gitextract_ft07ahjp/

├── .clang-format
├── .compatibility
├── .coveragerc
├── .cuda_ext.json
├── .github/
│   ├── CODEOWNERS
│   ├── ISSUE_TEMPLATE/
│   │   ├── bug-report.yml
│   │   ├── config.yml
│   │   ├── documentation.yml
│   │   ├── feature_request.yml
│   │   └── proposal.yml
│   ├── pull_request_template.md
│   └── workflows/
│       ├── README.md
│       ├── build_on_pr.yml
│       ├── build_on_schedule.yml
│       ├── close_inactive.yml
│       ├── compatiblity_test_on_dispatch.yml
│       ├── compatiblity_test_on_pr.yml
│       ├── compatiblity_test_on_schedule.yml
│       ├── cuda_ext_check_before_merge.yml
│       ├── doc_build_on_schedule_after_release.yml
│       ├── doc_check_on_pr.yml
│       ├── doc_test_on_pr.yml
│       ├── doc_test_on_schedule.yml
│       ├── draft_github_release_post_after_merge.yml
│       ├── example_check_on_dispatch.yml
│       ├── example_check_on_pr.yml
│       ├── example_check_on_schedule.yml
│       ├── release_docker_after_publish.yml
│       ├── release_nightly_on_schedule.yml
│       ├── release_pypi_after_merge.yml
│       ├── release_test_pypi_before_merge.yml
│       ├── report_leaderboard_to_lark.yml
│       ├── report_test_coverage.yml
│       ├── run_chatgpt_examples.yml
│       ├── run_chatgpt_unit_tests.yml
│       ├── run_colossalqa_unit_tests.yml
│       ├── scripts/
│       │   ├── check_doc_i18n.py
│       │   ├── example_checks/
│       │   │   ├── check_dispatch_inputs.py
│       │   │   ├── check_example_weekly.py
│       │   │   └── detect_changed_example.py
│       │   ├── generate_leaderboard_and_send_to_lark.py
│       │   ├── generate_release_draft.py
│       │   ├── send_message_to_lark.py
│       │   └── update_setup_for_nightly.py
│       ├── submodule.yml
│       └── translate_comment.yml
├── .gitignore
├── .gitmodules
├── .isort.cfg
├── .pre-commit-config.yaml
├── CHANGE_LOG.md
├── CONTRIBUTING.md
├── LICENSE
├── MANIFEST.in
├── README.md
├── applications/
│   ├── Colossal-LLaMA/
│   │   ├── README.md
│   │   ├── colossal_llama/
│   │   │   ├── __init__.py
│   │   │   ├── dataset/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── conversation.py
│   │   │   │   ├── dummy_dataset.py
│   │   │   │   ├── loader.py
│   │   │   │   └── spliced_and_tokenized_dataset.py
│   │   │   ├── model/
│   │   │   │   └── init_model.py
│   │   │   ├── tokenizer/
│   │   │   │   └── init_tokenizer.py
│   │   │   └── utils/
│   │   │       ├── __init__.py
│   │   │       ├── ckpt_io.py
│   │   │       ├── froze.py
│   │   │       ├── neftune_patch.py
│   │   │       ├── stream_chat_patch.py
│   │   │       └── utils.py
│   │   ├── dataset/
│   │   │   ├── prepare_pretrain_dataset.py
│   │   │   └── prepare_sft_dataset.py
│   │   ├── docs/
│   │   │   ├── example_13b.md
│   │   │   └── example_7b.md
│   │   ├── hostfile.example
│   │   ├── inference/
│   │   │   ├── inference_example.py
│   │   │   └── stream_chat_example.py
│   │   ├── requirements.txt
│   │   ├── setup.py
│   │   ├── train.example.sh
│   │   ├── train.py
│   │   ├── train_sft.example.sh
│   │   └── version.txt
│   ├── ColossalChat/
│   │   ├── .gitignore
│   │   ├── LICENSE
│   │   ├── README.md
│   │   ├── benchmarks/
│   │   │   ├── Opt.json
│   │   │   ├── README.md
│   │   │   ├── benchmark_dpo.sh
│   │   │   ├── benchmark_kto.sh
│   │   │   ├── benchmark_memory_consumption.txt
│   │   │   ├── benchmark_orpo.sh
│   │   │   ├── benchmark_performance_summarization.txt
│   │   │   ├── benchmark_ppo.py
│   │   │   ├── benchmark_ppo.sh
│   │   │   ├── benchmark_sft.sh
│   │   │   ├── benchmark_simpo.sh
│   │   │   ├── data_preparation.sh
│   │   │   ├── dummy_dataset.py
│   │   │   ├── prepare_dummy_test_dataset.py
│   │   │   └── ray/
│   │   │       ├── 1mmt_dummy.py
│   │   │       └── mmmt_dummy.py
│   │   ├── coati/
│   │   │   ├── __init__.py
│   │   │   ├── dataset/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── conversation.py
│   │   │   │   ├── loader.py
│   │   │   │   ├── tokenization_utils.py
│   │   │   │   └── utils.py
│   │   │   ├── distributed/
│   │   │   │   ├── README.md
│   │   │   │   ├── __init__.py
│   │   │   │   ├── comm.py
│   │   │   │   ├── consumer.py
│   │   │   │   ├── grpo_consumer.py
│   │   │   │   ├── inference_backend.py
│   │   │   │   ├── launch.py
│   │   │   │   ├── launch_zero_bubble.py
│   │   │   │   ├── loss.py
│   │   │   │   ├── producer.py
│   │   │   │   ├── profiling_utils.py
│   │   │   │   ├── reward/
│   │   │   │   │   ├── code_reward/
│   │   │   │   │   │   ├── testing_util.py
│   │   │   │   │   │   └── utils.py
│   │   │   │   │   ├── reward_fn.py
│   │   │   │   │   ├── reward_utils.py
│   │   │   │   │   └── verifiable_reward.py
│   │   │   │   ├── utils.py
│   │   │   │   └── zero_bubble/
│   │   │   │       ├── README.md
│   │   │   │       ├── __init__.py
│   │   │   │       ├── consumer.py
│   │   │   │       ├── distributor.py
│   │   │   │       ├── grpo_consumer.py
│   │   │   │       ├── producer.py
│   │   │   │       └── requirements.txt
│   │   │   ├── experience_buffer/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── base.py
│   │   │   │   ├── naive.py
│   │   │   │   └── utils.py
│   │   │   ├── experience_maker/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── base.py
│   │   │   │   └── naive.py
│   │   │   ├── models/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── base.py
│   │   │   │   ├── critic.py
│   │   │   │   ├── generation.py
│   │   │   │   ├── lora.py
│   │   │   │   ├── loss.py
│   │   │   │   ├── reward_model.py
│   │   │   │   ├── rlvr_reward_model.py
│   │   │   │   └── utils.py
│   │   │   ├── quant/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── llama_gptq/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── loader.py
│   │   │   │   │   ├── model_utils.py
│   │   │   │   │   └── quant.py
│   │   │   │   └── utils.py
│   │   │   ├── ray/
│   │   │   │   ├── README.md
│   │   │   │   ├── __init__.py
│   │   │   │   ├── callbacks/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   └── performance_evaluator.py
│   │   │   │   ├── detached_replay_buffer.py
│   │   │   │   ├── detached_trainer_base.py
│   │   │   │   ├── detached_trainer_ppo.py
│   │   │   │   ├── experience_maker_holder.py
│   │   │   │   ├── lora_constructor.py
│   │   │   │   └── utils.py
│   │   │   ├── trainer/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── base.py
│   │   │   │   ├── callbacks/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   └── performance_evaluator.py
│   │   │   │   ├── dpo.py
│   │   │   │   ├── grpo.py
│   │   │   │   ├── kto.py
│   │   │   │   ├── orpo.py
│   │   │   │   ├── ppo.py
│   │   │   │   ├── rm.py
│   │   │   │   ├── sft.py
│   │   │   │   └── utils.py
│   │   │   └── utils/
│   │   │       ├── __init__.py
│   │   │       ├── accumulative_meter.py
│   │   │       ├── ckpt_io.py
│   │   │       └── reward_score/
│   │   │           ├── __init__.py
│   │   │           ├── competition.py
│   │   │           ├── gsm8k.py
│   │   │           └── utils.py
│   │   ├── conversation_template/
│   │   │   ├── 01-ai_Yi-1.5-9B-Chat.json
│   │   │   ├── MiniCPM-2b.json
│   │   │   ├── Qwen_Qwen1.5-110B-Chat.json
│   │   │   ├── Qwen_Qwen1.5-32B-Chat.json
│   │   │   ├── Qwen_Qwen2.5-3B.json
│   │   │   ├── THUDM_chatglm2-6b.json
│   │   │   ├── THUDM_chatglm3-6b.json
│   │   │   ├── baichuan-inc_Baichuan2-13B-Chat.json
│   │   │   ├── colossal-llama2.json
│   │   │   ├── deepseek-ai_DeepSeek-V2-Lite.json
│   │   │   ├── llama2.json
│   │   │   ├── microsoft_phi-2.json
│   │   │   ├── mistralai_Mixtral-8x7B-Instruct-v0.1.json
│   │   │   └── tiny-llama.json
│   │   ├── examples/
│   │   │   ├── README.md
│   │   │   ├── community/
│   │   │   │   ├── README.md
│   │   │   │   ├── peft/
│   │   │   │   │   ├── README.md
│   │   │   │   │   ├── easy_dataset.py
│   │   │   │   │   ├── easy_models.py
│   │   │   │   │   ├── train_peft_prompts.py
│   │   │   │   │   └── train_peft_sft.py
│   │   │   │   └── ray/
│   │   │   │       ├── README.md
│   │   │   │       ├── ray_job_script.py
│   │   │   │       └── train_prompts_on_ray.py
│   │   │   ├── data_preparation_scripts/
│   │   │   │   ├── prepare_dataset.py
│   │   │   │   ├── prepare_kto_dataset.sh
│   │   │   │   ├── prepare_preference_dataset.sh
│   │   │   │   ├── prepare_prompt_dataset.sh
│   │   │   │   └── prepare_sft_dataset.sh
│   │   │   ├── inference/
│   │   │   │   ├── chatio.py
│   │   │   │   ├── inference.py
│   │   │   │   └── web_chatbot/
│   │   │   │       ├── README.md
│   │   │   │       ├── locustfile.py
│   │   │   │       ├── requirements.txt
│   │   │   │       ├── server.py
│   │   │   │       └── utils.py
│   │   │   ├── requirements.txt
│   │   │   └── training_scripts/
│   │   │       ├── hostfile
│   │   │       ├── lora_config.json
│   │   │       ├── lora_finetune.py
│   │   │       ├── lora_sft_data.jsonl
│   │   │       ├── train_dpo.py
│   │   │       ├── train_dpo.sh
│   │   │       ├── train_grpo.py
│   │   │       ├── train_grpo.sh
│   │   │       ├── train_kto.py
│   │   │       ├── train_kto.sh
│   │   │       ├── train_orpo.py
│   │   │       ├── train_orpo.sh
│   │   │       ├── train_ppo.py
│   │   │       ├── train_ppo.sh
│   │   │       ├── train_rm.py
│   │   │       ├── train_rm.sh
│   │   │       ├── train_sft.py
│   │   │       └── train_sft.sh
│   │   ├── profiling.sh
│   │   ├── pytest.ini
│   │   ├── rl_example.py
│   │   ├── rl_example_zero_bubble.py
│   │   ├── setup.py
│   │   ├── start_code_verifier.py
│   │   ├── tests/
│   │   │   ├── __init__.py
│   │   │   ├── generate_dummy_datasets_for_testing.py
│   │   │   ├── llama.json
│   │   │   ├── opt.json
│   │   │   ├── prepare_test_env.sh
│   │   │   ├── test_data/
│   │   │   │   ├── dpo/
│   │   │   │   │   └── test_dpo_data.jsonl
│   │   │   │   ├── kto/
│   │   │   │   │   └── test_kto_data.jsonl
│   │   │   │   └── sft/
│   │   │   │       └── test_sft_data.jsonl
│   │   │   ├── test_data_preparation.sh
│   │   │   ├── test_lora.py
│   │   │   ├── test_templating.sh
│   │   │   ├── test_train.sh
│   │   │   └── verify_chat_data.py
│   │   └── visualization.py
│   ├── ColossalEval/
│   │   ├── README.md
│   │   ├── colossal_eval/
│   │   │   ├── __init__.py
│   │   │   ├── dataset/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── agieval.py
│   │   │   │   ├── base.py
│   │   │   │   ├── ceval.py
│   │   │   │   ├── cmmlu.py
│   │   │   │   ├── colossalai.py
│   │   │   │   ├── cvalues.py
│   │   │   │   ├── gaokaobench.py
│   │   │   │   ├── gsm.py
│   │   │   │   ├── longbench.py
│   │   │   │   ├── mmlu.py
│   │   │   │   ├── mtbench.py
│   │   │   │   ├── safetybench_en.py
│   │   │   │   └── safetybench_zh.py
│   │   │   ├── evaluate/
│   │   │   │   ├── GPT Evaluation.md
│   │   │   │   ├── __init__.py
│   │   │   │   ├── dataset_evaluator/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── dataset_evaluator.py
│   │   │   │   │   ├── gpt_judge.py
│   │   │   │   │   └── metrics.py
│   │   │   │   ├── evaluator.py
│   │   │   │   ├── gpt_evaluate.py
│   │   │   │   └── utils.py
│   │   │   ├── models/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── base.py
│   │   │   │   ├── chatglm.py
│   │   │   │   ├── huggingface.py
│   │   │   │   └── vllm.py
│   │   │   └── utils/
│   │   │       ├── __init__.py
│   │   │       ├── conversation.py
│   │   │       └── utilities.py
│   │   ├── configs/
│   │   │   └── gpt_evaluation/
│   │   │       ├── config/
│   │   │       │   ├── config_cn.json
│   │   │       │   └── config_en.json
│   │   │       ├── data/
│   │   │       │   ├── eval_cn_examples.json
│   │   │       │   └── eval_en_examples.json
│   │   │       └── prompt/
│   │   │           ├── battle_prompt/
│   │   │           │   ├── battle_prompt_cn.json
│   │   │           │   └── battle_prompt_en.json
│   │   │           └── evaluation_prompt/
│   │   │               ├── evaluation_prompt_cn.json
│   │   │               └── evaluation_prompt_en.json
│   │   ├── examples/
│   │   │   ├── dataset_evaluation/
│   │   │   │   ├── config/
│   │   │   │   │   ├── evaluation/
│   │   │   │   │   │   └── config.json
│   │   │   │   │   └── inference/
│   │   │   │   │       └── config.json
│   │   │   │   ├── eval_dataset.py
│   │   │   │   ├── eval_dataset.sh
│   │   │   │   ├── inference.py
│   │   │   │   └── inference.sh
│   │   │   └── gpt_evaluation/
│   │   │       ├── config/
│   │   │       │   ├── evaluation/
│   │   │       │   │   └── config.json
│   │   │       │   └── inference/
│   │   │       │       └── config.json
│   │   │       ├── eval.py
│   │   │       ├── eval.sh
│   │   │       ├── inference.py
│   │   │       └── inference.sh
│   │   ├── requirements.txt
│   │   └── setup.py
│   ├── ColossalMoE/
│   │   ├── README.md
│   │   ├── infer.py
│   │   ├── infer.sh
│   │   ├── requirements.txt
│   │   ├── setup.py
│   │   ├── tests/
│   │   │   └── __init__.py
│   │   ├── train.py
│   │   ├── train.sh
│   │   ├── utils.py
│   │   └── version.txt
│   ├── ColossalQA/
│   │   ├── .gitignore
│   │   ├── README.md
│   │   ├── colossalqa/
│   │   │   ├── __init__.py
│   │   │   ├── chain/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── memory/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   └── summary.py
│   │   │   │   └── retrieval_qa/
│   │   │   │       ├── __init__.py
│   │   │   │       ├── base.py
│   │   │   │       ├── load_chain.py
│   │   │   │       └── stuff.py
│   │   │   ├── data_loader/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── document_loader.py
│   │   │   │   └── table_dataloader.py
│   │   │   ├── local/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── colossalcloud_llm.py
│   │   │   │   ├── llm.py
│   │   │   │   ├── pangu_llm.py
│   │   │   │   └── utils.py
│   │   │   ├── memory.py
│   │   │   ├── mylogging.py
│   │   │   ├── prompt/
│   │   │   │   ├── README.md
│   │   │   │   └── prompt.py
│   │   │   ├── retrieval_conversation_en.py
│   │   │   ├── retrieval_conversation_universal.py
│   │   │   ├── retrieval_conversation_zh.py
│   │   │   ├── retriever.py
│   │   │   ├── text_splitter/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── chinese_text_splitter.py
│   │   │   │   └── utils.py
│   │   │   └── utils.py
│   │   ├── data/
│   │   │   ├── data_sample/
│   │   │   │   ├── companies.txt
│   │   │   │   ├── companies_zh.txt
│   │   │   │   ├── csv_organization_100.csv
│   │   │   │   ├── custom_service.json
│   │   │   │   ├── custom_service_classification.json
│   │   │   │   ├── custom_service_preprocessed.json
│   │   │   │   └── luchen_zh.txt
│   │   │   └── tests/
│   │   │       ├── 64KB.json
│   │   │       ├── companies.csv
│   │   │       ├── test.html
│   │   │       ├── test.md
│   │   │       └── test.txt
│   │   ├── examples/
│   │   │   ├── conversation_agent_chatgpt.py
│   │   │   ├── retrieval_conversation_chatgpt.py
│   │   │   ├── retrieval_conversation_en.py
│   │   │   ├── retrieval_conversation_en_customer_service.py
│   │   │   ├── retrieval_conversation_universal.py
│   │   │   ├── retrieval_conversation_zh.py
│   │   │   ├── retrieval_intent_classification_zh_customer_service.py
│   │   │   └── webui_demo/
│   │   │       ├── RAG_ChatBot.py
│   │   │       ├── README.md
│   │   │       ├── config.py
│   │   │       ├── requirements.txt
│   │   │       ├── server.py
│   │   │       ├── utils.py
│   │   │       └── webui.py
│   │   ├── pytest.ini
│   │   ├── requirements.txt
│   │   ├── setup.py
│   │   ├── tests/
│   │   │   ├── __init__.py
│   │   │   ├── test_document_loader.py
│   │   │   ├── test_memory.py
│   │   │   ├── test_retrieval_qa.py
│   │   │   └── test_text_splitter.py
│   │   └── version.txt
│   └── README.md
├── colossalai/
│   ├── _C/
│   │   └── __init__.py
│   ├── __init__.py
│   ├── _analyzer/
│   │   ├── README.md
│   │   ├── __init__.py
│   │   ├── _subclasses/
│   │   │   ├── __init__.py
│   │   │   ├── _meta_registration.py
│   │   │   ├── _monkey_patch.py
│   │   │   ├── flop_tensor.py
│   │   │   └── meta_tensor.py
│   │   ├── envs.py
│   │   └── fx/
│   │       ├── __init__.py
│   │       ├── codegen.py
│   │       ├── graph_module.py
│   │       ├── node_util.py
│   │       ├── passes/
│   │       │   ├── __init__.py
│   │       │   ├── graph_profile.py
│   │       │   └── shape_prop.py
│   │       ├── symbolic_profile.py
│   │       └── tracer/
│   │           ├── __init__.py
│   │           ├── bias_addition.py
│   │           ├── custom_leaf_module.py
│   │           ├── proxy.py
│   │           ├── symbolic_trace.py
│   │           └── tracer.py
│   ├── accelerator/
│   │   ├── README.md
│   │   ├── __init__.py
│   │   ├── api.py
│   │   ├── base_accelerator.py
│   │   ├── cpu_accelerator.py
│   │   ├── cuda_accelerator.py
│   │   └── npu_accelerator.py
│   ├── amp/
│   │   ├── __init__.py
│   │   └── naive_amp/
│   │       ├── __init__.py
│   │       ├── grad_scaler/
│   │       │   ├── __init__.py
│   │       │   ├── base_grad_scaler.py
│   │       │   ├── constant_grad_scaler.py
│   │       │   └── dynamic_grad_scaler.py
│   │       ├── mixed_precision_mixin/
│   │       │   ├── __init__.py
│   │       │   ├── base.py
│   │       │   ├── bf16.py
│   │       │   └── fp16.py
│   │       └── mixed_precision_optimizer.py
│   ├── auto_parallel/
│   │   ├── README.md
│   │   ├── __init__.py
│   │   ├── checkpoint/
│   │   │   ├── __init__.py
│   │   │   ├── build_c_ext.py
│   │   │   ├── ckpt_solver_base.py
│   │   │   ├── ckpt_solver_chen.py
│   │   │   ├── ckpt_solver_rotor.c
│   │   │   ├── ckpt_solver_rotor.py
│   │   │   └── operation.py
│   │   ├── meta_profiler/
│   │   │   ├── __init__.py
│   │   │   ├── constants.py
│   │   │   ├── meta_registry/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── activation.py
│   │   │   │   ├── binary_elementwise_ops.py
│   │   │   │   ├── conv.py
│   │   │   │   ├── embedding.py
│   │   │   │   ├── linear.py
│   │   │   │   ├── non_spmd.py
│   │   │   │   ├── norm.py
│   │   │   │   ├── pooling.py
│   │   │   │   ├── tensor.py
│   │   │   │   └── where.py
│   │   │   ├── registry.py
│   │   │   └── shard_metainfo.py
│   │   ├── offload/
│   │   │   ├── __init__.py
│   │   │   ├── amp_optimizer.py
│   │   │   ├── base_offload_module.py
│   │   │   ├── mem_optimize.py
│   │   │   ├── region.py
│   │   │   ├── region_manager.py
│   │   │   ├── runtime.py
│   │   │   ├── solver.py
│   │   │   ├── training_simulator.py
│   │   │   └── util.py
│   │   ├── passes/
│   │   │   ├── __init__.py
│   │   │   ├── comm_metainfo_pass.py
│   │   │   ├── constants.py
│   │   │   ├── meta_info_prop.py
│   │   │   ├── runtime_apply_pass.py
│   │   │   └── runtime_preparation_pass.py
│   │   ├── pipeline_shard/
│   │   │   └── __init__.py
│   │   └── tensor_shard/
│   │       ├── __init__.py
│   │       ├── constants.py
│   │       ├── initialize.py
│   │       ├── node_handler/
│   │       │   ├── __init__.py
│   │       │   ├── addmm_handler.py
│   │       │   ├── batch_norm_handler.py
│   │       │   ├── binary_elementwise_handler.py
│   │       │   ├── bmm_handler.py
│   │       │   ├── conv_handler.py
│   │       │   ├── default_reshape_handler.py
│   │       │   ├── embedding_handler.py
│   │       │   ├── getattr_handler.py
│   │       │   ├── getitem_handler.py
│   │       │   ├── layer_norm_handler.py
│   │       │   ├── linear_handler.py
│   │       │   ├── matmul_handler.py
│   │       │   ├── node_handler.py
│   │       │   ├── normal_pooling_handler.py
│   │       │   ├── output_handler.py
│   │       │   ├── permute_handler.py
│   │       │   ├── placeholder_handler.py
│   │       │   ├── registry.py
│   │       │   ├── softmax_handler.py
│   │       │   ├── split_handler.py
│   │       │   ├── strategy/
│   │       │   │   ├── __init__.py
│   │       │   │   ├── batch_norm_generator.py
│   │       │   │   ├── binary_elementwise_generator.py
│   │       │   │   ├── conv_strategy_generator.py
│   │       │   │   ├── embedding_generator.py
│   │       │   │   ├── getattr_generator.py
│   │       │   │   ├── getitem_generator.py
│   │       │   │   ├── layer_norm_generator.py
│   │       │   │   ├── matmul_strategy_generator.py
│   │       │   │   ├── normal_pooling_generator.py
│   │       │   │   ├── output_generator.py
│   │       │   │   ├── placeholder_generator.py
│   │       │   │   ├── reshape_generator.py
│   │       │   │   ├── softmax_generator.py
│   │       │   │   ├── strategy_generator.py
│   │       │   │   ├── sum_generator.py
│   │       │   │   ├── tensor_constructor_generator.py
│   │       │   │   ├── unary_elementwise_generator.py
│   │       │   │   └── where_generator.py
│   │       │   ├── sum_handler.py
│   │       │   ├── tensor_constructor_handler.py
│   │       │   ├── transpose_handler.py
│   │       │   ├── unary_elementwise_handler.py
│   │       │   ├── view_handler.py
│   │       │   └── where_handler.py
│   │       ├── options.py
│   │       ├── sharding_strategy.py
│   │       ├── solver/
│   │       │   ├── __init__.py
│   │       │   ├── cost_graph.py
│   │       │   ├── graph_analysis.py
│   │       │   ├── solver.py
│   │       │   └── strategies_constructor.py
│   │       └── utils/
│   │           ├── __init__.py
│   │           ├── broadcast.py
│   │           ├── factory.py
│   │           ├── misc.py
│   │           ├── reshape.py
│   │           └── sharding.py
│   ├── autochunk/
│   │   ├── autochunk_codegen.py
│   │   ├── estimate_memory.py
│   │   ├── reorder_graph.py
│   │   ├── search_chunk.py
│   │   ├── select_chunk.py
│   │   ├── trace_flow.py
│   │   ├── trace_indice.py
│   │   └── utils.py
│   ├── booster/
│   │   ├── __init__.py
│   │   ├── accelerator.py
│   │   ├── booster.py
│   │   ├── mixed_precision/
│   │   │   ├── __init__.py
│   │   │   ├── bf16.py
│   │   │   ├── fp16_apex.py
│   │   │   ├── fp16_naive.py
│   │   │   ├── fp16_torch.py
│   │   │   ├── fp8.py
│   │   │   └── mixed_precision_base.py
│   │   └── plugin/
│   │       ├── __init__.py
│   │       ├── dp_plugin_base.py
│   │       ├── gemini_plugin.py
│   │       ├── hybrid_parallel_plugin.py
│   │       ├── low_level_zero_plugin.py
│   │       ├── moe_hybrid_parallel_plugin.py
│   │       ├── plugin_base.py
│   │       ├── pp_plugin_base.py
│   │       ├── torch_ddp_plugin.py
│   │       └── torch_fsdp_plugin.py
│   ├── checkpoint_io/
│   │   ├── __init__.py
│   │   ├── checkpoint_io_base.py
│   │   ├── general_checkpoint_io.py
│   │   ├── hybrid_parallel_checkpoint_io.py
│   │   ├── index_file.py
│   │   ├── moe_checkpoint.py
│   │   └── utils.py
│   ├── cli/
│   │   ├── __init__.py
│   │   ├── check/
│   │   │   ├── __init__.py
│   │   │   └── check_installation.py
│   │   ├── cli.py
│   │   └── launcher/
│   │       ├── __init__.py
│   │       ├── hostinfo.py
│   │       ├── multinode_runner.py
│   │       └── run.py
│   ├── cluster/
│   │   ├── __init__.py
│   │   ├── device_mesh_manager.py
│   │   ├── dist_coordinator.py
│   │   ├── process_group_manager.py
│   │   └── process_group_mesh.py
│   ├── context/
│   │   ├── __init__.py
│   │   ├── config.py
│   │   └── singleton_meta.py
│   ├── device/
│   │   ├── __init__.py
│   │   ├── alpha_beta_profiler.py
│   │   ├── calc_pipeline_strategy.py
│   │   └── device_mesh.py
│   ├── fx/
│   │   ├── __init__.py
│   │   ├── _compatibility.py
│   │   ├── _meta_regist_12.py
│   │   ├── _meta_regist_13.py
│   │   ├── codegen/
│   │   │   ├── __init__.py
│   │   │   └── activation_checkpoint_codegen.py
│   │   ├── graph_module.py
│   │   ├── passes/
│   │   │   ├── __init__.py
│   │   │   ├── adding_split_node_pass.py
│   │   │   ├── concrete_info_prop.py
│   │   │   ├── experimental/
│   │   │   │   └── adding_shape_consistency_pass.py
│   │   │   ├── meta_info_prop.py
│   │   │   ├── passes_for_gpt2_test.py
│   │   │   ├── shard_1d_pass.py
│   │   │   ├── split_module.py
│   │   │   └── utils.py
│   │   ├── profiler/
│   │   │   ├── __init__.py
│   │   │   ├── constants.py
│   │   │   ├── dataflow.py
│   │   │   ├── experimental/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── constants.py
│   │   │   │   ├── profiler.py
│   │   │   │   ├── profiler_function/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── activation_function.py
│   │   │   │   │   ├── arithmetic.py
│   │   │   │   │   ├── embedding.py
│   │   │   │   │   ├── linear.py
│   │   │   │   │   ├── normalization.py
│   │   │   │   │   ├── pooling.py
│   │   │   │   │   ├── python_ops.py
│   │   │   │   │   └── torch_ops.py
│   │   │   │   ├── profiler_module/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── activation_function.py
│   │   │   │   │   ├── attention.py
│   │   │   │   │   ├── convolution.py
│   │   │   │   │   ├── dropout.py
│   │   │   │   │   ├── embedding.py
│   │   │   │   │   ├── linear.py
│   │   │   │   │   ├── normalization.py
│   │   │   │   │   ├── pooling.py
│   │   │   │   │   ├── rnn.py
│   │   │   │   │   └── torch_op.py
│   │   │   │   ├── registry.py
│   │   │   │   └── shard_utils.py
│   │   │   ├── memory_utils.py
│   │   │   ├── opcount.py
│   │   │   ├── profiler.py
│   │   │   ├── shard_utils.py
│   │   │   └── tensor.py
│   │   ├── proxy.py
│   │   └── tracer/
│   │       ├── __init__.py
│   │       ├── _meta_trace.py
│   │       ├── _symbolic_trace.py
│   │       ├── _tracer_utils.py
│   │       ├── bias_addition_patch/
│   │       │   ├── __init__.py
│   │       │   ├── patched_bias_addition_function/
│   │       │   │   ├── __init__.py
│   │       │   │   ├── addbmm.py
│   │       │   │   ├── addmm.py
│   │       │   │   ├── bias_addition_function.py
│   │       │   │   └── linear.py
│   │       │   └── patched_bias_addition_module/
│   │       │       ├── __init__.py
│   │       │       ├── bias_addition_module.py
│   │       │       ├── conv.py
│   │       │       └── linear.py
│   │       ├── experimental.py
│   │       ├── meta_patch/
│   │       │   ├── __init__.py
│   │       │   ├── patched_function/
│   │       │   │   ├── __init__.py
│   │       │   │   ├── activation_function.py
│   │       │   │   ├── arithmetic.py
│   │       │   │   ├── convolution.py
│   │       │   │   ├── embedding.py
│   │       │   │   ├── normalization.py
│   │       │   │   ├── python_ops.py
│   │       │   │   └── torch_ops.py
│   │       │   └── patched_module/
│   │       │       ├── __init__.py
│   │       │       ├── activation_function.py
│   │       │       ├── convolution.py
│   │       │       ├── embedding.py
│   │       │       ├── linear.py
│   │       │       ├── normalization.py
│   │       │       ├── pooling.py
│   │       │       └── rnn.py
│   │       ├── registry.py
│   │       └── tracer.py
│   ├── inference/
│   │   ├── README.md
│   │   ├── __init__.py
│   │   ├── batch_bucket.py
│   │   ├── config.py
│   │   ├── core/
│   │   │   ├── __init__.py
│   │   │   ├── async_engine.py
│   │   │   ├── base_engine.py
│   │   │   ├── diffusion_engine.py
│   │   │   ├── engine.py
│   │   │   ├── llm_engine.py
│   │   │   ├── plugin.py
│   │   │   ├── request_handler.py
│   │   │   └── rpc_engine.py
│   │   ├── executor/
│   │   │   ├── __init__.py
│   │   │   └── rpc_worker.py
│   │   ├── flash_decoding_utils.py
│   │   ├── graph_runner.py
│   │   ├── kv_cache/
│   │   │   ├── __init__.py
│   │   │   ├── block_cache.py
│   │   │   └── kvcache_manager.py
│   │   ├── logit_processors.py
│   │   ├── modeling/
│   │   │   ├── __init__.py
│   │   │   ├── backends/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── attention_backend.py
│   │   │   │   └── pre_attention_backend.py
│   │   │   ├── layers/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── attention.py
│   │   │   │   ├── baichuan_tp_linear.py
│   │   │   │   ├── diffusion.py
│   │   │   │   └── distrifusion.py
│   │   │   ├── models/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── glide_llama.py
│   │   │   │   ├── nopadding_baichuan.py
│   │   │   │   ├── nopadding_llama.py
│   │   │   │   ├── pixart_alpha.py
│   │   │   │   └── stablediffusion3.py
│   │   │   └── policy/
│   │   │       ├── __init__.py
│   │   │       ├── glide_llama.py
│   │   │       ├── nopadding_baichuan.py
│   │   │       ├── nopadding_llama.py
│   │   │       ├── pixart_alpha.py
│   │   │       └── stablediffusion3.py
│   │   ├── sampler.py
│   │   ├── server/
│   │   │   ├── __init__.py
│   │   │   ├── api_server.py
│   │   │   ├── chat_service.py
│   │   │   ├── completion_service.py
│   │   │   └── utils.py
│   │   ├── spec/
│   │   │   ├── __init__.py
│   │   │   ├── drafter.py
│   │   │   └── struct.py
│   │   ├── struct.py
│   │   └── utils.py
│   ├── initialize.py
│   ├── interface/
│   │   ├── __init__.py
│   │   ├── model.py
│   │   ├── optimizer.py
│   │   └── pretrained.py
│   ├── kernel/
│   │   ├── __init__.py
│   │   ├── jit/
│   │   │   ├── __init__.py
│   │   │   ├── bias_dropout_add.py
│   │   │   ├── bias_gelu.py
│   │   │   └── option.py
│   │   ├── kernel_loader.py
│   │   └── triton/
│   │       ├── __init__.py
│   │       ├── context_attn_unpad.py
│   │       ├── flash_decoding.py
│   │       ├── fused_rotary_embedding.py
│   │       ├── kvcache_copy.py
│   │       ├── llama_act_combine_kernel.py
│   │       ├── no_pad_rotary_embedding.py
│   │       ├── qkv_matmul_kernel.py
│   │       ├── rms_layernorm.py
│   │       ├── rotary_cache_copy.py
│   │       └── softmax.py
│   ├── lazy/
│   │   ├── __init__.py
│   │   ├── construction.py
│   │   ├── lazy_init.py
│   │   └── pretrained.py
│   ├── legacy/
│   │   ├── __init__.py
│   │   ├── amp/
│   │   │   ├── __init__.py
│   │   │   ├── amp_type.py
│   │   │   ├── apex_amp/
│   │   │   │   ├── __init__.py
│   │   │   │   └── apex_amp.py
│   │   │   ├── naive_amp/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── _fp16_optimizer.py
│   │   │   │   ├── _utils.py
│   │   │   │   └── naive_amp.py
│   │   │   └── torch_amp/
│   │   │       ├── __init__.py
│   │   │       ├── _grad_scaler.py
│   │   │       └── torch_amp.py
│   │   ├── builder/
│   │   │   ├── __init__.py
│   │   │   └── builder.py
│   │   ├── communication/
│   │   │   ├── __init__.py
│   │   │   ├── collective.py
│   │   │   ├── p2p.py
│   │   │   ├── p2p_v2.py
│   │   │   ├── ring.py
│   │   │   └── utils.py
│   │   ├── constants.py
│   │   ├── context/
│   │   │   ├── __init__.py
│   │   │   ├── parallel_context.py
│   │   │   ├── parallel_mode.py
│   │   │   ├── process_group_initializer/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── initializer_1d.py
│   │   │   │   ├── initializer_2d.py
│   │   │   │   ├── initializer_2p5d.py
│   │   │   │   ├── initializer_3d.py
│   │   │   │   ├── initializer_data.py
│   │   │   │   ├── initializer_model.py
│   │   │   │   ├── initializer_pipeline.py
│   │   │   │   ├── initializer_sequence.py
│   │   │   │   ├── initializer_tensor.py
│   │   │   │   └── process_group_initializer.py
│   │   │   └── random/
│   │   │       ├── __init__.py
│   │   │       ├── _helper.py
│   │   │       └── seed_manager.py
│   │   ├── core.py
│   │   ├── engine/
│   │   │   ├── __init__.py
│   │   │   ├── _base_engine.py
│   │   │   ├── gradient_accumulation/
│   │   │   │   ├── __init__.py
│   │   │   │   └── _gradient_accumulation.py
│   │   │   ├── gradient_handler/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── _base_gradient_handler.py
│   │   │   │   ├── _data_parallel_gradient_handler.py
│   │   │   │   ├── _moe_gradient_handler.py
│   │   │   │   ├── _pipeline_parallel_gradient_handler.py
│   │   │   │   ├── _sequence_parallel_gradient_handler.py
│   │   │   │   ├── _zero_gradient_handler.py
│   │   │   │   └── utils.py
│   │   │   └── schedule/
│   │   │       ├── __init__.py
│   │   │       ├── _base_schedule.py
│   │   │       ├── _non_pipeline_schedule.py
│   │   │       ├── _pipeline_schedule.py
│   │   │       └── _pipeline_schedule_v2.py
│   │   ├── global_variables.py
│   │   ├── inference/
│   │   │   ├── README.md
│   │   │   ├── __init__.py
│   │   │   ├── async_engine.py
│   │   │   ├── async_manager.py
│   │   │   ├── dynamic_batching/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── get_tokenizer.py
│   │   │   │   ├── infer_batch.py
│   │   │   │   ├── io_struct.py
│   │   │   │   ├── ray_dist_init.py
│   │   │   │   ├── ray_init_config.py
│   │   │   │   ├── req_queue.py
│   │   │   │   ├── sampling_params.py
│   │   │   │   └── stats.py
│   │   │   ├── hybridengine/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── engine.py
│   │   │   │   ├── modeling/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── _utils.py
│   │   │   │   │   └── llama.py
│   │   │   │   └── polices/
│   │   │   │       ├── __init__.py
│   │   │   │       └── llama.py
│   │   │   ├── manager.py
│   │   │   ├── pipeline/
│   │   │   │   ├── README.md
│   │   │   │   ├── __init__.py
│   │   │   │   ├── benchmark/
│   │   │   │   │   ├── benchmark.py
│   │   │   │   │   └── run.sh
│   │   │   │   └── microbatch_manager.py
│   │   │   ├── quant/
│   │   │   │   ├── gptq/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   └── cai_gptq/
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       ├── cai_quant_linear.py
│   │   │   │   │       └── gptq_op.py
│   │   │   │   └── smoothquant/
│   │   │   │       ├── __init__.py
│   │   │   │       └── models/
│   │   │   │           ├── __init__.py
│   │   │   │           ├── base_model.py
│   │   │   │           ├── linear.py
│   │   │   │           └── llama.py
│   │   │   ├── serving/
│   │   │   │   ├── ray_serve/
│   │   │   │   │   ├── Colossal_Inference_rayserve.py
│   │   │   │   │   ├── README.md
│   │   │   │   │   ├── send_request.py
│   │   │   │   │   └── send_requests.py
│   │   │   │   ├── test_ci.sh
│   │   │   │   └── torch_serve/
│   │   │   │       ├── Colossal_Inference_Handler.py
│   │   │   │       ├── README.md
│   │   │   │       ├── config.properties
│   │   │   │       ├── docker/
│   │   │   │       │   └── Dockerfile
│   │   │   │       ├── model-config.yaml
│   │   │   │       └── sample_text.txt
│   │   │   └── tensor_parallel/
│   │   │       ├── __init__.py
│   │   │       ├── batch_infer_state.py
│   │   │       ├── engine.py
│   │   │       ├── kvcache_manager.py
│   │   │       ├── modeling/
│   │   │       │   ├── __init__.py
│   │   │       │   ├── _utils.py
│   │   │       │   ├── bloom.py
│   │   │       │   ├── chatglm2.py
│   │   │       │   └── llama.py
│   │   │       └── policies/
│   │   │           ├── __init__.py
│   │   │           ├── bloom.py
│   │   │           ├── chatglm2.py
│   │   │           └── llama.py
│   │   ├── initialize.py
│   │   ├── moe/
│   │   │   ├── layer/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── experts.py
│   │   │   │   ├── layers.py
│   │   │   │   └── routers.py
│   │   │   ├── load_balance.py
│   │   │   ├── manager.py
│   │   │   ├── openmoe/
│   │   │   │   ├── README.md
│   │   │   │   ├── benchmark/
│   │   │   │   │   ├── benchmark_cai.py
│   │   │   │   │   ├── benchmark_cai.sh
│   │   │   │   │   ├── benchmark_cai_dist.sh
│   │   │   │   │   ├── benchmark_fsdp.py
│   │   │   │   │   ├── benchmark_fsdp.sh
│   │   │   │   │   ├── hostfile.txt
│   │   │   │   │   └── utils.py
│   │   │   │   ├── infer.py
│   │   │   │   ├── infer.sh
│   │   │   │   ├── model/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── convert_openmoe_ckpt.py
│   │   │   │   │   ├── convert_openmoe_ckpt.sh
│   │   │   │   │   ├── modeling_openmoe.py
│   │   │   │   │   ├── openmoe_8b_config.json
│   │   │   │   │   ├── openmoe_base_config.json
│   │   │   │   │   └── openmoe_policy.py
│   │   │   │   ├── requirements.txt
│   │   │   │   ├── test_ci.sh
│   │   │   │   ├── train.py
│   │   │   │   └── train.sh
│   │   │   └── utils.py
│   │   ├── nn/
│   │   │   ├── __init__.py
│   │   │   ├── _ops/
│   │   │   │   ├── __init__.py
│   │   │   │   └── _utils.py
│   │   │   ├── layer/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── base_layer.py
│   │   │   │   ├── colossalai_layer/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── _utils.py
│   │   │   │   │   ├── dropout.py
│   │   │   │   │   ├── embedding.py
│   │   │   │   │   ├── linear.py
│   │   │   │   │   └── normalization.py
│   │   │   │   ├── parallel_1d/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── _operation.py
│   │   │   │   │   ├── _utils.py
│   │   │   │   │   └── layers.py
│   │   │   │   ├── parallel_2d/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── _operation.py
│   │   │   │   │   ├── _utils.py
│   │   │   │   │   └── layers.py
│   │   │   │   ├── parallel_2p5d/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── _operation.py
│   │   │   │   │   ├── _utils.py
│   │   │   │   │   └── layers.py
│   │   │   │   ├── parallel_3d/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── _operation.py
│   │   │   │   │   ├── _utils.py
│   │   │   │   │   └── layers.py
│   │   │   │   ├── parallel_sequence/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── _operation.py
│   │   │   │   │   ├── _utils.py
│   │   │   │   │   └── layers.py
│   │   │   │   ├── utils/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   └── common.py
│   │   │   │   ├── vanilla/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   └── layers.py
│   │   │   │   └── wrapper/
│   │   │   │       ├── __init__.py
│   │   │   │       └── pipeline_wrapper.py
│   │   │   ├── loss/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── loss_1d.py
│   │   │   │   ├── loss_2d.py
│   │   │   │   ├── loss_2p5d.py
│   │   │   │   └── loss_3d.py
│   │   │   ├── metric/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── _utils.py
│   │   │   │   ├── accuracy_2d.py
│   │   │   │   ├── accuracy_2p5d.py
│   │   │   │   └── accuracy_3d.py
│   │   │   └── parallel/
│   │   │       ├── __init__.py
│   │   │       ├── data_parallel.py
│   │   │       ├── layers/
│   │   │       │   ├── __init__.py
│   │   │       │   ├── cache_embedding/
│   │   │       │   │   ├── __init__.py
│   │   │       │   │   ├── base_embedding.py
│   │   │       │   │   ├── cache_mgr.py
│   │   │       │   │   ├── cached_embedding.py
│   │   │       │   │   ├── copyer.py
│   │   │       │   │   ├── embedding_config.py
│   │   │       │   │   ├── parallel_cached_embedding.py
│   │   │       │   │   ├── parallel_cached_embedding_tablewise.py
│   │   │       │   │   └── parallel_cached_embedding_tablewise_split_cache.py
│   │   │       │   ├── colo_module.py
│   │   │       │   ├── embedding.py
│   │   │       │   ├── linear.py
│   │   │       │   └── module_utils.py
│   │   │       └── reducer.py
│   │   ├── pipeline/
│   │   │   ├── __init__.py
│   │   │   ├── layer_spec.py
│   │   │   ├── middleware/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── adaptor/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   └── fx.py
│   │   │   │   └── topo.py
│   │   │   ├── pipelinable.py
│   │   │   ├── pipeline_process_group.py
│   │   │   ├── rpc/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── _pipeline_base.py
│   │   │   │   ├── _pipeline_schedule.py
│   │   │   │   └── utils.py
│   │   │   └── utils.py
│   │   ├── registry/
│   │   │   ├── __init__.py
│   │   │   └── registry.py
│   │   ├── tensor/
│   │   │   ├── __init__.py
│   │   │   ├── compute_spec.py
│   │   │   ├── const.py
│   │   │   ├── dist_spec_mgr.py
│   │   │   ├── distspec.py
│   │   │   ├── op_wrapper.py
│   │   │   ├── process_group.py
│   │   │   └── tensor_spec.py
│   │   ├── trainer/
│   │   │   ├── __init__.py
│   │   │   ├── _trainer.py
│   │   │   └── hooks/
│   │   │       ├── __init__.py
│   │   │       ├── _base_hook.py
│   │   │       ├── _checkpoint_hook.py
│   │   │       ├── _commons_.py
│   │   │       ├── _log_hook.py
│   │   │       ├── _lr_scheduler_hook.py
│   │   │       └── _metric_hook.py
│   │   ├── utils/
│   │   │   ├── __init__.py
│   │   │   ├── activation_checkpoint.py
│   │   │   ├── checkpoint/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── module_checkpoint.py
│   │   │   │   └── utils.py
│   │   │   ├── checkpointing.py
│   │   │   ├── common.py
│   │   │   ├── data_sampler/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── base_sampler.py
│   │   │   │   └── data_parallel_sampler.py
│   │   │   ├── memory.py
│   │   │   └── profiler/
│   │   │       ├── __init__.py
│   │   │       ├── extention.py
│   │   │       ├── legacy/
│   │   │       │   ├── __init__.py
│   │   │       │   ├── comm_profiler.py
│   │   │       │   ├── pcie_profiler.py
│   │   │       │   └── prof_utils.py
│   │   │       ├── profiler.py
│   │   │       └── stateful_tensor_mem_extention.py
│   │   └── zero/
│   │       ├── __init__.py
│   │       ├── gemini/
│   │       │   ├── __init__.py
│   │       │   ├── colo_init_context.py
│   │       │   ├── gemini_context.py
│   │       │   ├── ophooks/
│   │       │   │   ├── __init__.py
│   │       │   │   ├── _shard_grad_ophook.py
│   │       │   │   ├── _shard_param_ophook.py
│   │       │   │   ├── runtime_mem_tracer_hook.py
│   │       │   │   └── utils.py
│   │       │   ├── paramhooks/
│   │       │   │   ├── __init__.py
│   │       │   │   └── _param_hookmgr.py
│   │       │   ├── stateful_tensor.py
│   │       │   ├── stateful_tensor_mgr.py
│   │       │   ├── tensor_placement_policy.py
│   │       │   └── tensor_utils.py
│   │       ├── init_ctx/
│   │       │   ├── __init__.py
│   │       │   └── init_context.py
│   │       ├── shard_utils/
│   │       │   ├── __init__.py
│   │       │   ├── base_shard_strategy.py
│   │       │   ├── bucket_tensor_shard_strategy.py
│   │       │   ├── commons.py
│   │       │   └── tensor_shard_strategy.py
│   │       ├── sharded_model/
│   │       │   ├── __init__.py
│   │       │   ├── _utils.py
│   │       │   ├── reduce_scatter.py
│   │       │   ├── sharded_model_v2.py
│   │       │   ├── utils.py
│   │       │   └── zero_hook.py
│   │       ├── sharded_optim/
│   │       │   ├── __init__.py
│   │       │   └── sharded_optim_v2.py
│   │       └── sharded_param/
│   │           ├── __init__.py
│   │           ├── sharded_param.py
│   │           └── sharded_tensor.py
│   ├── logging/
│   │   ├── __init__.py
│   │   └── logger.py
│   ├── moe/
│   │   ├── __init__.py
│   │   └── _operation.py
│   ├── nn/
│   │   ├── __init__.py
│   │   ├── init.py
│   │   ├── layer/
│   │   │   ├── __init__.py
│   │   │   ├── layernorm.py
│   │   │   ├── scaled_softmax.py
│   │   │   └── utils.py
│   │   ├── loss/
│   │   │   └── __init__.py
│   │   ├── lr_scheduler/
│   │   │   ├── __init__.py
│   │   │   ├── cosine.py
│   │   │   ├── delayed.py
│   │   │   ├── linear.py
│   │   │   ├── multistep.py
│   │   │   ├── onecycle.py
│   │   │   ├── poly.py
│   │   │   └── torch.py
│   │   └── optimizer/
│   │       ├── README.md
│   │       ├── __init__.py
│   │       ├── adafactor.py
│   │       ├── came.py
│   │       ├── cpu_adam.py
│   │       ├── distributed_adafactor.py
│   │       ├── distributed_came.py
│   │       ├── distributed_galore.py
│   │       ├── distributed_lamb.py
│   │       ├── fused_adam.py
│   │       ├── fused_lamb.py
│   │       ├── fused_sgd.py
│   │       ├── galore.py
│   │       ├── hybrid_adam.py
│   │       ├── lamb.py
│   │       ├── lars.py
│   │       └── nvme_optimizer.py
│   ├── pipeline/
│   │   ├── __init__.py
│   │   ├── p2p.py
│   │   ├── schedule/
│   │   │   ├── __init__.py
│   │   │   ├── _utils.py
│   │   │   ├── base.py
│   │   │   ├── generate.py
│   │   │   ├── interleaved_pp.py
│   │   │   ├── one_f_one_b.py
│   │   │   ├── v_schedule.py
│   │   │   └── zero_bubble_pp.py
│   │   ├── stage_manager.py
│   │   └── weight_grad_store.py
│   ├── quantization/
│   │   ├── __init__.py
│   │   ├── bnb.py
│   │   ├── bnb_config.py
│   │   ├── fp8.py
│   │   ├── fp8_config.py
│   │   ├── fp8_hook.py
│   │   └── utils.py
│   ├── shardformer/
│   │   ├── README.md
│   │   ├── __init__.py
│   │   ├── _utils.py
│   │   ├── examples/
│   │   │   ├── convergence_benchmark.py
│   │   │   ├── convergence_benchmark.sh
│   │   │   ├── data.py
│   │   │   └── performance_benchmark.py
│   │   ├── layer/
│   │   │   ├── __init__.py
│   │   │   ├── _operation.py
│   │   │   ├── attn.py
│   │   │   ├── dropout.py
│   │   │   ├── embedding.py
│   │   │   ├── linear.py
│   │   │   ├── loss.py
│   │   │   ├── normalization.py
│   │   │   ├── parallel_module.py
│   │   │   ├── qkv_fused_linear.py
│   │   │   └── utils.py
│   │   ├── modeling/
│   │   │   ├── __init__.py
│   │   │   ├── bert.py
│   │   │   ├── blip2.py
│   │   │   ├── bloom.py
│   │   │   ├── chatglm2.py
│   │   │   ├── chatglm2_6b/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── configuration_chatglm.py
│   │   │   │   └── modeling_chatglm.py
│   │   │   ├── command.py
│   │   │   ├── deepseek.py
│   │   │   ├── deepseek_v3.py
│   │   │   ├── falcon.py
│   │   │   ├── gpt2.py
│   │   │   ├── gptj.py
│   │   │   ├── jit.py
│   │   │   ├── llama.py
│   │   │   ├── mistral.py
│   │   │   ├── mixtral.py
│   │   │   ├── opt.py
│   │   │   ├── qwen2.py
│   │   │   ├── qwen3.py
│   │   │   ├── sam.py
│   │   │   ├── t5.py
│   │   │   ├── vit.py
│   │   │   └── whisper.py
│   │   ├── policies/
│   │   │   ├── __init__.py
│   │   │   ├── auto_policy.py
│   │   │   ├── base_policy.py
│   │   │   ├── bert.py
│   │   │   ├── blip2.py
│   │   │   ├── bloom.py
│   │   │   ├── chatglm2.py
│   │   │   ├── command.py
│   │   │   ├── deepseek.py
│   │   │   ├── deepseek_v3.py
│   │   │   ├── falcon.py
│   │   │   ├── gpt2.py
│   │   │   ├── gptj.py
│   │   │   ├── llama.py
│   │   │   ├── mistral.py
│   │   │   ├── mixtral.py
│   │   │   ├── opt.py
│   │   │   ├── qwen2.py
│   │   │   ├── qwen3.py
│   │   │   ├── sam.py
│   │   │   ├── t5.py
│   │   │   ├── vit.py
│   │   │   └── whisper.py
│   │   └── shard/
│   │       ├── __init__.py
│   │       ├── grad_ckpt_config.py
│   │       ├── shard_config.py
│   │       ├── sharder.py
│   │       ├── shardformer.py
│   │       └── utils.py
│   ├── tensor/
│   │   ├── __init__.py
│   │   ├── colo_parameter.py
│   │   ├── colo_tensor.py
│   │   ├── comm_spec.py
│   │   ├── d_tensor/
│   │   │   ├── README.md
│   │   │   ├── __init__.py
│   │   │   ├── api.py
│   │   │   ├── comm_spec.py
│   │   │   ├── layout.py
│   │   │   ├── layout_converter.py
│   │   │   ├── misc.py
│   │   │   ├── sharding_spec.py
│   │   │   └── utils.py
│   │   ├── moe_tensor/
│   │   │   ├── __init__.py
│   │   │   ├── api.py
│   │   │   └── moe_info.py
│   │   ├── padded_tensor/
│   │   │   ├── __init__.py
│   │   │   └── api.py
│   │   ├── param_op_hook.py
│   │   ├── shape_consistency.py
│   │   ├── sharding_spec.py
│   │   └── utils.py
│   ├── testing/
│   │   ├── __init__.py
│   │   ├── comparison.py
│   │   ├── pytest_wrapper.py
│   │   ├── random.py
│   │   └── utils.py
│   ├── utils/
│   │   ├── __init__.py
│   │   ├── common.py
│   │   ├── memory.py
│   │   ├── model/
│   │   │   ├── __init__.py
│   │   │   └── utils.py
│   │   ├── multi_tensor_apply/
│   │   │   ├── __init__.py
│   │   │   └── multi_tensor_apply.py
│   │   ├── rank_recorder/
│   │   │   ├── README.md
│   │   │   ├── __init__.py
│   │   │   └── rank_recorder.py
│   │   ├── safetensors.py
│   │   ├── tensor_detector/
│   │   │   ├── __init__.py
│   │   │   ├── readme.md
│   │   │   └── tensor_detector.py
│   │   └── timer.py
│   └── zero/
│       ├── __init__.py
│       ├── gemini/
│       │   ├── __init__.py
│       │   ├── chunk/
│       │   │   ├── __init__.py
│       │   │   ├── chunk.py
│       │   │   ├── manager.py
│       │   │   ├── search_utils.py
│       │   │   └── utils.py
│       │   ├── gemini_ddp.py
│       │   ├── gemini_hook.py
│       │   ├── gemini_mgr.py
│       │   ├── gemini_optimizer.py
│       │   ├── memory_tracer/
│       │   │   ├── __init__.py
│       │   │   ├── chunk_memstats_collector.py
│       │   │   ├── memory_monitor.py
│       │   │   ├── memory_stats.py
│       │   │   ├── memstats_collector.py
│       │   │   ├── param_runtime_order.py
│       │   │   ├── runtime_mem_tracer.py
│       │   │   ├── static_memstats_collector.py
│       │   │   └── utils.py
│       │   ├── placement_policy.py
│       │   └── utils.py
│       ├── low_level/
│       │   ├── __init__.py
│       │   ├── _utils.py
│       │   ├── bookkeeping/
│       │   │   ├── __init__.py
│       │   │   ├── base_store.py
│       │   │   ├── bucket_store.py
│       │   │   ├── gradient_store.py
│       │   │   └── tensor_bucket.py
│       │   ├── low_level_optim.py
│       │   ├── readme.md
│       │   └── zero_hook.py
│       └── wrapper.py
├── docker/
│   └── Dockerfile
├── docs/
│   ├── README-zh-Hans.md
│   ├── README.md
│   ├── REFERENCE.md
│   ├── conda-doc-test-deps.yml
│   ├── requirements-doc-test.txt
│   ├── sidebars.json
│   ├── source/
│   │   ├── en/
│   │   │   ├── Colossal-Auto/
│   │   │   │   ├── feature/
│   │   │   │   │   ├── auto_checkpoint.md
│   │   │   │   │   ├── device_mesh.md
│   │   │   │   │   ├── layout_converting_management.md
│   │   │   │   │   └── tracer.md
│   │   │   │   └── get_started/
│   │   │   │       ├── installation.md
│   │   │   │       ├── introduction.md
│   │   │   │       └── run_demo.md
│   │   │   ├── advanced_tutorials/
│   │   │   │   ├── integrate_mixture_of_experts_into_your_model.md
│   │   │   │   ├── meet_gemini.md
│   │   │   │   ├── opt_service.md
│   │   │   │   ├── train_gpt_using_hybrid_parallelism.md
│   │   │   │   └── train_vit_with_hybrid_parallelism.md
│   │   │   ├── basics/
│   │   │   │   ├── booster_api.md
│   │   │   │   ├── booster_checkpoint.md
│   │   │   │   ├── booster_plugins.md
│   │   │   │   ├── command_line_tool.md
│   │   │   │   └── launch_colossalai.md
│   │   │   ├── concepts/
│   │   │   │   ├── colossalai_overview.md
│   │   │   │   ├── distributed_training.md
│   │   │   │   └── paradigms_of_parallelism.md
│   │   │   ├── features/
│   │   │   │   ├── 1D_tensor_parallel.md
│   │   │   │   ├── 2D_tensor_parallel.md
│   │   │   │   ├── 2p5D_tensor_parallel.md
│   │   │   │   ├── 3D_tensor_parallel.md
│   │   │   │   ├── cluster_utils.md
│   │   │   │   ├── distributed_optimizers.md
│   │   │   │   ├── gradient_accumulation_with_booster.md
│   │   │   │   ├── gradient_clipping_with_booster.md
│   │   │   │   ├── lazy_init.md
│   │   │   │   ├── mixed_precision_training_with_booster.md
│   │   │   │   ├── nvme_offload.md
│   │   │   │   ├── pipeline_parallel.md
│   │   │   │   ├── sequence_parallelism.md
│   │   │   │   ├── shardformer.md
│   │   │   │   ├── zero_with_chunk.md
│   │   │   │   └── zerobubble_pipeline_parallelism.md
│   │   │   ├── get_started/
│   │   │   │   ├── bonus.md
│   │   │   │   ├── installation.md
│   │   │   │   ├── reading_roadmap.md
│   │   │   │   └── run_demo.md
│   │   │   └── sidebar_category_translation.json
│   │   └── zh-Hans/
│   │       ├── Colossal-Auto/
│   │       │   ├── feature/
│   │       │   │   ├── auto_checkpoint.md
│   │       │   │   ├── device_mesh.md
│   │       │   │   ├── layout_converting_management.md
│   │       │   │   └── tracer.md
│   │       │   └── get_started/
│   │       │       ├── installation.md
│   │       │       ├── introduction.md
│   │       │       └── run_demo.md
│   │       ├── advanced_tutorials/
│   │       │   ├── integrate_mixture_of_experts_into_your_model.md
│   │       │   ├── meet_gemini.md
│   │       │   ├── opt_service.md
│   │       │   ├── train_gpt_using_hybrid_parallelism.md
│   │       │   └── train_vit_with_hybrid_parallelism.md
│   │       ├── basics/
│   │       │   ├── booster_api.md
│   │       │   ├── booster_checkpoint.md
│   │       │   ├── booster_plugins.md
│   │       │   ├── command_line_tool.md
│   │       │   └── launch_colossalai.md
│   │       ├── concepts/
│   │       │   ├── colossalai_overview.md
│   │       │   ├── distributed_training.md
│   │       │   └── paradigms_of_parallelism.md
│   │       ├── features/
│   │       │   ├── 1D_tensor_parallel.md
│   │       │   ├── 2D_tensor_parallel.md
│   │       │   ├── 2p5D_tensor_parallel.md
│   │       │   ├── 3D_tensor_parallel.md
│   │       │   ├── cluster_utils.md
│   │       │   ├── distributed_optimizers.md
│   │       │   ├── gradient_accumulation_with_booster.md
│   │       │   ├── gradient_clipping_with_booster.md
│   │       │   ├── lazy_init.md
│   │       │   ├── mixed_precision_training_with_booster.md
│   │       │   ├── nvme_offload.md
│   │       │   ├── pipeline_parallel.md
│   │       │   ├── sequence_parallelism.md
│   │       │   ├── shardformer.md
│   │       │   ├── zero_with_chunk.md
│   │       │   └── zerobubble_pipeline_parallelism.md
│   │       ├── get_started/
│   │       │   ├── bonus.md
│   │       │   ├── installation.md
│   │       │   ├── reading_roadmap.md
│   │       │   └── run_demo.md
│   │       └── sidebar_category_translation.json
│   └── versions.json
├── examples/
│   ├── README.md
│   ├── __init__.py
│   ├── community/
│   │   ├── README.md
│   │   ├── fp8/
│   │   │   └── mnist/
│   │   │       ├── README.md
│   │   │       └── main.py
│   │   └── roberta/
│   │       ├── README.md
│   │       ├── preprocessing/
│   │       │   ├── Makefile
│   │       │   ├── README.md
│   │       │   ├── get_mask.py
│   │       │   ├── mask.cpp
│   │       │   ├── sentence_split.py
│   │       │   └── tokenize_mask.py
│   │       ├── pretraining/
│   │       │   ├── README.md
│   │       │   ├── arguments.py
│   │       │   ├── bert_dataset_provider.py
│   │       │   ├── evaluation.py
│   │       │   ├── hostfile
│   │       │   ├── loss.py
│   │       │   ├── model/
│   │       │   │   ├── bert.py
│   │       │   │   └── deberta_v2.py
│   │       │   ├── nvidia_bert_dataset_provider.py
│   │       │   ├── pretrain_utils.py
│   │       │   ├── run_pretrain.sh
│   │       │   ├── run_pretrain_resume.sh
│   │       │   ├── run_pretraining.py
│   │       │   └── utils/
│   │       │       ├── WandbLog.py
│   │       │       ├── exp_util.py
│   │       │       ├── global_vars.py
│   │       │       └── logger.py
│   │       ├── requirements.txt
│   │       └── test_ci.sh
│   ├── images/
│   │   ├── diffusion/
│   │   │   ├── LICENSE
│   │   │   ├── README.md
│   │   │   ├── configs/
│   │   │   │   ├── Inference/
│   │   │   │   │   ├── v2-inference-v.yaml
│   │   │   │   │   ├── v2-inference.yaml
│   │   │   │   │   ├── v2-inpainting-inference.yaml
│   │   │   │   │   ├── v2-midas-inference.yaml
│   │   │   │   │   └── x4-upscaling.yaml
│   │   │   │   ├── Teyvat/
│   │   │   │   │   ├── README.md
│   │   │   │   │   └── train_colossalai_teyvat.yaml
│   │   │   │   ├── train_colossalai.yaml
│   │   │   │   ├── train_colossalai_cifar10.yaml
│   │   │   │   └── train_ddp.yaml
│   │   │   ├── docker/
│   │   │   │   └── Dockerfile
│   │   │   ├── environment.yaml
│   │   │   ├── ldm/
│   │   │   │   ├── data/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── cifar10.py
│   │   │   │   │   ├── imagenet.py
│   │   │   │   │   ├── lsun.py
│   │   │   │   │   └── teyvat.py
│   │   │   │   ├── lr_scheduler.py
│   │   │   │   ├── models/
│   │   │   │   │   ├── autoencoder.py
│   │   │   │   │   └── diffusion/
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       ├── classifier.py
│   │   │   │   │       ├── ddim.py
│   │   │   │   │       ├── ddpm.py
│   │   │   │   │       ├── dpm_solver/
│   │   │   │   │       │   ├── __init__.py
│   │   │   │   │       │   ├── dpm_solver.py
│   │   │   │   │       │   └── sampler.py
│   │   │   │   │       ├── plms.py
│   │   │   │   │       └── sampling_util.py
│   │   │   │   ├── modules/
│   │   │   │   │   ├── attention.py
│   │   │   │   │   ├── diffusionmodules/
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   ├── model.py
│   │   │   │   │   │   ├── openaimodel.py
│   │   │   │   │   │   ├── upscaling.py
│   │   │   │   │   │   └── util.py
│   │   │   │   │   ├── distributions/
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   └── distributions.py
│   │   │   │   │   ├── ema.py
│   │   │   │   │   ├── encoders/
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   └── modules.py
│   │   │   │   │   ├── image_degradation/
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   ├── bsrgan.py
│   │   │   │   │   │   ├── bsrgan_light.py
│   │   │   │   │   │   └── utils_image.py
│   │   │   │   │   └── midas/
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       ├── api.py
│   │   │   │   │       ├── midas/
│   │   │   │   │       │   ├── __init__.py
│   │   │   │   │       │   ├── base_model.py
│   │   │   │   │       │   ├── blocks.py
│   │   │   │   │       │   ├── dpt_depth.py
│   │   │   │   │       │   ├── midas_net.py
│   │   │   │   │       │   ├── midas_net_custom.py
│   │   │   │   │       │   ├── transforms.py
│   │   │   │   │       │   └── vit.py
│   │   │   │   │       └── utils.py
│   │   │   │   └── util.py
│   │   │   ├── main.py
│   │   │   ├── requirements.txt
│   │   │   ├── scripts/
│   │   │   │   ├── download_first_stages.sh
│   │   │   │   ├── download_models.sh
│   │   │   │   ├── img2img.py
│   │   │   │   ├── inpaint.py
│   │   │   │   ├── knn2img.py
│   │   │   │   ├── sample_diffusion.py
│   │   │   │   ├── tests/
│   │   │   │   │   ├── test_checkpoint.py
│   │   │   │   │   └── test_watermark.py
│   │   │   │   ├── train_searcher.py
│   │   │   │   ├── txt2img.py
│   │   │   │   ├── txt2img.sh
│   │   │   │   └── utils.py
│   │   │   ├── setup.py
│   │   │   ├── test_ci.sh
│   │   │   ├── train_colossalai.sh
│   │   │   └── train_ddp.sh
│   │   ├── dreambooth/
│   │   │   ├── README.md
│   │   │   ├── colossalai.sh
│   │   │   ├── debug.py
│   │   │   ├── dreambooth.sh
│   │   │   ├── inference.py
│   │   │   ├── requirements.txt
│   │   │   ├── test_ci.sh
│   │   │   ├── train_dreambooth.py
│   │   │   ├── train_dreambooth_colossalai.py
│   │   │   ├── train_dreambooth_colossalai_lora.py
│   │   │   └── train_dreambooth_inpaint.py
│   │   ├── resnet/
│   │   │   ├── .gitignore
│   │   │   ├── README.md
│   │   │   ├── eval.py
│   │   │   ├── requirements.txt
│   │   │   ├── test_ci.sh
│   │   │   └── train.py
│   │   └── vit/
│   │       ├── README.md
│   │       ├── args.py
│   │       ├── data.py
│   │       ├── requirements.txt
│   │       ├── run_benchmark.sh
│   │       ├── run_demo.sh
│   │       ├── test_ci.sh
│   │       ├── vit_benchmark.py
│   │       └── vit_train_demo.py
│   ├── inference/
│   │   ├── benchmark_ops/
│   │   │   ├── benchmark_context_attn_unpad.py
│   │   │   ├── benchmark_decoding_attn.py
│   │   │   ├── benchmark_flash_decoding_attention.py
│   │   │   ├── benchmark_fused_rotary_embdding_unpad.py
│   │   │   ├── benchmark_kv_cache_memcopy.py
│   │   │   ├── benchmark_rmsnorm.py
│   │   │   ├── benchmark_rotary_embedding.py
│   │   │   ├── benchmark_xine_copy.py
│   │   │   └── test_ci.sh
│   │   ├── client/
│   │   │   ├── locustfile.py
│   │   │   ├── run_locust.sh
│   │   │   └── test_ci.sh
│   │   ├── llama/
│   │   │   ├── README.md
│   │   │   ├── benchmark_llama.py
│   │   │   ├── benchmark_llama3.py
│   │   │   ├── llama_generation.py
│   │   │   ├── run_benchmark.sh
│   │   │   └── test_ci.sh
│   │   └── stable_diffusion/
│   │       ├── README.md
│   │       ├── benchmark_sd3.py
│   │       ├── compute_metric.py
│   │       ├── requirements.txt
│   │       ├── run_benchmark.sh
│   │       ├── sd3_generation.py
│   │       └── test_ci.sh
│   ├── language/
│   │   ├── __init__.py
│   │   ├── bert/
│   │   │   ├── README.md
│   │   │   ├── benchmark.py
│   │   │   ├── benchmark.sh
│   │   │   ├── benchmark_utils.py
│   │   │   ├── data.py
│   │   │   ├── finetune.py
│   │   │   ├── requirements.txt
│   │   │   └── test_ci.sh
│   │   ├── commons/
│   │   │   └── utils.py
│   │   ├── data_utils.py
│   │   ├── deepseek/
│   │   │   ├── benchmark.py
│   │   │   └── test_ci.sh
│   │   ├── gpt/
│   │   │   ├── README.md
│   │   │   ├── experiments/
│   │   │   │   ├── auto_offload/
│   │   │   │   │   ├── README.md
│   │   │   │   │   ├── model_zoo.py
│   │   │   │   │   ├── requirements.txt
│   │   │   │   │   ├── run.sh
│   │   │   │   │   └── train_gpt_offload.py
│   │   │   │   ├── auto_parallel/
│   │   │   │   │   ├── README.md
│   │   │   │   │   ├── auto_parallel_with_gpt.py
│   │   │   │   │   ├── gpt_modules.py
│   │   │   │   │   └── requirements.txt
│   │   │   │   └── pipeline_parallel/
│   │   │   │       ├── README.md
│   │   │   │       ├── model_zoo.py
│   │   │   │       ├── requirements.txt
│   │   │   │       ├── run.sh
│   │   │   │       └── train_gpt_pp.py
│   │   │   ├── gemini/
│   │   │   │   ├── benchmark_gemini.sh
│   │   │   │   ├── commons/
│   │   │   │   │   ├── model_zoo.py
│   │   │   │   │   └── utils.py
│   │   │   │   ├── requirements.txt
│   │   │   │   ├── run_gemini.sh
│   │   │   │   ├── test_ci.sh
│   │   │   │   └── train_gpt_demo.py
│   │   │   ├── hybridparallelism/
│   │   │   │   ├── benchmark.py
│   │   │   │   ├── data.py
│   │   │   │   ├── finetune.py
│   │   │   │   └── run.sh
│   │   │   ├── requirements.txt
│   │   │   ├── test_ci.sh
│   │   │   └── titans/
│   │   │       ├── LICENSE
│   │   │       ├── README.md
│   │   │       ├── configs/
│   │   │       │   ├── gpt2_small_zero3_pp1d.py
│   │   │       │   └── gpt3_zero3_pp1d.py
│   │   │       ├── dataset/
│   │   │       │   └── webtext.py
│   │   │       ├── model/
│   │   │       │   ├── __init__.py
│   │   │       │   ├── embed.py
│   │   │       │   ├── gpt1d.py
│   │   │       │   └── pipeline_gpt1d.py
│   │   │       ├── requirements.txt
│   │   │       ├── run.sh
│   │   │       ├── test_ci.sh
│   │   │       └── train_gpt.py
│   │   ├── grok-1/
│   │   │   ├── README.md
│   │   │   ├── grok1_policy.py
│   │   │   ├── inference.py
│   │   │   ├── inference_tp.py
│   │   │   ├── requirements.txt
│   │   │   ├── run_inference_fast.sh
│   │   │   ├── run_inference_slow.sh
│   │   │   ├── test_ci.sh
│   │   │   └── utils.py
│   │   ├── llama/
│   │   │   ├── README.md
│   │   │   ├── benchmark.py
│   │   │   ├── requirements.txt
│   │   │   ├── scripts/
│   │   │   │   ├── benchmark_70B/
│   │   │   │   │   ├── 3d.sh
│   │   │   │   │   ├── gemini.sh
│   │   │   │   │   └── gemini_auto.sh
│   │   │   │   └── benchmark_7B/
│   │   │   │       ├── gemini.sh
│   │   │   │       └── gemini_auto.sh
│   │   │   └── test_ci.sh
│   │   ├── mixtral/
│   │   │   ├── benchmark.py
│   │   │   └── test_ci.sh
│   │   ├── model_utils.py
│   │   ├── opt/
│   │   │   ├── README.md
│   │   │   ├── args.py
│   │   │   ├── data.py
│   │   │   ├── opt_benchmark.py
│   │   │   ├── opt_train_demo.py
│   │   │   ├── requirements.txt
│   │   │   ├── run_benchmark.sh
│   │   │   ├── run_demo.sh
│   │   │   └── test_ci.sh
│   │   ├── palm/
│   │   │   ├── README.md
│   │   │   ├── data/
│   │   │   │   └── README.md
│   │   │   ├── palm_pytorch/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── autoregressive_wrapper.py
│   │   │   │   └── palm_pytorch.py
│   │   │   ├── requirements.txt
│   │   │   ├── run.sh
│   │   │   ├── test_ci.sh
│   │   │   └── train.py
│   │   └── performance_evaluator.py
│   └── tutorial/
│       ├── .gitignore
│       ├── README.md
│       ├── auto_parallel/
│       │   ├── README.md
│       │   ├── auto_ckpt_batchsize_test.py
│       │   ├── auto_ckpt_solver_test.py
│       │   ├── auto_parallel_with_resnet.py
│       │   ├── bench_utils.py
│       │   ├── config.py
│       │   ├── requirements.txt
│       │   ├── setup.py
│       │   └── test_ci.sh
│       ├── download_cifar10.py
│       ├── fastfold/
│       │   └── README.md
│       ├── hybrid_parallel/
│       │   ├── README.md
│       │   ├── config.py
│       │   ├── requirements.txt
│       │   ├── test_ci.sh
│       │   └── train.py
│       ├── large_batch_optimizer/
│       │   ├── README.md
│       │   ├── config.py
│       │   ├── requirements.txt
│       │   ├── test_ci.sh
│       │   └── train.py
│       ├── new_api/
│       │   ├── README.md
│       │   ├── cifar_resnet/
│       │   │   ├── .gitignore
│       │   │   ├── README.md
│       │   │   ├── eval.py
│       │   │   ├── requirements.txt
│       │   │   ├── test_ci.sh
│       │   │   └── train.py
│       │   ├── cifar_vit/
│       │   │   ├── README.md
│       │   │   ├── requirements.txt
│       │   │   ├── test_ci.sh
│       │   │   └── train.py
│       │   ├── glue_bert/
│       │   │   ├── README.md
│       │   │   ├── data.py
│       │   │   ├── finetune.py
│       │   │   ├── requirements.txt
│       │   │   └── test_ci.sh
│       │   └── test_ci.sh
│       ├── opt/
│       │   ├── inference/
│       │   │   ├── README.md
│       │   │   ├── batch.py
│       │   │   ├── benchmark/
│       │   │   │   └── locustfile.py
│       │   │   ├── cache.py
│       │   │   ├── opt_fastapi.py
│       │   │   ├── opt_server.py
│       │   │   ├── requirements.txt
│       │   │   └── script/
│       │   │       ├── process-opt-175b/
│       │   │       │   ├── README.md
│       │   │       │   ├── convert_ckpt.py
│       │   │       │   ├── flat-meta.json
│       │   │       │   └── unflat.sh
│       │   │       └── processing_ckpt_66b.py
│       │   ├── opt/
│       │   │   ├── README.md
│       │   │   ├── benchmark.sh
│       │   │   ├── colossalai_zero.py
│       │   │   ├── context.py
│       │   │   ├── requirements.txt
│       │   │   ├── run_clm.py
│       │   │   ├── run_clm.sh
│       │   │   ├── run_clm_synthetic.sh
│       │   │   └── test_ci.sh
│       │   └── test_ci.sh
│       ├── requirements.txt
│       └── sequence_parallel/
│           ├── README.md
│           ├── config.py
│           ├── data/
│           │   ├── __init__.py
│           │   ├── bert_helper.py
│           │   ├── datasets/
│           │   │   ├── Makefile
│           │   │   ├── __init__.py
│           │   │   ├── bert_dataset.py
│           │   │   ├── blendable_dataset.py
│           │   │   ├── builder.py
│           │   │   ├── data_samplers.py
│           │   │   ├── dataset_utils.py
│           │   │   ├── helpers.cpp
│           │   │   ├── ict_dataset.py
│           │   │   ├── indexed_dataset.py
│           │   │   └── test/
│           │   │       ├── test_indexed_dataset.py
│           │   │       └── test_preprocess_data.sh
│           │   ├── dummy_dataloader.py
│           │   └── tokenizer/
│           │       ├── __init__.py
│           │       ├── bert_tokenization.py
│           │       └── tokenizer.py
│           ├── loss_func/
│           │   ├── __init__.py
│           │   ├── bert_loss.py
│           │   ├── cross_entropy.py
│           │   └── utils.py
│           ├── lr_scheduler/
│           │   ├── __init__.py
│           │   └── annealing_lr.py
│           ├── model/
│           │   ├── __init__.py
│           │   ├── bert.py
│           │   └── layers/
│           │       ├── __init__.py
│           │       ├── bert_layer.py
│           │       ├── dropout.py
│           │       ├── embedding.py
│           │       ├── head.py
│           │       ├── init_method.py
│           │       ├── linear.py
│           │       ├── mlp.py
│           │       ├── pooler.py
│           │       └── preprocess.py
│           ├── requirements.txt
│           ├── test_ci.sh
│           └── train.py
├── extensions/
│   ├── README.md
│   ├── __init__.py
│   ├── base_extension.py
│   ├── cpp_extension.py
│   ├── csrc/
│   │   ├── __init__.py
│   │   ├── common/
│   │   │   ├── data_type.h
│   │   │   ├── micros.h
│   │   │   ├── mp_type_traits.h
│   │   │   ├── target.h
│   │   │   └── vec_type_traits.h
│   │   ├── funcs/
│   │   │   ├── binary_functor.h
│   │   │   ├── cast_functor.h
│   │   │   ├── reduce_function.h
│   │   │   ├── ternary_functor.h
│   │   │   └── unary_functor.h
│   │   └── kernel/
│   │       ├── arm/
│   │       │   ├── cpu_adam_arm.cpp
│   │       │   └── cpu_adam_arm.h
│   │       ├── cuda/
│   │       │   ├── activation_kernel.cu
│   │       │   ├── attention/
│   │       │   │   └── attention_utils.h
│   │       │   ├── context_kv_cache_memcpy_kernel.cu
│   │       │   ├── convert_fp8_kernel.cu
│   │       │   ├── decode_kv_cache_memcpy_kernel.cu
│   │       │   ├── flash_decoding_attention_kernel.cu
│   │       │   ├── fused_rotary_emb_and_cache_kernel.cu
│   │       │   ├── get_cos_and_sin_kernel.cu
│   │       │   ├── layer_norm_kernel.cu
│   │       │   ├── moe_kernel.cu
│   │       │   ├── multi_tensor_adam_kernel.cu
│   │       │   ├── multi_tensor_apply.cuh
│   │       │   ├── multi_tensor_l2norm_kernel.cu
│   │       │   ├── multi_tensor_lamb_kernel.cu
│   │       │   ├── multi_tensor_scale_kernel.cu
│   │       │   ├── multi_tensor_sgd_kernel.cu
│   │       │   ├── rms_layernorm_kernel.cu
│   │       │   ├── scaled_masked_softmax_kernel.cu
│   │       │   ├── scaled_upper_triang_masked_softmax_kernel.cu
│   │       │   └── utils/
│   │       │       ├── gpu_launch_config.h
│   │       │       ├── micros.h
│   │       │       ├── nvgpu_dev_info.h
│   │       │       └── vec_copy.h
│   │       └── x86/
│   │           ├── cpu_adam.cpp
│   │           └── cpu_adam.h
│   ├── cuda_extension.py
│   ├── pybind/
│   │   ├── __init__.py
│   │   ├── cpu_adam/
│   │   │   ├── __init__.py
│   │   │   ├── cpu_adam_arm.py
│   │   │   └── cpu_adam_x86.py
│   │   ├── flash_attention/
│   │   │   ├── __init__.py
│   │   │   ├── flash_attention_dao_cuda.py
│   │   │   ├── flash_attention_npu.py
│   │   │   └── flash_attention_sdpa_cuda.py
│   │   ├── inference/
│   │   │   ├── __init__.py
│   │   │   ├── inference.cpp
│   │   │   └── inference_ops_cuda.py
│   │   ├── layernorm/
│   │   │   ├── __init__.py
│   │   │   ├── layer_norm.cpp
│   │   │   └── layernorm_cuda.py
│   │   ├── moe/
│   │   │   ├── __init__.py
│   │   │   ├── moe.cpp
│   │   │   └── moe_cuda.py
│   │   ├── optimizer/
│   │   │   ├── __init__.py
│   │   │   ├── fused_optimizer_cuda.py
│   │   │   └── optimizer.cpp
│   │   └── softmax/
│   │       ├── __init__.py
│   │       ├── scaled_masked_softmax.cpp
│   │       ├── scaled_masked_softmax_cuda.py
│   │       ├── scaled_upper_triang_masked_softmax.cpp
│   │       └── scaled_upper_triangle_masked_softmax_cuda.py
│   ├── triton_extension.py
│   └── utils.py
├── pytest.ini
├── requirements/
│   ├── requirements-test.txt
│   └── requirements.txt
├── setup.py
├── tests/
│   ├── __init__.py
│   ├── conftest.py
│   ├── kit/
│   │   ├── __init__.py
│   │   └── model_zoo/
│   │       ├── __init__.py
│   │       ├── custom/
│   │       │   ├── __init__.py
│   │       │   ├── base.py
│   │       │   ├── hanging_param_model.py
│   │       │   ├── nested_model.py
│   │       │   ├── repeated_computed_layers.py
│   │       │   ├── simple_mlp.py
│   │       │   └── simple_net.py
│   │       ├── diffusers/
│   │       │   ├── __init__.py
│   │       │   └── diffusers.py
│   │       ├── executor.py
│   │       ├── registry.py
│   │       ├── timm/
│   │       │   ├── __init__.py
│   │       │   └── timm.py
│   │       ├── torchaudio/
│   │       │   ├── __init__.py
│   │       │   └── torchaudio.py
│   │       ├── torchrec/
│   │       │   ├── __init__.py
│   │       │   └── torchrec.py
│   │       ├── torchvision/
│   │       │   ├── __init__.py
│   │       │   └── torchvision.py
│   │       └── transformers/
│   │           ├── __init__.py
│   │           ├── albert.py
│   │           ├── bert.py
│   │           ├── blip2.py
│   │           ├── bloom.py
│   │           ├── chatglm2.py
│   │           ├── command.py
│   │           ├── deepseek.py
│   │           ├── deepseek_v3.py
│   │           ├── falcon.py
│   │           ├── gpt.py
│   │           ├── gptj.py
│   │           ├── llama.py
│   │           ├── mistral.py
│   │           ├── mixtral.py
│   │           ├── opt.py
│   │           ├── qwen2.py
│   │           ├── qwen3.py
│   │           ├── sam.py
│   │           ├── t5.py
│   │           ├── vit.py
│   │           └── whisper.py
│   ├── test_analyzer/
│   │   ├── __init__.py
│   │   ├── test_fx/
│   │   │   ├── __init__.py
│   │   │   ├── test_bias_addition.py
│   │   │   ├── test_mod_dir.py
│   │   │   ├── test_nested_ckpt.py
│   │   │   ├── test_shape_prop.py
│   │   │   ├── test_symbolic_profile.py
│   │   │   └── zoo.py
│   │   └── test_subclasses/
│   │       ├── __init__.py
│   │       ├── test_aten.py
│   │       ├── test_flop_tensor.py
│   │       └── test_meta_mode.py
│   ├── test_auto_parallel/
│   │   ├── __init__.py
│   │   ├── test_ckpt_solvers/
│   │   │   ├── test_C_solver_consistency.py
│   │   │   ├── test_ckpt_torchvision.py
│   │   │   └── test_linearize.py
│   │   ├── test_offload/
│   │   │   ├── model_utils.py
│   │   │   ├── test_perf.py
│   │   │   └── test_solver.py
│   │   ├── test_pass/
│   │   │   ├── __init__.py
│   │   │   ├── test_node_converting_pass.py
│   │   │   └── test_size_value_converting_pass.py
│   │   └── test_tensor_shard/
│   │       ├── __init__.py
│   │       ├── test_bias_addition_forward.py
│   │       ├── test_broadcast.py
│   │       ├── test_checkpoint.py
│   │       ├── test_compatibility_with_ddp.py
│   │       ├── test_compatibility_with_gemini.py
│   │       ├── test_find_repeat_block.py
│   │       ├── test_gpt/
│   │       │   ├── __init__.py
│   │       │   ├── gpt_modules.py
│   │       │   ├── test_runtime_with_gpt_modules.py
│   │       │   └── test_solver_with_gpt_module.py
│   │       ├── test_liveness_analysis.py
│   │       ├── test_metainfo/
│   │       │   ├── test_activation_metainfo.py
│   │       │   ├── test_binary_elementwise_metainfo.py
│   │       │   ├── test_conv_metainfo.py
│   │       │   ├── test_embedding_metainfo.py
│   │       │   ├── test_linear_metainfo.py
│   │       │   ├── test_matmul_metainfo.py
│   │       │   ├── test_norm_metainfo.py
│   │       │   ├── test_pooling_metainfo.py
│   │       │   ├── test_tensor_metainfo.py
│   │       │   ├── test_where_metainfo.py
│   │       │   └── utils.py
│   │       ├── test_node_handler/
│   │       │   ├── __init__.py
│   │       │   ├── test_addbmm_handler.py
│   │       │   ├── test_addmm_handler.py
│   │       │   ├── test_batch_norm_handler.py
│   │       │   ├── test_bias_linear_function_node.py
│   │       │   ├── test_bias_linear_module_node.py
│   │       │   ├── test_binary_elementwise_handler.py
│   │       │   ├── test_bmm_handler.py
│   │       │   ├── test_conv_handler.py
│   │       │   ├── test_default_reshape_handler.py
│   │       │   ├── test_embedding_handler.py
│   │       │   ├── test_getattr_handler.py
│   │       │   ├── test_getitem_handler.py
│   │       │   ├── test_layer_norm_handler.py
│   │       │   ├── test_linear_handler.py
│   │       │   ├── test_matmul_handler.py
│   │       │   ├── test_norm_pooling_handler.py
│   │       │   ├── test_output_handler.py
│   │       │   ├── test_permute_and_transpose_handler.py
│   │       │   ├── test_placeholder_handler.py
│   │       │   ├── test_shard_option.py
│   │       │   ├── test_softmax_handler.py
│   │       │   ├── test_split_handler.py
│   │       │   ├── test_sum_handler.py
│   │       │   ├── test_tensor_constructor.py
│   │       │   ├── test_unary_element_wise_handler.py
│   │       │   ├── test_view_handler.py
│   │       │   ├── test_where_handler.py
│   │       │   └── utils.py
│   │       └── test_solver_with_resnet_v2.py
│   ├── test_autochunk/
│   │   ├── test_autochunk_alphafold/
│   │   │   ├── benchmark_autochunk_alphafold.py
│   │   │   ├── test_autochunk_alphafold_utils.py
│   │   │   ├── test_autochunk_evoformer_block.py
│   │   │   ├── test_autochunk_evoformer_stack.py
│   │   │   └── test_autochunk_extramsa_block.py
│   │   ├── test_autochunk_diffuser/
│   │   │   ├── benchmark_autochunk_diffuser.py
│   │   │   ├── test_autochunk_diffuser_utils.py
│   │   │   └── test_autochunk_unet.py
│   │   ├── test_autochunk_transformer/
│   │   │   ├── benchmark_autochunk_transformer.py
│   │   │   ├── test_autochunk_gpt.py
│   │   │   └── test_autochunk_transformer_utils.py
│   │   └── test_autochunk_vit/
│   │       ├── test_autochunk_vit.py
│   │       └── test_autochunk_vit_utils.py
│   ├── test_booster/
│   │   ├── test_accelerator.py
│   │   ├── test_mixed_precision/
│   │   │   └── test_fp16_torch.py
│   │   └── test_plugin/
│   │       ├── test_3d_plugin.py
│   │       ├── test_dp_plugin_base.py
│   │       ├── test_gemini_plugin.py
│   │       ├── test_low_level_zero_plugin.py
│   │       ├── test_torch_ddp_plugin.py
│   │       └── test_torch_fsdp_plugin.py
│   ├── test_checkpoint_io/
│   │   ├── test_gemini_checkpoint_io.py
│   │   ├── test_gemini_torch_compability.py
│   │   ├── test_general_checkpoint_io.py
│   │   ├── test_hybrid_parallel_plugin_checkpoint_io.py
│   │   ├── test_low_level_zero_checkpoint_io.py
│   │   ├── test_plugins_huggingface_compatibility.py
│   │   ├── test_safetensors_async_io.py
│   │   ├── test_torch_ddp_checkpoint_io.py
│   │   ├── test_torch_fsdp_checkpoint_io.py
│   │   └── utils.py
│   ├── test_cluster/
│   │   ├── test_device_mesh_manager.py
│   │   └── test_process_group_mesh.py
│   ├── test_config/
│   │   ├── sample_config.py
│   │   └── test_load_config.py
│   ├── test_device/
│   │   ├── test_alpha_beta.py
│   │   ├── test_device_mesh.py
│   │   ├── test_extract_alpha_beta.py
│   │   ├── test_init_logical_pg.py
│   │   └── test_search_logical_device_mesh.py
│   ├── test_fp8/
│   │   ├── test_all_to_all_single.py
│   │   ├── test_fp8_all_to_all.py
│   │   ├── test_fp8_all_to_all_single.py
│   │   ├── test_fp8_allgather.py
│   │   ├── test_fp8_allreduce.py
│   │   ├── test_fp8_cast.py
│   │   ├── test_fp8_ddp_comm_hook.py
│   │   ├── test_fp8_fsdp_comm_hook.py
│   │   ├── test_fp8_hook.py
│   │   ├── test_fp8_linear.py
│   │   └── test_fp8_reduce_scatter.py
│   ├── test_fx/
│   │   ├── test_codegen/
│   │   │   ├── test_activation_checkpoint_codegen.py
│   │   │   ├── test_nested_activation_checkpoint_codegen.py
│   │   │   └── test_offload_codegen.py
│   │   ├── test_coloproxy.py
│   │   ├── test_comm_size_compute.py
│   │   ├── test_graph_manipulation.py
│   │   ├── test_meta/
│   │   │   ├── test_aten.py
│   │   │   ├── test_backward.py
│   │   │   └── test_meta_trace.py
│   │   ├── test_meta_info_prop.py
│   │   ├── test_parallel_1d.py
│   │   ├── test_pipeline/
│   │   │   ├── test_hf_model/
│   │   │   │   ├── hf_utils.py
│   │   │   │   ├── test_albert.py
│   │   │   │   ├── test_bert.py
│   │   │   │   ├── test_gpt.py
│   │   │   │   ├── test_opt.py
│   │   │   │   └── test_t5.py
│   │   │   ├── test_timm_model/
│   │   │   │   ├── test_timm.py
│   │   │   │   └── timm_utils.py
│   │   │   ├── test_topo/
│   │   │   │   ├── test_topo.py
│   │   │   │   └── topo_utils.py
│   │   │   └── test_torchvision/
│   │   │       └── test_torchvision.py
│   │   ├── test_pipeline_passes.py
│   │   ├── test_profiler/
│   │   │   ├── gpt_utils.py
│   │   │   └── test_profiler_meta_info_prop.py
│   │   └── test_tracer/
│   │       ├── test_activation_checkpoint_annotation.py
│   │       ├── test_bias_addition_module.py
│   │       ├── test_control_flow.py
│   │       ├── test_functional_conv.py
│   │       ├── test_hf_model/
│   │       │   ├── hf_tracer_utils.py
│   │       │   ├── test_hf_albert.py
│   │       │   ├── test_hf_bert.py
│   │       │   ├── test_hf_diffuser.py
│   │       │   ├── test_hf_gpt.py
│   │       │   ├── test_hf_opt.py
│   │       │   └── test_hf_t5.py
│   │       ├── test_patched_module.py
│   │       ├── test_patched_op.py
│   │       ├── test_timm_model/
│   │       │   └── test_timm_model.py
│   │       ├── test_torchaudio_model/
│   │       │   ├── test_torchaudio_model.py
│   │       │   └── torchaudio_utils.py
│   │       ├── test_torchrec_model/
│   │       │   ├── test_deepfm_model.py
│   │       │   └── test_dlrm_model.py
│   │       └── test_torchvision_model/
│   │           └── test_torchvision_model.py
│   ├── test_infer/
│   │   ├── __init__.py
│   │   ├── _utils.py
│   │   ├── test_async_engine/
│   │   │   ├── test_async_engine.py
│   │   │   └── test_request_tracer.py
│   │   ├── test_batch_bucket.py
│   │   ├── test_config_and_struct.py
│   │   ├── test_continuous_batching.py
│   │   ├── test_cuda_graph.py
│   │   ├── test_drafter.py
│   │   ├── test_inference_engine.py
│   │   ├── test_kernels/
│   │   │   ├── __init__.py
│   │   │   ├── cuda/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── test_convert_fp8.py
│   │   │   │   ├── test_flash_decoding_attention.py
│   │   │   │   ├── test_get_cos_and_sin.py
│   │   │   │   ├── test_kv_cache_memcpy.py
│   │   │   │   ├── test_rms_layernorm.py
│   │   │   │   ├── test_rotary_embdding_unpad.py
│   │   │   │   └── test_silu_and_mul.py
│   │   │   └── triton/
│   │   │       ├── __init__.py
│   │   │       ├── kernel_utils.py
│   │   │       ├── test_context_attn_unpad.py
│   │   │       ├── test_decoding_attn.py
│   │   │       ├── test_fused_rotary_embedding.py
│   │   │       ├── test_kvcache_copy.py
│   │   │       ├── test_rmsnorm_triton.py
│   │   │       ├── test_rotary_embdding_unpad.py
│   │   │       └── test_xine_copy.py
│   │   ├── test_kvcache_manager.py
│   │   ├── test_models/
│   │   │   ├── test_attention.py
│   │   │   ├── test_baichuan.py
│   │   │   └── test_custom_model.py
│   │   ├── test_request_handler.py
│   │   ├── test_rpc_engine.py
│   │   └── test_streamingllm.py
│   ├── test_lazy/
│   │   ├── lazy_init_utils.py
│   │   ├── test_from_pretrained.py
│   │   ├── test_models.py
│   │   └── test_ops.py
│   ├── test_legacy/
│   │   ├── test_amp/
│   │   │   ├── test_naive_fp16.py
│   │   │   └── test_torch_fp16.py
│   │   ├── test_comm/
│   │   │   ├── test_boardcast_send_recv_v2.py
│   │   │   ├── test_comm.py
│   │   │   ├── test_object_list_p2p.py
│   │   │   └── test_object_list_p2p_v2.py
│   │   ├── test_context/
│   │   │   ├── configs/
│   │   │   │   ├── parallel_2d_init.py
│   │   │   │   ├── parallel_2p5d_init.py
│   │   │   │   └── parallel_3d_init.py
│   │   │   └── test_hybrid_parallel.py
│   │   ├── test_data/
│   │   │   ├── test_cifar10_dataset.py
│   │   │   ├── test_data_parallel_sampler.py
│   │   │   └── test_deterministic_dataloader.py
│   │   ├── test_engine/
│   │   │   ├── test_engine.py
│   │   │   └── test_gradient_accumluation.py
│   │   ├── test_layers/
│   │   │   ├── test_1d/
│   │   │   │   ├── checks_1d/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── check_layer_1d.py
│   │   │   │   │   └── common.py
│   │   │   │   └── test_1d.py
│   │   │   ├── test_2d/
│   │   │   │   ├── checks_2d/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── check_layer_2d.py
│   │   │   │   │   ├── check_operation_2d.py
│   │   │   │   │   └── common.py
│   │   │   │   └── test_2d.py
│   │   │   ├── test_2p5d/
│   │   │   │   ├── checks_2p5d/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── check_layer_2p5d.py
│   │   │   │   │   ├── check_operation_2p5d.py
│   │   │   │   │   └── common.py
│   │   │   │   └── test_2p5d.py
│   │   │   ├── test_3d/
│   │   │   │   ├── checks_3d/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── check_layer_3d.py
│   │   │   │   │   └── common.py
│   │   │   │   └── test_3d.py
│   │   │   ├── test_cache_embedding.py
│   │   │   └── test_sequence/
│   │   │       ├── checks_seq/
│   │   │       │   ├── __init__.py
│   │   │       │   └── check_layer_seq.py
│   │   │       └── test_sequence.py
│   │   ├── test_moe/
│   │   │   ├── moe_utils.py
│   │   │   ├── test_grad_handler.py
│   │   │   ├── test_moe_group.py
│   │   │   ├── test_moe_hybrid_zero.py
│   │   │   └── test_moe_load_balance.py
│   │   ├── test_pipeline/
│   │   │   ├── rpc_test_utils.py
│   │   │   ├── test_cuda_rpc_chimera.py
│   │   │   ├── test_cuda_rpc_optimizer.py
│   │   │   ├── test_cuda_rpc_pipeline.py
│   │   │   ├── test_cuda_rpc_value_correctness.py
│   │   │   ├── test_middleware_1f1b.py
│   │   │   ├── test_pipelinable.py
│   │   │   └── test_pipeline_process_group.py
│   │   ├── test_tensor/
│   │   │   ├── common_utils/
│   │   │   │   ├── __init__.py
│   │   │   │   └── _utils.py
│   │   │   ├── core/
│   │   │   │   └── test_dist_spec_mgr.py
│   │   │   └── test_parameter.py
│   │   ├── test_trainer/
│   │   │   ├── test_pipeline/
│   │   │   │   ├── test_p2p.py
│   │   │   │   └── test_pipeline_schedule.py
│   │   │   ├── test_trainer_with_non_pipe_schedule.py
│   │   │   └── test_trainer_with_pipe_schedule.py
│   │   ├── test_utils/
│   │   │   ├── test_activation_checkpointing.py
│   │   │   ├── test_checkpoint/
│   │   │   │   ├── test_checkpoint_1d.py
│   │   │   │   ├── test_checkpoint_2d.py
│   │   │   │   ├── test_checkpoint_2p5d.py
│   │   │   │   └── test_checkpoint_3d.py
│   │   │   ├── test_memory.py
│   │   │   └── test_norm_gradient_clipping.py
│   │   └── test_zero/
│   │       └── test_commons.py
│   ├── test_lora/
│   │   └── test_lora.py
│   ├── test_moe/
│   │   ├── moe_utils.py
│   │   ├── test_deepseek_layer.py
│   │   ├── test_kernel.py
│   │   ├── test_mixtral_layer.py
│   │   ├── test_moe_checkpoint.py
│   │   ├── test_moe_ep_tp.py
│   │   └── test_moe_ep_zero.py
│   ├── test_optimizer/
│   │   ├── _utils.py
│   │   ├── test_adam_kernel.py
│   │   ├── test_adam_optim.py
│   │   ├── test_dist_adafactor.py
│   │   ├── test_dist_came.py
│   │   ├── test_dist_galore.py
│   │   ├── test_dist_lamb.py
│   │   ├── test_lr_scheduler.py
│   │   └── test_nvme.py
│   ├── test_pipeline/
│   │   ├── test_p2p_communication.py
│   │   ├── test_pipeline_utils/
│   │   │   ├── test_t5_pipeline_utils.py
│   │   │   └── test_whisper_pipeline_utils.py
│   │   ├── test_schedule/
│   │   │   ├── test_interleaved.py
│   │   │   ├── test_oneF_oneB.py
│   │   │   ├── test_pipeline_schedule_utils.py
│   │   │   └── test_zerobubble_pp.py
│   │   └── test_stage_manager.py
│   ├── test_shardformer/
│   │   ├── __init__.py
│   │   ├── test_flash_attention.py
│   │   ├── test_hybrid_parallel_grad_clip_norm/
│   │   │   ├── test_amp_optimizer.py
│   │   │   ├── test_naive_optimizer.py
│   │   │   └── test_zero_optimizer.py
│   │   ├── test_layer/
│   │   │   ├── test_dist_crossentropy.py
│   │   │   ├── test_dist_log_prob.py
│   │   │   ├── test_dropout.py
│   │   │   ├── test_embedding.py
│   │   │   ├── test_gpt2_qkv_fused_linear_1d.py
│   │   │   ├── test_layernorm.py
│   │   │   ├── test_linear_1d.py
│   │   │   ├── test_qkv_fused_linear_1d.py
│   │   │   ├── test_ring_attn.py
│   │   │   ├── test_sequence_parallel.py
│   │   │   └── test_vocab_parallel_embedding_1d.py
│   │   ├── test_model/
│   │   │   ├── __init__.py
│   │   │   ├── _utils.py
│   │   │   ├── test_shard_bert.py
│   │   │   ├── test_shard_blip2.py
│   │   │   ├── test_shard_bloom.py
│   │   │   ├── test_shard_chatglm2.py
│   │   │   ├── test_shard_command.py
│   │   │   ├── test_shard_deepseek.py
│   │   │   ├── test_shard_deepseek_v3.py
│   │   │   ├── test_shard_falcon.py
│   │   │   ├── test_shard_gpt2.py
│   │   │   ├── test_shard_gptj.py
│   │   │   ├── test_shard_llama.py
│   │   │   ├── test_shard_mistral.py
│   │   │   ├── test_shard_mixtral.py
│   │   │   ├── test_shard_opt.py
│   │   │   ├── test_shard_qwen2.py
│   │   │   ├── test_shard_qwen3.py
│   │   │   ├── test_shard_sam.py
│   │   │   ├── test_shard_t5.py
│   │   │   ├── test_shard_vit.py
│   │   │   └── test_shard_whisper.py
│   │   ├── test_shard_utils.py
│   │   └── test_with_torch_ddp.py
│   ├── test_smoothquant/
│   │   ├── test_llama_attention.py
│   │   ├── test_llama_mlp.py
│   │   ├── test_smoothquant_linear.py
│   │   └── test_sq_rotary_embedding.py
│   ├── test_tensor/
│   │   ├── test_comm_spec_apply.py
│   │   ├── test_dtensor/
│   │   │   ├── test_comm_spec.py
│   │   │   ├── test_dtensor.py
│   │   │   ├── test_dtensor_sharding_spec.py
│   │   │   └── test_layout_converter.py
│   │   ├── test_mix_gather.py
│   │   ├── test_padded_tensor.py
│   │   ├── test_shape_consistency.py
│   │   ├── test_shape_consistency_apply.py
│   │   └── test_sharding_spec.py
│   └── test_zero/
│       ├── test_gemini/
│       │   ├── test_chunk_mgrv2.py
│       │   ├── test_chunkv2.py
│       │   ├── test_gemini_use_rmt.py
│       │   ├── test_grad_accum.py
│       │   ├── test_grad_clip.py
│       │   ├── test_inference.py
│       │   ├── test_optim.py
│       │   ├── test_runtime_mem_tracer.py
│       │   ├── test_search.py
│       │   ├── test_zeroddp_state_dict.py
│       │   └── test_zerooptim_state_dict.py
│       └── test_low_level/
│           ├── test_coll_nd.py
│           ├── test_grad_acc.py
│           ├── test_mem_leak.py
│           ├── test_zero1_2.py
│           └── test_zero_ckpt.py
└── version.txt