Repository: microsoft/unilm
Branch: master
Commit: 833df7e7832e
Files: 5915
Total size: 112.5 MB
Directory structure:
gitextract_cuit3xwy/
├── .github/
│ └── ISSUE_TEMPLATE/
│ ├── bug_report.md
│ └── custom.md
├── .gitignore
├── .gitmodules
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── Diff-Transformer/
│ ├── Diff-Transformer-V2/
│ │ ├── README.md
│ │ └── multihead_flashdiffv2.py
│ ├── README.md
│ ├── example.py
│ ├── kernel/
│ │ └── rotary.py
│ ├── multihead_attention.py
│ ├── multihead_diffattn.py
│ ├── multihead_flashdiff_1.py
│ ├── multihead_flashdiff_2.py
│ └── rms_norm.py
├── LICENSE
├── LatentLM/
│ ├── README.md
│ ├── evaluate_fid.py
│ ├── evaluate_fid_fidelity.py
│ ├── inference_speed.py
│ ├── metrics/
│ │ ├── IS.py
│ │ ├── __init__.py
│ │ ├── fid.py
│ │ └── inception.py
│ ├── models/
│ │ ├── DiT.py
│ │ ├── EMA.py
│ │ ├── RMSNorm.py
│ │ ├── Transformer.py
│ │ ├── __init__.py
│ │ └── kernel/
│ │ ├── rotary.py
│ │ └── swiglu.py
│ ├── sample_hf.py
│ ├── sample_many.py
│ ├── schedule/
│ │ ├── __init__.py
│ │ ├── ddpm.py
│ │ └── dpm_solver.py
│ ├── tokenizer_models/
│ │ ├── __init__.py
│ │ ├── modeling_beit3_vision.py
│ │ ├── modeling_common.py
│ │ ├── modeling_sigma_vae.py
│ │ ├── modeling_utils.py
│ │ └── vae.py
│ ├── train_hf.py
│ └── utils.py
├── NOTICE.md
├── PFPO/
│ ├── README.md
│ ├── apps_train_sub_val_ids.json
│ ├── conf/
│ │ ├── api/
│ │ │ └── vllm/
│ │ │ ├── apps/
│ │ │ │ ├── deepseek_coder/
│ │ │ │ │ ├── dev_v1_0.yaml
│ │ │ │ │ ├── dev_v1_0_fix_bos.yaml
│ │ │ │ │ ├── dev_v1_1.yaml
│ │ │ │ │ ├── dev_v1_1_sample.yaml
│ │ │ │ │ ├── dev_v2_0.yaml
│ │ │ │ │ ├── r2c/
│ │ │ │ │ │ ├── dev_v1_0.yaml
│ │ │ │ │ │ ├── dev_v1_1.yaml
│ │ │ │ │ │ ├── dev_v1_1_sample.yaml
│ │ │ │ │ │ ├── dev_v2_0.yaml
│ │ │ │ │ │ ├── dev_v2_0_sample.yaml
│ │ │ │ │ │ ├── general_combine_train_v2_0.yaml
│ │ │ │ │ │ ├── general_combine_train_v2_0_prefix_completion.yaml
│ │ │ │ │ │ ├── general_combine_train_v2_1_4o_non_sc.yaml
│ │ │ │ │ │ ├── sub_dev_v1_1.yaml
│ │ │ │ │ │ ├── sub_dev_v2_0.yaml
│ │ │ │ │ │ ├── train_v1_0.yaml
│ │ │ │ │ │ ├── train_v1_0_s43.yaml
│ │ │ │ │ │ ├── train_v2_0.yaml
│ │ │ │ │ │ ├── train_v2_0_prefix_completion.yaml
│ │ │ │ │ │ └── xcode_train_v2_0.yaml
│ │ │ │ │ ├── sub_dev_v1_1.yaml
│ │ │ │ │ ├── sub_dev_v2_0.yaml
│ │ │ │ │ ├── test_inputs_gen/
│ │ │ │ │ │ ├── sub_dev_v1_0.yaml
│ │ │ │ │ │ └── test_v1_0.yaml
│ │ │ │ │ ├── train_v1_0.yaml
│ │ │ │ │ └── train_v2_0.yaml
│ │ │ │ └── general_eval/
│ │ │ │ ├── dev_v2_0.yaml
│ │ │ │ ├── dev_v2_1.yaml
│ │ │ │ └── dev_v2_2.yaml
│ │ │ ├── human_eval/
│ │ │ │ ├── ds_coder/
│ │ │ │ │ ├── r2c/
│ │ │ │ │ │ ├── test_v1_0.yaml
│ │ │ │ │ │ ├── test_v1_0_local.yaml
│ │ │ │ │ │ ├── test_v2_0_local.yaml
│ │ │ │ │ │ ├── test_v2_1_local.yaml
│ │ │ │ │ │ └── test_v2_2_local.yaml
│ │ │ │ │ ├── test_v1_0_local.yaml
│ │ │ │ │ └── test_v2_0.yaml
│ │ │ │ ├── test_v2_1.yaml
│ │ │ │ └── test_v2_2.yaml
│ │ │ ├── magicoder/
│ │ │ │ ├── llama3/
│ │ │ │ │ └── test_case_input_gen_v1_0.yaml
│ │ │ │ └── mistral/
│ │ │ │ ├── func_head_extract_v1_0.yaml
│ │ │ │ └── test_case_input_gen_v1_0.yaml
│ │ │ ├── mathscale/
│ │ │ │ ├── 4o_mathstral_train_0shot_v1_0.yaml
│ │ │ │ ├── 4o_mathstral_train_0shot_v1_0_completion.yaml
│ │ │ │ ├── 4o_mathstral_train_0shot_v1_1.yaml
│ │ │ │ ├── 4o_mathstral_train_0shot_v1_1_completion.yaml
│ │ │ │ ├── 4o_mathstral_train_half_0shot_v1_0.yaml
│ │ │ │ ├── 4o_mathstral_train_half_0shot_v1_0_completion.yaml
│ │ │ │ ├── mathstral/
│ │ │ │ │ ├── deepseek_test_0shot_tem_v1_1.yaml
│ │ │ │ │ ├── mistral_mathscale4o_labeling.yaml
│ │ │ │ │ ├── mistral_train_0shot_iter0_v1_0.yaml
│ │ │ │ │ ├── test_0shot_tem_v1_1.yaml
│ │ │ │ │ ├── test_0shot_tem_v1_1_step.yaml
│ │ │ │ │ ├── test_0shot_tem_v1_1_step_seed.yaml
│ │ │ │ │ ├── test_0shot_tem_v2_0_step.yaml
│ │ │ │ │ └── test_0shot_tem_v3_0_step.yaml
│ │ │ │ ├── mistral_train_0shot_v1_0.yaml
│ │ │ │ ├── mistral_train_0shot_v1_1.yaml
│ │ │ │ ├── mistral_train_0shot_v1_2.yaml
│ │ │ │ ├── numina_hard_train_0shot_v1_0_completion.yaml
│ │ │ │ ├── numina_hard_train_0shot_v1_0_seed.yaml
│ │ │ │ ├── numina_rewrite_qwen25_0shot_v1_0.yaml
│ │ │ │ ├── numina_train_0shot_v1_0.yaml
│ │ │ │ ├── numina_train_0shot_v1_0_completion.yaml
│ │ │ │ ├── test_0shot_tem_v1_1.yaml
│ │ │ │ └── test_0shot_tem_v1_1_step.yaml
│ │ │ ├── mbpp_sanitized/
│ │ │ │ ├── r2c/
│ │ │ │ │ ├── test_3shot_v2_0.yaml
│ │ │ │ │ ├── test_v1_0.yaml
│ │ │ │ │ └── test_v1_0_local.yaml
│ │ │ │ ├── test_3shot_v1_0.yaml
│ │ │ │ ├── test_3shot_v1_0_local.yaml
│ │ │ │ ├── test_v1_0_local.yaml
│ │ │ │ ├── test_v1_1_local.yaml
│ │ │ │ ├── test_v2_0_local.yaml
│ │ │ │ ├── test_v2_1_local.yaml
│ │ │ │ └── test_v2_2_local.yaml
│ │ │ ├── mwp-bench/
│ │ │ │ ├── deepseek_test_0shot_v1_1.yaml
│ │ │ │ ├── llama_base/
│ │ │ │ │ └── college_math_test_4shot_v1_0.yaml
│ │ │ │ ├── llama_chat/
│ │ │ │ │ ├── dev_0shot_v1_0.yaml
│ │ │ │ │ ├── math_test_0shot_v1_0.yaml
│ │ │ │ │ ├── math_test_0shot_v3_0.yaml
│ │ │ │ │ └── test_0shot_v1_0.yaml
│ │ │ │ ├── mathstral_dev_0shot_self_correct_v1_0.yaml
│ │ │ │ ├── mathstral_dev_0shot_v1_0.yaml
│ │ │ │ ├── mathstral_test_0shot_self_correct_v1_0.yaml
│ │ │ │ ├── mathstral_test_0shot_v1_0.yaml
│ │ │ │ ├── mathstral_test_gaokao_2023_0shot_v1_0.yaml
│ │ │ │ ├── mathstral_test_gsm8k_0shot_v1_0.yaml
│ │ │ │ ├── mistral/
│ │ │ │ │ ├── dev_0shot_v1_0.yaml
│ │ │ │ │ └── test_0shot_v1_0.yaml
│ │ │ │ └── mistral_dev_0shot_v1_0.yaml
│ │ │ └── vllm_params/
│ │ │ ├── sampling_param_greedy.yaml
│ │ │ └── sampling_param_sample.yaml
│ │ ├── deepspeed/
│ │ │ ├── fp16.yaml
│ │ │ ├── train_hybrid_engine_zero0.yaml
│ │ │ ├── train_hybrid_engine_zero1.yaml
│ │ │ ├── train_hybrid_engine_zero1_cosine.yaml
│ │ │ ├── train_hybrid_engine_zero1_lr.yaml
│ │ │ ├── train_hybrid_engine_zero1_optim_offload.yaml
│ │ │ ├── train_hybrid_engine_zero1_optim_offload_cosine.yaml
│ │ │ ├── train_hybrid_engine_zero1_optim_offload_lr.yaml
│ │ │ ├── train_hybrid_engine_zero1_wo_optim.yaml
│ │ │ ├── train_hybrid_engine_zero2.yaml
│ │ │ ├── train_hybrid_engine_zero2_cosine.yaml
│ │ │ ├── train_hybrid_engine_zero2_lr.yaml
│ │ │ ├── train_hybrid_engine_zero2_optim_offload.yaml
│ │ │ ├── train_hybrid_engine_zero2_optim_offload_cosine.yaml
│ │ │ ├── train_hybrid_engine_zero3.yaml
│ │ │ ├── train_hybrid_engine_zero3_cosine.yaml
│ │ │ ├── train_hybrid_engine_zero3_optim_offload.yaml
│ │ │ └── train_hybrid_engine_zero3_optim_offload_cosine.yaml
│ │ ├── exp/
│ │ │ ├── apps/
│ │ │ │ ├── code_gen/
│ │ │ │ │ └── deepseek_coder/
│ │ │ │ │ ├── dpo/
│ │ │ │ │ │ ├── orig-pseudo-v1.0-a100.yaml
│ │ │ │ │ │ ├── orig-v1.0-v100.yaml
│ │ │ │ │ │ ├── orig-v1.1-v100-tp2.yaml
│ │ │ │ │ │ ├── orig-v1.1-v100-tp4.yaml
│ │ │ │ │ │ ├── orig-v1.1-v100.yaml
│ │ │ │ │ │ ├── orig-v1.2-v100-tp4.yaml
│ │ │ │ │ │ ├── orig-v1.3-a100.yaml
│ │ │ │ │ │ ├── orig-v1.3-v100-tp4.yaml
│ │ │ │ │ │ ├── orig-v1.4-a100.yaml
│ │ │ │ │ │ ├── orig-v1.4-v100-tp4.yaml
│ │ │ │ │ │ ├── pseudo-sc-dpo-v1.0-v100-tp8.yaml
│ │ │ │ │ │ ├── pseudo-sc-dpo-v1.1-h100.yaml
│ │ │ │ │ │ ├── pseudo-sc-dpo-v1.1-v100-tp8.yaml
│ │ │ │ │ │ ├── pseudo-sc-dpo-v1.2-a100.yaml
│ │ │ │ │ │ └── pseudo-sc-dpo-v1.2-v100-tp8.yaml
│ │ │ │ │ └── sft/
│ │ │ │ │ ├── v1.0-a100.yaml
│ │ │ │ │ └── v1.0-v100.yaml
│ │ │ │ ├── r2c_generation/
│ │ │ │ │ └── deepseek_coder/
│ │ │ │ │ ├── dpo/
│ │ │ │ │ │ ├── deprecated/
│ │ │ │ │ │ │ └── sft-v1.0-v100-tp4.yaml
│ │ │ │ │ │ ├── gpt4o-distil-4o-ps-test-pdpo-h100-v1.0.yaml
│ │ │ │ │ │ ├── gpt4o-distil-4o-ps-test-pdpo-h100-v1.1.yaml
│ │ │ │ │ │ ├── gpt4o-distil-4o-self-mix-ps-test-v1.0-mi300x-dp16.yaml
│ │ │ │ │ │ ├── gpt4o-distil-4o-self-mix-ps-test-v1.0-mi300x.yaml
│ │ │ │ │ │ ├── gpt4o-distil-4o-self-mix-ps-test-v1.1-mi300x.yaml
│ │ │ │ │ │ ├── gpt4o-distil-ps-pdpo-ctr-ts-num-v1.0-mi300x-dp32.yaml
│ │ │ │ │ │ ├── gpt4o-distil-pseudo-v1.0-a100.yaml
│ │ │ │ │ │ ├── gpt4o-distil-self-pseudo-v1.0-a100.yaml
│ │ │ │ │ │ ├── gpt4o-distil-self-pseudo-v1.0-v100.yaml
│ │ │ │ │ │ ├── gpt4o-distil-v1.0-H100-4o-ps-test.yaml
│ │ │ │ │ │ ├── gpt4o-distil-v2.0-v100.yaml
│ │ │ │ │ │ ├── gpt4o-distil-v3.0-a100.yaml
│ │ │ │ │ │ ├── gpt4o-distil-v3.1-rm-a100.yaml
│ │ │ │ │ │ ├── gpt4o-distil-v3.2-v100.yaml
│ │ │ │ │ │ ├── gpt4o-distil-v4.0-v100-ps-test.yaml
│ │ │ │ │ │ ├── gpt4o-distil-v4.1-H100-ps-pdpo.yaml
│ │ │ │ │ │ ├── gpt4o-distil-v4.10-V100-ps-pdpo-rerun.yaml
│ │ │ │ │ │ ├── gpt4o-distil-v4.2-H100-gd-pdpo.yaml
│ │ │ │ │ │ ├── gpt4o-distil-v4.2-v100-gd-pdpo.yaml
│ │ │ │ │ │ ├── gpt4o-distil-v4.3-H100-ps-pdpo.yaml
│ │ │ │ │ │ ├── gpt4o-distil-v4.3-V100-ps-pdpo-rerun.yaml
│ │ │ │ │ │ ├── gpt4o-distil-v4.3-v100-gd-pdpo.yaml
│ │ │ │ │ │ ├── gpt4o-distil-v4.4-H100-ps-pdpo.yaml
│ │ │ │ │ │ ├── gpt4o-distil-v4.4-V100-ps-pdpo-rerun.yaml
│ │ │ │ │ │ ├── gpt4o-distil-v4.5-A100-ps-pdpo.yaml
│ │ │ │ │ │ ├── gpt4o-distil-v4.5-v100-ps-pdpo.yaml
│ │ │ │ │ │ ├── gpt4o-distil-v4.6-v100-ps-pdpo.yaml
│ │ │ │ │ │ ├── gpt4o-distil-v4.7-A100-ps-pdpo.yaml
│ │ │ │ │ │ ├── gpt4o-distil-v4.8-A100-ps-pdpo.yaml
│ │ │ │ │ │ ├── gpt4o-distil-v4.9-V100-ps-pdpo.yaml
│ │ │ │ │ │ ├── gpt4o-distil-v4.9.1-V100-ps-pdpo.yaml
│ │ │ │ │ │ ├── gpt4o-distil-v4.9.2-V100-ps-pdpo.yaml
│ │ │ │ │ │ ├── gpt4o-distil-v4.9.3-A100-ps-pdpo.yaml
│ │ │ │ │ │ ├── iter1/
│ │ │ │ │ │ │ ├── gpt4o-distil-apps-mc-v1.0-mi300x-hybrid.yaml
│ │ │ │ │ │ │ ├── gpt4o-distil-apps-mc-v1.1-mi300x-hybrid.yaml
│ │ │ │ │ │ │ ├── gpt4o-distil-combine-pdpo-v1.0-a100-40-ps-test.yaml
│ │ │ │ │ │ │ ├── gpt4o-distil-combine-pdpo-v1.1-v100-ps-test.yaml
│ │ │ │ │ │ │ ├── gpt4o-distil-combine-pdpo-v1.2-h100-ps-test.yaml
│ │ │ │ │ │ │ ├── gpt4o-distil-combine-v1.0-H100-4o-ps-test.yaml
│ │ │ │ │ │ │ ├── gpt4o-distil-combine-v1.0-v100-ps-test.yaml
│ │ │ │ │ │ │ ├── gpt4o-distil-combine-v1.1-H100-4o-ps-test.yaml
│ │ │ │ │ │ │ ├── gpt4o-distil-combine-v1.1-a100-40-ps-test.yaml
│ │ │ │ │ │ │ ├── gpt4o-distil-combine-v1.1-v100-ps-test.yaml
│ │ │ │ │ │ │ ├── gpt4o-distil-combine-v1.2-H100-4o-ps-test.yaml
│ │ │ │ │ │ │ └── gpt4o-distil-combine-v1.2-a100-40-ps-test.yaml
│ │ │ │ │ │ ├── iter2/
│ │ │ │ │ │ │ ├── gpt4o-distil-combine-dpo-n64sc-v1.0-A100-ps-test.yaml
│ │ │ │ │ │ │ ├── gpt4o-distil-combine-dpo-n64sc-v1.1-A100-ps-test.yaml
│ │ │ │ │ │ │ ├── gpt4o-distil-combine-dpo-n64sc-v1.2-A100-ps-test.yaml
│ │ │ │ │ │ │ ├── gpt4o-distil-combine-dpo-n64sc-v1.2-V100-ps-test.yaml
│ │ │ │ │ │ │ ├── gpt4o-distil-combine-dpo-n64sc-v1.3-A100-ps-test.yaml
│ │ │ │ │ │ │ ├── gpt4o-distil-combine-dpo-n64sc-v1.4-A100-ps-test.yaml
│ │ │ │ │ │ │ ├── gpt4o-distil-combine-pdpo-v1.0-h100-ps-test.yaml
│ │ │ │ │ │ │ ├── gpt4o-distil-combine-pdpo-v1.1-h100-ps-test.yaml
│ │ │ │ │ │ │ ├── gpt4o-distil-combine-pdpo-v1.1-v100-ps-test.yaml
│ │ │ │ │ │ │ ├── gpt4o-distil-combine-pdpo-v1.2-v100-ps-test.yaml
│ │ │ │ │ │ │ ├── gpt4o-distil-combine-pdpo-v1.3-h100-fix-ps-test.yaml
│ │ │ │ │ │ │ ├── gpt4o-distil-combine-pdpo-v1.3-v100-fix-ps-test.yaml
│ │ │ │ │ │ │ ├── gpt4o-distil-combine-pdpo-v1.3-v100-ps-test.yaml
│ │ │ │ │ │ │ ├── gpt4o-distil-combine-pdpo-v2.0-h100-ps-test.yaml
│ │ │ │ │ │ │ ├── gpt4o-distil-combine-pdpo-v2.1-h100-ps-test.yaml
│ │ │ │ │ │ │ ├── gpt4o-distil-combine-v1.0-H100-ps-test.yaml
│ │ │ │ │ │ │ ├── gpt4o-distil-combine-v1.0-mi300x-hybrid.yaml
│ │ │ │ │ │ │ ├── gpt4o-distil-combine-v1.1-mi300x-hybrid.yaml
│ │ │ │ │ │ │ └── gpt4o-distil-combine-v1.2-mi300x-hybrid.yaml
│ │ │ │ │ │ └── iter3/
│ │ │ │ │ │ ├── gpt4o-distil-combine-pdpo-v1.0-h100-ps-test.yaml
│ │ │ │ │ │ ├── gpt4o-distil-combine-pdpo-v1.1-h100-ps-test.yaml
│ │ │ │ │ │ └── gpt4o-distil-combine-pdpo-v1.2-h100-ps-test.yaml
│ │ │ │ │ └── sft/
│ │ │ │ │ ├── deprecated/
│ │ │ │ │ │ ├── gpt4o-distil-v1.0-v100.yaml
│ │ │ │ │ │ ├── gpt4o-distil-v1.1-v100.yaml
│ │ │ │ │ │ ├── gpt4o-distil-v2.0-v100.yaml
│ │ │ │ │ │ ├── gpt4o-distil-v2.1-v100-tp.yaml
│ │ │ │ │ │ ├── gpt4o-distil-v2.1-v100.yaml
│ │ │ │ │ │ ├── gpt4o-distil-v2.2-v100.yaml
│ │ │ │ │ │ ├── gpt4o-distil-v2.3-v100.yaml
│ │ │ │ │ │ ├── gpt4o-distil-v2.5-v100.yaml
│ │ │ │ │ │ └── gpt4o-distil-v2.6-v100.yaml
│ │ │ │ │ ├── gpt4o-distil-v2.4-a100.yaml
│ │ │ │ │ ├── gpt4o-distil-v2.4-v100-fix-2node-test.yaml
│ │ │ │ │ ├── gpt4o-distil-v2.4-v100-fix.yaml
│ │ │ │ │ ├── gpt4o-distil-v2.4-v100.yaml
│ │ │ │ │ ├── gpt4o-distil-v3.0-a100.yaml
│ │ │ │ │ ├── gpt4o-distil-v3.0-v100.yaml
│ │ │ │ │ ├── gpt4o-distil-v3.1-v100-test.yaml
│ │ │ │ │ └── gpt4o-distil-v3.1-v100.yaml
│ │ │ │ └── test_input_gen/
│ │ │ │ └── deepseek_coder/
│ │ │ │ └── sft/
│ │ │ │ └── v1.0-a100.yaml
│ │ │ └── mathscale/
│ │ │ ├── llama/
│ │ │ │ ├── dpo/
│ │ │ │ │ ├── iter1/
│ │ │ │ │ │ ├── llama3.1-dpo-4o-iter0-v1.0-H100.yaml
│ │ │ │ │ │ ├── llama3.1-dpo-4o-iter0-v1.1-A100-40.yaml
│ │ │ │ │ │ ├── llama3.1-pdpo-4o-iter1-1.0-A100.yaml
│ │ │ │ │ │ └── llama3.1-pdpo-4o-iter1-1.1-v100.yaml
│ │ │ │ │ ├── llama3.1-dpo-4o-iter0-v1.0-v100.yaml
│ │ │ │ │ ├── llama3.1-pdpo-4o-iter0-v1.0-v100.yaml
│ │ │ │ │ ├── llama3.1-pdpo-4o-iter0-v1.1-H100.yaml
│ │ │ │ │ ├── llama3.1-pdpo-4o-iter0-v1.2-V100.yaml
│ │ │ │ │ ├── llama3.1-pdpo-4o-iter0-v1.2-a100-40.yaml
│ │ │ │ │ ├── llama3.1-pdpo-4o-iter0-v2.0-v100.yaml
│ │ │ │ │ ├── llama3.1-pdpo-4o-iter0-v2.1-a100-40.yaml
│ │ │ │ │ ├── llama3.1-pdpo-4o-iter0-v2.1-v100.yaml
│ │ │ │ │ ├── llama3.1-pdpo-4o-iter0-v2.2-A100.yaml
│ │ │ │ │ └── numina-co/
│ │ │ │ │ ├── llama3.1-pdpo-iter1-1.0-split01-p0.0-h100.yaml
│ │ │ │ │ ├── llama3.1-pdpo-iter1-1.0-split01-p0.5-h100.yaml
│ │ │ │ │ ├── llama3.1-pdpo-iter1-split0123-cross2-p0.5-v1.0-h100.yaml
│ │ │ │ │ ├── llama3.1-pdpo-iter1-split0123-cross2-p0.5-v1.1-h100.yaml
│ │ │ │ │ ├── llama3.1-pdpo-iter1-split0123-p0.5-v1.0-h100.yaml
│ │ │ │ │ ├── llama3.1-pdpo-iter1-split0123-p0.5-v1.1-h100.yaml
│ │ │ │ │ ├── llama3.1-pdpo-iter1-split23-p0.0-v1.0-h100.yaml
│ │ │ │ │ ├── llama3.1-pdpo-iter1-split23-p0.0-v1.1-h100.yaml
│ │ │ │ │ ├── llama3.1-pdpo-iter1-split23-p0.5-v1.0-h100.yaml
│ │ │ │ │ ├── llama3.1-pdpo-iter1-split23-p0.5-v1.1-h100.yaml
│ │ │ │ │ ├── llama3.1-pdpo-iter2-split01-23-p0.5-v1.0-h100.yaml
│ │ │ │ │ ├── llama3.1-pdpo-iter2-split01-23-p0.5-v1.1-h100.yaml
│ │ │ │ │ ├── llama3.1-pdpo-iter2-split01-23-p0.5-v1.2-h100.yaml
│ │ │ │ │ ├── llama3.1-pdpo-iter2-split01-23-p0.5-v1.3-h100.yaml
│ │ │ │ │ ├── llama3.1-pdpo-iter2-split01-23-p0.5-v1.4-a100.yaml
│ │ │ │ │ ├── llama3.1-pdpo-iter2-split01-23-p0.5-v1.4-h100.yaml
│ │ │ │ │ ├── llama3.1-pdpo-iter2-split01-23-p0.5-v1.4-v100.yaml
│ │ │ │ │ ├── llama3.1-pdpo-iter3-split01-23-45-p0.5-v1.0-h100.yaml
│ │ │ │ │ ├── llama3.1-pdpo-iter3-split01-23-45-p0.5-v1.1-h100.yaml
│ │ │ │ │ ├── llama3.1-pdpo-iter3-split01-23-45-p0.5-v1.2-h100.yaml
│ │ │ │ │ ├── llama3.1-pdpo-iter3-split01-23-45-p0.5-v1.3-h100.yaml
│ │ │ │ │ ├── llama3.1-pdpo-iter3-split01-23-45-p0.5-v1.3-v100.yaml
│ │ │ │ │ ├── llama3.1-pdpo-iter3-split01-23-45-p0.5-v1.4-a100-dp16.yaml
│ │ │ │ │ ├── llama3.1-pdpo-iter3-split01-23-45-p0.5-v1.4-h100.yaml
│ │ │ │ │ ├── llama3.1-pdpo-iter4-split01-23-45-67-p0.0-v1.1-h100.yaml
│ │ │ │ │ ├── llama3.1-pdpo-iter4-split01-23-45-67-p0.0-v1.2-h100.yaml
│ │ │ │ │ ├── llama3.1-pdpo-iter4-split01-23-45-67-p0.0-v1.3-v100.yaml
│ │ │ │ │ ├── llama3.1-pdpo-iter4-split01-23-45-67-p0.0-v1.4-h100.yaml
│ │ │ │ │ ├── llama3.1-pdpo-iter4-split01-23-45-67-p0.0-v1.5-h100.yaml
│ │ │ │ │ ├── llama3.1-pdpo-iter4-split01-23-45-67-p0.0-v1.6-a100-40.yaml
│ │ │ │ │ ├── llama3.1-pdpo-iter4-split01-23-45-67-p0.0-v1.6-h100.yaml
│ │ │ │ │ ├── llama3.1-pdpo-iter4-split01-23-45-67-p0.0-v1.7-h100.yaml
│ │ │ │ │ ├── llama3.1-pdpo-iter4-split01-23-45-6789-p0.0-v1.1-h100.yaml
│ │ │ │ │ ├── llama3.1-pdpo-iter4-split01-23-45-6789-p0.0-v1.2-a100.yaml
│ │ │ │ │ ├── llama3.1-pdpo-iter4-split01-23-45-6789-p0.0-v1.3-v100.yaml
│ │ │ │ │ ├── llama3.1-pdpo-iter4-split01-23-45-6789-p0.0-v1.5-h100-dp16.yaml
│ │ │ │ │ ├── llama3.1-pdpo-iter4-split01-23-45-6789-p0.0-v1.5-h100.yaml
│ │ │ │ │ ├── llama3.1-pdpo-iter4-split01-23-45-6789-p0.0-v1.5-v100.yaml
│ │ │ │ │ ├── llama3.1-pdpo-iter4-split01-23-45-p0.0-v1.0-h100.yaml
│ │ │ │ │ ├── llama3.1-pdpo-iter5-split01-23-45-67-89-p0.2-v1.0-h100.yaml
│ │ │ │ │ ├── llama3.1-pdpo-iter5-split01-23-45-67-89-p0.2-v1.1-h100.yaml
│ │ │ │ │ ├── llama3.1-pdpo-iter5-split01-23-45-67-89-p0.4-v1.2-h100.yaml
│ │ │ │ │ └── llama3.1-pdpo-iter5-split01-23-45-67-89-p0.5-v1.3-a100-40.yaml
│ │ │ │ └── sft/
│ │ │ │ ├── 70b-sft-v1.0-mi300x.yaml
│ │ │ │ ├── 70b-sft-v1.1-mi300x.yaml
│ │ │ │ ├── 70b-sft-v1.2-mi300x.yaml
│ │ │ │ └── 70b-sft-v2.0-mi300x.yaml
│ │ │ └── mistral/
│ │ │ ├── dpo/
│ │ │ │ ├── co-half-0/
│ │ │ │ │ ├── mathstral-co-pdpo-half0-iter0-v1.0-a100.yaml
│ │ │ │ │ ├── mathstral-co-pdpo-half0-iter0-v1.1-h100.yaml
│ │ │ │ │ ├── mathstral-co-pdpo-half0-iter0-v1.2-h100.yaml
│ │ │ │ │ ├── mathstral-co-pdpo-half0-iter0-v1.3-a100.yaml
│ │ │ │ │ └── mathstral-co-pdpo-sc-half0-iter0-p0.0-v1.0-a100.yaml
│ │ │ │ ├── co-half-1/
│ │ │ │ │ ├── mathstral-co-pdpo-half1-iter0-v1.0-a100.yaml
│ │ │ │ │ └── mathstral-co-pdpo-sc-half1-iter1-p0.0-v1.0-a100.yaml
│ │ │ │ ├── iter-2-mscale-v0.1/
│ │ │ │ │ ├── mathstral-pdpo-mscale300k-iter2-v1.0-A100-40.yaml
│ │ │ │ │ ├── mathstral-pdpo-mscale300k-iter2-v1.0-V100.yaml
│ │ │ │ │ ├── mathstral-pdpo-mscale300k-iter2-v1.1-A100-40.yaml
│ │ │ │ │ ├── mathstral-pdpo-mscale300k-iter2-v1.1-H100.yaml
│ │ │ │ │ ├── mathstral-pdpo-mscale300k-iter2-v1.1-V100.yaml
│ │ │ │ │ ├── mathstral-pdpo-mscale300k-iter2-v1.2-V100.yaml
│ │ │ │ │ ├── mathstral-pdpo-mscale300k-iter2-v1.3-A100-40.yaml
│ │ │ │ │ ├── mathstral-pdpo-mscale300k-iter2-v1.3-A100.yaml
│ │ │ │ │ ├── mathstral-pdpo-mscale300k-iter2-v1.3-H100.yaml
│ │ │ │ │ ├── mathstral-pdpo-mscale300k-iter2-v1.3-V100.yaml
│ │ │ │ │ └── mathstral-pdpo-mscale300k-iter2-v1.3.1-A100-40.yaml
│ │ │ │ ├── iter-3-mscale-v0.1/
│ │ │ │ │ ├── mathstral-pdpo-mscale300k-iter3-v1.0-V100.yaml
│ │ │ │ │ └── mathstral-pdpo-mscale300k-iter3-v1.1-A100.yaml
│ │ │ │ ├── iter1/
│ │ │ │ │ ├── mathstral-dpo-4o-iter1-v1.0-v100.yaml
│ │ │ │ │ ├── mathstral-dpo-4o-iter1-v1.1-a100.yaml
│ │ │ │ │ ├── mathstral-dpo-4o-iter1-v1.2-h100.yaml
│ │ │ │ │ ├── mathstral-dpo-4o-iter1-v1.3-h100.yaml
│ │ │ │ │ ├── mathstral-dpo-4o-iter1-v1.4-v100.yaml
│ │ │ │ │ ├── mathstral-dpo-4o-iter1-v1.5-h100.yaml
│ │ │ │ │ ├── mathstral-pdpo-4o-iter1-v1.0-H100.yaml
│ │ │ │ │ ├── mathstral-raft-dpo-4o-iter1-v2.0-h100.yaml
│ │ │ │ │ ├── mathstral-raft-dpo-4o-iter1-v2.1-h100.yaml
│ │ │ │ │ ├── mathstral-sc-dpo-4o-iter1-v1.0-a100-40.yaml
│ │ │ │ │ ├── mathstral-sc-dpo-4o-iter1-v1.1-a100.yaml
│ │ │ │ │ ├── mathstral-sc-dpo-4o-iter1-v1.2-a100-40.yaml
│ │ │ │ │ ├── mathstral-sc-dpo-numina-iter1-v1.0-h100.yaml
│ │ │ │ │ ├── mathstral-sc-pdpo-4o-iter1-v1.0-H100.yaml
│ │ │ │ │ ├── mathstral-sc-pdpo-4o-iter1-v1.1-H100.yaml
│ │ │ │ │ ├── mathstral-sc-pdpo-4o-iter1-v1.2-A100.yaml
│ │ │ │ │ ├── mathstral-sc-pdpo-4o-iter1-v1.3-A100-40.yaml
│ │ │ │ │ ├── mathstral-sc-pdpo-4o-iter1-v1.4-A100-40.yaml
│ │ │ │ │ ├── mathstral-sc-pdpo-4o-iter1-v1.4-H100.yaml
│ │ │ │ │ ├── mathstral-sc-pdpo-4o-iter1-v2.0-H100.yaml
│ │ │ │ │ ├── mathstral-sc-pdpo-4o-iter1-v2.1-H100.yaml
│ │ │ │ │ ├── mathstral-sc-pdpo-4o-iter1-v2.2-H100.yaml
│ │ │ │ │ ├── mathstral-sc-pdpo-numina-iter1-v2.0-h100.yaml
│ │ │ │ │ ├── mathstral-sc-pdpo-numina-iter1-v2.1-A100-40.yaml
│ │ │ │ │ ├── mathstral-sc-pdpo-numina-iter1-v2.1-a100.yaml
│ │ │ │ │ ├── mathstral-sc-pdpo-numina-iter1-v2.2-h100.yaml
│ │ │ │ │ ├── mathstral-sc-pdpo-numina-iter1-v2.3-A100-40.yaml
│ │ │ │ │ ├── mathstral-sc-pdpo-numina-iter1-v2.4-A100-40.yaml
│ │ │ │ │ ├── mathstral-sc-prm-4o-iter1-v1.0-H100.yaml
│ │ │ │ │ └── mathstral-sc-prm-4o-iter1-v1.1-A100-40.yaml
│ │ │ │ ├── iter1-mscale-v0.1/
│ │ │ │ │ ├── mathstral-pdpo-mscale300k-4o-iter1-v1.0-MI300x.yaml
│ │ │ │ │ ├── mathstral-pdpo-mscale300k-iter1-v1.0-H100.yaml
│ │ │ │ │ ├── mathstral-pdpo-mscale300k-iter1-v1.1-H100.yaml
│ │ │ │ │ ├── mathstral-pdpo-mscale300k-iter1-v1.2-H100.yaml
│ │ │ │ │ ├── mathstral-pdpo-mscale300k-iter1-v1.3-A100.yaml
│ │ │ │ │ ├── mathstral-pdpo-mscale300k-iter1-v1.4-A100-40.yaml
│ │ │ │ │ ├── mathstral-pdpo-mscale300k-iter1-v1.4-H100.yaml
│ │ │ │ │ ├── mathstral-pdpo-mscale300k-iter1-v1.5-A100-40.yaml
│ │ │ │ │ ├── mathstral-pdpo-mscale300k-iter1-v1.5-H100.yaml
│ │ │ │ │ ├── mathstral-pdpo-mscale300k-iter1-v1.6-H100.yaml
│ │ │ │ │ ├── mathstral-pdpo-mscale300k-iter1-v1.7-H100.yaml
│ │ │ │ │ ├── mathstral-pdpo-mscale300k-iter1-v2.0-H100.yaml
│ │ │ │ │ ├── mathstral-pdpo-mscale300k-iter1-v2.1-H100.yaml
│ │ │ │ │ ├── mathstral-pdpo-mscale300k-iter1-v3.0-H100.yaml
│ │ │ │ │ ├── mathstral-pdpo-mscale300k-iter1-v3.1-A100-40.yaml
│ │ │ │ │ ├── mathstral-pdpo-mscale300k-iter1-v3.1-H100.yaml
│ │ │ │ │ ├── mathstral-pdpo-mscale300k-iter1-v3.1-V100.yaml
│ │ │ │ │ ├── mathstral-pdpo-mscale300k-iter1-v3.2-A100.yaml
│ │ │ │ │ ├── mathstral-pdpo-mscale300k-iter1-v3.2-v100.yaml
│ │ │ │ │ └── mathstral-sc-dpo-mscale300k-iter1-v1.0-H100.yaml
│ │ │ │ ├── mathstral-dpo-4o-iter0-v1.0-a100.yaml
│ │ │ │ ├── mathstral-dpo-4o-iter0-v1.1-a100.yaml
│ │ │ │ ├── mathstral-dpo-4o-iter0-v1.2-a100.yaml
│ │ │ │ ├── mathstral-dpo-full-v1.0-a100.yaml
│ │ │ │ ├── mathstral-dpo-split1-v1.0-a100.yaml
│ │ │ │ ├── mathstral-dpo-split1-v1.0-v100.yaml
│ │ │ │ ├── mathstral-pdpo-4o-iter0-v1.1-a100.yaml
│ │ │ │ ├── mathstral-pdpo-4o-iter0-v1.2-a100.yaml
│ │ │ │ ├── mathstral-pdpo-4o-iter0-v1.3-a100-40.yaml
│ │ │ │ ├── mathstral-pdpo-4o-iter0-v2.0-A100.yaml
│ │ │ │ ├── mathstral-pdpo-4o-iter0-v2.1-A100-40-tp2.yaml
│ │ │ │ ├── mathstral-pdpo-4o-iter0-v2.1-A100-40.yaml
│ │ │ │ ├── mathstral-pdpo-4o-iter0-v2.1-H100.yaml
│ │ │ │ ├── mathstral-pdpo-4o-iter0-v2.1-V100.yaml
│ │ │ │ ├── mathstral-pdpo-4o-iter0-v2.2-V100.yaml
│ │ │ │ ├── mathstral-pdpo-4o-iter0-v2.2.1-H100.yaml
│ │ │ │ ├── mathstral-pdpo-4o-iter0-v2.2.2-A100.yaml
│ │ │ │ ├── mathstral-pdpo-4o-iter0-v2.3-H100.yaml
│ │ │ │ ├── mathstral-pdpo-4o-iter0-v2.4-H100.yaml
│ │ │ │ ├── mathstral-pdpo-4o-iter0-v2.4-V100.yaml
│ │ │ │ ├── mathstral-pdpo-sc-iter0-v1.0-H100.yaml
│ │ │ │ ├── mathstral-pdpo-sc-iter0-v1.1-H100.yaml
│ │ │ │ ├── mathstral-pdpo-sc-iter0-v2.0-H100.yaml
│ │ │ │ ├── mathstral-pdpo-sc-iter0-v2.1-A100.yaml
│ │ │ │ ├── ms-mistral-dpo-split1-v1.0-v100.yaml
│ │ │ │ ├── ms-mistral-dpo-split1-v1.1-v100.yaml
│ │ │ │ ├── ms-mistral-dpo-split1-v1.2-v100.yaml
│ │ │ │ ├── ms-mistral-dpo-split2-v1.0-v100.yaml
│ │ │ │ ├── ms-mistral-dpo-split2-v1.1-v100.yaml
│ │ │ │ └── reverse_order/
│ │ │ │ ├── mathstral-pre-sc-pdpo-4o-iter1-v1.0-H100.yaml
│ │ │ │ ├── mathstral-pre-sc-pdpo-4o-iter1-v1.0-V100.yaml
│ │ │ │ ├── mathstral-pre-sc-pdpo-mscale-iter1-v1.0-H100.yaml
│ │ │ │ ├── mathstral-pre-sc-pdpo-mscale-iter1-v1.1-H100.yaml
│ │ │ │ ├── mathstral-pre-sc-pdpo-mscale-iter1-v1.2-H100.yaml
│ │ │ │ ├── mathstral-pre-sc-pdpo-mscale-iter1-v1.3-H100-dp8.yaml
│ │ │ │ ├── mathstral-pre-sc-pdpo-mscale-iter1-v1.3-H100.yaml
│ │ │ │ ├── mathstral-pre-sc-pdpo-mscale-iter2-4o-gd-v1.0-H100-dp16.yaml
│ │ │ │ └── mathstral-pre-sc-pdpo-mscale-iter2-4o-gd-v1.1-H100-dp16.yaml
│ │ │ ├── reward/
│ │ │ │ └── iter1/
│ │ │ │ ├── mathstral-sc-prm-4o-iter1-v1.0-h100.yaml
│ │ │ │ ├── mathstral-sc-prm-4o-iter1-v1.0-v100.yaml
│ │ │ │ ├── mathstral-sc-prm-mscale-iter2-v1.0-v100.yaml
│ │ │ │ ├── mathstral-sc-prm-mscale-iter3-v1.0-v100.yaml
│ │ │ │ ├── process-rm-predict-flat.yaml
│ │ │ │ └── process-rm-predict-single.yaml
│ │ │ └── sft/
│ │ │ ├── co-half-0/
│ │ │ │ └── mathstral-mathscale4o-sft-v1.0-v100.yaml
│ │ │ ├── co-half-1/
│ │ │ │ └── mathstral-mathscale4o-sft-v1.0-v100.yaml
│ │ │ ├── iter1/
│ │ │ │ ├── mathstral-mathscale4o-raft-v1.0-h100.yaml
│ │ │ │ ├── mathstral-mathscale4o-raft-v1.1-a100-40.yaml
│ │ │ │ └── mathstral-mathscale4o-raft-v1.1-h100.yaml
│ │ │ ├── mathstral-mathscale4o-sft-v1.0-a100.yaml
│ │ │ ├── mathstral-mathscale4o-sft-v1.1-v100.yaml
│ │ │ ├── mathstral-mathscale4o-sft-v1.2-v100.yaml
│ │ │ ├── mathstral-mathscale4o-sft-v2.0-v100.yaml
│ │ │ └── mistral-mathscale4o-sft-v1.0-v100.yaml
│ │ ├── hydra/
│ │ │ └── default.yaml
│ │ └── post_process/
│ │ ├── deepseek.yaml
│ │ ├── gsm8k.yaml
│ │ ├── math.yaml
│ │ ├── openai_cot.yaml
│ │ └── openai_react.yaml
│ ├── data/
│ │ ├── apps.py
│ │ ├── code_contest.py
│ │ ├── combine_dataset.py
│ │ ├── deepseek_math_utils/
│ │ │ ├── answer_extraction.py
│ │ │ ├── eval_script.py
│ │ │ ├── eval_utils.py
│ │ │ └── ocwcourses_eval_utils.py
│ │ ├── general_collator.py
│ │ ├── human_eval.py
│ │ ├── input_aligner.py
│ │ ├── input_utils.py
│ │ ├── math.py
│ │ ├── math_reader.py
│ │ ├── math_util.py
│ │ ├── mathscale/
│ │ │ └── util.py
│ │ ├── numina_math.py
│ │ ├── openai_api_caller.py
│ │ ├── qwen25math/
│ │ │ ├── LICENSE
│ │ │ ├── README.md
│ │ │ ├── data_loader.py
│ │ │ ├── evaluate.py
│ │ │ ├── examples.py
│ │ │ ├── grader.py
│ │ │ ├── math_eval.py
│ │ │ ├── math_utils.py
│ │ │ ├── model_utils.py
│ │ │ ├── parser.py
│ │ │ ├── python_executor.py
│ │ │ ├── requirements.txt
│ │ │ ├── trajectory.py
│ │ │ └── utils.py
│ │ └── vllm.py
│ ├── eval/
│ │ ├── codex_humaneval/
│ │ │ ├── data.py
│ │ │ ├── evaluation.py
│ │ │ ├── execution.py
│ │ │ └── run_eval.py
│ │ ├── dispatch_openai_requests.py
│ │ ├── mbpp_eval/
│ │ │ ├── execute.py
│ │ │ ├── run_eval.py
│ │ │ └── utils.py
│ │ └── utils.py
│ ├── general_util/
│ │ ├── __init__.py
│ │ ├── average_meter.py
│ │ ├── dist_utils.py
│ │ ├── evaluator.py
│ │ ├── fs_tp_utils.py
│ │ ├── fsdp_utils.py
│ │ ├── lightseq_utils.py
│ │ ├── logger.py
│ │ ├── mixin.py
│ │ ├── mpu_proxy.py
│ │ ├── tensorboard_helper.py
│ │ ├── tokenization_utils.py
│ │ ├── torch_fsdp_utils.py
│ │ ├── training_utils.py
│ │ └── transformer_engine.py
│ ├── models/
│ │ ├── dpo_utils.py
│ │ ├── ds_utils.py
│ │ ├── fs_tp_mixin.py
│ │ ├── llama.py
│ │ ├── llama_megatron_tp.py
│ │ ├── llama_tp.py
│ │ ├── megatron_tp_mixin.py
│ │ ├── mistral.py
│ │ ├── mistral_tp.py
│ │ ├── mixin.py
│ │ ├── qwen2.py
│ │ ├── qwen2_megatron_tp.py
│ │ ├── qwen2_tp.py
│ │ └── utils.py
│ ├── openai_api_caller_v1.py
│ ├── post_inference.py
│ ├── post_processors/
│ │ ├── code/
│ │ │ ├── clean.py
│ │ │ ├── code.py
│ │ │ └── evaluator.py
│ │ ├── dist_mixin.py
│ │ ├── dpo.py
│ │ ├── openai_api_callback.py
│ │ ├── pattern/
│ │ │ └── tags.py
│ │ └── qwen25_math_callback.py
│ ├── prompts/
│ │ ├── apps/
│ │ │ ├── critique_0shot_v1.0.txt
│ │ │ ├── magicoder_cls_2shot.txt
│ │ │ ├── r2c_prompt_0shot_v1.0.txt
│ │ │ ├── r2c_prompt_1shot_v1.0.txt
│ │ │ ├── test_case_simulate.v1.0.txt
│ │ │ ├── test_input_gen_0shot_v1.0.txt
│ │ │ ├── test_input_gen_2shot_v2.0.txt
│ │ │ ├── test_input_gen_2shot_v2.1.txt
│ │ │ ├── worsen_0shot_v1.0.txt
│ │ │ └── worsen_from_feedback_0shot_v1.0.txt
│ │ ├── human_eval/
│ │ │ ├── ds_coder_prompt_v1_0.txt
│ │ │ ├── r2c_prompt_0shot_v1.0.txt
│ │ │ ├── r2c_prompt_0shot_v1.1.txt
│ │ │ ├── r2c_prompt_0shot_v1.2.txt
│ │ │ └── r2c_prompt_0shot_v1.3.txt
│ │ ├── magicoder/
│ │ │ ├── oss_has_function_head_v1_0.txt
│ │ │ └── test_input_gen_2shot_v1.0.txt
│ │ ├── math/
│ │ │ └── college_math_4shot.txt
│ │ └── mbpp/
│ │ ├── r2c_prompt_0shot_v1.0.txt
│ │ ├── r2c_prompt_3shot_v1.0.txt
│ │ └── r2c_prompt_3shot_v2.0.txt
│ ├── requirements.txt
│ ├── scripts/
│ │ ├── __init__.py
│ │ ├── apps/
│ │ │ ├── __init__.py
│ │ │ ├── analyze/
│ │ │ │ ├── freq2image.py
│ │ │ │ ├── get_output_frequency.py
│ │ │ │ └── pipeline.sh
│ │ │ ├── code_flaw/
│ │ │ │ └── pipeline_v1.0.sh
│ │ │ ├── construct_prefer_pair.py
│ │ │ ├── construct_prefer_pair_rm.py
│ │ │ ├── construct_prefer_pair_soft.py
│ │ │ ├── eval_gpt4_outputs.py
│ │ │ ├── execute_gold_sol_on_test_case.py
│ │ │ ├── execute_gold_sol_on_test_case.sh
│ │ │ ├── extract_pseudo_outputs_as_label.py
│ │ │ ├── get_output_frequency.py
│ │ │ ├── gpt4o_to_normal_pred_format.py
│ │ │ ├── merge_dp_predictions.py
│ │ │ ├── merge_dp_solutions.sh
│ │ │ ├── pp_critique_difficulty.py
│ │ │ ├── pp_eval_gpt4.py
│ │ │ ├── pp_eval_gpt4_general_combine.py
│ │ │ ├── pp_solution_gen_inputs.py
│ │ │ ├── pp_test_case.py
│ │ │ ├── pp_test_case_gen_inputs.py
│ │ │ ├── pp_test_case_gen_inputs_v2.0.py
│ │ │ ├── pp_test_case_gen_outputs.py
│ │ │ ├── pp_test_case_gen_public_outputs.py
│ │ │ ├── pp_test_case_gen_public_outputs_few_shot.py
│ │ │ ├── pp_test_case_gen_public_outputs_few_shot_verify.py
│ │ │ ├── pp_worsen_inputs.py
│ │ │ ├── prm/
│ │ │ │ ├── construct_process_rm_sample.py
│ │ │ │ ├── construct_process_rm_sample_fix.py
│ │ │ │ └── sample_steps.py
│ │ │ ├── pseudo_test_cases/
│ │ │ │ ├── 4o_pseudo_baseline.sh
│ │ │ │ ├── README.md
│ │ │ │ ├── __init__.py
│ │ │ │ ├── clean_oss_mistral_data.py
│ │ │ │ ├── clean_xcode_4o_test_inputs_data.py
│ │ │ │ ├── collect_pseudo_outputs.py
│ │ │ │ ├── combine_gpt_raw_requests.py
│ │ │ │ ├── combine_pseudo_test_inputs.py
│ │ │ │ ├── construct_dpo_pairs.sh
│ │ │ │ ├── control_test_case_num_baseline.sh
│ │ │ │ ├── control_test_case_num_baseline_pipeline.sh
│ │ │ │ ├── extract_4o_combine_outputs_as_label.sh
│ │ │ │ ├── oss_combine_collect_pseudo_outputs.py
│ │ │ │ ├── oss_combine_collect_pseudo_outputs_mp.py
│ │ │ │ ├── oss_combine_collect_pseudo_outputs_mp_compress.py
│ │ │ │ ├── oss_combine_collect_pseudo_outputs_takes_extra.py
│ │ │ │ ├── oss_combine_prefix_fail_extract_pseudo_label.py
│ │ │ │ ├── oss_combine_run_extract_pseudo_label.py
│ │ │ │ ├── pipeline.sh
│ │ │ │ ├── pp_inputs_pick_problem_evol.py
│ │ │ │ ├── pp_inputs_pick_problem_oss.py
│ │ │ │ ├── prefix_fail_extract_pseudo_label.py
│ │ │ │ ├── prefix_fail_extract_pseudo_label_align_ts_num.py
│ │ │ │ ├── run_outputs_local.sh
│ │ │ │ ├── xcode_pipeline.sh
│ │ │ │ └── xcode_pp_test_case_gen.py
│ │ │ ├── re_verify_solutions.py
│ │ │ ├── rerank_code_rm.py
│ │ │ ├── solution_fail_extract.py
│ │ │ ├── solution_fail_extract_critique.py
│ │ │ ├── solution_fail_extract_pseudo_label.py
│ │ │ ├── solution_run_outputs.py
│ │ │ ├── solution_run_outputs_local.py
│ │ │ ├── solution_run_pseudo_outputs_local.py
│ │ │ ├── utils_execute.py
│ │ │ └── worsen_gpt4_combine.py
│ │ ├── collect_mbpp_test_cases_outputs_sc_v1.0.py
│ │ ├── eval_mbpp_judgement.py
│ │ ├── eval_mbpp_judgement_v2.py
│ │ ├── execute_mbpp_intermediate_res.py
│ │ ├── execute_mbpp_intermediate_res_mp.py
│ │ ├── inference/
│ │ │ └── vllm_dp_mul_node.sh
│ │ ├── math/
│ │ │ ├── analyze_sc.py
│ │ │ ├── deepseek_math_sample_steps.py
│ │ │ ├── estimate_state_value.py
│ │ │ ├── merge_dp_multi_solution.py
│ │ │ ├── merge_dp_predictions.py
│ │ │ ├── merge_dp_predictions.sh
│ │ │ ├── merge_incomplete_predictions.py
│ │ │ ├── merge_rm_dp_multi_solution.py
│ │ │ ├── rerank_w_orm.py
│ │ │ ├── rerank_w_prm.py
│ │ │ └── rerank_w_prm_combine.py
│ │ ├── math_scale/
│ │ │ ├── __init__.py
│ │ │ ├── analyze/
│ │ │ │ ├── compute_acc_by_id.py
│ │ │ │ ├── draw_sc.py
│ │ │ │ ├── extract_hard_questions.py
│ │ │ │ ├── freq2image.py
│ │ │ │ ├── get_output_frequency.py
│ │ │ │ ├── hard_change.sh
│ │ │ │ └── pipeline.sh
│ │ │ ├── concat_data.py
│ │ │ ├── construct_prefer_pair.py
│ │ │ ├── construct_prefer_pair_sc.py
│ │ │ ├── construct_prm_pair.sh
│ │ │ ├── construct_process_rm_sample_gd.py
│ │ │ ├── construct_process_rm_sample_sc.py
│ │ │ ├── exclude_unused_data.py
│ │ │ ├── extract_content_from_orig_format.py
│ │ │ ├── extract_mathscale_v2_box_answer.py
│ │ │ ├── extract_numina_math_box_answer.py
│ │ │ ├── fix_answer_extract_and_verify.py
│ │ │ ├── fix_answer_extract_and_verify_v2.py
│ │ │ ├── llama_numina_co_train/
│ │ │ │ ├── construct_prm_sc_pair.sh
│ │ │ │ └── pipeline.sh
│ │ │ ├── math_scale_offline_gpt_eval.py
│ │ │ ├── mathstral_mathscale_co_train/
│ │ │ │ ├── construct_prm_gd_pair.sh
│ │ │ │ ├── construct_prm_sc_pair.sh
│ │ │ │ └── pipeline.sh
│ │ │ ├── merge_dp_predictions.py
│ │ │ ├── merge_dp_predictions.sh
│ │ │ ├── merge_dp_seed_predictions.py
│ │ │ ├── merge_dp_seed_predictions_by_split.sh
│ │ │ ├── merge_math500_predictions.sh
│ │ │ ├── merge_mwpbench_predictions.sh
│ │ │ ├── merge_mwpbench_sympy_predictions.sh
│ │ │ ├── merge_qwen2_dp_math_dev_predictions_v0.0.sh
│ │ │ ├── merge_qwen2_dp_math_dev_predictions_v1.3.sh
│ │ │ ├── merge_qwen2_dp_predictions_v1.1.sh
│ │ │ ├── merge_qwen2_dp_predictions_v1.2.sh
│ │ │ ├── merge_qwen2_dp_predictions_v1.3.sh
│ │ │ ├── mscale/
│ │ │ │ ├── 4o_pipeline.sh
│ │ │ │ ├── construct_prm_sc_pair.sh
│ │ │ │ ├── pipeline.sh
│ │ │ │ └── rerank.sh
│ │ │ ├── pipeline.sh
│ │ │ ├── pp_gpt_inputs.py
│ │ │ ├── process_4o.py
│ │ │ ├── process_raw_4o.py
│ │ │ ├── process_raw_4o_labeling.py
│ │ │ ├── qwen25math_style_eval.py
│ │ │ ├── qwen25math_style_eval.sh
│ │ │ ├── qwen25math_style_eval_math.py
│ │ │ ├── qwen25math_style_eval_v2.0.py
│ │ │ ├── qwen25math_style_preprocess_pred_label.py
│ │ │ ├── reject_sampling_pipeline.sh
│ │ │ ├── rerank_w_prm_math.py
│ │ │ ├── rerank_w_prm_math_scale_save.py
│ │ │ ├── rerank_w_prm_math_scale_save_pair.py
│ │ │ ├── rerank_w_prm_math_scale_save_pair_margin.py
│ │ │ └── split_data.py
│ │ ├── mbpp/
│ │ │ ├── eval_human_eval_gpt_outputs.py
│ │ │ ├── eval_mbpp_gpt_outputs.py
│ │ │ ├── pp_eval_gpt4_human_eval.py
│ │ │ ├── pp_eval_gpt4_mbpp.py
│ │ │ ├── prepare_mbpp_test_cases_inputs_v1.0.py
│ │ │ ├── print_human_eval_mbpp_res.sh
│ │ │ ├── process_mbpp_test_cases_inputs.py
│ │ │ └── run_test_case_v1.0.py
│ │ ├── model_converts/
│ │ │ ├── llama_hf_mp_split.py
│ │ │ └── pad_model_embedding.py
│ │ ├── prepare_code_contests_decompose.py
│ │ ├── prepare_code_contests_decompose_verification.py
│ │ ├── prepare_code_contests_decompose_verification_v2.0.py
│ │ ├── prepare_code_contests_judgement.py
│ │ ├── prepare_mbpp_desc2code_inputs_v1.0.py
│ │ ├── prepare_mbpp_inputs_v1.0.py
│ │ ├── prepare_mbpp_intermediate_print_v1.0.py
│ │ ├── prepare_mbpp_predict_judgement.py
│ │ ├── prepare_mbpp_test_cases_inputs_v1.0.py
│ │ ├── prepare_mbpp_test_cases_outputs_v1.0.py
│ │ ├── prepare_mbpp_test_cases_outputs_v1.1.py
│ │ ├── split_data_according_to_id.py
│ │ └── verify_mbpp_test_cases.py
│ ├── service_api_caller_v1.py
│ ├── trainer_base_ds_mul_fs_tp.py
│ ├── trainer_ds_megatron_mul.py
│ ├── visualize/
│ │ ├── length_distribution.py
│ │ ├── reward_histogram.py
│ │ └── test_response_length.py
│ ├── vllm_inference.py
│ └── vllm_inference_dp.py
├── README.md
├── ReSA/
│ ├── README.md
│ ├── figures/
│ │ └── figure.py
│ ├── llm/
│ │ ├── __init__.py
│ │ ├── arch/
│ │ │ ├── __init__.py
│ │ │ ├── context_manager.py
│ │ │ └── model.py
│ │ ├── config.py
│ │ ├── data/
│ │ │ └── tokenizer.py
│ │ ├── eval.py
│ │ ├── eval_math.py
│ │ ├── kernel/
│ │ │ ├── __init__.py
│ │ │ ├── flash_attention_with_kv_cache.py
│ │ │ ├── flash_sparse_decoding.py
│ │ │ ├── rotary.py
│ │ │ ├── tilelang_attention_with_kv_cache.py
│ │ │ └── tilelang_sparse_decoding.py
│ │ └── utils/
│ │ └── math_utils.py
│ ├── math_data/
│ │ ├── aime24/
│ │ │ └── test.jsonl
│ │ ├── amc23/
│ │ │ └── test.jsonl
│ │ ├── aqua/
│ │ │ └── test.jsonl
│ │ ├── asdiv/
│ │ │ └── test.jsonl
│ │ ├── carp_en/
│ │ │ ├── demo.json
│ │ │ └── test.jsonl
│ │ ├── cmath/
│ │ │ └── test.jsonl
│ │ ├── cn_middle_school/
│ │ │ └── test.jsonl
│ │ ├── college_math/
│ │ │ └── test.jsonl
│ │ ├── eval_rm_maj_example/
│ │ │ └── math_cot_100.jsonl
│ │ ├── gaokao2023en/
│ │ │ └── test.jsonl
│ │ ├── gaokao2024_I/
│ │ │ └── test.jsonl
│ │ ├── gaokao2024_II/
│ │ │ └── test.jsonl
│ │ ├── gaokao2024_mix/
│ │ │ └── test.jsonl
│ │ ├── gaokao_math_cloze/
│ │ │ └── test.jsonl
│ │ ├── gaokao_math_qa/
│ │ │ └── test.jsonl
│ │ ├── gsm8k/
│ │ │ ├── test.jsonl
│ │ │ └── train.jsonl
│ │ ├── math/
│ │ │ ├── test.jsonl
│ │ │ └── train.jsonl
│ │ ├── mawps/
│ │ │ ├── addsub.jsonl
│ │ │ ├── multiarith.jsonl
│ │ │ ├── singleeq.jsonl
│ │ │ ├── singleop.jsonl
│ │ │ └── test.jsonl
│ │ ├── minerva_math/
│ │ │ ├── README.md
│ │ │ └── test.jsonl
│ │ ├── mmlu_stem/
│ │ │ └── test.jsonl
│ │ ├── olympiadbench/
│ │ │ ├── test.json
│ │ │ └── test.jsonl
│ │ ├── sat_math/
│ │ │ └── test.jsonl
│ │ ├── svamp/
│ │ │ └── test.jsonl
│ │ └── tabmwp/
│ │ └── test.jsonl
│ └── scripts/
│ ├── local_eval_math.sh
│ ├── math_eval_result.sh
│ ├── math_eval_result_length.py
│ ├── math_utils.py
│ └── setup_math_eval.sh
├── SECURITY.md
├── YOCO/
│ ├── README.md
│ ├── requirements.txt
│ ├── scripts/
│ │ ├── eval_needle.sh
│ │ ├── eval_task.sh
│ │ └── train.sh
│ └── yoco/
│ ├── __init__.py
│ ├── criterions/
│ │ ├── __init__.py
│ │ ├── harness_eval.py
│ │ ├── multi_needle.py
│ │ └── needle_haystack.py
│ ├── models/
│ │ ├── __init__.py
│ │ ├── decoder/
│ │ │ ├── __init__.py
│ │ │ ├── cross_attention.py
│ │ │ ├── feedforward_network.py
│ │ │ ├── gate_retention.py
│ │ │ ├── kernel/
│ │ │ │ ├── gate_recurrent.py
│ │ │ │ ├── rotary.py
│ │ │ │ └── swiglu.py
│ │ │ ├── model_parallel_init.py
│ │ │ ├── rms_norm.py
│ │ │ ├── sliding_window_attention.py
│ │ │ ├── transformer.py
│ │ │ └── yoco.py
│ │ ├── transformer.py
│ │ └── yoco.py
│ ├── tasks/
│ │ ├── __init__.py
│ │ ├── data/
│ │ │ ├── __init__.py
│ │ │ ├── basic_loader.py
│ │ │ ├── llama_tokenizer.py
│ │ │ ├── lm_loader.py
│ │ │ ├── tiktoken_tokenizer.py
│ │ │ └── utils.py
│ │ ├── gpt.py
│ │ ├── harness_eval.py
│ │ ├── harness_task.py
│ │ ├── mmlu_task.py
│ │ └── pseudo.py
│ ├── train.py
│ └── validate.py
├── adalm/
│ ├── README.md
│ ├── finetune/
│ │ ├── __init__.py
│ │ ├── run_classifier.py
│ │ ├── run_ner.py
│ │ ├── run_pico.py
│ │ ├── utils_for_glue.py
│ │ └── utils_ner.py
│ ├── incr_bpe/
│ │ ├── README.md
│ │ ├── subword_builder.py
│ │ ├── test_data/
│ │ │ ├── chem.txt
│ │ │ └── vocab.txt
│ │ ├── text_encoder.py
│ │ ├── tokenizer.py
│ │ └── vocab_extend.py
│ ├── requirements.txt
│ └── setup.py
├── beats/
│ ├── BEATs.py
│ ├── README.md
│ ├── Tokenizers.py
│ ├── backbone.py
│ ├── beats_README.md
│ ├── modules.py
│ └── quantizer.py
├── beit/
│ ├── .gitignore
│ ├── README.md
│ ├── dall_e/
│ │ ├── __init__.py
│ │ ├── decoder.py
│ │ ├── encoder.py
│ │ └── utils.py
│ ├── dataset_folder.py
│ ├── datasets.py
│ ├── engine_for_finetuning.py
│ ├── engine_for_pretraining.py
│ ├── get_started_for_image_classification.md
│ ├── masking_generator.py
│ ├── modeling_discrete_vae.py
│ ├── modeling_finetune.py
│ ├── modeling_pretrain.py
│ ├── optim_factory.py
│ ├── requirements.txt
│ ├── run_beit_pretraining.py
│ ├── run_class_finetuning.py
│ ├── run_linear_eval.py
│ ├── semantic_segmentation/
│ │ ├── README.md
│ │ ├── backbone/
│ │ │ └── beit.py
│ │ ├── configs/
│ │ │ ├── _base_/
│ │ │ │ ├── datasets/
│ │ │ │ │ ├── ade20k.py
│ │ │ │ │ └── ade20k_640x640.py
│ │ │ │ ├── default_runtime.py
│ │ │ │ ├── models/
│ │ │ │ │ └── upernet_beit.py
│ │ │ │ └── schedules/
│ │ │ │ ├── schedule_160k.py
│ │ │ │ └── schedule_320k.py
│ │ │ └── beit/
│ │ │ └── upernet/
│ │ │ ├── upernet_beit_base_12_512_slide_160k_ade20k_ms.py
│ │ │ ├── upernet_beit_base_12_512_slide_160k_ade20k_pt.py
│ │ │ ├── upernet_beit_base_12_512_slide_160k_ade20k_pt2ft.py
│ │ │ ├── upernet_beit_base_12_640_slide_160k_ade20k_ms.py
│ │ │ ├── upernet_beit_base_12_640_slide_160k_ade20k_pt2ft.py
│ │ │ ├── upernet_beit_large_24_512_slide_160k_ade20k_ms.py
│ │ │ ├── upernet_beit_large_24_512_slide_160k_ade20k_pt2ft.py
│ │ │ ├── upernet_beit_large_24_640_slide_160k_ade20k_ms.py
│ │ │ └── upernet_beit_large_24_640_slide_160k_ade20k_pt2ft.py
│ │ ├── mmcv_custom/
│ │ │ ├── __init__.py
│ │ │ ├── apex_runner/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── apex_iter_based_runner.py
│ │ │ │ ├── checkpoint.py
│ │ │ │ └── optimizer.py
│ │ │ ├── checkpoint.py
│ │ │ ├── layer_decay_optimizer_constructor.py
│ │ │ ├── resize_transform.py
│ │ │ └── train_api.py
│ │ └── tools/
│ │ ├── dist_test.sh
│ │ ├── dist_train.sh
│ │ ├── test.py
│ │ └── train.py
│ ├── transforms.py
│ └── utils.py
├── beit2/
│ ├── .gitignore
│ ├── PRETRAINING.md
│ ├── README.md
│ ├── TOKENIZER.md
│ ├── dataset_folder.py
│ ├── datasets.py
│ ├── engine_for_finetuning.py
│ ├── engine_for_pretraining.py
│ ├── engine_for_vqkd.py
│ ├── get_started_for_image_classification.md
│ ├── imagenet_a_r_indices.py
│ ├── masking_generator.py
│ ├── modeling_finetune.py
│ ├── modeling_pretrain.py
│ ├── modeling_vqkd.py
│ ├── norm_ema_quantizer.py
│ ├── optim_factory.py
│ ├── requirements.txt
│ ├── run_beitv2_pretraining.py
│ ├── run_class_finetuning.py
│ ├── run_vqkd_training.py
│ ├── semantic_segmentation/
│ │ ├── README.md
│ │ ├── backbone/
│ │ │ └── beit.py
│ │ ├── configs/
│ │ │ ├── _base_/
│ │ │ │ ├── datasets/
│ │ │ │ │ ├── ade20k.py
│ │ │ │ │ └── ade20k_640x640.py
│ │ │ │ ├── default_runtime.py
│ │ │ │ ├── models/
│ │ │ │ │ └── upernet_beit.py
│ │ │ │ └── schedules/
│ │ │ │ ├── schedule_160k.py
│ │ │ │ └── schedule_320k.py
│ │ │ └── beit/
│ │ │ └── upernet/
│ │ │ ├── upernet_beit_base_12_512_slide_160k_21ktoade20k.py
│ │ │ ├── upernet_beit_base_12_512_slide_160k_ade20k.py
│ │ │ ├── upernet_beit_large_24_512_slide_160k_21ktoade20k.py
│ │ │ └── upernet_beit_large_24_512_slide_160k_ade20k.py
│ │ ├── mmcv_custom/
│ │ │ ├── __init__.py
│ │ │ ├── apex_runner/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── apex_iter_based_runner.py
│ │ │ │ ├── checkpoint.py
│ │ │ │ └── optimizer.py
│ │ │ ├── checkpoint.py
│ │ │ ├── layer_decay_optimizer_constructor.py
│ │ │ ├── resize_transform.py
│ │ │ └── train_api.py
│ │ └── tools/
│ │ ├── dist_test.sh
│ │ ├── dist_train.sh
│ │ ├── test.py
│ │ └── train.py
│ ├── test_get_code.py
│ ├── transforms.py
│ ├── utils.py
│ ├── visualize_attention.py
│ └── vqkd_teacher/
│ ├── __init__.py
│ ├── clip/
│ │ ├── __init__.py
│ │ ├── clip.py
│ │ ├── model.py
│ │ └── simple_tokenizer.py
│ └── dino.py
├── beit3/
│ ├── README.md
│ ├── datasets.py
│ ├── engine_for_finetuning.py
│ ├── get_started/
│ │ ├── get_started_for_captioning.md
│ │ ├── get_started_for_image_classification.md
│ │ ├── get_started_for_nlvr2.md
│ │ ├── get_started_for_retrieval.md
│ │ └── get_started_for_vqav2.md
│ ├── glossary.py
│ ├── modeling_finetune.py
│ ├── modeling_utils.py
│ ├── optim_factory.py
│ ├── randaug.py
│ ├── requirements.txt
│ ├── run_beit3_finetuning.py
│ └── utils.py
├── bitnet/
│ └── README.md
├── decoding/
│ ├── GAD/
│ │ ├── block_plugins/
│ │ │ ├── __init__.py
│ │ │ ├── criterions/
│ │ │ │ ├── __init__.py
│ │ │ │ └── glat_loss.py
│ │ │ ├── models/
│ │ │ │ ├── BlockNAT.py
│ │ │ │ └── __init__.py
│ │ │ └── tasks/
│ │ │ ├── __init__.py
│ │ │ └── translation_lev_modified.py
│ │ ├── data/
│ │ │ ├── test.de.compound.ref
│ │ │ ├── wmt14.en-de/
│ │ │ │ ├── bpe.32000
│ │ │ │ ├── dict.de.txt
│ │ │ │ └── dict.en.txt
│ │ │ └── wmt16.en-ro/
│ │ │ ├── dict.en.txt
│ │ │ ├── dict.ro.txt
│ │ │ └── get_data.sh
│ │ ├── fairseq/
│ │ │ ├── __init__.py
│ │ │ ├── benchmark/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── dummy_lm.py
│ │ │ │ ├── dummy_masked_lm.py
│ │ │ │ ├── dummy_model.py
│ │ │ │ └── dummy_mt.py
│ │ │ ├── binarizer.py
│ │ │ ├── checkpoint_utils.py
│ │ │ ├── clib/
│ │ │ │ ├── cuda/
│ │ │ │ │ ├── ngram_repeat_block_cuda.cpp
│ │ │ │ │ └── ngram_repeat_block_cuda_kernel.cu
│ │ │ │ ├── libbleu/
│ │ │ │ │ ├── libbleu.cpp
│ │ │ │ │ └── module.cpp
│ │ │ │ ├── libnat/
│ │ │ │ │ └── edit_dist.cpp
│ │ │ │ └── libnat_cuda/
│ │ │ │ ├── binding.cpp
│ │ │ │ ├── edit_dist.cu
│ │ │ │ └── edit_dist.h
│ │ │ ├── config/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── config.yaml
│ │ │ │ └── model/
│ │ │ │ ├── transformer_lm/
│ │ │ │ │ ├── transformer_lm_baevski_gbw.yaml
│ │ │ │ │ ├── transformer_lm_baevski_wiki103.yaml
│ │ │ │ │ ├── transformer_lm_big.yaml
│ │ │ │ │ ├── transformer_lm_gbw.yaml
│ │ │ │ │ ├── transformer_lm_gpt.yaml
│ │ │ │ │ ├── transformer_lm_gpt2_big.yaml
│ │ │ │ │ ├── transformer_lm_gpt2_medium.yaml
│ │ │ │ │ ├── transformer_lm_gpt2_small.yaml
│ │ │ │ │ └── transformer_lm_wiki103.yaml
│ │ │ │ ├── wav2vec/
│ │ │ │ │ └── vq_wav2vec_gumbel.yaml
│ │ │ │ └── wav2vec2/
│ │ │ │ ├── wav2vec2_base.yaml
│ │ │ │ └── wav2vec2_large.yaml
│ │ │ ├── criterions/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── adaptive_loss.py
│ │ │ │ ├── composite_loss.py
│ │ │ │ ├── cross_entropy.py
│ │ │ │ ├── ctc.py
│ │ │ │ ├── fairseq_criterion.py
│ │ │ │ ├── label_smoothed_cross_entropy.py
│ │ │ │ ├── label_smoothed_cross_entropy_with_alignment.py
│ │ │ │ ├── legacy_masked_lm.py
│ │ │ │ ├── masked_lm.py
│ │ │ │ ├── model_criterion.py
│ │ │ │ ├── nat_loss.py
│ │ │ │ ├── sentence_prediction.py
│ │ │ │ ├── sentence_ranking.py
│ │ │ │ └── wav2vec_criterion.py
│ │ │ ├── data/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── add_target_dataset.py
│ │ │ │ ├── append_token_dataset.py
│ │ │ │ ├── audio/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── audio_utils.py
│ │ │ │ │ ├── feature_transforms/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── global_cmvn.py
│ │ │ │ │ │ ├── specaugment.py
│ │ │ │ │ │ └── utterance_cmvn.py
│ │ │ │ │ ├── raw_audio_dataset.py
│ │ │ │ │ └── speech_to_text_dataset.py
│ │ │ │ ├── backtranslation_dataset.py
│ │ │ │ ├── base_wrapper_dataset.py
│ │ │ │ ├── bucket_pad_length_dataset.py
│ │ │ │ ├── colorize_dataset.py
│ │ │ │ ├── concat_dataset.py
│ │ │ │ ├── concat_sentences_dataset.py
│ │ │ │ ├── data_utils.py
│ │ │ │ ├── data_utils_fast.cpp
│ │ │ │ ├── data_utils_fast.pyx
│ │ │ │ ├── denoising_dataset.py
│ │ │ │ ├── dictionary.py
│ │ │ │ ├── encoders/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── byte_bpe.py
│ │ │ │ │ ├── byte_utils.py
│ │ │ │ │ ├── bytes.py
│ │ │ │ │ ├── characters.py
│ │ │ │ │ ├── fastbpe.py
│ │ │ │ │ ├── gpt2_bpe.py
│ │ │ │ │ ├── gpt2_bpe_utils.py
│ │ │ │ │ ├── hf_bert_bpe.py
│ │ │ │ │ ├── hf_byte_bpe.py
│ │ │ │ │ ├── moses_tokenizer.py
│ │ │ │ │ ├── nltk_tokenizer.py
│ │ │ │ │ ├── sentencepiece_bpe.py
│ │ │ │ │ ├── space_tokenizer.py
│ │ │ │ │ ├── subword_nmt_bpe.py
│ │ │ │ │ └── utils.py
│ │ │ │ ├── fairseq_dataset.py
│ │ │ │ ├── fasta_dataset.py
│ │ │ │ ├── id_dataset.py
│ │ │ │ ├── indexed_dataset.py
│ │ │ │ ├── iterators.py
│ │ │ │ ├── language_pair_dataset.py
│ │ │ │ ├── legacy/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── block_pair_dataset.py
│ │ │ │ │ ├── masked_lm_dataset.py
│ │ │ │ │ └── masked_lm_dictionary.py
│ │ │ │ ├── list_dataset.py
│ │ │ │ ├── lm_context_window_dataset.py
│ │ │ │ ├── lru_cache_dataset.py
│ │ │ │ ├── mask_tokens_dataset.py
│ │ │ │ ├── monolingual_dataset.py
│ │ │ │ ├── multi_corpus_dataset.py
│ │ │ │ ├── multi_corpus_sampled_dataset.py
│ │ │ │ ├── multilingual/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── multilingual_data_manager.py
│ │ │ │ │ ├── multilingual_utils.py
│ │ │ │ │ ├── sampled_multi_dataset.py
│ │ │ │ │ ├── sampled_multi_epoch_dataset.py
│ │ │ │ │ └── sampling_method.py
│ │ │ │ ├── nested_dictionary_dataset.py
│ │ │ │ ├── noising.py
│ │ │ │ ├── num_samples_dataset.py
│ │ │ │ ├── numel_dataset.py
│ │ │ │ ├── offset_tokens_dataset.py
│ │ │ │ ├── pad_dataset.py
│ │ │ │ ├── plasma_utils.py
│ │ │ │ ├── prepend_dataset.py
│ │ │ │ ├── prepend_token_dataset.py
│ │ │ │ ├── raw_label_dataset.py
│ │ │ │ ├── replace_dataset.py
│ │ │ │ ├── resampling_dataset.py
│ │ │ │ ├── roll_dataset.py
│ │ │ │ ├── round_robin_zip_datasets.py
│ │ │ │ ├── shorten_dataset.py
│ │ │ │ ├── sort_dataset.py
│ │ │ │ ├── strip_token_dataset.py
│ │ │ │ ├── subsample_dataset.py
│ │ │ │ ├── token_block_dataset.py
│ │ │ │ ├── token_block_utils_fast.cpp
│ │ │ │ ├── token_block_utils_fast.pyx
│ │ │ │ ├── transform_eos_dataset.py
│ │ │ │ └── transform_eos_lang_pair_dataset.py
│ │ │ ├── dataclass/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── configs.py
│ │ │ │ ├── constants.py
│ │ │ │ ├── initialize.py
│ │ │ │ └── utils.py
│ │ │ ├── distributed/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── distributed_timeout_wrapper.py
│ │ │ │ ├── legacy_distributed_data_parallel.py
│ │ │ │ ├── module_proxy_wrapper.py
│ │ │ │ ├── tpu_distributed_data_parallel.py
│ │ │ │ └── utils.py
│ │ │ ├── file_io.py
│ │ │ ├── file_utils.py
│ │ │ ├── hub_utils.py
│ │ │ ├── incremental_decoding_utils.py
│ │ │ ├── iterative_refinement_generator.py
│ │ │ ├── logging/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── meters.py
│ │ │ │ ├── metrics.py
│ │ │ │ └── progress_bar.py
│ │ │ ├── model_parallel/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── criterions/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── vocab_parallel_cross_entropy.py
│ │ │ │ ├── megatron_trainer.py
│ │ │ │ ├── models/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── pipeline_parallel_transformer/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── layers.py
│ │ │ │ │ │ └── model.py
│ │ │ │ │ ├── roberta/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ └── model.py
│ │ │ │ │ ├── transformer.py
│ │ │ │ │ └── transformer_lm.py
│ │ │ │ └── modules/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── multihead_attention.py
│ │ │ │ └── transformer_layer.py
│ │ │ ├── models/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── bart/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── hub_interface.py
│ │ │ │ │ └── model.py
│ │ │ │ ├── composite_encoder.py
│ │ │ │ ├── distributed_fairseq_model.py
│ │ │ │ ├── fairseq_decoder.py
│ │ │ │ ├── fairseq_encoder.py
│ │ │ │ ├── fairseq_incremental_decoder.py
│ │ │ │ ├── fairseq_model.py
│ │ │ │ ├── fconv.py
│ │ │ │ ├── fconv_lm.py
│ │ │ │ ├── fconv_self_att.py
│ │ │ │ ├── huggingface/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── hf_gpt2.py
│ │ │ │ ├── lightconv.py
│ │ │ │ ├── lightconv_lm.py
│ │ │ │ ├── lstm.py
│ │ │ │ ├── lstm_lm.py
│ │ │ │ ├── masked_lm.py
│ │ │ │ ├── model_utils.py
│ │ │ │ ├── multilingual_transformer.py
│ │ │ │ ├── nat/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── fairseq_nat_model.py
│ │ │ │ │ ├── nonautoregressive_ensembles.py
│ │ │ │ │ └── nonautoregressive_transformer.py
│ │ │ │ ├── roberta/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── alignment_utils.py
│ │ │ │ │ ├── hub_interface.py
│ │ │ │ │ ├── model.py
│ │ │ │ │ ├── model_camembert.py
│ │ │ │ │ ├── model_gottbert.py
│ │ │ │ │ └── model_xlmr.py
│ │ │ │ ├── speech_to_text/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── berard.py
│ │ │ │ │ ├── convtransformer.py
│ │ │ │ │ └── s2t_transformer.py
│ │ │ │ ├── transformer.py
│ │ │ │ ├── transformer_align.py
│ │ │ │ ├── transformer_from_pretrained_xlm.py
│ │ │ │ ├── transformer_lm.py
│ │ │ │ └── wav2vec/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── wav2vec.py
│ │ │ │ ├── wav2vec2.py
│ │ │ │ └── wav2vec2_asr.py
│ │ │ ├── modules/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── adaptive_input.py
│ │ │ │ ├── adaptive_softmax.py
│ │ │ │ ├── beamable_mm.py
│ │ │ │ ├── character_token_embedder.py
│ │ │ │ ├── checkpoint_activations.py
│ │ │ │ ├── conv_tbc.py
│ │ │ │ ├── cross_entropy.py
│ │ │ │ ├── cuda_utils.cu
│ │ │ │ ├── downsampled_multihead_attention.py
│ │ │ │ ├── dynamic_convolution.py
│ │ │ │ ├── dynamic_crf_layer.py
│ │ │ │ ├── dynamicconv_layer/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── cuda_function_gen.py
│ │ │ │ │ ├── dynamicconv_cuda.cpp
│ │ │ │ │ ├── dynamicconv_cuda.cuh
│ │ │ │ │ ├── dynamicconv_cuda_kernel.cu
│ │ │ │ │ ├── dynamicconv_layer.py
│ │ │ │ │ ├── dynamiconv_cpu.cpp
│ │ │ │ │ └── setup.py
│ │ │ │ ├── fairseq_dropout.py
│ │ │ │ ├── fp32_group_norm.py
│ │ │ │ ├── gelu.py
│ │ │ │ ├── grad_multiply.py
│ │ │ │ ├── gumbel_vector_quantizer.py
│ │ │ │ ├── kmeans_vector_quantizer.py
│ │ │ │ ├── layer_drop.py
│ │ │ │ ├── layer_norm.py
│ │ │ │ ├── learned_positional_embedding.py
│ │ │ │ ├── lightconv_layer/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── cuda_function_gen.py
│ │ │ │ │ ├── lightconv_cuda.cpp
│ │ │ │ │ ├── lightconv_cuda.cuh
│ │ │ │ │ ├── lightconv_cuda_kernel.cu
│ │ │ │ │ ├── lightconv_layer.py
│ │ │ │ │ └── setup.py
│ │ │ │ ├── lightweight_convolution.py
│ │ │ │ ├── linearized_convolution.py
│ │ │ │ ├── multihead_attention.py
│ │ │ │ ├── positional_embedding.py
│ │ │ │ ├── quant_noise.py
│ │ │ │ ├── quantization/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── pq/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── em.py
│ │ │ │ │ │ ├── modules/
│ │ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ │ ├── qconv.py
│ │ │ │ │ │ │ ├── qemb.py
│ │ │ │ │ │ │ └── qlinear.py
│ │ │ │ │ │ ├── pq.py
│ │ │ │ │ │ └── utils.py
│ │ │ │ │ ├── quantization_options.py
│ │ │ │ │ └── scalar/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── modules/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── qact.py
│ │ │ │ │ │ ├── qconv.py
│ │ │ │ │ │ ├── qemb.py
│ │ │ │ │ │ └── qlinear.py
│ │ │ │ │ ├── ops.py
│ │ │ │ │ └── utils.py
│ │ │ │ ├── same_pad.py
│ │ │ │ ├── scalar_bias.py
│ │ │ │ ├── sinusoidal_positional_embedding.py
│ │ │ │ ├── sparse_multihead_attention.py
│ │ │ │ ├── sparse_transformer_sentence_encoder.py
│ │ │ │ ├── sparse_transformer_sentence_encoder_layer.py
│ │ │ │ ├── transformer_layer.py
│ │ │ │ ├── transformer_sentence_encoder.py
│ │ │ │ ├── transformer_sentence_encoder_layer.py
│ │ │ │ ├── transpose_last.py
│ │ │ │ ├── unfold.py
│ │ │ │ └── vggblock.py
│ │ │ ├── nan_detector.py
│ │ │ ├── ngram_repeat_block.py
│ │ │ ├── optim/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── adadelta.py
│ │ │ │ ├── adafactor.py
│ │ │ │ ├── adagrad.py
│ │ │ │ ├── adam.py
│ │ │ │ ├── adamax.py
│ │ │ │ ├── bmuf.py
│ │ │ │ ├── composite.py
│ │ │ │ ├── cpu_adam.py
│ │ │ │ ├── dynamic_loss_scaler.py
│ │ │ │ ├── fairseq_optimizer.py
│ │ │ │ ├── fp16_optimizer.py
│ │ │ │ ├── fused_adam.py
│ │ │ │ ├── fused_lamb.py
│ │ │ │ ├── lr_scheduler/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── cosine_lr_scheduler.py
│ │ │ │ │ ├── fairseq_lr_scheduler.py
│ │ │ │ │ ├── fixed_schedule.py
│ │ │ │ │ ├── inverse_square_root_schedule.py
│ │ │ │ │ ├── manual_lr_scheduler.py
│ │ │ │ │ ├── pass_through.py
│ │ │ │ │ ├── polynomial_decay_schedule.py
│ │ │ │ │ ├── reduce_lr_on_plateau.py
│ │ │ │ │ ├── tri_stage_lr_scheduler.py
│ │ │ │ │ └── triangular_lr_scheduler.py
│ │ │ │ ├── nag.py
│ │ │ │ ├── sgd.py
│ │ │ │ └── shard.py
│ │ │ ├── options.py
│ │ │ ├── pdb.py
│ │ │ ├── quantization_utils.py
│ │ │ ├── registry.py
│ │ │ ├── scoring/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── bleu.py
│ │ │ │ ├── chrf.py
│ │ │ │ ├── tokenizer.py
│ │ │ │ └── wer.py
│ │ │ ├── search.py
│ │ │ ├── sequence_generator.py
│ │ │ ├── sequence_scorer.py
│ │ │ ├── tasks/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── audio_pretraining.py
│ │ │ │ ├── cross_lingual_lm.py
│ │ │ │ ├── denoising.py
│ │ │ │ ├── fairseq_task.py
│ │ │ │ ├── language_modeling.py
│ │ │ │ ├── legacy_masked_lm.py
│ │ │ │ ├── masked_lm.py
│ │ │ │ ├── multilingual_denoising.py
│ │ │ │ ├── multilingual_masked_lm.py
│ │ │ │ ├── multilingual_translation.py
│ │ │ │ ├── semisupervised_translation.py
│ │ │ │ ├── sentence_prediction.py
│ │ │ │ ├── sentence_ranking.py
│ │ │ │ ├── speech_to_text.py
│ │ │ │ ├── translation.py
│ │ │ │ ├── translation_from_pretrained_bart.py
│ │ │ │ ├── translation_from_pretrained_xlm.py
│ │ │ │ ├── translation_lev.py
│ │ │ │ └── translation_multi_simple_epoch.py
│ │ │ ├── token_generation_constraints.py
│ │ │ ├── tokenizer.py
│ │ │ ├── trainer.py
│ │ │ ├── utils.py
│ │ │ ├── version.py
│ │ │ └── version.txt
│ │ ├── fairseq_cli/
│ │ │ ├── __init__.py
│ │ │ ├── eval_lm.py
│ │ │ ├── generate.py
│ │ │ ├── hydra_train.py
│ │ │ ├── interactive.py
│ │ │ ├── preprocess.py
│ │ │ ├── score.py
│ │ │ ├── train.py
│ │ │ └── validate.py
│ │ ├── hubconf.py
│ │ ├── inference.py
│ │ ├── inference.sh
│ │ ├── inference_paper.py
│ │ ├── pyproject.toml
│ │ ├── readme.md
│ │ ├── ref.sh
│ │ ├── scripts/
│ │ │ ├── __init__.py
│ │ │ ├── average_checkpoints.py
│ │ │ ├── build_sym_alignment.py
│ │ │ ├── compare_namespaces.py
│ │ │ ├── compound_split_bleu.sh
│ │ │ ├── constraints/
│ │ │ │ ├── extract.py
│ │ │ │ └── validate.py
│ │ │ ├── convert_dictionary.lua
│ │ │ ├── convert_model.lua
│ │ │ ├── count_docs.py
│ │ │ ├── read_binarized.py
│ │ │ ├── rm_pt.py
│ │ │ ├── sacrebleu.sh
│ │ │ ├── shard_docs.py
│ │ │ ├── split_train_valid_docs.py
│ │ │ ├── spm_decode.py
│ │ │ ├── spm_encode.py
│ │ │ └── spm_train.py
│ │ ├── setup.py
│ │ ├── train.py
│ │ └── train.sh
│ ├── IAD/
│ │ ├── README.md
│ │ ├── fairseq/
│ │ │ ├── .github/
│ │ │ │ ├── ISSUE_TEMPLATE/
│ │ │ │ │ ├── bug_report.md
│ │ │ │ │ ├── documentation.md
│ │ │ │ │ ├── feature_request.md
│ │ │ │ │ └── how-to-question.md
│ │ │ │ ├── ISSUE_TEMPLATE.md
│ │ │ │ ├── PULL_REQUEST_TEMPLATE.md
│ │ │ │ ├── stale.yml
│ │ │ │ └── workflows/
│ │ │ │ ├── build.yml
│ │ │ │ └── build_wheels.yml
│ │ │ ├── .gitignore
│ │ │ ├── .gitmodules
│ │ │ ├── CODE_OF_CONDUCT.md
│ │ │ ├── CONTRIBUTING.md
│ │ │ ├── LICENSE
│ │ │ ├── README.md
│ │ │ ├── README_FAIRSEQ.md
│ │ │ ├── docs/
│ │ │ │ ├── Makefile
│ │ │ │ ├── _static/
│ │ │ │ │ └── theme_overrides.css
│ │ │ │ ├── command_line_tools.rst
│ │ │ │ ├── conf.py
│ │ │ │ ├── criterions.rst
│ │ │ │ ├── data.rst
│ │ │ │ ├── docutils.conf
│ │ │ │ ├── getting_started.rst
│ │ │ │ ├── hydra_integration.md
│ │ │ │ ├── index.rst
│ │ │ │ ├── lr_scheduler.rst
│ │ │ │ ├── make.bat
│ │ │ │ ├── models.rst
│ │ │ │ ├── modules.rst
│ │ │ │ ├── optim.rst
│ │ │ │ ├── overview.rst
│ │ │ │ ├── requirements.txt
│ │ │ │ ├── tasks.rst
│ │ │ │ ├── tutorial_classifying_names.rst
│ │ │ │ └── tutorial_simple_lstm.rst
│ │ │ ├── examples/
│ │ │ │ ├── .gitignore
│ │ │ │ ├── __init__.py
│ │ │ │ ├── adaptive_span/
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── adagrad_with_grad_clip.py
│ │ │ │ │ ├── adaptive_span_attention.py
│ │ │ │ │ ├── adaptive_span_loss.py
│ │ │ │ │ ├── adaptive_span_model.py
│ │ │ │ │ ├── adaptive_span_model_wrapper.py
│ │ │ │ │ └── truncated_bptt_lm_task.py
│ │ │ │ ├── backtranslation/
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── deduplicate_lines.py
│ │ │ │ │ ├── extract_bt_data.py
│ │ │ │ │ ├── prepare-de-monolingual.sh
│ │ │ │ │ ├── prepare-wmt18en2de.sh
│ │ │ │ │ ├── sacrebleu.sh
│ │ │ │ │ └── tokenized_bleu.sh
│ │ │ │ ├── bart/
│ │ │ │ │ ├── README.glue.md
│ │ │ │ │ ├── README.md
│ │ │ │ │ └── README.summarization.md
│ │ │ │ ├── byte_level_bpe/
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── get_bitext.py
│ │ │ │ │ ├── get_data.sh
│ │ │ │ │ └── gru_transformer.py
│ │ │ │ ├── camembert/
│ │ │ │ │ └── README.md
│ │ │ │ ├── constrained_decoding/
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── normalize.py
│ │ │ │ │ └── tok.py
│ │ │ │ ├── conv_seq2seq/
│ │ │ │ │ └── README.md
│ │ │ │ ├── criss/
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── download_and_preprocess_flores_test.sh
│ │ │ │ │ ├── download_and_preprocess_tatoeba.sh
│ │ │ │ │ ├── mining/
│ │ │ │ │ │ ├── mine.py
│ │ │ │ │ │ └── mine_example.sh
│ │ │ │ │ ├── save_encoder.py
│ │ │ │ │ ├── sentence_retrieval/
│ │ │ │ │ │ ├── encoder_analysis.py
│ │ │ │ │ │ └── sentence_retrieval_tatoeba.sh
│ │ │ │ │ └── unsupervised_mt/
│ │ │ │ │ └── eval.sh
│ │ │ │ ├── cross_lingual_language_model/
│ │ │ │ │ └── README.md
│ │ │ │ ├── fast_noisy_channel/
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── noisy_channel_beam_search.py
│ │ │ │ │ ├── noisy_channel_sequence_generator.py
│ │ │ │ │ └── noisy_channel_translation.py
│ │ │ │ ├── gottbert/
│ │ │ │ │ └── README.md
│ │ │ │ ├── joint_alignment_translation/
│ │ │ │ │ ├── README.md
│ │ │ │ │ └── prepare-wmt18en2de_no_norm_no_escape_no_agressive.sh
│ │ │ │ ├── language_model/
│ │ │ │ │ ├── README.adaptive_inputs.md
│ │ │ │ │ ├── README.conv.md
│ │ │ │ │ ├── README.md
│ │ │ │ │ └── prepare-wikitext-103.sh
│ │ │ │ ├── latent_depth/
│ │ │ │ │ ├── README.md
│ │ │ │ │ └── latent_depth_src/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── loss/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ └── latent_depth.py
│ │ │ │ │ ├── models/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── latent_multilingual_transformer.py
│ │ │ │ │ │ └── latent_transformer.py
│ │ │ │ │ ├── modules/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ └── latent_layers.py
│ │ │ │ │ └── multilingual_translation_latent_depth.py
│ │ │ │ ├── layerdrop/
│ │ │ │ │ └── README.md
│ │ │ │ ├── linformer/
│ │ │ │ │ ├── README.md
│ │ │ │ │ └── linformer_src/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── models/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ └── linformer_roberta.py
│ │ │ │ │ └── modules/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── linformer_sentence_encoder.py
│ │ │ │ │ ├── linformer_sentence_encoder_layer.py
│ │ │ │ │ └── multihead_linear_attention.py
│ │ │ │ ├── m2m_100/
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── install_dependecies.sh
│ │ │ │ │ ├── process_data/
│ │ │ │ │ │ ├── clean_histogram.py
│ │ │ │ │ │ ├── dedup_data.py
│ │ │ │ │ │ └── remove_too_much_punc.py
│ │ │ │ │ ├── tok.sh
│ │ │ │ │ └── tokenizers/
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── seg_ja.sh
│ │ │ │ │ ├── seg_ko.sh
│ │ │ │ │ ├── thirdparty/
│ │ │ │ │ │ └── .gitignore
│ │ │ │ │ ├── tokenize_indic.py
│ │ │ │ │ ├── tokenize_thai.py
│ │ │ │ │ ├── tokenize_zh.py
│ │ │ │ │ └── tokenizer_ar.sh
│ │ │ │ ├── mbart/
│ │ │ │ │ └── README.md
│ │ │ │ ├── megatron_11b/
│ │ │ │ │ ├── README.md
│ │ │ │ │ └── detok.py
│ │ │ │ ├── multilingual/
│ │ │ │ │ ├── ML50_langs.txt
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── data_scripts/
│ │ │ │ │ │ ├── README.md
│ │ │ │ │ │ ├── binarize.py
│ │ │ │ │ │ ├── check_iswlt_test_data.py
│ │ │ │ │ │ ├── check_self_overlaps.py
│ │ │ │ │ │ ├── check_valid_test_overlaps.py
│ │ │ │ │ │ ├── dedup_all.py
│ │ │ │ │ │ ├── download_ML50_v1.sh
│ │ │ │ │ │ ├── download_af_xh.sh
│ │ │ │ │ │ ├── download_flores_data.sh
│ │ │ │ │ │ ├── download_iitb.sh
│ │ │ │ │ │ ├── download_iwslt_and_extract.sh
│ │ │ │ │ │ ├── download_lotus.sh
│ │ │ │ │ │ ├── download_ted_and_extract.py
│ │ │ │ │ │ ├── download_wat19_my.sh
│ │ │ │ │ │ ├── download_wmt19_and_before.py
│ │ │ │ │ │ ├── download_wmt20.sh
│ │ │ │ │ │ ├── preprocess_ML50_v1.sh
│ │ │ │ │ │ ├── remove_valid_test_in_train.py
│ │ │ │ │ │ ├── requirement.txt
│ │ │ │ │ │ └── utils/
│ │ │ │ │ │ ├── dedup.py
│ │ │ │ │ │ ├── fasttext_multi_filter.py
│ │ │ │ │ │ └── strip_sgm.sh
│ │ │ │ │ ├── finetune_multilingual_model.sh
│ │ │ │ │ ├── multilingual_fairseq_gen.sh
│ │ │ │ │ └── train_multilingual_model.sh
│ │ │ │ ├── noisychannel/
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── rerank.py
│ │ │ │ │ ├── rerank_generate.py
│ │ │ │ │ ├── rerank_options.py
│ │ │ │ │ ├── rerank_score_bw.py
│ │ │ │ │ ├── rerank_score_lm.py
│ │ │ │ │ ├── rerank_tune.py
│ │ │ │ │ └── rerank_utils.py
│ │ │ │ ├── nonautoregressive_translation/
│ │ │ │ │ ├── README.md
│ │ │ │ │ └── scripts.md
│ │ │ │ ├── paraphraser/
│ │ │ │ │ ├── README.md
│ │ │ │ │ └── paraphrase.py
│ │ │ │ ├── pay_less_attention_paper/
│ │ │ │ │ └── README.md
│ │ │ │ ├── pointer_generator/
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── README.xsum.md
│ │ │ │ │ ├── pointer_generator_src/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ └── transformer_pg.py
│ │ │ │ │ ├── postprocess.py
│ │ │ │ │ └── preprocess.py
│ │ │ │ ├── quant_noise/
│ │ │ │ │ ├── README.md
│ │ │ │ │ └── transformer_quantization_config.yaml
│ │ │ │ ├── roberta/
│ │ │ │ │ ├── README.custom_classification.md
│ │ │ │ │ ├── README.glue.md
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── README.pretraining.md
│ │ │ │ │ ├── README.race.md
│ │ │ │ │ ├── commonsense_qa/
│ │ │ │ │ │ ├── README.md
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── commonsense_qa_task.py
│ │ │ │ │ │ └── download_cqa_data.sh
│ │ │ │ │ ├── multiprocessing_bpe_encoder.py
│ │ │ │ │ ├── preprocess_GLUE_tasks.sh
│ │ │ │ │ ├── preprocess_RACE.py
│ │ │ │ │ ├── preprocess_RACE.sh
│ │ │ │ │ └── wsc/
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── wsc_criterion.py
│ │ │ │ │ ├── wsc_task.py
│ │ │ │ │ └── wsc_utils.py
│ │ │ │ ├── rxf/
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── rxf_src/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── label_smoothed_cross_entropy_r3f.py
│ │ │ │ │ └── sentence_prediction_r3f.py
│ │ │ │ ├── scaling_nmt/
│ │ │ │ │ └── README.md
│ │ │ │ ├── simultaneous_translation/
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── criterions/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ └── label_smoothed_cross_entropy_latency_augmented.py
│ │ │ │ │ ├── docs/
│ │ │ │ │ │ ├── baseline.md
│ │ │ │ │ │ └── evaluation.md
│ │ │ │ │ ├── eval/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── agents/
│ │ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ │ ├── agent.py
│ │ │ │ │ │ │ ├── simul_trans_agent.py
│ │ │ │ │ │ │ ├── simul_trans_text_agent.py
│ │ │ │ │ │ │ └── word_splitter.py
│ │ │ │ │ │ ├── client.py
│ │ │ │ │ │ ├── eval_latency.py
│ │ │ │ │ │ ├── evaluate.py
│ │ │ │ │ │ ├── scorers/
│ │ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ │ ├── scorer.py
│ │ │ │ │ │ │ └── text_scorer.py
│ │ │ │ │ │ └── server.py
│ │ │ │ │ ├── models/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ └── transformer_monotonic_attention.py
│ │ │ │ │ ├── modules/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── monotonic_multihead_attention.py
│ │ │ │ │ │ └── monotonic_transformer_layer.py
│ │ │ │ │ └── utils/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── functions.py
│ │ │ │ │ └── latency.py
│ │ │ │ ├── speech_recognition/
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── criterions/
│ │ │ │ │ │ ├── ASG_loss.py
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ └── cross_entropy_acc.py
│ │ │ │ │ ├── data/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── asr_dataset.py
│ │ │ │ │ │ ├── collaters.py
│ │ │ │ │ │ ├── data_utils.py
│ │ │ │ │ │ └── replabels.py
│ │ │ │ │ ├── datasets/
│ │ │ │ │ │ ├── asr_prep_json.py
│ │ │ │ │ │ └── prepare-librispeech.sh
│ │ │ │ │ ├── infer.py
│ │ │ │ │ ├── models/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── vggtransformer.py
│ │ │ │ │ │ └── w2l_conv_glu_enc.py
│ │ │ │ │ ├── tasks/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ └── speech_recognition.py
│ │ │ │ │ ├── utils/
│ │ │ │ │ │ └── wer_utils.py
│ │ │ │ │ └── w2l_decoder.py
│ │ │ │ ├── speech_to_text/
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── data_utils.py
│ │ │ │ │ ├── docs/
│ │ │ │ │ │ ├── covost_example.md
│ │ │ │ │ │ ├── librispeech_example.md
│ │ │ │ │ │ └── mustc_example.md
│ │ │ │ │ ├── prep_covost_data.py
│ │ │ │ │ ├── prep_librispeech_data.py
│ │ │ │ │ └── prep_mustc_data.py
│ │ │ │ ├── stories/
│ │ │ │ │ └── README.md
│ │ │ │ ├── translation/
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── prepare-iwslt14.sh
│ │ │ │ │ ├── prepare-iwslt17-multilingual.sh
│ │ │ │ │ ├── prepare-wmt14en2de.sh
│ │ │ │ │ └── prepare-wmt14en2fr.sh
│ │ │ │ ├── translation_moe/
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── score.py
│ │ │ │ │ └── translation_moe_src/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── logsumexp_moe.py
│ │ │ │ │ ├── mean_pool_gating_network.py
│ │ │ │ │ └── translation_moe.py
│ │ │ │ ├── truncated_bptt/
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── transformer_xl_model.py
│ │ │ │ │ └── truncated_bptt_lm_task.py
│ │ │ │ ├── unsupervised_quality_estimation/
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── aggregate_scores.py
│ │ │ │ │ ├── meteor.py
│ │ │ │ │ └── repeat_lines.py
│ │ │ │ ├── wav2vec/
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── config/
│ │ │ │ │ │ ├── finetuning/
│ │ │ │ │ │ │ ├── base_100h.yaml
│ │ │ │ │ │ │ ├── base_10h.yaml
│ │ │ │ │ │ │ ├── base_10m.yaml
│ │ │ │ │ │ │ ├── base_1h.yaml
│ │ │ │ │ │ │ ├── base_960h.yaml
│ │ │ │ │ │ │ ├── vox_100h.yaml
│ │ │ │ │ │ │ ├── vox_10h.yaml
│ │ │ │ │ │ │ ├── vox_10m.yaml
│ │ │ │ │ │ │ ├── vox_1h.yaml
│ │ │ │ │ │ │ └── vox_960h.yaml
│ │ │ │ │ │ └── pretraining/
│ │ │ │ │ │ ├── wav2vec2_base_librispeech.yaml
│ │ │ │ │ │ └── wav2vec2_large_librivox.yaml
│ │ │ │ │ ├── libri_labels.py
│ │ │ │ │ ├── vq-wav2vec_featurize.py
│ │ │ │ │ ├── wav2vec_featurize.py
│ │ │ │ │ └── wav2vec_manifest.py
│ │ │ │ ├── wmt19/
│ │ │ │ │ └── README.md
│ │ │ │ ├── wmt20/
│ │ │ │ │ └── README.md
│ │ │ │ └── xlmr/
│ │ │ │ └── README.md
│ │ │ ├── fairseq/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── benchmark/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── dummy_lm.py
│ │ │ │ │ ├── dummy_masked_lm.py
│ │ │ │ │ ├── dummy_model.py
│ │ │ │ │ └── dummy_mt.py
│ │ │ │ ├── binarizer.py
│ │ │ │ ├── checkpoint_utils.py
│ │ │ │ ├── clib/
│ │ │ │ │ ├── cuda/
│ │ │ │ │ │ ├── ngram_repeat_block_cuda.cpp
│ │ │ │ │ │ └── ngram_repeat_block_cuda_kernel.cu
│ │ │ │ │ ├── libbleu/
│ │ │ │ │ │ ├── libbleu.cpp
│ │ │ │ │ │ └── module.cpp
│ │ │ │ │ ├── libnat/
│ │ │ │ │ │ └── edit_dist.cpp
│ │ │ │ │ └── libnat_cuda/
│ │ │ │ │ ├── binding.cpp
│ │ │ │ │ ├── edit_dist.cu
│ │ │ │ │ └── edit_dist.h
│ │ │ │ ├── config/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── config.yaml
│ │ │ │ │ └── model/
│ │ │ │ │ ├── transformer_lm/
│ │ │ │ │ │ ├── transformer_lm_baevski_gbw.yaml
│ │ │ │ │ │ ├── transformer_lm_baevski_wiki103.yaml
│ │ │ │ │ │ ├── transformer_lm_big.yaml
│ │ │ │ │ │ ├── transformer_lm_gbw.yaml
│ │ │ │ │ │ ├── transformer_lm_gpt.yaml
│ │ │ │ │ │ ├── transformer_lm_gpt2_big.yaml
│ │ │ │ │ │ ├── transformer_lm_gpt2_medium.yaml
│ │ │ │ │ │ ├── transformer_lm_gpt2_small.yaml
│ │ │ │ │ │ └── transformer_lm_wiki103.yaml
│ │ │ │ │ ├── wav2vec/
│ │ │ │ │ │ └── vq_wav2vec_gumbel.yaml
│ │ │ │ │ └── wav2vec2/
│ │ │ │ │ ├── wav2vec2_base.yaml
│ │ │ │ │ └── wav2vec2_large.yaml
│ │ │ │ ├── criterions/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── adaptive_loss.py
│ │ │ │ │ ├── composite_loss.py
│ │ │ │ │ ├── cross_entropy.py
│ │ │ │ │ ├── ctc.py
│ │ │ │ │ ├── fairseq_criterion.py
│ │ │ │ │ ├── label_smoothed_cross_entropy.py
│ │ │ │ │ ├── label_smoothed_cross_entropy_with_alignment.py
│ │ │ │ │ ├── legacy_masked_lm.py
│ │ │ │ │ ├── masked_lm.py
│ │ │ │ │ ├── model_criterion.py
│ │ │ │ │ ├── nat_loss.py
│ │ │ │ │ ├── sentence_prediction.py
│ │ │ │ │ ├── sentence_ranking.py
│ │ │ │ │ └── wav2vec_criterion.py
│ │ │ │ ├── data/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── add_target_dataset.py
│ │ │ │ │ ├── append_token_dataset.py
│ │ │ │ │ ├── audio/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── audio_utils.py
│ │ │ │ │ │ ├── feature_transforms/
│ │ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ │ ├── global_cmvn.py
│ │ │ │ │ │ │ ├── specaugment.py
│ │ │ │ │ │ │ └── utterance_cmvn.py
│ │ │ │ │ │ ├── raw_audio_dataset.py
│ │ │ │ │ │ └── speech_to_text_dataset.py
│ │ │ │ │ ├── backtranslation_dataset.py
│ │ │ │ │ ├── base_wrapper_dataset.py
│ │ │ │ │ ├── bucket_pad_length_dataset.py
│ │ │ │ │ ├── colorize_dataset.py
│ │ │ │ │ ├── concat_dataset.py
│ │ │ │ │ ├── concat_sentences_dataset.py
│ │ │ │ │ ├── data_utils.py
│ │ │ │ │ ├── data_utils_fast.pyx
│ │ │ │ │ ├── denoising_dataset.py
│ │ │ │ │ ├── dictionary.py
│ │ │ │ │ ├── encoders/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── byte_bpe.py
│ │ │ │ │ │ ├── byte_utils.py
│ │ │ │ │ │ ├── bytes.py
│ │ │ │ │ │ ├── characters.py
│ │ │ │ │ │ ├── fastbpe.py
│ │ │ │ │ │ ├── gpt2_bpe.py
│ │ │ │ │ │ ├── gpt2_bpe_utils.py
│ │ │ │ │ │ ├── hf_bert_bpe.py
│ │ │ │ │ │ ├── hf_byte_bpe.py
│ │ │ │ │ │ ├── moses_tokenizer.py
│ │ │ │ │ │ ├── nltk_tokenizer.py
│ │ │ │ │ │ ├── sentencepiece_bpe.py
│ │ │ │ │ │ ├── space_tokenizer.py
│ │ │ │ │ │ ├── subword_nmt_bpe.py
│ │ │ │ │ │ └── utils.py
│ │ │ │ │ ├── fairseq_dataset.py
│ │ │ │ │ ├── fasta_dataset.py
│ │ │ │ │ ├── id_dataset.py
│ │ │ │ │ ├── indexed_dataset.py
│ │ │ │ │ ├── iterators.py
│ │ │ │ │ ├── language_pair_dataset.py
│ │ │ │ │ ├── legacy/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── block_pair_dataset.py
│ │ │ │ │ │ ├── masked_lm_dataset.py
│ │ │ │ │ │ └── masked_lm_dictionary.py
│ │ │ │ │ ├── list_dataset.py
│ │ │ │ │ ├── lm_context_window_dataset.py
│ │ │ │ │ ├── lru_cache_dataset.py
│ │ │ │ │ ├── mask_tokens_dataset.py
│ │ │ │ │ ├── monolingual_dataset.py
│ │ │ │ │ ├── multi_corpus_dataset.py
│ │ │ │ │ ├── multi_corpus_sampled_dataset.py
│ │ │ │ │ ├── multilingual/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── multilingual_data_manager.py
│ │ │ │ │ │ ├── multilingual_utils.py
│ │ │ │ │ │ ├── sampled_multi_dataset.py
│ │ │ │ │ │ ├── sampled_multi_epoch_dataset.py
│ │ │ │ │ │ └── sampling_method.py
│ │ │ │ │ ├── nested_dictionary_dataset.py
│ │ │ │ │ ├── noising.py
│ │ │ │ │ ├── num_samples_dataset.py
│ │ │ │ │ ├── numel_dataset.py
│ │ │ │ │ ├── offset_tokens_dataset.py
│ │ │ │ │ ├── pad_dataset.py
│ │ │ │ │ ├── plasma_utils.py
│ │ │ │ │ ├── prepend_dataset.py
│ │ │ │ │ ├── prepend_token_dataset.py
│ │ │ │ │ ├── raw_label_dataset.py
│ │ │ │ │ ├── replace_dataset.py
│ │ │ │ │ ├── resampling_dataset.py
│ │ │ │ │ ├── roll_dataset.py
│ │ │ │ │ ├── round_robin_zip_datasets.py
│ │ │ │ │ ├── shorten_dataset.py
│ │ │ │ │ ├── sort_dataset.py
│ │ │ │ │ ├── strip_token_dataset.py
│ │ │ │ │ ├── subsample_dataset.py
│ │ │ │ │ ├── token_block_dataset.py
│ │ │ │ │ ├── token_block_utils_fast.pyx
│ │ │ │ │ ├── transform_eos_dataset.py
│ │ │ │ │ └── transform_eos_lang_pair_dataset.py
│ │ │ │ ├── dataclass/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── configs.py
│ │ │ │ │ ├── constants.py
│ │ │ │ │ ├── initialize.py
│ │ │ │ │ └── utils.py
│ │ │ │ ├── distributed_utils.py
│ │ │ │ ├── file_io.py
│ │ │ │ ├── file_utils.py
│ │ │ │ ├── hub_utils.py
│ │ │ │ ├── incremental_decoding_utils.py
│ │ │ │ ├── iterative_refinement_generator.py
│ │ │ │ ├── legacy_distributed_data_parallel.py
│ │ │ │ ├── logging/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── meters.py
│ │ │ │ │ ├── metrics.py
│ │ │ │ │ └── progress_bar.py
│ │ │ │ ├── model_parallel/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── criterions/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ └── vocab_parallel_cross_entropy.py
│ │ │ │ │ ├── megatron_trainer.py
│ │ │ │ │ ├── models/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── pipeline_parallel_transformer/
│ │ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ │ ├── layers.py
│ │ │ │ │ │ │ └── model.py
│ │ │ │ │ │ ├── roberta/
│ │ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ │ └── model.py
│ │ │ │ │ │ ├── transformer.py
│ │ │ │ │ │ └── transformer_lm.py
│ │ │ │ │ └── modules/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── multihead_attention.py
│ │ │ │ │ ├── transformer_layer.py
│ │ │ │ │ ├── transformer_sentence_encoder.py
│ │ │ │ │ └── transformer_sentence_encoder_layer.py
│ │ │ │ ├── models/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── bart/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── hub_interface.py
│ │ │ │ │ │ └── model.py
│ │ │ │ │ ├── composite_encoder.py
│ │ │ │ │ ├── distributed_fairseq_model.py
│ │ │ │ │ ├── fairseq_decoder.py
│ │ │ │ │ ├── fairseq_encoder.py
│ │ │ │ │ ├── fairseq_incremental_decoder.py
│ │ │ │ │ ├── fairseq_model.py
│ │ │ │ │ ├── fconv.py
│ │ │ │ │ ├── fconv_lm.py
│ │ │ │ │ ├── fconv_self_att.py
│ │ │ │ │ ├── huggingface/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ └── hf_gpt2.py
│ │ │ │ │ ├── lightconv.py
│ │ │ │ │ ├── lightconv_lm.py
│ │ │ │ │ ├── lstm.py
│ │ │ │ │ ├── lstm_lm.py
│ │ │ │ │ ├── masked_lm.py
│ │ │ │ │ ├── model_utils.py
│ │ │ │ │ ├── multilingual_transformer.py
│ │ │ │ │ ├── nat/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── cmlm_transformer.py
│ │ │ │ │ │ ├── fairseq_nat_model.py
│ │ │ │ │ │ ├── insertion_transformer.py
│ │ │ │ │ │ ├── iterative_nonautoregressive_transformer.py
│ │ │ │ │ │ ├── levenshtein_transformer.py
│ │ │ │ │ │ ├── levenshtein_utils.py
│ │ │ │ │ │ ├── nat_crf_transformer.py
│ │ │ │ │ │ ├── nonautoregressive_ensembles.py
│ │ │ │ │ │ └── nonautoregressive_transformer.py
│ │ │ │ │ ├── roberta/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── alignment_utils.py
│ │ │ │ │ │ ├── hub_interface.py
│ │ │ │ │ │ ├── model.py
│ │ │ │ │ │ ├── model_camembert.py
│ │ │ │ │ │ ├── model_gottbert.py
│ │ │ │ │ │ └── model_xlmr.py
│ │ │ │ │ ├── speech_to_text/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── berard.py
│ │ │ │ │ │ └── s2t_transformer.py
│ │ │ │ │ ├── transformer.py
│ │ │ │ │ ├── transformer_align.py
│ │ │ │ │ ├── transformer_from_pretrained_xlm.py
│ │ │ │ │ ├── transformer_lm.py
│ │ │ │ │ └── wav2vec/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── wav2vec.py
│ │ │ │ │ ├── wav2vec2.py
│ │ │ │ │ └── wav2vec2_asr.py
│ │ │ │ ├── modules/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── adaptive_input.py
│ │ │ │ │ ├── adaptive_softmax.py
│ │ │ │ │ ├── beamable_mm.py
│ │ │ │ │ ├── character_token_embedder.py
│ │ │ │ │ ├── checkpoint_activations.py
│ │ │ │ │ ├── conv_tbc.py
│ │ │ │ │ ├── cross_entropy.py
│ │ │ │ │ ├── cuda_utils.cu
│ │ │ │ │ ├── downsampled_multihead_attention.py
│ │ │ │ │ ├── dynamic_convolution.py
│ │ │ │ │ ├── dynamic_crf_layer.py
│ │ │ │ │ ├── dynamicconv_layer/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── cuda_function_gen.py
│ │ │ │ │ │ ├── dynamicconv_cuda.cpp
│ │ │ │ │ │ ├── dynamicconv_cuda.cuh
│ │ │ │ │ │ ├── dynamicconv_cuda_kernel.cu
│ │ │ │ │ │ ├── dynamicconv_layer.py
│ │ │ │ │ │ ├── dynamiconv_cpu.cpp
│ │ │ │ │ │ └── setup.py
│ │ │ │ │ ├── fairseq_dropout.py
│ │ │ │ │ ├── fp32_group_norm.py
│ │ │ │ │ ├── gelu.py
│ │ │ │ │ ├── grad_multiply.py
│ │ │ │ │ ├── gumbel_vector_quantizer.py
│ │ │ │ │ ├── kmeans_vector_quantizer.py
│ │ │ │ │ ├── layer_drop.py
│ │ │ │ │ ├── layer_norm.py
│ │ │ │ │ ├── learned_positional_embedding.py
│ │ │ │ │ ├── lightconv_layer/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── cuda_function_gen.py
│ │ │ │ │ │ ├── lightconv_cuda.cpp
│ │ │ │ │ │ ├── lightconv_cuda.cuh
│ │ │ │ │ │ ├── lightconv_cuda_kernel.cu
│ │ │ │ │ │ ├── lightconv_layer.py
│ │ │ │ │ │ └── setup.py
│ │ │ │ │ ├── lightweight_convolution.py
│ │ │ │ │ ├── linearized_convolution.py
│ │ │ │ │ ├── multihead_attention.py
│ │ │ │ │ ├── positional_embedding.py
│ │ │ │ │ ├── quant_noise.py
│ │ │ │ │ ├── quantization/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── pq/
│ │ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ │ ├── em.py
│ │ │ │ │ │ │ ├── modules/
│ │ │ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ │ │ ├── qconv.py
│ │ │ │ │ │ │ │ ├── qemb.py
│ │ │ │ │ │ │ │ └── qlinear.py
│ │ │ │ │ │ │ ├── pq.py
│ │ │ │ │ │ │ └── utils.py
│ │ │ │ │ │ ├── quantization_options.py
│ │ │ │ │ │ └── scalar/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── modules/
│ │ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ │ ├── qact.py
│ │ │ │ │ │ │ ├── qconv.py
│ │ │ │ │ │ │ ├── qemb.py
│ │ │ │ │ │ │ └── qlinear.py
│ │ │ │ │ │ ├── ops.py
│ │ │ │ │ │ └── utils.py
│ │ │ │ │ ├── same_pad.py
│ │ │ │ │ ├── scalar_bias.py
│ │ │ │ │ ├── sinusoidal_positional_embedding.py
│ │ │ │ │ ├── sparse_multihead_attention.py
│ │ │ │ │ ├── sparse_transformer_sentence_encoder.py
│ │ │ │ │ ├── sparse_transformer_sentence_encoder_layer.py
│ │ │ │ │ ├── transformer_layer.py
│ │ │ │ │ ├── transformer_sentence_encoder.py
│ │ │ │ │ ├── transformer_sentence_encoder_layer.py
│ │ │ │ │ ├── transpose_last.py
│ │ │ │ │ ├── unfold.py
│ │ │ │ │ └── vggblock.py
│ │ │ │ ├── nan_detector.py
│ │ │ │ ├── ngram_repeat_block.py
│ │ │ │ ├── optim/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── adadelta.py
│ │ │ │ │ ├── adafactor.py
│ │ │ │ │ ├── adagrad.py
│ │ │ │ │ ├── adam.py
│ │ │ │ │ ├── adamax.py
│ │ │ │ │ ├── bmuf.py
│ │ │ │ │ ├── composite.py
│ │ │ │ │ ├── dynamic_loss_scaler.py
│ │ │ │ │ ├── fairseq_optimizer.py
│ │ │ │ │ ├── fp16_optimizer.py
│ │ │ │ │ ├── fused_adam.py
│ │ │ │ │ ├── fused_lamb.py
│ │ │ │ │ ├── lr_scheduler/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── cosine_lr_scheduler.py
│ │ │ │ │ │ ├── fairseq_lr_scheduler.py
│ │ │ │ │ │ ├── fixed_schedule.py
│ │ │ │ │ │ ├── inverse_square_root_schedule.py
│ │ │ │ │ │ ├── manual_lr_scheduler.py
│ │ │ │ │ │ ├── pass_through.py
│ │ │ │ │ │ ├── polynomial_decay_schedule.py
│ │ │ │ │ │ ├── reduce_lr_on_plateau.py
│ │ │ │ │ │ ├── tri_stage_lr_scheduler.py
│ │ │ │ │ │ └── triangular_lr_scheduler.py
│ │ │ │ │ ├── nag.py
│ │ │ │ │ ├── sgd.py
│ │ │ │ │ └── shard.py
│ │ │ │ ├── options.py
│ │ │ │ ├── pdb.py
│ │ │ │ ├── quantization_utils.py
│ │ │ │ ├── registry.py
│ │ │ │ ├── scoring/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── bleu.py
│ │ │ │ │ ├── chrf.py
│ │ │ │ │ ├── tokenizer.py
│ │ │ │ │ └── wer.py
│ │ │ │ ├── search.py
│ │ │ │ ├── sequence_generator.py
│ │ │ │ ├── sequence_scorer.py
│ │ │ │ ├── tasks/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── audio_pretraining.py
│ │ │ │ │ ├── cross_lingual_lm.py
│ │ │ │ │ ├── denoising.py
│ │ │ │ │ ├── fairseq_task.py
│ │ │ │ │ ├── language_modeling.py
│ │ │ │ │ ├── legacy_masked_lm.py
│ │ │ │ │ ├── masked_lm.py
│ │ │ │ │ ├── multilingual_denoising.py
│ │ │ │ │ ├── multilingual_masked_lm.py
│ │ │ │ │ ├── multilingual_translation.py
│ │ │ │ │ ├── semisupervised_translation.py
│ │ │ │ │ ├── sentence_prediction.py
│ │ │ │ │ ├── sentence_ranking.py
│ │ │ │ │ ├── speech_to_text.py
│ │ │ │ │ ├── translation.py
│ │ │ │ │ ├── translation_from_pretrained_bart.py
│ │ │ │ │ ├── translation_from_pretrained_xlm.py
│ │ │ │ │ ├── translation_lev.py
│ │ │ │ │ └── translation_multi_simple_epoch.py
│ │ │ │ ├── token_generation_constraints.py
│ │ │ │ ├── tokenizer.py
│ │ │ │ ├── trainer.py
│ │ │ │ ├── utils.py
│ │ │ │ └── version.txt
│ │ │ ├── fairseq_cli/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── eval_lm.py
│ │ │ │ ├── generate.py
│ │ │ │ ├── hydra_train.py
│ │ │ │ ├── interactive.py
│ │ │ │ ├── preprocess.py
│ │ │ │ ├── score.py
│ │ │ │ ├── train.py
│ │ │ │ └── validate.py
│ │ │ ├── hubconf.py
│ │ │ ├── pyproject.toml
│ │ │ ├── scripts/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── average_checkpoints.py
│ │ │ │ ├── build_sym_alignment.py
│ │ │ │ ├── compare_namespaces.py
│ │ │ │ ├── compound_split_bleu.sh
│ │ │ │ ├── constraints/
│ │ │ │ │ ├── extract.py
│ │ │ │ │ └── validate.py
│ │ │ │ ├── convert_dictionary.lua
│ │ │ │ ├── convert_model.lua
│ │ │ │ ├── count_docs.py
│ │ │ │ ├── read_binarized.py
│ │ │ │ ├── rm_pt.py
│ │ │ │ ├── sacrebleu.sh
│ │ │ │ ├── shard_docs.py
│ │ │ │ ├── split_train_valid_docs.py
│ │ │ │ ├── spm_decode.py
│ │ │ │ ├── spm_encode.py
│ │ │ │ └── spm_train.py
│ │ │ ├── setup.py
│ │ │ ├── tests/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── distributed/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── test_distributed_utils.py
│ │ │ │ │ └── utils.py
│ │ │ │ ├── gpu/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── test_binaries_gpu.py
│ │ │ │ │ └── transformer_quantization_config.yaml
│ │ │ │ ├── speech_recognition/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── asr_test_base.py
│ │ │ │ │ ├── test_collaters.py
│ │ │ │ │ ├── test_cross_entropy.py
│ │ │ │ │ ├── test_data_utils.py
│ │ │ │ │ └── test_vggtransformer.py
│ │ │ │ ├── test_average_checkpoints.py
│ │ │ │ ├── test_backtranslation_dataset.py
│ │ │ │ ├── test_binaries.py
│ │ │ │ ├── test_bmuf.py
│ │ │ │ ├── test_character_token_embedder.py
│ │ │ │ ├── test_checkpoint_utils.py
│ │ │ │ ├── test_concat_dataset.py
│ │ │ │ ├── test_constraints.py
│ │ │ │ ├── test_convtbc.py
│ │ │ │ ├── test_data_utils.py
│ │ │ │ ├── test_dictionary.py
│ │ │ │ ├── test_export.py
│ │ │ │ ├── test_file_io.py
│ │ │ │ ├── test_fp16_optimizer.py
│ │ │ │ ├── test_inference_dropout.py
│ │ │ │ ├── test_iopath.py
│ │ │ │ ├── test_iterators.py
│ │ │ │ ├── test_label_smoothing.py
│ │ │ │ ├── test_lm_context_window.py
│ │ │ │ ├── test_lstm_jitable.py
│ │ │ │ ├── test_memory_efficient_fp16.py
│ │ │ │ ├── test_metrics.py
│ │ │ │ ├── test_multi_corpus_sampled_dataset.py
│ │ │ │ ├── test_multihead_attention.py
│ │ │ │ ├── test_noising.py
│ │ │ │ ├── test_reproducibility.py
│ │ │ │ ├── test_resampling_dataset.py
│ │ │ │ ├── test_sequence_generator.py
│ │ │ │ ├── test_sequence_scorer.py
│ │ │ │ ├── test_sparse_multihead_attention.py
│ │ │ │ ├── test_token_block_dataset.py
│ │ │ │ ├── test_train.py
│ │ │ │ ├── test_utils.py
│ │ │ │ └── utils.py
│ │ │ ├── tmp.txt
│ │ │ └── train.py
│ │ ├── inference.py
│ │ ├── inference_batch.py
│ │ └── interactive.sh
│ └── readme.md
├── deepnet/
│ └── README.md
├── deltalm/
│ ├── README.md
│ ├── deltalm/
│ │ ├── __init__.py
│ │ └── models/
│ │ ├── __init__.py
│ │ └── deltalm.py
│ ├── examples/
│ │ ├── binary_iwslt14.sh
│ │ ├── evaluate_iwslt14.sh
│ │ ├── prepare_iwslt14.sh
│ │ ├── spm_iwslt14.sh
│ │ └── train_iwslt14.sh
│ ├── generate.py
│ ├── interactive.py
│ ├── preprocess.py
│ └── train.py
├── dit/
│ ├── README.md
│ ├── classification/
│ │ ├── README.md
│ │ ├── dataset_folder.py
│ │ ├── datasets.py
│ │ ├── deepspeed_configs/
│ │ │ └── config.json
│ │ ├── engine_for_finetuning.py
│ │ ├── modeling_finetune.py
│ │ ├── optim_factory.py
│ │ ├── requirements.txt
│ │ ├── run_class_finetuning.py
│ │ ├── transforms.py
│ │ └── utils.py
│ ├── object_detection/
│ │ ├── README.md
│ │ ├── adaptive_binarize.py
│ │ ├── convert_to_coco_format.py
│ │ ├── ditod/
│ │ │ ├── __init__.py
│ │ │ ├── backbone.py
│ │ │ ├── beit.py
│ │ │ ├── config.py
│ │ │ ├── dataset_mapper.py
│ │ │ ├── deit.py
│ │ │ ├── icdar_evaluation.py
│ │ │ ├── mycheckpointer.py
│ │ │ ├── mytrainer.py
│ │ │ └── table_evaluation/
│ │ │ ├── __init__.py
│ │ │ ├── data_structure.py
│ │ │ └── evaluate.py
│ │ ├── icdar19_configs/
│ │ │ ├── Base-RCNN-FPN.yaml
│ │ │ ├── cascade/
│ │ │ │ ├── cascade_dit_base.yaml
│ │ │ │ └── cascade_dit_large.yaml
│ │ │ └── maskrcnn/
│ │ │ ├── maskrcnn_dit_base.yaml
│ │ │ └── maskrcnn_dit_large.yaml
│ │ ├── inference.py
│ │ ├── publaynet_configs/
│ │ │ ├── Base-RCNN-FPN.yaml
│ │ │ ├── cascade/
│ │ │ │ ├── cascade_dit_base.yaml
│ │ │ │ └── cascade_dit_large.yaml
│ │ │ └── maskrcnn/
│ │ │ ├── maskrcnn_dit_base.yaml
│ │ │ └── maskrcnn_dit_large.yaml
│ │ └── train_net.py
│ ├── requirements.txt
│ └── text_detection/
│ ├── README.md
│ ├── configs/
│ │ ├── Base-RCNN-FPN.yaml
│ │ ├── mask_rcnn_dit_base.yaml
│ │ └── mask_rcnn_dit_large.yaml
│ ├── ditod/
│ │ ├── __init__.py
│ │ ├── backbone.py
│ │ ├── beit.py
│ │ ├── concern/
│ │ │ ├── __init__.py
│ │ │ ├── average_meter.py
│ │ │ ├── box2seg.py
│ │ │ ├── config.py
│ │ │ ├── convert.py
│ │ │ ├── icdar2015_eval/
│ │ │ │ ├── __init__.py
│ │ │ │ └── detection/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── deteval.py
│ │ │ │ ├── icdar2013.py
│ │ │ │ ├── iou.py
│ │ │ │ └── mtwi2018.py
│ │ │ ├── log.py
│ │ │ ├── signal_monitor.py
│ │ │ ├── visualizer.py
│ │ │ └── webcv2/
│ │ │ ├── __init__.py
│ │ │ ├── manager.py
│ │ │ ├── server.py
│ │ │ └── templates/
│ │ │ └── index.html
│ │ ├── config.py
│ │ ├── dataset_mapper.py
│ │ ├── deit.py
│ │ ├── funsd_evaluation.py
│ │ ├── mycheckpointer.py
│ │ └── mytrainer.py
│ └── train_net.py
├── e5/
│ ├── README.md
│ ├── model_config.py
│ ├── mteb_beir_eval.py
│ ├── mteb_except_retrieval_eval.py
│ ├── requirements.txt
│ ├── scripts/
│ │ ├── eval_mteb_beir.sh
│ │ └── eval_mteb_except_retrieval.sh
│ └── utils.py
├── edgelm/
│ ├── CODE_OF_CONDUCT.md
│ ├── CONTRIBUTING.md
│ ├── LICENSE
│ ├── README.md
│ ├── docs/
│ │ ├── Makefile
│ │ ├── _static/
│ │ │ └── theme_overrides.css
│ │ ├── command_line_tools.rst
│ │ ├── conf.py
│ │ ├── criterions.rst
│ │ ├── data.rst
│ │ ├── docutils.conf
│ │ ├── getting_started.rst
│ │ ├── hydra_integration.md
│ │ ├── index.rst
│ │ ├── lr_scheduler.rst
│ │ ├── make.bat
│ │ ├── models.rst
│ │ ├── modules.rst
│ │ ├── optim.rst
│ │ ├── overview.rst
│ │ ├── requirements.txt
│ │ ├── tasks.rst
│ │ ├── tutorial_classifying_names.rst
│ │ └── tutorial_simple_lstm.rst
│ ├── examples/
│ │ ├── .gitignore
│ │ ├── MMPT/
│ │ │ ├── .gitignore
│ │ │ ├── CONFIG.md
│ │ │ ├── DATASET.md
│ │ │ ├── README.md
│ │ │ ├── endtask.md
│ │ │ ├── locallaunch.py
│ │ │ ├── mmpt/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── datasets/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── fairseqmmdataset.py
│ │ │ │ │ └── mmdataset.py
│ │ │ │ ├── evaluators/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── evaluator.py
│ │ │ │ │ ├── metric.py
│ │ │ │ │ └── predictor.py
│ │ │ │ ├── losses/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── fairseqmmloss.py
│ │ │ │ │ ├── loss.py
│ │ │ │ │ └── nce.py
│ │ │ │ ├── models/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── fairseqmmmodel.py
│ │ │ │ │ ├── mmfusion.py
│ │ │ │ │ ├── mmfusionnlg.py
│ │ │ │ │ └── transformermodel.py
│ │ │ │ ├── modules/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── mm.py
│ │ │ │ │ ├── retri.py
│ │ │ │ │ └── vectorpool.py
│ │ │ │ ├── processors/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── dedupprocessor.py
│ │ │ │ │ ├── dsprocessor.py
│ │ │ │ │ ├── how2processor.py
│ │ │ │ │ ├── how2retriprocessor.py
│ │ │ │ │ ├── models/
│ │ │ │ │ │ └── s3dg.py
│ │ │ │ │ └── processor.py
│ │ │ │ ├── tasks/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── fairseqmmtask.py
│ │ │ │ │ ├── milncetask.py
│ │ │ │ │ ├── retritask.py
│ │ │ │ │ ├── task.py
│ │ │ │ │ └── vlmtask.py
│ │ │ │ └── utils/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── load_config.py
│ │ │ │ └── shardedtensor.py
│ │ │ ├── mmpt_cli/
│ │ │ │ ├── localjob.py
│ │ │ │ └── predict.py
│ │ │ ├── pretraining.md
│ │ │ ├── projects/
│ │ │ │ ├── mfmmlm.yaml
│ │ │ │ ├── mtm/
│ │ │ │ │ ├── mmfusionmtm.yaml
│ │ │ │ │ ├── vlm/
│ │ │ │ │ │ ├── coin.yaml
│ │ │ │ │ │ ├── crosstask.yaml
│ │ │ │ │ │ ├── how2.yaml
│ │ │ │ │ │ ├── test_coin.yaml
│ │ │ │ │ │ ├── test_crosstask.yaml
│ │ │ │ │ │ ├── test_crosstask_zs.yaml
│ │ │ │ │ │ ├── test_vtt.yaml
│ │ │ │ │ │ ├── test_vttqa.yaml
│ │ │ │ │ │ ├── test_youcook.yaml
│ │ │ │ │ │ ├── test_youcookcap.yaml
│ │ │ │ │ │ ├── vtt.yaml
│ │ │ │ │ │ ├── vttqa.yaml
│ │ │ │ │ │ ├── youcook.yaml
│ │ │ │ │ │ └── youcookcap.yaml
│ │ │ │ │ └── vlm.yaml
│ │ │ │ ├── retri/
│ │ │ │ │ ├── videoclip/
│ │ │ │ │ │ ├── coin_videoclip.yaml
│ │ │ │ │ │ ├── crosstask_videoclip.yaml
│ │ │ │ │ │ ├── how2.yaml
│ │ │ │ │ │ ├── test_coin_videoclip.yaml
│ │ │ │ │ │ ├── test_coin_zs.yaml
│ │ │ │ │ │ ├── test_crosstask_videoclip.yaml
│ │ │ │ │ │ ├── test_crosstask_zs_videoclip.yaml
│ │ │ │ │ │ ├── test_didemo_zs.yaml
│ │ │ │ │ │ ├── test_vtt_videoclip.yaml
│ │ │ │ │ │ ├── test_vtt_zs.yaml
│ │ │ │ │ │ ├── test_vttqa_videoclip.yaml
│ │ │ │ │ │ ├── test_vttqa_zs.yaml
│ │ │ │ │ │ ├── test_youcook_videoclip.yaml
│ │ │ │ │ │ ├── test_youcook_zs.yaml
│ │ │ │ │ │ ├── vtt_videoclip.yaml
│ │ │ │ │ │ ├── vttqa_videoclip.yaml
│ │ │ │ │ │ └── youcook_videoclip.yaml
│ │ │ │ │ ├── videoclip.yaml
│ │ │ │ │ └── videoretri.yaml
│ │ │ │ └── task/
│ │ │ │ ├── coin.yaml
│ │ │ │ ├── coin_videoclip.yaml
│ │ │ │ ├── crosstask.yaml
│ │ │ │ ├── crosstask_videoclip.yaml
│ │ │ │ ├── default.yaml
│ │ │ │ ├── ft.yaml
│ │ │ │ ├── how2.yaml
│ │ │ │ ├── test.yaml
│ │ │ │ ├── test_coin.yaml
│ │ │ │ ├── test_coin_videoclip.yaml
│ │ │ │ ├── test_coin_zs.yaml
│ │ │ │ ├── test_crosstask.yaml
│ │ │ │ ├── test_crosstask_videoclip.yaml
│ │ │ │ ├── test_crosstask_zs.yaml
│ │ │ │ ├── test_crosstask_zs_videoclip.yaml
│ │ │ │ ├── test_didemo_zs.yaml
│ │ │ │ ├── test_vtt.yaml
│ │ │ │ ├── test_vtt_videoclip.yaml
│ │ │ │ ├── test_vtt_zs.yaml
│ │ │ │ ├── test_vttqa.yaml
│ │ │ │ ├── test_vttqa_videoclip.yaml
│ │ │ │ ├── test_vttqa_zs.yaml
│ │ │ │ ├── test_youcook.yaml
│ │ │ │ ├── test_youcook_videoclip.yaml
│ │ │ │ ├── test_youcook_zs.yaml
│ │ │ │ ├── test_youcookcap.yaml
│ │ │ │ ├── vtt.yaml
│ │ │ │ ├── vtt_videoclip.yaml
│ │ │ │ ├── vttqa.yaml
│ │ │ │ ├── vttqa_videoclip.yaml
│ │ │ │ ├── youcook.yaml
│ │ │ │ ├── youcook_videoclip.yaml
│ │ │ │ └── youcookcap.yaml
│ │ │ ├── scripts/
│ │ │ │ ├── text_token_extractor/
│ │ │ │ │ ├── configs/
│ │ │ │ │ │ └── bert-base-uncased.yaml
│ │ │ │ │ └── pretokenization.py
│ │ │ │ └── video_feature_extractor/
│ │ │ │ ├── extract.py
│ │ │ │ ├── how2/
│ │ │ │ │ └── s3d.sh
│ │ │ │ ├── model.py
│ │ │ │ ├── pathbuilder.py
│ │ │ │ ├── preprocessing.py
│ │ │ │ ├── random_sequence_shuffler.py
│ │ │ │ ├── shard_feature.py
│ │ │ │ └── videoreader.py
│ │ │ └── setup.py
│ │ ├── __init__.py
│ │ ├── adaptive_span/
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ ├── adagrad_with_grad_clip.py
│ │ │ ├── adaptive_span_attention.py
│ │ │ ├── adaptive_span_loss.py
│ │ │ ├── adaptive_span_model.py
│ │ │ ├── adaptive_span_model_wrapper.py
│ │ │ └── truncated_bptt_lm_task.py
│ │ ├── backtranslation/
│ │ │ ├── README.md
│ │ │ ├── deduplicate_lines.py
│ │ │ ├── extract_bt_data.py
│ │ │ ├── prepare-de-monolingual.sh
│ │ │ ├── prepare-wmt18en2de.sh
│ │ │ ├── sacrebleu.sh
│ │ │ └── tokenized_bleu.sh
│ │ ├── bart/
│ │ │ ├── README.glue.md
│ │ │ ├── README.md
│ │ │ ├── README.summarization.md
│ │ │ └── summarize.py
│ │ ├── byte_level_bpe/
│ │ │ ├── README.md
│ │ │ ├── get_bitext.py
│ │ │ ├── get_data.sh
│ │ │ └── gru_transformer.py
│ │ ├── camembert/
│ │ │ └── README.md
│ │ ├── constrained_decoding/
│ │ │ ├── README.md
│ │ │ ├── normalize.py
│ │ │ └── tok.py
│ │ ├── conv_seq2seq/
│ │ │ └── README.md
│ │ ├── criss/
│ │ │ ├── README.md
│ │ │ ├── download_and_preprocess_flores_test.sh
│ │ │ ├── download_and_preprocess_tatoeba.sh
│ │ │ ├── mining/
│ │ │ │ ├── mine.py
│ │ │ │ └── mine_example.sh
│ │ │ ├── save_encoder.py
│ │ │ ├── sentence_retrieval/
│ │ │ │ ├── encoder_analysis.py
│ │ │ │ └── sentence_retrieval_tatoeba.sh
│ │ │ └── unsupervised_mt/
│ │ │ └── eval.sh
│ │ ├── cross_lingual_language_model/
│ │ │ └── README.md
│ │ ├── discriminative_reranking_nmt/
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ ├── config/
│ │ │ │ └── deen.yaml
│ │ │ ├── criterions/
│ │ │ │ ├── __init__.py
│ │ │ │ └── discriminative_reranking_criterion.py
│ │ │ ├── drnmt_rerank.py
│ │ │ ├── models/
│ │ │ │ ├── __init__.py
│ │ │ │ └── discriminative_reranking_model.py
│ │ │ ├── scripts/
│ │ │ │ └── prep_data.py
│ │ │ └── tasks/
│ │ │ ├── __init__.py
│ │ │ └── discriminative_reranking_task.py
│ │ ├── fast_noisy_channel/
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ ├── noisy_channel_beam_search.py
│ │ │ ├── noisy_channel_sequence_generator.py
│ │ │ └── noisy_channel_translation.py
│ │ ├── flores101/
│ │ │ └── README.md
│ │ ├── fully_sharded_data_parallel/
│ │ │ └── README.md
│ │ ├── gottbert/
│ │ │ └── README.md
│ │ ├── hubert/
│ │ │ ├── README.md
│ │ │ ├── config/
│ │ │ │ ├── decode/
│ │ │ │ │ ├── ax_sweep/
│ │ │ │ │ │ ├── ngram.yaml
│ │ │ │ │ │ └── transformer.yaml
│ │ │ │ │ ├── infer_fsqlm.yaml
│ │ │ │ │ ├── infer_kenlm.yaml
│ │ │ │ │ ├── infer_viterbi.yaml
│ │ │ │ │ └── run/
│ │ │ │ │ ├── submitit_slurm.yaml
│ │ │ │ │ └── submitit_slurm_8gpu.yaml
│ │ │ │ ├── finetune/
│ │ │ │ │ ├── base_10h.yaml
│ │ │ │ │ ├── ckpt/
│ │ │ │ │ │ └── it1.yaml
│ │ │ │ │ ├── lm/
│ │ │ │ │ │ └── ls_4gram.yaml
│ │ │ │ │ └── run/
│ │ │ │ │ └── submitit_reg.yaml
│ │ │ │ └── pretrain/
│ │ │ │ ├── data/
│ │ │ │ │ ├── iter1.yaml
│ │ │ │ │ └── iter2.yaml
│ │ │ │ ├── hubert_base_librispeech.yaml
│ │ │ │ ├── hubert_large_librivox.yaml
│ │ │ │ ├── hubert_xlarge_librivox.yaml
│ │ │ │ └── run/
│ │ │ │ └── submitit_reg.yaml
│ │ │ ├── measure_teacher_quality.py
│ │ │ ├── simple_kmeans/
│ │ │ │ ├── README.md
│ │ │ │ ├── dump_hubert_feature.py
│ │ │ │ ├── dump_hubert_feature_s2t.py
│ │ │ │ ├── dump_km_label.py
│ │ │ │ ├── dump_mfcc_feature.py
│ │ │ │ ├── dump_w2v2_feature.py
│ │ │ │ ├── feature_utils.py
│ │ │ │ └── learn_kmeans.py
│ │ │ └── update_ckpt.py
│ │ ├── joint_alignment_translation/
│ │ │ ├── README.md
│ │ │ └── prepare-wmt18en2de_no_norm_no_escape_no_agressive.sh
│ │ ├── language_model/
│ │ │ ├── README.adaptive_inputs.md
│ │ │ ├── README.conv.md
│ │ │ ├── README.md
│ │ │ └── prepare-wikitext-103.sh
│ │ ├── laser/
│ │ │ ├── README.md
│ │ │ └── laser_src/
│ │ │ ├── __init__.py
│ │ │ ├── laser_lstm.py
│ │ │ ├── laser_task.py
│ │ │ ├── laser_transformer.py
│ │ │ └── multitask_data_utils.py
│ │ ├── latent_depth/
│ │ │ ├── README.md
│ │ │ └── latent_depth_src/
│ │ │ ├── __init__.py
│ │ │ ├── loss/
│ │ │ │ ├── __init__.py
│ │ │ │ └── latent_depth.py
│ │ │ ├── models/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── latent_multilingual_transformer.py
│ │ │ │ └── latent_transformer.py
│ │ │ ├── modules/
│ │ │ │ ├── __init__.py
│ │ │ │ └── latent_layers.py
│ │ │ └── multilingual_translation_latent_depth.py
│ │ ├── layerdrop/
│ │ │ └── README.md
│ │ ├── linformer/
│ │ │ ├── README.md
│ │ │ └── linformer_src/
│ │ │ ├── __init__.py
│ │ │ ├── models/
│ │ │ │ ├── __init__.py
│ │ │ │ └── linformer_roberta.py
│ │ │ └── modules/
│ │ │ ├── __init__.py
│ │ │ ├── linformer_sentence_encoder.py
│ │ │ ├── linformer_sentence_encoder_layer.py
│ │ │ └── multihead_linear_attention.py
│ │ ├── m2m_100/
│ │ │ ├── README.md
│ │ │ ├── install_dependecies.sh
│ │ │ ├── process_data/
│ │ │ │ ├── clean_histogram.py
│ │ │ │ ├── dedup_data.py
│ │ │ │ └── remove_too_much_punc.py
│ │ │ ├── tok.sh
│ │ │ └── tokenizers/
│ │ │ ├── README.md
│ │ │ ├── seg_ja.sh
│ │ │ ├── seg_ko.sh
│ │ │ ├── thirdparty/
│ │ │ │ └── .gitignore
│ │ │ ├── tokenize_indic.py
│ │ │ ├── tokenize_thai.py
│ │ │ ├── tokenize_zh.py
│ │ │ └── tokenizer_ar.sh
│ │ ├── mbart/
│ │ │ └── README.md
│ │ ├── megatron_11b/
│ │ │ ├── README.md
│ │ │ └── detok.py
│ │ ├── multilingual/
│ │ │ ├── ML50_langs.txt
│ │ │ ├── README.md
│ │ │ ├── data_scripts/
│ │ │ │ ├── README.md
│ │ │ │ ├── binarize.py
│ │ │ │ ├── check_iswlt_test_data.py
│ │ │ │ ├── check_self_overlaps.py
│ │ │ │ ├── check_valid_test_overlaps.py
│ │ │ │ ├── dedup_all.py
│ │ │ │ ├── download_ML50_v1.sh
│ │ │ │ ├── download_af_xh.sh
│ │ │ │ ├── download_flores_data.sh
│ │ │ │ ├── download_iitb.sh
│ │ │ │ ├── download_iwslt_and_extract.sh
│ │ │ │ ├── download_lotus.sh
│ │ │ │ ├── download_ted_and_extract.py
│ │ │ │ ├── download_wat19_my.sh
│ │ │ │ ├── download_wmt19_and_before.py
│ │ │ │ ├── download_wmt20.sh
│ │ │ │ ├── preprocess_ML50_v1.sh
│ │ │ │ ├── remove_valid_test_in_train.py
│ │ │ │ ├── requirement.txt
│ │ │ │ └── utils/
│ │ │ │ ├── dedup.py
│ │ │ │ ├── fasttext_multi_filter.py
│ │ │ │ └── strip_sgm.sh
│ │ │ ├── finetune_multilingual_model.sh
│ │ │ ├── multilingual_fairseq_gen.sh
│ │ │ └── train_multilingual_model.sh
│ │ ├── noisychannel/
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ ├── rerank.py
│ │ │ ├── rerank_generate.py
│ │ │ ├── rerank_options.py
│ │ │ ├── rerank_score_bw.py
│ │ │ ├── rerank_score_lm.py
│ │ │ ├── rerank_tune.py
│ │ │ └── rerank_utils.py
│ │ ├── nonautoregressive_translation/
│ │ │ ├── README.md
│ │ │ └── scripts.md
│ │ ├── normformer/
│ │ │ ├── README.md
│ │ │ └── train_lm.sh
│ │ ├── operators/
│ │ │ ├── alignment_train_cpu.cpp
│ │ │ ├── alignment_train_cuda.cpp
│ │ │ ├── alignment_train_cuda.h
│ │ │ ├── alignment_train_kernel.cu
│ │ │ └── utils.h
│ │ ├── paraphraser/
│ │ │ ├── README.md
│ │ │ └── paraphrase.py
│ │ ├── pay_less_attention_paper/
│ │ │ └── README.md
│ │ ├── pointer_generator/
│ │ │ ├── README.md
│ │ │ ├── README.xsum.md
│ │ │ ├── pointer_generator_src/
│ │ │ │ ├── __init__.py
│ │ │ │ └── transformer_pg.py
│ │ │ ├── postprocess.py
│ │ │ └── preprocess.py
│ │ ├── quant_noise/
│ │ │ ├── README.md
│ │ │ └── transformer_quantization_config.yaml
│ │ ├── roberta/
│ │ │ ├── README.custom_classification.md
│ │ │ ├── README.glue.md
│ │ │ ├── README.md
│ │ │ ├── README.pretraining.md
│ │ │ ├── README.race.md
│ │ │ ├── commonsense_qa/
│ │ │ │ ├── README.md
│ │ │ │ ├── __init__.py
│ │ │ │ ├── commonsense_qa_task.py
│ │ │ │ └── download_cqa_data.sh
│ │ │ ├── config/
│ │ │ │ ├── finetuning/
│ │ │ │ │ ├── cola.yaml
│ │ │ │ │ ├── mnli.yaml
│ │ │ │ │ ├── mrpc.yaml
│ │ │ │ │ ├── qnli.yaml
│ │ │ │ │ ├── qqp.yaml
│ │ │ │ │ ├── rte.yaml
│ │ │ │ │ ├── sst_2.yaml
│ │ │ │ │ └── sts_b.yaml
│ │ │ │ └── pretraining/
│ │ │ │ └── base.yaml
│ │ │ ├── multiprocessing_bpe_encoder.py
│ │ │ ├── preprocess_GLUE_tasks.sh
│ │ │ ├── preprocess_RACE.py
│ │ │ ├── preprocess_RACE.sh
│ │ │ └── wsc/
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ ├── wsc_criterion.py
│ │ │ ├── wsc_task.py
│ │ │ └── wsc_utils.py
│ │ ├── rxf/
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ └── rxf_src/
│ │ │ ├── __init__.py
│ │ │ ├── label_smoothed_cross_entropy_r3f.py
│ │ │ └── sentence_prediction_r3f.py
│ │ ├── scaling_nmt/
│ │ │ └── README.md
│ │ ├── shuffled_word_order/
│ │ │ ├── README.finetuning.md
│ │ │ └── README.md
│ │ ├── simultaneous_translation/
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ ├── docs/
│ │ │ │ ├── ende-mma.md
│ │ │ │ └── enja-waitk.md
│ │ │ ├── eval/
│ │ │ │ └── agents/
│ │ │ │ └── simul_t2t_enja.py
│ │ │ ├── models/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── convtransformer_simul_trans.py
│ │ │ │ └── transformer_monotonic_attention.py
│ │ │ ├── modules/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── fixed_pre_decision.py
│ │ │ │ ├── monotonic_multihead_attention.py
│ │ │ │ └── monotonic_transformer_layer.py
│ │ │ ├── tests/
│ │ │ │ ├── test_alignment_train.py
│ │ │ │ └── test_text_models.py
│ │ │ └── utils/
│ │ │ ├── __init__.py
│ │ │ ├── functions.py
│ │ │ ├── monotonic_attention.py
│ │ │ └── p_choose_strategy.py
│ │ ├── speech_recognition/
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ ├── criterions/
│ │ │ │ ├── ASG_loss.py
│ │ │ │ ├── __init__.py
│ │ │ │ └── cross_entropy_acc.py
│ │ │ ├── data/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── asr_dataset.py
│ │ │ │ ├── collaters.py
│ │ │ │ ├── data_utils.py
│ │ │ │ └── replabels.py
│ │ │ ├── datasets/
│ │ │ │ ├── asr_prep_json.py
│ │ │ │ └── prepare-librispeech.sh
│ │ │ ├── infer.py
│ │ │ ├── kaldi/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── add-self-loop-simple.cc
│ │ │ │ ├── config/
│ │ │ │ │ └── kaldi_initializer.yaml
│ │ │ │ ├── kaldi_decoder.py
│ │ │ │ └── kaldi_initializer.py
│ │ │ ├── models/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── vggtransformer.py
│ │ │ │ └── w2l_conv_glu_enc.py
│ │ │ ├── new/
│ │ │ │ ├── README.md
│ │ │ │ ├── __init__.py
│ │ │ │ ├── conf/
│ │ │ │ │ ├── hydra/
│ │ │ │ │ │ └── sweeper/
│ │ │ │ │ │ └── ax.yaml
│ │ │ │ │ └── infer.yaml
│ │ │ │ ├── decoders/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base_decoder.py
│ │ │ │ │ ├── decoder.py
│ │ │ │ │ ├── decoder_config.py
│ │ │ │ │ ├── flashlight_decoder.py
│ │ │ │ │ └── viterbi_decoder.py
│ │ │ │ └── infer.py
│ │ │ ├── tasks/
│ │ │ │ ├── __init__.py
│ │ │ │ └── speech_recognition.py
│ │ │ ├── utils/
│ │ │ │ └── wer_utils.py
│ │ │ └── w2l_decoder.py
│ │ ├── speech_synthesis/
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ ├── data_utils.py
│ │ │ ├── docs/
│ │ │ │ ├── common_voice_example.md
│ │ │ │ ├── ljspeech_example.md
│ │ │ │ └── vctk_example.md
│ │ │ ├── evaluation/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── eval_asr.py
│ │ │ │ ├── eval_f0.py
│ │ │ │ ├── eval_sp.py
│ │ │ │ └── get_eval_manifest.py
│ │ │ ├── generate_waveform.py
│ │ │ ├── preprocessing/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── denoise_and_vad_audio.py
│ │ │ │ ├── denoiser/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── demucs.py
│ │ │ │ │ ├── pretrained.py
│ │ │ │ │ ├── resample.py
│ │ │ │ │ └── utils.py
│ │ │ │ ├── get_common_voice_audio_manifest.py
│ │ │ │ ├── get_feature_manifest.py
│ │ │ │ ├── get_ljspeech_audio_manifest.py
│ │ │ │ ├── get_speaker_embedding.py
│ │ │ │ ├── get_vctk_audio_manifest.py
│ │ │ │ ├── speaker_embedder/
│ │ │ │ │ └── __init__.py
│ │ │ │ └── vad/
│ │ │ │ └── __init__.py
│ │ │ └── utils.py
│ │ ├── speech_text_joint_to_text/
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ ├── configs/
│ │ │ │ └── mustc_noise.list
│ │ │ ├── criterions/
│ │ │ │ ├── __init__.py
│ │ │ │ └── text_guide_cross_entropy_acc.py
│ │ │ ├── docs/
│ │ │ │ ├── ende-mustc.md
│ │ │ │ └── iwslt2021.md
│ │ │ ├── models/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── s2t_dualinputtransformer.py
│ │ │ │ └── s2t_dualinputxmtransformer.py
│ │ │ ├── scripts/
│ │ │ │ └── g2p_encode.py
│ │ │ └── tasks/
│ │ │ ├── __init__.py
│ │ │ └── speech_text_joint.py
│ │ ├── speech_to_text/
│ │ │ ├── README.md
│ │ │ ├── data_utils.py
│ │ │ ├── docs/
│ │ │ │ ├── covost_example.md
│ │ │ │ ├── librispeech_example.md
│ │ │ │ ├── mtedx_example.md
│ │ │ │ ├── mustc_example.md
│ │ │ │ └── simulst_mustc_example.md
│ │ │ ├── prep_covost_data.py
│ │ │ ├── prep_librispeech_data.py
│ │ │ ├── prep_mtedx_data.py
│ │ │ ├── prep_mustc_data.py
│ │ │ ├── seg_mustc_data.py
│ │ │ └── simultaneous_translation/
│ │ │ └── agents/
│ │ │ └── fairseq_simul_st_agent.py
│ │ ├── stories/
│ │ │ └── README.md
│ │ ├── textless_nlp/
│ │ │ └── gslm/
│ │ │ ├── README.md
│ │ │ ├── metrics/
│ │ │ │ ├── README.md
│ │ │ │ ├── abx_metrics/
│ │ │ │ │ ├── README.md
│ │ │ │ │ └── dump_abx_feats.py
│ │ │ │ └── asr_metrics/
│ │ │ │ ├── README.md
│ │ │ │ ├── continuation_eval.py
│ │ │ │ ├── misc/
│ │ │ │ │ ├── bleu_utils.py
│ │ │ │ │ ├── cut_as.py
│ │ │ │ │ └── dict.ltr.txt
│ │ │ │ ├── ppx.py
│ │ │ │ └── self_auto_bleu.py
│ │ │ ├── speech2unit/
│ │ │ │ ├── README.md
│ │ │ │ ├── __init__.py
│ │ │ │ ├── clustering/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── cluster_kmeans.py
│ │ │ │ │ ├── dump_feats.py
│ │ │ │ │ ├── quantize_with_kmeans.py
│ │ │ │ │ └── utils.py
│ │ │ │ └── pretrained/
│ │ │ │ ├── cpc_feature_reader.py
│ │ │ │ ├── hubert_feature_reader.py
│ │ │ │ ├── logmel_feature_reader.py
│ │ │ │ ├── utils.py
│ │ │ │ └── w2v2_feature_reader.py
│ │ │ ├── tools/
│ │ │ │ ├── README.md
│ │ │ │ └── resynthesize_speech.py
│ │ │ ├── ulm/
│ │ │ │ ├── README.md
│ │ │ │ └── sample.py
│ │ │ └── unit2speech/
│ │ │ ├── README.md
│ │ │ ├── convert_to_16k.py
│ │ │ ├── glow.py
│ │ │ ├── multiproc.py
│ │ │ ├── synthesize_audio_from_units.py
│ │ │ ├── tacotron2/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── audio_processing.py
│ │ │ │ ├── cleaners.py
│ │ │ │ ├── cmudict.py
│ │ │ │ ├── layers.py
│ │ │ │ ├── model.py
│ │ │ │ ├── numbers.py
│ │ │ │ ├── stft.py
│ │ │ │ ├── symbols.py
│ │ │ │ ├── text.py
│ │ │ │ ├── utils.py
│ │ │ │ └── waveglow_denoiser.py
│ │ │ ├── tts_data.py
│ │ │ └── utils.py
│ │ ├── translation/
│ │ │ ├── README.md
│ │ │ ├── prepare-iwslt14.sh
│ │ │ ├── prepare-iwslt17-multilingual.sh
│ │ │ ├── prepare-wmt14en2de.sh
│ │ │ └── prepare-wmt14en2fr.sh
│ │ ├── translation_moe/
│ │ │ ├── README.md
│ │ │ ├── score.py
│ │ │ └── translation_moe_src/
│ │ │ ├── __init__.py
│ │ │ ├── logsumexp_moe.py
│ │ │ ├── mean_pool_gating_network.py
│ │ │ └── translation_moe.py
│ │ ├── truncated_bptt/
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ ├── transformer_xl_model.py
│ │ │ └── truncated_bptt_lm_task.py
│ │ ├── unsupervised_quality_estimation/
│ │ │ ├── README.md
│ │ │ ├── aggregate_scores.py
│ │ │ ├── meteor.py
│ │ │ └── repeat_lines.py
│ │ ├── wav2vec/
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ ├── config/
│ │ │ │ ├── finetuning/
│ │ │ │ │ ├── base_100h.yaml
│ │ │ │ │ ├── base_10h.yaml
│ │ │ │ │ ├── base_10m.yaml
│ │ │ │ │ ├── base_1h.yaml
│ │ │ │ │ ├── base_960h.yaml
│ │ │ │ │ ├── vox_100h.yaml
│ │ │ │ │ ├── vox_10h.yaml
│ │ │ │ │ ├── vox_10m.yaml
│ │ │ │ │ ├── vox_1h.yaml
│ │ │ │ │ └── vox_960h.yaml
│ │ │ │ └── pretraining/
│ │ │ │ ├── wav2vec2_base_librispeech.yaml
│ │ │ │ ├── wav2vec2_large_librivox.yaml
│ │ │ │ ├── wav2vec2_large_librivox_tpu-pod.yaml
│ │ │ │ └── wav2vec2_large_librivox_tpu.yaml
│ │ │ ├── libri_labels.py
│ │ │ ├── scripts/
│ │ │ │ └── binarize_manifest.sh
│ │ │ ├── unsupervised/
│ │ │ │ ├── README.md
│ │ │ │ ├── __init__.py
│ │ │ │ ├── config/
│ │ │ │ │ ├── finetuning/
│ │ │ │ │ │ └── w2v_finetune.yaml
│ │ │ │ │ ├── gan/
│ │ │ │ │ │ └── w2vu.yaml
│ │ │ │ │ ├── generate/
│ │ │ │ │ │ └── viterbi.yaml
│ │ │ │ │ ├── timit_matched/
│ │ │ │ │ │ ├── test.uid
│ │ │ │ │ │ ├── train.uid
│ │ │ │ │ │ ├── train_text.uid
│ │ │ │ │ │ └── valid.uid
│ │ │ │ │ └── timit_unmatched/
│ │ │ │ │ ├── test.uid
│ │ │ │ │ ├── train.uid
│ │ │ │ │ ├── train_text.uid
│ │ │ │ │ └── valid.uid
│ │ │ │ ├── data/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── extracted_features_dataset.py
│ │ │ │ │ └── random_input_dataset.py
│ │ │ │ ├── kaldi_self_train/
│ │ │ │ │ ├── README.md
│ │ │ │ │ └── st/
│ │ │ │ │ ├── cmd.sh
│ │ │ │ │ ├── decode_phone.sh
│ │ │ │ │ ├── decode_word_step1.sh
│ │ │ │ │ ├── decode_word_step2.sh
│ │ │ │ │ ├── local/
│ │ │ │ │ │ ├── copy_aligned_text.py
│ │ │ │ │ │ ├── decode.sh
│ │ │ │ │ │ ├── prepare_data_from_w2v.py
│ │ │ │ │ │ ├── prepare_lang.sh
│ │ │ │ │ │ ├── prepare_lang_word.sh
│ │ │ │ │ │ ├── prepare_lm.sh
│ │ │ │ │ │ ├── score.sh
│ │ │ │ │ │ ├── show_wer.sh
│ │ │ │ │ │ ├── train_subset_lgbeam.sh
│ │ │ │ │ │ ├── unsup_select.py
│ │ │ │ │ │ ├── unsup_select_decode.sh
│ │ │ │ │ │ └── unsup_select_decode_word.sh
│ │ │ │ │ ├── path.sh
│ │ │ │ │ ├── steps
│ │ │ │ │ ├── steps_gan/
│ │ │ │ │ │ ├── train_deltas.sh
│ │ │ │ │ │ ├── train_lda_mllt.sh
│ │ │ │ │ │ └── train_sat.sh
│ │ │ │ │ ├── train.sh
│ │ │ │ │ └── utils
│ │ │ │ ├── models/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── wav2vec_u.py
│ │ │ │ ├── scripts/
│ │ │ │ │ ├── apply_pca.py
│ │ │ │ │ ├── copy_labels.py
│ │ │ │ │ ├── filter_lexicon.py
│ │ │ │ │ ├── filter_tsv.py
│ │ │ │ │ ├── g2p_wrd_to_phn.py
│ │ │ │ │ ├── ltr_to_wrd.py
│ │ │ │ │ ├── mean_pool.py
│ │ │ │ │ ├── merge_clusters.py
│ │ │ │ │ ├── normalize_and_filter_text.py
│ │ │ │ │ ├── normalize_text.py
│ │ │ │ │ ├── pca.py
│ │ │ │ │ ├── phonemize_with_sil.py
│ │ │ │ │ ├── prepare_audio.sh
│ │ │ │ │ ├── prepare_text.sh
│ │ │ │ │ ├── prepare_timit.sh
│ │ │ │ │ ├── remove_silence.py
│ │ │ │ │ ├── vads.py
│ │ │ │ │ ├── wav2vec_apply_cluster_faiss.py
│ │ │ │ │ ├── wav2vec_cluster_faiss.py
│ │ │ │ │ ├── wav2vec_extract_features.py
│ │ │ │ │ ├── wer.py
│ │ │ │ │ └── wrd_to_ltr.py
│ │ │ │ ├── tasks/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── unpaired_audio_text.py
│ │ │ │ └── w2vu_generate.py
│ │ │ ├── vq-wav2vec_featurize.py
│ │ │ ├── wav2vec_featurize.py
│ │ │ └── wav2vec_manifest.py
│ │ ├── wmt19/
│ │ │ └── README.md
│ │ ├── wmt20/
│ │ │ └── README.md
│ │ └── xlmr/
│ │ └── README.md
│ ├── fairseq/
│ │ ├── __init__.py
│ │ ├── benchmark/
│ │ │ ├── __init__.py
│ │ │ ├── dummy_dataset.py
│ │ │ ├── dummy_lm.py
│ │ │ ├── dummy_masked_lm.py
│ │ │ ├── dummy_model.py
│ │ │ └── dummy_mt.py
│ │ ├── binarizer.py
│ │ ├── checkpoint_utils.py
│ │ ├── clib/
│ │ │ ├── cuda/
│ │ │ │ ├── ngram_repeat_block_cuda.cpp
│ │ │ │ └── ngram_repeat_block_cuda_kernel.cu
│ │ │ ├── libbase/
│ │ │ │ └── balanced_assignment.cpp
│ │ │ ├── libbleu/
│ │ │ │ ├── libbleu.cpp
│ │ │ │ └── module.cpp
│ │ │ ├── libnat/
│ │ │ │ └── edit_dist.cpp
│ │ │ └── libnat_cuda/
│ │ │ ├── binding.cpp
│ │ │ ├── edit_dist.cu
│ │ │ └── edit_dist.h
│ │ ├── config/
│ │ │ ├── __init__.py
│ │ │ ├── config.yaml
│ │ │ └── model/
│ │ │ ├── transformer_lm/
│ │ │ │ ├── transformer_lm_baevski_gbw.yaml
│ │ │ │ ├── transformer_lm_baevski_wiki103.yaml
│ │ │ │ ├── transformer_lm_big.yaml
│ │ │ │ ├── transformer_lm_gbw.yaml
│ │ │ │ ├── transformer_lm_gpt.yaml
│ │ │ │ ├── transformer_lm_gpt2_big.yaml
│ │ │ │ ├── transformer_lm_gpt2_medium.yaml
│ │ │ │ ├── transformer_lm_gpt2_small.yaml
│ │ │ │ └── transformer_lm_wiki103.yaml
│ │ │ ├── wav2vec/
│ │ │ │ └── vq_wav2vec_gumbel.yaml
│ │ │ └── wav2vec2/
│ │ │ ├── wav2vec2_base.yaml
│ │ │ └── wav2vec2_large.yaml
│ │ ├── criterions/
│ │ │ ├── __init__.py
│ │ │ ├── adaptive_loss.py
│ │ │ ├── composite_loss.py
│ │ │ ├── cross_entropy.py
│ │ │ ├── ctc.py
│ │ │ ├── fairseq_criterion.py
│ │ │ ├── fastspeech2_loss.py
│ │ │ ├── hubert_criterion.py
│ │ │ ├── label_smoothed_cross_entropy.py
│ │ │ ├── label_smoothed_cross_entropy_latency_augmented.py
│ │ │ ├── label_smoothed_cross_entropy_with_alignment.py
│ │ │ ├── legacy_masked_lm.py
│ │ │ ├── masked_lm.py
│ │ │ ├── model_criterion.py
│ │ │ ├── nat_loss.py
│ │ │ ├── sentence_prediction.py
│ │ │ ├── sentence_ranking.py
│ │ │ ├── tacotron2_loss.py
│ │ │ └── wav2vec_criterion.py
│ │ ├── data/
│ │ │ ├── __init__.py
│ │ │ ├── add_target_dataset.py
│ │ │ ├── append_token_dataset.py
│ │ │ ├── audio/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── audio_utils.py
│ │ │ │ ├── data_cfg.py
│ │ │ │ ├── feature_transforms/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── global_cmvn.py
│ │ │ │ │ ├── specaugment.py
│ │ │ │ │ └── utterance_cmvn.py
│ │ │ │ ├── frm_text_to_speech_dataset.py
│ │ │ │ ├── hubert_dataset.py
│ │ │ │ ├── multi_modality_dataset.py
│ │ │ │ ├── raw_audio_dataset.py
│ │ │ │ ├── speech_to_text_dataset.py
│ │ │ │ ├── speech_to_text_joint_dataset.py
│ │ │ │ └── text_to_speech_dataset.py
│ │ │ ├── backtranslation_dataset.py
│ │ │ ├── base_wrapper_dataset.py
│ │ │ ├── bucket_pad_length_dataset.py
│ │ │ ├── colorize_dataset.py
│ │ │ ├── concat_dataset.py
│ │ │ ├── concat_sentences_dataset.py
│ │ │ ├── data_utils.py
│ │ │ ├── data_utils_fast.cpp
│ │ │ ├── data_utils_fast.pyx
│ │ │ ├── denoising_dataset.py
│ │ │ ├── dictionary.py
│ │ │ ├── encoders/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── byte_bpe.py
│ │ │ │ ├── byte_utils.py
│ │ │ │ ├── bytes.py
│ │ │ │ ├── characters.py
│ │ │ │ ├── fastbpe.py
│ │ │ │ ├── gpt2_bpe.py
│ │ │ │ ├── gpt2_bpe_utils.py
│ │ │ │ ├── hf_bert_bpe.py
│ │ │ │ ├── hf_byte_bpe.py
│ │ │ │ ├── moses_tokenizer.py
│ │ │ │ ├── nltk_tokenizer.py
│ │ │ │ ├── sentencepiece_bpe.py
│ │ │ │ ├── space_tokenizer.py
│ │ │ │ ├── subword_nmt_bpe.py
│ │ │ │ └── utils.py
│ │ │ ├── fairseq_dataset.py
│ │ │ ├── fasta_dataset.py
│ │ │ ├── huffman/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── huffman_coder.py
│ │ │ │ └── huffman_mmap_indexed_dataset.py
│ │ │ ├── id_dataset.py
│ │ │ ├── indexed_dataset.py
│ │ │ ├── iterators.py
│ │ │ ├── language_pair_dataset.py
│ │ │ ├── legacy/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── block_pair_dataset.py
│ │ │ │ ├── masked_lm_dataset.py
│ │ │ │ └── masked_lm_dictionary.py
│ │ │ ├── list_dataset.py
│ │ │ ├── lm_context_window_dataset.py
│ │ │ ├── lru_cache_dataset.py
│ │ │ ├── mask_tokens_dataset.py
│ │ │ ├── monolingual_dataset.py
│ │ │ ├── multi_corpus_dataset.py
│ │ │ ├── multi_corpus_sampled_dataset.py
│ │ │ ├── multilingual/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── multilingual_data_manager.py
│ │ │ │ ├── multilingual_utils.py
│ │ │ │ ├── sampled_multi_dataset.py
│ │ │ │ ├── sampled_multi_epoch_dataset.py
│ │ │ │ └── sampling_method.py
│ │ │ ├── nested_dictionary_dataset.py
│ │ │ ├── noising.py
│ │ │ ├── num_samples_dataset.py
│ │ │ ├── numel_dataset.py
│ │ │ ├── offset_tokens_dataset.py
│ │ │ ├── pad_dataset.py
│ │ │ ├── plasma_utils.py
│ │ │ ├── prepend_dataset.py
│ │ │ ├── prepend_token_dataset.py
│ │ │ ├── raw_label_dataset.py
│ │ │ ├── replace_dataset.py
│ │ │ ├── resampling_dataset.py
│ │ │ ├── roll_dataset.py
│ │ │ ├── round_robin_zip_datasets.py
│ │ │ ├── shorten_dataset.py
│ │ │ ├── sort_dataset.py
│ │ │ ├── squad/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── basic_tokenizer.py
│ │ │ │ ├── squad_extractor.py
│ │ │ │ └── squad_metrics.py
│ │ │ ├── strip_token_dataset.py
│ │ │ ├── subsample_dataset.py
│ │ │ ├── text_compressor.py
│ │ │ ├── token_block_dataset.py
│ │ │ ├── token_block_utils_fast.cpp
│ │ │ ├── token_block_utils_fast.pyx
│ │ │ ├── transform_eos_dataset.py
│ │ │ └── transform_eos_lang_pair_dataset.py
│ │ ├── dataclass/
│ │ │ ├── __init__.py
│ │ │ ├── configs.py
│ │ │ ├── constants.py
│ │ │ ├── initialize.py
│ │ │ └── utils.py
│ │ ├── distributed/
│ │ │ ├── __init__.py
│ │ │ ├── distributed_timeout_wrapper.py
│ │ │ ├── fully_sharded_data_parallel.py
│ │ │ ├── legacy_distributed_data_parallel.py
│ │ │ ├── module_proxy_wrapper.py
│ │ │ ├── tpu_distributed_data_parallel.py
│ │ │ └── utils.py
│ │ ├── file_chunker_utils.py
│ │ ├── file_io.py
│ │ ├── file_utils.py
│ │ ├── hub_utils.py
│ │ ├── incremental_decoding_utils.py
│ │ ├── iterative_refinement_generator.py
│ │ ├── logging/
│ │ │ ├── __init__.py
│ │ │ ├── meters.py
│ │ │ ├── metrics.py
│ │ │ └── progress_bar.py
│ │ ├── model_parallel/
│ │ │ ├── __init__.py
│ │ │ ├── criterions/
│ │ │ │ ├── __init__.py
│ │ │ │ └── vocab_parallel_cross_entropy.py
│ │ │ ├── megatron_trainer.py
│ │ │ ├── models/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── pipeline_parallel_transformer/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── layers.py
│ │ │ │ │ └── model.py
│ │ │ │ ├── roberta/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── model.py
│ │ │ │ ├── transformer.py
│ │ │ │ └── transformer_lm.py
│ │ │ └── modules/
│ │ │ ├── __init__.py
│ │ │ ├── multihead_attention.py
│ │ │ └── transformer_layer.py
│ │ ├── models/
│ │ │ ├── __init__.py
│ │ │ ├── bart/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── hub_interface.py
│ │ │ │ └── model.py
│ │ │ ├── composite_encoder.py
│ │ │ ├── distributed_fairseq_model.py
│ │ │ ├── ema/
│ │ │ │ ├── __init__.py
│ │ │ │ └── ema.py
│ │ │ ├── fairseq_decoder.py
│ │ │ ├── fairseq_encoder.py
│ │ │ ├── fairseq_incremental_decoder.py
│ │ │ ├── fairseq_model.py
│ │ │ ├── fconv.py
│ │ │ ├── fconv_lm.py
│ │ │ ├── fconv_self_att.py
│ │ │ ├── hubert/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── hubert.py
│ │ │ │ └── hubert_asr.py
│ │ │ ├── huggingface/
│ │ │ │ ├── __init__.py
│ │ │ │ └── hf_gpt2.py
│ │ │ ├── lightconv.py
│ │ │ ├── lightconv_lm.py
│ │ │ ├── lstm.py
│ │ │ ├── lstm_lm.py
│ │ │ ├── masked_lm.py
│ │ │ ├── model_utils.py
│ │ │ ├── multilingual_transformer.py
│ │ │ ├── nat/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── cmlm_transformer.py
│ │ │ │ ├── fairseq_nat_model.py
│ │ │ │ ├── insertion_transformer.py
│ │ │ │ ├── iterative_nonautoregressive_transformer.py
│ │ │ │ ├── levenshtein_transformer.py
│ │ │ │ ├── levenshtein_utils.py
│ │ │ │ ├── nat_crf_transformer.py
│ │ │ │ ├── nonautoregressive_ensembles.py
│ │ │ │ └── nonautoregressive_transformer.py
│ │ │ ├── roberta/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── alignment_utils.py
│ │ │ │ ├── enc_dec.py
│ │ │ │ ├── hub_interface.py
│ │ │ │ ├── model.py
│ │ │ │ ├── model_camembert.py
│ │ │ │ ├── model_gottbert.py
│ │ │ │ └── model_xlmr.py
│ │ │ ├── speech_to_text/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── berard.py
│ │ │ │ ├── convtransformer.py
│ │ │ │ ├── modules/
│ │ │ │ │ ├── augmented_memory_attention.py
│ │ │ │ │ └── emformer.py
│ │ │ │ ├── s2t_transformer.py
│ │ │ │ ├── utils.py
│ │ │ │ └── xm_transformer.py
│ │ │ ├── text_to_speech/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── fastspeech2.py
│ │ │ │ ├── hifigan.py
│ │ │ │ ├── tacotron2.py
│ │ │ │ ├── tts_transformer.py
│ │ │ │ └── vocoder.py
│ │ │ ├── transformer/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── transformer_base.py
│ │ │ │ ├── transformer_config.py
│ │ │ │ ├── transformer_decoder.py
│ │ │ │ ├── transformer_encoder.py
│ │ │ │ └── transformer_legacy.py
│ │ │ ├── transformer_align.py
│ │ │ ├── transformer_from_pretrained_xlm.py
│ │ │ ├── transformer_lm.py
│ │ │ └── wav2vec/
│ │ │ ├── __init__.py
│ │ │ ├── wav2vec.py
│ │ │ ├── wav2vec2.py
│ │ │ └── wav2vec2_asr.py
│ │ ├── modules/
│ │ │ ├── __init__.py
│ │ │ ├── adaptive_input.py
│ │ │ ├── adaptive_softmax.py
│ │ │ ├── base_layer.py
│ │ │ ├── beamable_mm.py
│ │ │ ├── character_token_embedder.py
│ │ │ ├── checkpoint_activations.py
│ │ │ ├── conv_tbc.py
│ │ │ ├── cross_entropy.py
│ │ │ ├── cuda_utils.cu
│ │ │ ├── downsampled_multihead_attention.py
│ │ │ ├── dynamic_convolution.py
│ │ │ ├── dynamic_crf_layer.py
│ │ │ ├── dynamicconv_layer/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── cuda_function_gen.py
│ │ │ │ ├── dynamicconv_cuda.cpp
│ │ │ │ ├── dynamicconv_cuda.cuh
│ │ │ │ ├── dynamicconv_cuda_kernel.cu
│ │ │ │ ├── dynamicconv_layer.py
│ │ │ │ ├── dynamiconv_cpu.cpp
│ │ │ │ └── setup.py
│ │ │ ├── fairseq_dropout.py
│ │ │ ├── fp32_group_norm.py
│ │ │ ├── gelu.py
│ │ │ ├── grad_multiply.py
│ │ │ ├── gumbel_vector_quantizer.py
│ │ │ ├── kmeans_attention.py
│ │ │ ├── kmeans_vector_quantizer.py
│ │ │ ├── layer_drop.py
│ │ │ ├── layer_norm.py
│ │ │ ├── learned_positional_embedding.py
│ │ │ ├── lightconv_layer/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── cuda_function_gen.py
│ │ │ │ ├── lightconv_cuda.cpp
│ │ │ │ ├── lightconv_cuda.cuh
│ │ │ │ ├── lightconv_cuda_kernel.cu
│ │ │ │ ├── lightconv_layer.py
│ │ │ │ └── setup.py
│ │ │ ├── lightweight_convolution.py
│ │ │ ├── linearized_convolution.py
│ │ │ ├── location_attention.py
│ │ │ ├── lora.py
│ │ │ ├── lstm_cell_with_zoneout.py
│ │ │ ├── multihead_attention.py
│ │ │ ├── positional_embedding.py
│ │ │ ├── quant_noise.py
│ │ │ ├── quantization/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── pq/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── em.py
│ │ │ │ │ ├── modules/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── qconv.py
│ │ │ │ │ │ ├── qemb.py
│ │ │ │ │ │ └── qlinear.py
│ │ │ │ │ ├── pq.py
│ │ │ │ │ └── utils.py
│ │ │ │ ├── quantization_options.py
│ │ │ │ └── scalar/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── modules/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── qact.py
│ │ │ │ │ ├── qconv.py
│ │ │ │ │ ├── qemb.py
│ │ │ │ │ └── qlinear.py
│ │ │ │ ├── ops.py
│ │ │ │ └── utils.py
│ │ │ ├── same_pad.py
│ │ │ ├── scalar_bias.py
│ │ │ ├── sinusoidal_positional_embedding.py
│ │ │ ├── sparse_multihead_attention.py
│ │ │ ├── sparse_transformer_sentence_encoder.py
│ │ │ ├── sparse_transformer_sentence_encoder_layer.py
│ │ │ ├── transformer_layer.py
│ │ │ ├── transformer_sentence_encoder.py
│ │ │ ├── transformer_sentence_encoder_layer.py
│ │ │ ├── transpose_last.py
│ │ │ ├── unfold.py
│ │ │ └── vggblock.py
│ │ ├── nan_detector.py
│ │ ├── ngram_repeat_block.py
│ │ ├── optim/
│ │ │ ├── __init__.py
│ │ │ ├── adadelta.py
│ │ │ ├── adafactor.py
│ │ │ ├── adagrad.py
│ │ │ ├── adam.py
│ │ │ ├── adamax.py
│ │ │ ├── amp_optimizer.py
│ │ │ ├── bmuf.py
│ │ │ ├── composite.py
│ │ │ ├── cpu_adam.py
│ │ │ ├── dynamic_loss_scaler.py
│ │ │ ├── fairseq_optimizer.py
│ │ │ ├── fp16_optimizer.py
│ │ │ ├── fused_adam.py
│ │ │ ├── fused_lamb.py
│ │ │ ├── lr_scheduler/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── cosine_lr_scheduler.py
│ │ │ │ ├── fairseq_lr_scheduler.py
│ │ │ │ ├── fixed_schedule.py
│ │ │ │ ├── inverse_square_root_schedule.py
│ │ │ │ ├── manual_lr_scheduler.py
│ │ │ │ ├── pass_through.py
│ │ │ │ ├── polynomial_decay_schedule.py
│ │ │ │ ├── reduce_lr_on_plateau.py
│ │ │ │ ├── step_lr_scheduler.py
│ │ │ │ ├── tri_stage_lr_scheduler.py
│ │ │ │ └── triangular_lr_scheduler.py
│ │ │ ├── nag.py
│ │ │ ├── sgd.py
│ │ │ └── shard.py
│ │ ├── options.py
│ │ ├── pdb.py
│ │ ├── quantization_utils.py
│ │ ├── registry.py
│ │ ├── scoring/
│ │ │ ├── __init__.py
│ │ │ ├── bleu.py
│ │ │ ├── chrf.py
│ │ │ ├── tokenizer.py
│ │ │ └── wer.py
│ │ ├── search.py
│ │ ├── sequence_generator.py
│ │ ├── sequence_scorer.py
│ │ ├── speech_generator.py
│ │ ├── tasks/
│ │ │ ├── __init__.py
│ │ │ ├── audio_finetuning.py
│ │ │ ├── audio_pretraining.py
│ │ │ ├── cross_lingual_lm.py
│ │ │ ├── denoising.py
│ │ │ ├── fairseq_task.py
│ │ │ ├── frm_text_to_speech.py
│ │ │ ├── hubert_pretraining.py
│ │ │ ├── language_modeling.py
│ │ │ ├── legacy_masked_lm.py
│ │ │ ├── masked_lm.py
│ │ │ ├── multilingual_denoising.py
│ │ │ ├── multilingual_masked_lm.py
│ │ │ ├── multilingual_translation.py
│ │ │ ├── online_backtranslation.py
│ │ │ ├── semisupervised_translation.py
│ │ │ ├── sentence_prediction.py
│ │ │ ├── sentence_ranking.py
│ │ │ ├── simultaneous_translation.py
│ │ │ ├── speech_to_text.py
│ │ │ ├── text_to_speech.py
│ │ │ ├── translation.py
│ │ │ ├── translation_from_pretrained_bart.py
│ │ │ ├── translation_from_pretrained_xlm.py
│ │ │ ├── translation_lev.py
│ │ │ └── translation_multi_simple_epoch.py
│ │ ├── token_generation_constraints.py
│ │ ├── tokenizer.py
│ │ ├── trainer.py
│ │ ├── utils.py
│ │ ├── version.py
│ │ └── version.txt
│ ├── fairseq_cli/
│ │ ├── __init__.py
│ │ ├── eval_lm.py
│ │ ├── generate.py
│ │ ├── hydra_train.py
│ │ ├── interactive.py
│ │ ├── preprocess.py
│ │ ├── score.py
│ │ ├── train.py
│ │ └── validate.py
│ ├── hubconf.py
│ ├── pyproject.toml
│ ├── scripts/
│ │ ├── __init__.py
│ │ ├── average_checkpoints.py
│ │ ├── build_sym_alignment.py
│ │ ├── compare_namespaces.py
│ │ ├── compound_split_bleu.sh
│ │ ├── constraints/
│ │ │ ├── extract.py
│ │ │ └── validate.py
│ │ ├── convert_dictionary.lua
│ │ ├── convert_model.lua
│ │ ├── count_docs.py
│ │ ├── read_binarized.py
│ │ ├── rm_pt.py
│ │ ├── sacrebleu.sh
│ │ ├── shard_docs.py
│ │ ├── split_train_valid_docs.py
│ │ ├── spm_decode.py
│ │ ├── spm_encode.py
│ │ ├── spm_train.py
│ │ └── test_fsdp.sh
│ ├── setup.py
│ ├── tests/
│ │ ├── __init__.py
│ │ ├── distributed/
│ │ │ ├── __init__.py
│ │ │ ├── test_bmuf.py
│ │ │ ├── test_distributed_timeout_wrapper.py
│ │ │ ├── test_module_proxy_wrapper.py
│ │ │ ├── test_utils.py
│ │ │ └── utils.py
│ │ ├── gpu/
│ │ │ ├── __init__.py
│ │ │ ├── test_binaries_gpu.py
│ │ │ ├── test_ema_gpu.py
│ │ │ └── transformer_quantization_config.yaml
│ │ ├── speech_recognition/
│ │ │ ├── __init__.py
│ │ │ ├── asr_test_base.py
│ │ │ ├── test_collaters.py
│ │ │ ├── test_cross_entropy.py
│ │ │ ├── test_data_utils.py
│ │ │ └── test_vggtransformer.py
│ │ ├── test_activation_checkpointing.py
│ │ ├── test_amp_optimizer.py
│ │ ├── test_average_checkpoints.py
│ │ ├── test_backtranslation_dataset.py
│ │ ├── test_binaries.py
│ │ ├── test_character_token_embedder.py
│ │ ├── test_checkpoint_utils.py
│ │ ├── test_concat_dataset.py
│ │ ├── test_constraints.py
│ │ ├── test_convtbc.py
│ │ ├── test_data_utils.py
│ │ ├── test_dataclass_utils.py
│ │ ├── test_dataset.py
│ │ ├── test_dictionary.py
│ │ ├── test_ema.py
│ │ ├── test_export.py
│ │ ├── test_file_chunker_utils.py
│ │ ├── test_file_io.py
│ │ ├── test_fp16_optimizer.py
│ │ ├── test_huffman.py
│ │ ├── test_inference_dropout.py
│ │ ├── test_iopath.py
│ │ ├── test_iterators.py
│ │ ├── test_label_smoothing.py
│ │ ├── test_lm_context_window.py
│ │ ├── test_lstm_jitable.py
│ │ ├── test_memory_efficient_fp16.py
│ │ ├── test_metrics.py
│ │ ├── test_multi_corpus_dataset.py
│ │ ├── test_multi_corpus_sampled_dataset.py
│ │ ├── test_multihead_attention.py
│ │ ├── test_noising.py
│ │ ├── test_online_backtranslation.py
│ │ ├── test_plasma_utils.py
│ │ ├── test_reproducibility.py
│ │ ├── test_resampling_dataset.py
│ │ ├── test_roberta.py
│ │ ├── test_sequence_generator.py
│ │ ├── test_sequence_scorer.py
│ │ ├── test_sparse_multihead_attention.py
│ │ ├── test_token_block_dataset.py
│ │ ├── test_train.py
│ │ ├── test_transformer.py
│ │ ├── test_utils.py
│ │ ├── test_valid_subset_checks.py
│ │ └── utils.py
│ └── train.py
├── glan/
│ └── README.md
├── infoxlm/
│ ├── README.md
│ ├── fairseq/
│ │ ├── .gitignore
│ │ ├── CODE_OF_CONDUCT.md
│ │ ├── CONTRIBUTING.md
│ │ ├── LICENSE
│ │ ├── README.md
│ │ ├── docs/
│ │ │ ├── Makefile
│ │ │ ├── _static/
│ │ │ │ └── theme_overrides.css
│ │ │ ├── command_line_tools.rst
│ │ │ ├── conf.py
│ │ │ ├── criterions.rst
│ │ │ ├── data.rst
│ │ │ ├── docutils.conf
│ │ │ ├── getting_started.rst
│ │ │ ├── index.rst
│ │ │ ├── lr_scheduler.rst
│ │ │ ├── make.bat
│ │ │ ├── models.rst
│ │ │ ├── modules.rst
│ │ │ ├── optim.rst
│ │ │ ├── overview.rst
│ │ │ ├── requirements.txt
│ │ │ ├── tasks.rst
│ │ │ ├── tutorial_classifying_names.rst
│ │ │ └── tutorial_simple_lstm.rst
│ │ ├── eval_lm.py
│ │ ├── examples/
│ │ │ ├── .gitignore
│ │ │ ├── __init__.py
│ │ │ ├── backtranslation/
│ │ │ │ └── README.md
│ │ │ ├── bart/
│ │ │ │ ├── README.cnn.md
│ │ │ │ ├── README.glue.md
│ │ │ │ └── README.md
│ │ │ ├── camembert/
│ │ │ │ └── README.md
│ │ │ ├── conv_seq2seq/
│ │ │ │ └── README.md
│ │ │ ├── cross_lingual_language_model/
│ │ │ │ └── README.md
│ │ │ ├── joint_alignment_translation/
│ │ │ │ ├── README.md
│ │ │ │ └── prepare-wmt18en2de_no_norm_no_escape_no_agressive.sh
│ │ │ ├── language_model/
│ │ │ │ ├── README.md
│ │ │ │ ├── conv_lm/
│ │ │ │ │ └── README.md
│ │ │ │ ├── prepare-wikitext-103.sh
│ │ │ │ └── transformer_lm/
│ │ │ │ └── README.md
│ │ │ ├── layerdrop/
│ │ │ │ └── README.md
│ │ │ ├── noisychannel/
│ │ │ │ ├── README.md
│ │ │ │ ├── __init__.py
│ │ │ │ ├── rerank.py
│ │ │ │ ├── rerank_generate.py
│ │ │ │ ├── rerank_options.py
│ │ │ │ ├── rerank_score_bw.py
│ │ │ │ ├── rerank_score_lm.py
│ │ │ │ ├── rerank_tune.py
│ │ │ │ └── rerank_utils.py
│ │ │ ├── nonautoregressive_translation/
│ │ │ │ ├── README.md
│ │ │ │ └── scripts.md
│ │ │ ├── pay_less_attention_paper/
│ │ │ │ └── README.md
│ │ │ ├── roberta/
│ │ │ │ ├── README.custom_classification.md
│ │ │ │ ├── README.glue.md
│ │ │ │ ├── README.md
│ │ │ │ ├── README.pretraining.md
│ │ │ │ ├── README.race.md
│ │ │ │ ├── commonsense_qa/
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── commonsense_qa_task.py
│ │ │ │ │ └── download_cqa_data.sh
│ │ │ │ ├── multiprocessing_bpe_encoder.py
│ │ │ │ ├── preprocess_GLUE_tasks.sh
│ │ │ │ ├── preprocess_RACE.py
│ │ │ │ ├── preprocess_RACE.sh
│ │ │ │ └── wsc/
│ │ │ │ ├── README.md
│ │ │ │ ├── __init__.py
│ │ │ │ ├── wsc_criterion.py
│ │ │ │ ├── wsc_task.py
│ │ │ │ └── wsc_utils.py
│ │ │ ├── scaling_nmt/
│ │ │ │ └── README.md
│ │ │ ├── speech_recognition/
│ │ │ │ ├── README.md
│ │ │ │ ├── __init__.py
│ │ │ │ ├── criterions/
│ │ │ │ │ ├── ASG_loss.py
│ │ │ │ │ ├── CTC_loss.py
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── cross_entropy_acc.py
│ │ │ │ ├── data/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── asr_dataset.py
│ │ │ │ │ ├── collaters.py
│ │ │ │ │ ├── data_utils.py
│ │ │ │ │ └── replabels.py
│ │ │ │ ├── datasets/
│ │ │ │ │ ├── asr_prep_json.py
│ │ │ │ │ └── prepare-librispeech.sh
│ │ │ │ ├── infer.py
│ │ │ │ ├── models/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── vggtransformer.py
│ │ │ │ │ └── w2l_conv_glu_enc.py
│ │ │ │ ├── tasks/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── speech_recognition.py
│ │ │ │ ├── utils/
│ │ │ │ │ └── wer_utils.py
│ │ │ │ └── w2l_decoder.py
│ │ │ ├── stories/
│ │ │ │ └── README.md
│ │ │ ├── translation/
│ │ │ │ ├── README.md
│ │ │ │ ├── prepare-iwslt14.sh
│ │ │ │ ├── prepare-iwslt17-multilingual.sh
│ │ │ │ ├── prepare-wmt14en2de.sh
│ │ │ │ └── prepare-wmt14en2fr.sh
│ │ │ ├── translation_moe/
│ │ │ │ ├── README.md
│ │ │ │ └── score.py
│ │ │ ├── wav2vec/
│ │ │ │ └── README.md
│ │ │ ├── wmt19/
│ │ │ │ └── README.md
│ │ │ └── xlmr/
│ │ │ └── README.md
│ │ ├── fairseq/
│ │ │ ├── __init__.py
│ │ │ ├── binarizer.py
│ │ │ ├── bleu.py
│ │ │ ├── checkpoint_utils.py
│ │ │ ├── clib/
│ │ │ │ ├── libbleu/
│ │ │ │ │ ├── libbleu.cpp
│ │ │ │ │ └── module.cpp
│ │ │ │ └── libnat/
│ │ │ │ └── edit_dist.cpp
│ │ │ ├── criterions/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── adaptive_loss.py
│ │ │ │ ├── binary_cross_entropy.py
│ │ │ │ ├── composite_loss.py
│ │ │ │ ├── cross_entropy.py
│ │ │ │ ├── fairseq_criterion.py
│ │ │ │ ├── label_smoothed_cross_entropy.py
│ │ │ │ ├── label_smoothed_cross_entropy_with_alignment.py
│ │ │ │ ├── legacy_masked_lm.py
│ │ │ │ ├── masked_lm.py
│ │ │ │ ├── nat_loss.py
│ │ │ │ ├── sentence_prediction.py
│ │ │ │ └── sentence_ranking.py
│ │ │ ├── data/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── append_token_dataset.py
│ │ │ │ ├── audio/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── raw_audio_dataset.py
│ │ │ │ ├── backtranslation_dataset.py
│ │ │ │ ├── base_wrapper_dataset.py
│ │ │ │ ├── colorize_dataset.py
│ │ │ │ ├── concat_dataset.py
│ │ │ │ ├── concat_sentences_dataset.py
│ │ │ │ ├── data_utils.py
│ │ │ │ ├── data_utils_fast.pyx
│ │ │ │ ├── denoising_dataset.py
│ │ │ │ ├── dictionary.py
│ │ │ │ ├── encoders/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── fastbpe.py
│ │ │ │ │ ├── gpt2_bpe.py
│ │ │ │ │ ├── gpt2_bpe_utils.py
│ │ │ │ │ ├── hf_bert_bpe.py
│ │ │ │ │ ├── moses_tokenizer.py
│ │ │ │ │ ├── nltk_tokenizer.py
│ │ │ │ │ ├── sentencepiece_bpe.py
│ │ │ │ │ ├── space_tokenizer.py
│ │ │ │ │ ├── subword_nmt_bpe.py
│ │ │ │ │ └── utils.py
│ │ │ │ ├── fairseq_dataset.py
│ │ │ │ ├── id_dataset.py
│ │ │ │ ├── indexed_dataset.py
│ │ │ │ ├── iterators.py
│ │ │ │ ├── language_pair_dataset.py
│ │ │ │ ├── legacy/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── block_pair_dataset.py
│ │ │ │ │ ├── masked_lm_dataset.py
│ │ │ │ │ └── masked_lm_dictionary.py
│ │ │ │ ├── list_dataset.py
│ │ │ │ ├── lm_context_window_dataset.py
│ │ │ │ ├── lru_cache_dataset.py
│ │ │ │ ├── mask_tokens_dataset.py
│ │ │ │ ├── monolingual_dataset.py
│ │ │ │ ├── multi_corpus_sampled_dataset.py
│ │ │ │ ├── nested_dictionary_dataset.py
│ │ │ │ ├── noising.py
│ │ │ │ ├── num_samples_dataset.py
│ │ │ │ ├── numel_dataset.py
│ │ │ │ ├── offset_tokens_dataset.py
│ │ │ │ ├── pad_dataset.py
│ │ │ │ ├── plasma_utils.py
│ │ │ │ ├── prepend_dataset.py
│ │ │ │ ├── prepend_token_dataset.py
│ │ │ │ ├── raw_label_dataset.py
│ │ │ │ ├── replace_dataset.py
│ │ │ │ ├── resampling_dataset.py
│ │ │ │ ├── roll_dataset.py
│ │ │ │ ├── round_robin_zip_datasets.py
│ │ │ │ ├── sharded_dataset.py
│ │ │ │ ├── sort_dataset.py
│ │ │ │ ├── strip_token_dataset.py
│ │ │ │ ├── subsample_dataset.py
│ │ │ │ ├── token_block_dataset.py
│ │ │ │ ├── token_block_utils_fast.pyx
│ │ │ │ ├── transform_eos_dataset.py
│ │ │ │ ├── transform_eos_lang_pair_dataset.py
│ │ │ │ └── truncate_dataset.py
│ │ │ ├── distributed_utils.py
│ │ │ ├── file_utils.py
│ │ │ ├── hub_utils.py
│ │ │ ├── iterative_refinement_generator.py
│ │ │ ├── legacy_distributed_data_parallel.py
│ │ │ ├── meters.py
│ │ │ ├── models/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── bart/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── hub_interface.py
│ │ │ │ │ └── model.py
│ │ │ │ ├── cmlm_transformer.py
│ │ │ │ ├── composite_encoder.py
│ │ │ │ ├── distributed_fairseq_model.py
│ │ │ │ ├── fairseq_decoder.py
│ │ │ │ ├── fairseq_encoder.py
│ │ │ │ ├── fairseq_incremental_decoder.py
│ │ │ │ ├── fairseq_model.py
│ │ │ │ ├── fconv.py
│ │ │ │ ├── fconv_lm.py
│ │ │ │ ├── fconv_self_att.py
│ │ │ │ ├── insertion_transformer.py
│ │ │ │ ├── iterative_nonautoregressive_transformer.py
│ │ │ │ ├── levenshtein_transformer.py
│ │ │ │ ├── lightconv.py
│ │ │ │ ├── lightconv_lm.py
│ │ │ │ ├── lstm.py
│ │ │ │ ├── masked_lm.py
│ │ │ │ ├── model_utils.py
│ │ │ │ ├── multilingual_transformer.py
│ │ │ │ ├── nonautoregressive_ensembles.py
│ │ │ │ ├── nonautoregressive_transformer.py
│ │ │ │ ├── roberta/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── alignment_utils.py
│ │ │ │ │ ├── hub_interface.py
│ │ │ │ │ └── model.py
│ │ │ │ ├── transformer.py
│ │ │ │ ├── transformer_from_pretrained_xlm.py
│ │ │ │ ├── transformer_lm.py
│ │ │ │ └── wav2vec.py
│ │ │ ├── modules/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── adaptive_input.py
│ │ │ │ ├── adaptive_softmax.py
│ │ │ │ ├── beamable_mm.py
│ │ │ │ ├── character_token_embedder.py
│ │ │ │ ├── conv_tbc.py
│ │ │ │ ├── cuda_utils.cu
│ │ │ │ ├── downsampled_multihead_attention.py
│ │ │ │ ├── dynamic_convolution.py
│ │ │ │ ├── dynamicconv_layer/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── cuda_function_gen.py
│ │ │ │ │ ├── dynamicconv_cuda.cpp
│ │ │ │ │ ├── dynamicconv_cuda.cuh
│ │ │ │ │ ├── dynamicconv_cuda_kernel.cu
│ │ │ │ │ ├── dynamicconv_layer.py
│ │ │ │ │ ├── dynamiconv_cpu.cpp
│ │ │ │ │ └── setup.py
│ │ │ │ ├── gelu.py
│ │ │ │ ├── grad_multiply.py
│ │ │ │ ├── highway.py
│ │ │ │ ├── layer_norm.py
│ │ │ │ ├── learned_positional_embedding.py
│ │ │ │ ├── lightconv_layer/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── cuda_function_gen.py
│ │ │ │ │ ├── lightconv_cuda.cpp
│ │ │ │ │ ├── lightconv_cuda.cuh
│ │ │ │ │ ├── lightconv_cuda_kernel.cu
│ │ │ │ │ ├── lightconv_layer.py
│ │ │ │ │ └── setup.py
│ │ │ │ ├── lightweight_convolution.py
│ │ │ │ ├── linearized_convolution.py
│ │ │ │ ├── logsumexp_moe.py
│ │ │ │ ├── mean_pool_gating_network.py
│ │ │ │ ├── multihead_attention.py
│ │ │ │ ├── positional_embedding.py
│ │ │ │ ├── scalar_bias.py
│ │ │ │ ├── sinusoidal_positional_embedding.py
│ │ │ │ ├── sparse_multihead_attention.py
│ │ │ │ ├── sparse_transformer_sentence_encoder.py
│ │ │ │ ├── sparse_transformer_sentence_encoder_layer.py
│ │ │ │ ├── transformer_layer.py
│ │ │ │ ├── transformer_sentence_encoder.py
│ │ │ │ ├── transformer_sentence_encoder_layer.py
│ │ │ │ ├── unfold.py
│ │ │ │ └── vggblock.py
│ │ │ ├── optim/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── adadelta.py
│ │ │ │ ├── adafactor.py
│ │ │ │ ├── adagrad.py
│ │ │ │ ├── adam.py
│ │ │ │ ├── adamax.py
│ │ │ │ ├── bmuf.py
│ │ │ │ ├── fairseq_optimizer.py
│ │ │ │ ├── fp16_optimizer.py
│ │ │ │ ├── lr_scheduler/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── cosine_lr_scheduler.py
│ │ │ │ │ ├── fairseq_lr_scheduler.py
│ │ │ │ │ ├── fixed_schedule.py
│ │ │ │ │ ├── inverse_square_root_schedule.py
│ │ │ │ │ ├── polynomial_decay_schedule.py
│ │ │ │ │ ├── reduce_lr_on_plateau.py
│ │ │ │ │ ├── tri_stage_lr_scheduler.py
│ │ │ │ │ └── triangular_lr_scheduler.py
│ │ │ │ ├── nag.py
│ │ │ │ └── sgd.py
│ │ │ ├── options.py
│ │ │ ├── pdb.py
│ │ │ ├── progress_bar.py
│ │ │ ├── registry.py
│ │ │ ├── search.py
│ │ │ ├── sequence_generator.py
│ │ │ ├── sequence_scorer.py
│ │ │ ├── tasks/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── audio_pretraining.py
│ │ │ │ ├── cross_lingual_lm.py
│ │ │ │ ├── denoising.py
│ │ │ │ ├── fairseq_task.py
│ │ │ │ ├── language_modeling.py
│ │ │ │ ├── legacy_masked_lm.py
│ │ │ │ ├── masked_lm.py
│ │ │ │ ├── multilingual_masked_lm.py
│ │ │ │ ├── multilingual_translation.py
│ │ │ │ ├── semisupervised_translation.py
│ │ │ │ ├── sentence_prediction.py
│ │ │ │ ├── sentence_ranking.py
│ │ │ │ ├── translation.py
│ │ │ │ ├── translation_from_pretrained_xlm.py
│ │ │ │ ├── translation_lev.py
│ │ │ │ └── translation_moe.py
│ │ │ ├── tokenizer.py
│ │ │ ├── trainer.py
│ │ │ └── utils.py
│ │ ├── fairseq_cli/
│ │ │ └── __init__.py
│ │ ├── generate.py
│ │ ├── hubconf.py
│ │ ├── interactive.py
│ │ ├── preprocess.py
│ │ ├── score.py
│ │ ├── scripts/
│ │ │ ├── __init__.py
│ │ │ ├── average_checkpoints.py
│ │ │ ├── build_sym_alignment.py
│ │ │ ├── compare_namespaces.py
│ │ │ ├── compound_split_bleu.sh
│ │ │ ├── convert_dictionary.lua
│ │ │ ├── convert_model.lua
│ │ │ ├── count_docs.py
│ │ │ ├── read_binarized.py
│ │ │ ├── rm_pt.py
│ │ │ ├── sacrebleu_pregen.sh
│ │ │ ├── shard_docs.py
│ │ │ ├── split_train_valid_docs.py
│ │ │ ├── spm_decode.py
│ │ │ ├── spm_encode.py
│ │ │ ├── spm_train.py
│ │ │ ├── wav2vec_featurize.py
│ │ │ └── wav2vec_manifest.py
│ │ ├── setup.py
│ │ ├── tests/
│ │ │ ├── __init__.py
│ │ │ ├── speech_recognition/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── asr_test_base.py
│ │ │ │ ├── test_collaters.py
│ │ │ │ ├── test_cross_entropy.py
│ │ │ │ └── test_vggtransformer.py
│ │ │ ├── test_average_checkpoints.py
│ │ │ ├── test_backtranslation_dataset.py
│ │ │ ├── test_binaries.py
│ │ │ ├── test_bmuf.py
│ │ │ ├── test_character_token_embedder.py
│ │ │ ├── test_concat_dataset.py
│ │ │ ├── test_convtbc.py
│ │ │ ├── test_dictionary.py
│ │ │ ├── test_iterators.py
│ │ │ ├── test_label_smoothing.py
│ │ │ ├── test_memory_efficient_fp16.py
│ │ │ ├── test_multi_corpus_sampled_dataset.py
│ │ │ ├── test_multihead_attention.py
│ │ │ ├── test_noising.py
│ │ │ ├── test_reproducibility.py
│ │ │ ├── test_resampling_dataset.py
│ │ │ ├── test_sequence_generator.py
│ │ │ ├── test_sequence_scorer.py
│ │ │ ├── test_sparse_multihead_attention.py
│ │ │ ├── test_token_block_dataset.py
│ │ │ ├── test_train.py
│ │ │ ├── test_utils.py
│ │ │ └── utils.py
│ │ ├── train.py
│ │ └── validate.py
│ ├── src-infoxlm/
│ │ ├── infoxlm/
│ │ │ ├── __init__.py
│ │ │ ├── criterions/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── xlco.py
│ │ │ │ └── xlm_align.py
│ │ │ ├── data/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── dict_dataset.py
│ │ │ │ ├── mlm_utils.py
│ │ │ │ ├── offset_dataset.py
│ │ │ │ ├── tlm_dataset.py
│ │ │ │ ├── xlco_dataset.py
│ │ │ │ └── xlm_align.py
│ │ │ ├── models/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── infoxlm.py
│ │ │ │ ├── roberta.py
│ │ │ │ └── xlm_align.py
│ │ │ ├── tasks/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── infoxlm.py
│ │ │ │ ├── mlm.py
│ │ │ │ ├── tlm.py
│ │ │ │ └── xlm_align.py
│ │ │ └── utils.py
│ │ ├── setup.py
│ │ └── train.py
│ └── tools/
│ ├── para2bin.py
│ ├── para2bin4xlco.py
│ └── txt2bin.py
├── kosmos-1/
│ └── README.md
├── kosmos-2/
│ ├── README.md
│ ├── data/
│ │ ├── dict.txt
│ │ ├── generate_config.py
│ │ ├── prepare_grit.py
│ │ ├── sentencepiece.bpe.model
│ │ └── visualize_grit.py
│ ├── demo/
│ │ ├── decode_string.py
│ │ ├── draw_box.py
│ │ └── gradio_app.py
│ ├── docs/
│ │ └── install.md
│ ├── evaluation/
│ │ ├── caption_obj_few_shot.py
│ │ ├── caption_obj_qa.py
│ │ ├── flickr_entities/
│ │ │ ├── README.md
│ │ │ ├── cook_data.py
│ │ │ ├── decode_string.py
│ │ │ └── flickr_entities_evaluate.py
│ │ ├── grd-zeroshot-flickr.sh
│ │ ├── grd-zeroshot-refcoco.sh
│ │ ├── refcoco/
│ │ │ ├── README.md
│ │ │ ├── box_ops.py
│ │ │ ├── cook_data.py
│ │ │ ├── decode_string.py
│ │ │ └── refexp_evaluate.py
│ │ ├── seed-bench/
│ │ │ ├── README.md
│ │ │ ├── cook_image_data.py
│ │ │ └── eval_ppl.py
│ │ └── zeroshot-seed-bench.sh
│ ├── fairseq/
│ │ ├── .circleci/
│ │ │ └── config.yml
│ │ ├── .github/
│ │ │ ├── ISSUE_TEMPLATE/
│ │ │ │ ├── bug_report.md
│ │ │ │ ├── documentation.md
│ │ │ │ ├── feature_request.md
│ │ │ │ └── how-to-question.md
│ │ │ ├── ISSUE_TEMPLATE.md
│ │ │ ├── PULL_REQUEST_TEMPLATE.md
│ │ │ ├── stale.yml
│ │ │ └── workflows/
│ │ │ ├── build.yml
│ │ │ └── build_wheels.yml
│ │ ├── .gitignore
│ │ ├── .gitmodules
│ │ ├── .isort.cfg
│ │ ├── .pre-commit-config.yaml
│ │ ├── CODE_OF_CONDUCT.md
│ │ ├── CONTRIBUTING.md
│ │ ├── LICENSE
│ │ ├── README.md
│ │ ├── docs/
│ │ │ ├── Makefile
│ │ │ ├── _static/
│ │ │ │ └── theme_overrides.css
│ │ │ ├── command_line_tools.rst
│ │ │ ├── conf.py
│ │ │ ├── criterions.rst
│ │ │ ├── data.rst
│ │ │ ├── docutils.conf
│ │ │ ├── getting_started.rst
│ │ │ ├── hydra_integration.md
│ │ │ ├── index.rst
│ │ │ ├── lr_scheduler.rst
│ │ │ ├── make.bat
│ │ │ ├── models.rst
│ │ │ ├── modules.rst
│ │ │ ├── optim.rst
│ │ │ ├── overview.rst
│ │ │ ├── requirements.txt
│ │ │ ├── tasks.rst
│ │ │ ├── tutorial_classifying_names.rst
│ │ │ └── tutorial_simple_lstm.rst
│ │ ├── examples/
│ │ │ ├── .gitignore
│ │ │ ├── MMPT/
│ │ │ │ ├── .gitignore
│ │ │ │ ├── CONFIG.md
│ │ │ │ ├── DATASET.md
│ │ │ │ ├── README.md
│ │ │ │ ├── endtask.md
│ │ │ │ ├── locallaunch.py
│ │ │ │ ├── mmpt/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── datasets/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── fairseqmmdataset.py
│ │ │ │ │ │ └── mmdataset.py
│ │ │ │ │ ├── evaluators/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── evaluator.py
│ │ │ │ │ │ ├── metric.py
│ │ │ │ │ │ └── predictor.py
│ │ │ │ │ ├── losses/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── fairseqmmloss.py
│ │ │ │ │ │ ├── loss.py
│ │ │ │ │ │ └── nce.py
│ │ │ │ │ ├── models/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── fairseqmmmodel.py
│ │ │ │ │ │ ├── mmfusion.py
│ │ │ │ │ │ ├── mmfusionnlg.py
│ │ │ │ │ │ └── transformermodel.py
│ │ │ │ │ ├── modules/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── mm.py
│ │ │ │ │ │ ├── retri.py
│ │ │ │ │ │ └── vectorpool.py
│ │ │ │ │ ├── processors/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── dedupprocessor.py
│ │ │ │ │ │ ├── dsprocessor.py
│ │ │ │ │ │ ├── how2processor.py
│ │ │ │ │ │ ├── how2retriprocessor.py
│ │ │ │ │ │ ├── models/
│ │ │ │ │ │ │ └── s3dg.py
│ │ │ │ │ │ └── processor.py
│ │ │ │ │ ├── tasks/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── fairseqmmtask.py
│ │ │ │ │ │ ├── milncetask.py
│ │ │ │ │ │ ├── retritask.py
│ │ │ │ │ │ ├── task.py
│ │ │ │ │ │ └── vlmtask.py
│ │ │ │ │ └── utils/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── load_config.py
│ │ │ │ │ └── shardedtensor.py
│ │ │ │ ├── mmpt_cli/
│ │ │ │ │ ├── localjob.py
│ │ │ │ │ └── predict.py
│ │ │ │ ├── pretraining.md
│ │ │ │ ├── projects/
│ │ │ │ │ ├── mfmmlm.yaml
│ │ │ │ │ ├── mtm/
│ │ │ │ │ │ ├── mmfusionmtm.yaml
│ │ │ │ │ │ ├── vlm/
│ │ │ │ │ │ │ ├── coin.yaml
│ │ │ │ │ │ │ ├── crosstask.yaml
│ │ │ │ │ │ │ ├── how2.yaml
│ │ │ │ │ │ │ ├── test_coin.yaml
│ │ │ │ │ │ │ ├── test_crosstask.yaml
│ │ │ │ │ │ │ ├── test_crosstask_zs.yaml
│ │ │ │ │ │ │ ├── test_vtt.yaml
│ │ │ │ │ │ │ ├── test_vttqa.yaml
│ │ │ │ │ │ │ ├── test_youcook.yaml
│ │ │ │ │ │ │ ├── test_youcookcap.yaml
│ │ │ │ │ │ │ ├── vtt.yaml
│ │ │ │ │ │ │ ├── vttqa.yaml
│ │ │ │ │ │ │ ├── youcook.yaml
│ │ │ │ │ │ │ └── youcookcap.yaml
│ │ │ │ │ │ └── vlm.yaml
│ │ │ │ │ ├── retri/
│ │ │ │ │ │ ├── videoclip/
│ │ │ │ │ │ │ ├── coin_videoclip.yaml
│ │ │ │ │ │ │ ├── crosstask_videoclip.yaml
│ │ │ │ │ │ │ ├── how2.yaml
│ │ │ │ │ │ │ ├── test_coin_videoclip.yaml
│ │ │ │ │ │ │ ├── test_coin_zs.yaml
│ │ │ │ │ │ │ ├── test_crosstask_videoclip.yaml
│ │ │ │ │ │ │ ├── test_crosstask_zs_videoclip.yaml
│ │ │ │ │ │ │ ├── test_didemo_zs.yaml
│ │ │ │ │ │ │ ├── test_vtt_videoclip.yaml
│ │ │ │ │ │ │ ├── test_vtt_zs.yaml
│ │ │ │ │ │ │ ├── test_vttqa_videoclip.yaml
│ │ │ │ │ │ │ ├── test_vttqa_zs.yaml
│ │ │ │ │ │ │ ├── test_youcook_videoclip.yaml
│ │ │ │ │ │ │ ├── test_youcook_zs.yaml
│ │ │ │ │ │ │ ├── vtt_videoclip.yaml
│ │ │ │ │ │ │ ├── vttqa_videoclip.yaml
│ │ │ │ │ │ │ └── youcook_videoclip.yaml
│ │ │ │ │ │ ├── videoclip.yaml
│ │ │ │ │ │ └── videoretri.yaml
│ │ │ │ │ └── task/
│ │ │ │ │ ├── coin.yaml
│ │ │ │ │ ├── coin_videoclip.yaml
│ │ │ │ │ ├── crosstask.yaml
│ │ │ │ │ ├── crosstask_videoclip.yaml
│ │ │ │ │ ├── default.yaml
│ │ │ │ │ ├── ft.yaml
│ │ │ │ │ ├── how2.yaml
│ │ │ │ │ ├── test.yaml
│ │ │ │ │ ├── test_coin.yaml
│ │ │ │ │ ├── test_coin_videoclip.yaml
│ │ │ │ │ ├── test_coin_zs.yaml
│ │ │ │ │ ├── test_crosstask.yaml
│ │ │ │ │ ├── test_crosstask_videoclip.yaml
│ │ │ │ │ ├── test_crosstask_zs.yaml
│ │ │ │ │ ├── test_crosstask_zs_videoclip.yaml
│ │ │ │ │ ├── test_didemo_zs.yaml
│ │ │ │ │ ├── test_vtt.yaml
│ │ │ │ │ ├── test_vtt_videoclip.yaml
│ │ │ │ │ ├── test_vtt_zs.yaml
│ │ │ │ │ ├── test_vttqa.yaml
│ │ │ │ │ ├── test_vttqa_videoclip.yaml
│ │ │ │ │ ├── test_vttqa_zs.yaml
│ │ │ │ │ ├── test_youcook.yaml
│ │ │ │ │ ├── test_youcook_videoclip.yaml
│ │ │ │ │ ├── test_youcook_zs.yaml
│ │ │ │ │ ├── test_youcookcap.yaml
│ │ │ │ │ ├── vtt.yaml
│ │ │ │ │ ├── vtt_videoclip.yaml
│ │ │ │ │ ├── vttqa.yaml
│ │ │ │ │ ├── vttqa_videoclip.yaml
│ │ │ │ │ ├── youcook.yaml
│ │ │ │ │ ├── youcook_videoclip.yaml
│ │ │ │ │ └── youcookcap.yaml
│ │ │ │ ├── scripts/
│ │ │ │ │ ├── text_token_extractor/
│ │ │ │ │ │ ├── configs/
│ │ │ │ │ │ │ └── bert-base-uncased.yaml
│ │ │ │ │ │ └── pretokenization.py
│ │ │ │ │ └── video_feature_extractor/
│ │ │ │ │ ├── extract.py
│ │ │ │ │ ├── how2/
│ │ │ │ │ │ └── s3d.sh
│ │ │ │ │ ├── model.py
│ │ │ │ │ ├── pathbuilder.py
│ │ │ │ │ ├── preprocessing.py
│ │ │ │ │ ├── random_sequence_shuffler.py
│ │ │ │ │ ├── shard_feature.py
│ │ │ │ │ └── videoreader.py
│ │ │ │ └── setup.py
│ │ │ ├── __init__.py
│ │ │ ├── adaptive_span/
│ │ │ │ ├── README.md
│ │ │ │ ├── __init__.py
│ │ │ │ ├── adagrad_with_grad_clip.py
│ │ │ │ ├── adaptive_span_attention.py
│ │ │ │ ├── adaptive_span_loss.py
│ │ │ │ ├── adaptive_span_model.py
│ │ │ │ ├── adaptive_span_model_wrapper.py
│ │ │ │ └── truncated_bptt_lm_task.py
│ │ │ ├── attention_head_selection/
│ │ │ │ ├── README.md
│ │ │ │ └── src/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── data/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── speech_to_text_dataset_with_domain.py
│ │ │ │ ├── loss/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── attention_head_selection.py
│ │ │ │ ├── models/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── head_selection_s2t_transformer.py
│ │ │ │ │ └── head_selection_transformer.py
│ │ │ │ ├── modules/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── attn_head_selector.py
│ │ │ │ │ ├── head_selection_transformer_layer.py
│ │ │ │ │ ├── multihead_attention_selection.py
│ │ │ │ │ └── multihead_functional.py
│ │ │ │ └── speech_to_text_head_selection.py
│ │ │ ├── backtranslation/
│ │ │ │ ├── README.md
│ │ │ │ ├── deduplicate_lines.py
│ │ │ │ ├── extract_bt_data.py
│ │ │ │ ├── prepare-de-monolingual.sh
│ │ │ │ ├── prepare-wmt18en2de.sh
│ │ │ │ ├── sacrebleu.sh
│ │ │ │ └── tokenized_bleu.sh
│ │ │ ├── bart/
│ │ │ │ ├── README.glue.md
│ │ │ │ ├── README.md
│ │ │ │ ├── README.summarization.md
│ │ │ │ └── summarize.py
│ │ │ ├── byte_level_bpe/
│ │ │ │ ├── README.md
│ │ │ │ ├── get_bitext.py
│ │ │ │ ├── get_data.sh
│ │ │ │ └── gru_transformer.py
│ │ │ ├── camembert/
│ │ │ │ └── README.md
│ │ │ ├── constrained_decoding/
│ │ │ │ ├── README.md
│ │ │ │ ├── normalize.py
│ │ │ │ └── tok.py
│ │ │ ├── conv_seq2seq/
│ │ │ │ └── README.md
│ │ │ ├── criss/
│ │ │ │ ├── README.md
│ │ │ │ ├── download_and_preprocess_flores_test.sh
│ │ │ │ ├── download_and_preprocess_tatoeba.sh
│ │ │ │ ├── mining/
│ │ │ │ │ ├── mine.py
│ │ │ │ │ └── mine_example.sh
│ │ │ │ ├── save_encoder.py
│ │ │ │ ├── sentence_retrieval/
│ │ │ │ │ ├── encoder_analysis.py
│ │ │ │ │ └── sentence_retrieval_tatoeba.sh
│ │ │ │ └── unsupervised_mt/
│ │ │ │ └── eval.sh
│ │ │ ├── cross_lingual_language_model/
│ │ │ │ └── README.md
│ │ │ ├── discriminative_reranking_nmt/
│ │ │ │ ├── README.md
│ │ │ │ ├── __init__.py
│ │ │ │ ├── config/
│ │ │ │ │ └── deen.yaml
│ │ │ │ ├── criterions/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── discriminative_reranking_criterion.py
│ │ │ │ ├── drnmt_rerank.py
│ │ │ │ ├── models/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── discriminative_reranking_model.py
│ │ │ │ ├── scripts/
│ │ │ │ │ └── prep_data.py
│ │ │ │ └── tasks/
│ │ │ │ ├── __init__.py
│ │ │ │ └── discriminative_reranking_task.py
│ │ │ ├── fast_noisy_channel/
│ │ │ │ ├── README.md
│ │ │ │ ├── __init__.py
│ │ │ │ ├── noisy_channel_beam_search.py
│ │ │ │ ├── noisy_channel_sequence_generator.py
│ │ │ │ └── noisy_channel_translation.py
│ │ │ ├── flores101/
│ │ │ │ └── README.md
│ │ │ ├── fully_sharded_data_parallel/
│ │ │ │ └── README.md
│ │ │ ├── gottbert/
│ │ │ │ └── README.md
│ │ │ ├── hubert/
│ │ │ │ ├── README.md
│ │ │ │ ├── config/
│ │ │ │ │ ├── decode/
│ │ │ │ │ │ ├── ax_sweep/
│ │ │ │ │ │ │ ├── ngram.yaml
│ │ │ │ │ │ │ └── transformer.yaml
│ │ │ │ │ │ ├── infer_fsqlm.yaml
│ │ │ │ │ │ ├── infer_kenlm.yaml
│ │ │ │ │ │ ├── infer_viterbi.yaml
│ │ │ │ │ │ └── run/
│ │ │ │ │ │ ├── submitit_slurm.yaml
│ │ │ │ │ │ └── submitit_slurm_8gpu.yaml
│ │ │ │ │ ├── finetune/
│ │ │ │ │ │ ├── base_10h.yaml
│ │ │ │ │ │ ├── ckpt/
│ │ │ │ │ │ │ └── it1.yaml
│ │ │ │ │ │ ├── lm/
│ │ │ │ │ │ │ └── ls_4gram.yaml
│ │ │ │ │ │ └── run/
│ │ │ │ │ │ └── submitit_reg.yaml
│ │ │ │ │ └── pretrain/
│ │ │ │ │ ├── data/
│ │ │ │ │ │ ├── iter1.yaml
│ │ │ │ │ │ └── iter2.yaml
│ │ │ │ │ ├── hubert_base_librispeech.yaml
│ │ │ │ │ ├── hubert_large_librivox.yaml
│ │ │ │ │ ├── hubert_xlarge_librivox.yaml
│ │ │ │ │ └── run/
│ │ │ │ │ └── submitit_reg.yaml
│ │ │ │ ├── measure_teacher_quality.py
│ │ │ │ ├── simple_kmeans/
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── dump_hubert_feature.py
│ │ │ │ │ ├── dump_hubert_feature_s2t.py
│ │ │ │ │ ├── dump_km_label.py
│ │ │ │ │ ├── dump_mfcc_feature.py
│ │ │ │ │ ├── dump_w2v2_feature.py
│ │ │ │ │ ├── feature_utils.py
│ │ │ │ │ └── learn_kmeans.py
│ │ │ │ └── update_ckpt.py
│ │ │ ├── joint_alignment_translation/
│ │ │ │ ├── README.md
│ │ │ │ └── prepare-wmt18en2de_no_norm_no_escape_no_agressive.sh
│ │ │ ├── language_model/
│ │ │ │ ├── README.adaptive_inputs.md
│ │ │ │ ├── README.conv.md
│ │ │ │ ├── README.md
│ │ │ │ └── prepare-wikitext-103.sh
│ │ │ ├── laser/
│ │ │ │ ├── README.md
│ │ │ │ └── laser_src/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── laser_lstm.py
│ │ │ │ ├── laser_task.py
│ │ │ │ ├── laser_transformer.py
│ │ │ │ └── multitask_data_utils.py
│ │ │ ├── latent_depth/
│ │ │ │ ├── README.md
│ │ │ │ └── latent_depth_src/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── loss/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── latent_depth.py
│ │ │ │ ├── models/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── latent_multilingual_transformer.py
│ │ │ │ │ └── latent_transformer.py
│ │ │ │ ├── modules/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── latent_layers.py
│ │ │ │ └── multilingual_translation_latent_depth.py
│ │ │ ├── layerdrop/
│ │ │ │ └── README.md
│ │ │ ├── linformer/
│ │ │ │ ├── README.md
│ │ │ │ └── linformer_src/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── models/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── linformer_roberta.py
│ │ │ │ └── modules/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── linformer_sentence_encoder.py
│ │ │ │ ├── linformer_sentence_encoder_layer.py
│ │ │ │ └── multihead_linear_attention.py
│ │ │ ├── m2m_100/
│ │ │ │ ├── README.md
│ │ │ │ ├── install_dependecies.sh
│ │ │ │ ├── process_data/
│ │ │ │ │ ├── clean_histogram.py
│ │ │ │ │ ├── dedup_data.py
│ │ │ │ │ └── remove_too_much_punc.py
│ │ │ │ ├── tok.sh
│ │ │ │ └── tokenizers/
│ │ │ │ ├── README.md
│ │ │ │ ├── seg_ja.sh
│ │ │ │ ├── seg_ko.sh
│ │ │ │ ├── thirdparty/
│ │ │ │ │ └── .gitignore
│ │ │ │ ├── tokenize_indic.py
│ │ │ │ ├── tokenize_thai.py
│ │ │ │ ├── tokenize_zh.py
│ │ │ │ └── tokenizer_ar.sh
│ │ │ ├── mbart/
│ │ │ │ └── README.md
│ │ │ ├── megatron_11b/
│ │ │ │ ├── README.md
│ │ │ │ └── detok.py
│ │ │ ├── moe_lm/
│ │ │ │ ├── README.md
│ │ │ │ ├── data_card.md
│ │ │ │ └── model_card.md
│ │ │ ├── multilingual/
│ │ │ │ ├── ML50_langs.txt
│ │ │ │ ├── README.md
│ │ │ │ ├── data_scripts/
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── binarize.py
│ │ │ │ │ ├── check_iswlt_test_data.py
│ │ │ │ │ ├── check_self_overlaps.py
│ │ │ │ │ ├── check_valid_test_overlaps.py
│ │ │ │ │ ├── dedup_all.py
│ │ │ │ │ ├── download_ML50_v1.sh
│ │ │ │ │ ├── download_af_xh.sh
│ │ │ │ │ ├── download_flores_data.sh
│ │ │ │ │ ├── download_iitb.sh
│ │ │ │ │ ├── download_iwslt_and_extract.sh
│ │ │ │ │ ├── download_lotus.sh
│ │ │ │ │ ├── download_ted_and_extract.py
│ │ │ │ │ ├── download_wat19_my.sh
│ │ │ │ │ ├── download_wmt19_and_before.py
│ │ │ │ │ ├── download_wmt20.sh
│ │ │ │ │ ├── preprocess_ML50_v1.sh
│ │ │ │ │ ├── remove_valid_test_in_train.py
│ │ │ │ │ ├── requirement.txt
│ │ │ │ │ └── utils/
│ │ │ │ │ ├── dedup.py
│ │ │ │ │ ├── fasttext_multi_filter.py
│ │ │ │ │ └── strip_sgm.sh
│ │ │ │ ├── finetune_multilingual_model.sh
│ │ │ │ ├── multilingual_fairseq_gen.sh
│ │ │ │ └── train_multilingual_model.sh
│ │ │ ├── noisychannel/
│ │ │ │ ├── README.md
│ │ │ │ ├── __init__.py
│ │ │ │ ├── rerank.py
│ │ │ │ ├── rerank_generate.py
│ │ │ │ ├── rerank_options.py
│ │ │ │ ├── rerank_score_bw.py
│ │ │ │ ├── rerank_score_lm.py
│ │ │ │ ├── rerank_tune.py
│ │ │ │ └── rerank_utils.py
│ │ │ ├── nonautoregressive_translation/
│ │ │ │ ├── README.md
│ │ │ │ └── scripts.md
│ │ │ ├── normformer/
│ │ │ │ ├── README.md
│ │ │ │ └── train_lm.sh
│ │ │ ├── operators/
│ │ │ │ ├── alignment_train_cpu.cpp
│ │ │ │ ├── alignment_train_cuda.cpp
│ │ │ │ ├── alignment_train_cuda.h
│ │ │ │ ├── alignment_train_kernel.cu
│ │ │ │ └── utils.h
│ │ │ ├── paraphraser/
│ │ │ │ ├── README.md
│ │ │ │ └── paraphrase.py
│ │ │ ├── pay_less_attention_paper/
│ │ │ │ └── README.md
│ │ │ ├── pointer_generator/
│ │ │ │ ├── README.md
│ │ │ │ ├── README.xsum.md
│ │ │ │ ├── pointer_generator_src/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── transformer_pg.py
│ │ │ │ ├── postprocess.py
│ │ │ │ └── preprocess.py
│ │ │ ├── quant_noise/
│ │ │ │ ├── README.md
│ │ │ │ └── transformer_quantization_config.yaml
│ │ │ ├── roberta/
│ │ │ │ ├── README.custom_classification.md
│ │ │ │ ├── README.glue.md
│ │ │ │ ├── README.md
│ │ │ │ ├── README.pretraining.md
│ │ │ │ ├── README.race.md
│ │ │ │ ├── commonsense_qa/
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── commonsense_qa_task.py
│ │ │ │ │ └── download_cqa_data.sh
│ │ │ │ ├── config/
│ │ │ │ │ ├── finetuning/
│ │ │ │ │ │ ├── cola.yaml
│ │ │ │ │ │ ├── mnli.yaml
│ │ │ │ │ │ ├── mrpc.yaml
│ │ │ │ │ │ ├── qnli.yaml
│ │ │ │ │ │ ├── qqp.yaml
│ │ │ │ │ │ ├── rte.yaml
│ │ │ │ │ │ ├── sst_2.yaml
│ │ │ │ │ │ └── sts_b.yaml
│ │ │ │ │ └── pretraining/
│ │ │ │ │ └── base.yaml
│ │ │ │ ├── multiprocessing_bpe_encoder.py
│ │ │ │ ├── preprocess_GLUE_tasks.sh
│ │ │ │ ├── preprocess_RACE.py
│ │ │ │ ├── preprocess_RACE.sh
│ │ │ │ └── wsc/
│ │ │ │ ├── README.md
│ │ │ │ ├── __init__.py
│ │ │ │ ├── wsc_criterion.py
│ │ │ │ ├── wsc_task.py
│ │ │ │ └── wsc_utils.py
│ │ │ ├── rxf/
│ │ │ │ ├── README.md
│ │ │ │ ├── __init__.py
│ │ │ │ └── rxf_src/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── label_smoothed_cross_entropy_r3f.py
│ │ │ │ └── sentence_prediction_r3f.py
│ │ │ ├── scaling_nmt/
│ │ │ │ └── README.md
│ │ │ ├── shuffled_word_order/
│ │ │ │ ├── README.finetuning.md
│ │ │ │ └── README.md
│ │ │ ├── simultaneous_translation/
│ │ │ │ ├── README.md
│ │ │ │ ├── __init__.py
│ │ │ │ ├── docs/
│ │ │ │ │ ├── ende-mma.md
│ │ │ │ │ └── enja-waitk.md
│ │ │ │ ├── eval/
│ │ │ │ │ └── agents/
│ │ │ │ │ └── simul_t2t_enja.py
│ │ │ │ ├── models/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── convtransformer_simul_trans.py
│ │ │ │ │ └── transformer_monotonic_attention.py
│ │ │ │ ├── modules/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── fixed_pre_decision.py
│ │ │ │ │ ├── monotonic_multihead_attention.py
│ │ │ │ │ └── monotonic_transformer_layer.py
│ │ │ │ └── utils/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── functions.py
│ │ │ │ ├── monotonic_attention.py
│ │ │ │ └── p_choose_strategy.py
│ │ │ ├── speech_recognition/
│ │ │ │ ├── README.md
│ │ │ │ ├── __init__.py
│ │ │ │ ├── criterions/
│ │ │ │ │ ├── ASG_loss.py
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── cross_entropy_acc.py
│ │ │ │ ├── data/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── asr_dataset.py
│ │ │ │ │ ├── collaters.py
│ │ │ │ │ ├── data_utils.py
│ │ │ │ │ └── replabels.py
│ │ │ │ ├── datasets/
│ │ │ │ │ ├── asr_prep_json.py
│ │ │ │ │ └── prepare-librispeech.sh
│ │ │ │ ├── infer.py
│ │ │ │ ├── kaldi/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── add-self-loop-simple.cc
│ │ │ │ │ ├── config/
│ │ │ │ │ │ └── kaldi_initializer.yaml
│ │ │ │ │ ├── kaldi_decoder.py
│ │ │ │ │ └── kaldi_initializer.py
│ │ │ │ ├── models/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── vggtransformer.py
│ │ │ │ │ └── w2l_conv_glu_enc.py
│ │ │ │ ├── new/
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── conf/
│ │ │ │ │ │ ├── hydra/
│ │ │ │ │ │ │ └── sweeper/
│ │ │ │ │ │ │ └── ax.yaml
│ │ │ │ │ │ └── infer.yaml
│ │ │ │ │ ├── decoders/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── base_decoder.py
│ │ │ │ │ │ ├── decoder.py
│ │ │ │ │ │ ├── decoder_config.py
│ │ │ │ │ │ ├── flashlight_decoder.py
│ │ │ │ │ │ └── viterbi_decoder.py
│ │ │ │ │ └── infer.py
│ │ │ │ ├── tasks/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── speech_recognition.py
│ │ │ │ ├── utils/
│ │ │ │ │ └── wer_utils.py
│ │ │ │ └── w2l_decoder.py
│ │ │ ├── speech_synthesis/
│ │ │ │ ├── README.md
│ │ │ │ ├── __init__.py
│ │ │ │ ├── data_utils.py
│ │ │ │ ├── docs/
│ │ │ │ │ ├── common_voice_example.md
│ │ │ │ │ ├── ljspeech_example.md
│ │ │ │ │ └── vctk_example.md
│ │ │ │ ├── evaluation/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── eval_asr.py
│ │ │ │ │ ├── eval_f0.py
│ │ │ │ │ ├── eval_sp.py
│ │ │ │ │ └── get_eval_manifest.py
│ │ │ │ ├── generate_waveform.py
│ │ │ │ ├── preprocessing/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── denoise_and_vad_audio.py
│ │ │ │ │ ├── denoiser/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── demucs.py
│ │ │ │ │ │ ├── pretrained.py
│ │ │ │ │ │ ├── resample.py
│ │ │ │ │ │ └── utils.py
│ │ │ │ │ ├── get_common_voice_audio_manifest.py
│ │ │ │ │ ├── get_feature_manifest.py
│ │ │ │ │ ├── get_ljspeech_audio_manifest.py
│ │ │ │ │ ├── get_speaker_embedding.py
│ │ │ │ │ ├── get_vctk_audio_manifest.py
│ │ │ │ │ ├── speaker_embedder/
│ │ │ │ │ │ └── __init__.py
│ │ │ │ │ └── vad/
│ │ │ │ │ └── __init__.py
│ │ │ │ └── utils.py
│ │ │ ├── speech_text_joint_to_text/
│ │ │ │ ├── README.md
│ │ │ │ ├── __init__.py
│ │ │ │ ├── configs/
│ │ │ │ │ └── mustc_noise.list
│ │ │ │ ├── criterions/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── text_guide_cross_entropy_acc.py
│ │ │ │ ├── docs/
│ │ │ │ │ ├── ende-mustc.md
│ │ │ │ │ └── iwslt2021.md
│ │ │ │ ├── models/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── s2t_dualinputtransformer.py
│ │ │ │ │ └── s2t_dualinputxmtransformer.py
│ │ │ │ ├── scripts/
│ │ │ │ │ └── g2p_encode.py
│ │ │ │ └── tasks/
│ │ │ │ ├── __init__.py
│ │ │ │ └── speech_text_joint.py
│ │ │ ├── speech_to_speech/
│ │ │ │ ├── README.md
│ │ │ │ ├── __init__.py
│ │ │ │ ├── benchmarking/
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── configs/
│ │ │ │ │ │ ├── 2StageS2ST.yaml
│ │ │ │ │ │ ├── 3StageS2ST.yaml
│ │ │ │ │ │ ├── DirectS2U.yaml
│ │ │ │ │ │ └── S2T.yaml
│ │ │ │ │ ├── core.py
│ │ │ │ │ ├── data_utils.py
│ │ │ │ │ └── get_metrics.py
│ │ │ │ ├── generate_waveform_from_code.py
│ │ │ │ └── preprocessing/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── data_utils.py
│ │ │ │ ├── prep_s2spect_data.py
│ │ │ │ └── prep_s2ut_data.py
│ │ │ ├── speech_to_text/
│ │ │ │ ├── README.md
│ │ │ │ ├── data_utils.py
│ │ │ │ ├── docs/
│ │ │ │ │ ├── covost_example.md
│ │ │ │ │ ├── librispeech_example.md
│ │ │ │ │ ├── mtedx_example.md
│ │ │ │ │ ├── mustc_example.md
│ │ │ │ │ └── simulst_mustc_example.md
│ │ │ │ ├── prep_covost_data.py
│ │ │ │ ├── prep_librispeech_data.py
│ │ │ │ ├── prep_mtedx_data.py
│ │ │ │ ├── prep_mustc_data.py
│ │ │ │ ├── seg_mustc_data.py
│ │ │ │ └── simultaneous_translation/
│ │ │ │ └── agents/
│ │ │ │ └── fairseq_simul_st_agent.py
│ │ │ ├── stories/
│ │ │ │ └── README.md
│ │ │ ├── textless_nlp/
│ │ │ │ ├── gslm/
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── metrics/
│ │ │ │ │ │ ├── README.md
│ │ │ │ │ │ ├── abx_metrics/
│ │ │ │ │ │ │ ├── README.md
│ │ │ │ │ │ │ └── dump_abx_feats.py
│ │ │ │ │ │ └── asr_metrics/
│ │ │ │ │ │ ├── README.md
│ │ │ │ │ │ ├── continuation_eval.py
│ │ │ │ │ │ ├── misc/
│ │ │ │ │ │ │ ├── bleu_utils.py
│ │ │ │ │ │ │ ├── cut_as.py
│ │ │ │ │ │ │ └── dict.ltr.txt
│ │ │ │ │ │ ├── ppx.py
│ │ │ │ │ │ └── self_auto_bleu.py
│ │ │ │ │ ├── speech2unit/
│ │ │ │ │ │ ├── README.md
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── clustering/
│ │ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ │ ├── cluster_kmeans.py
│ │ │ │ │ │ │ ├── dump_feats.py
│ │ │ │ │ │ │ ├── quantize_with_kmeans.py
│ │ │ │ │ │ │ └── utils.py
│ │ │ │ │ │ └── pretrained/
│ │ │ │ │ │ ├── cpc_feature_reader.py
│ │ │ │ │ │ ├── hubert_feature_reader.py
│ │ │ │ │ │ ├── logmel_feature_reader.py
│ │ │ │ │ │ ├── utils.py
│ │ │ │ │ │ └── w2v2_feature_reader.py
│ │ │ │ │ ├── tools/
│ │ │ │ │ │ ├── README.md
│ │ │ │ │ │ └── resynthesize_speech.py
│ │ │ │ │ ├── ulm/
│ │ │ │ │ │ ├── README.md
│ │ │ │ │ │ └── sample.py
│ │ │ │ │ └── unit2speech/
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── convert_to_16k.py
│ │ │ │ │ ├── glow.py
│ │ │ │ │ ├── multiproc.py
│ │ │ │ │ ├── synthesize_audio_from_units.py
│ │ │ │ │ ├── tacotron2/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── audio_processing.py
│ │ │ │ │ │ ├── cleaners.py
│ │ │ │ │ │ ├── cmudict.py
│ │ │ │ │ │ ├── layers.py
│ │ │ │ │ │ ├── model.py
│ │ │ │ │ │ ├── numbers.py
│ │ │ │ │ │ ├── stft.py
│ │ │ │ │ │ ├── symbols.py
│ │ │ │ │ │ ├── text.py
│ │ │ │ │ │ ├── utils.py
│ │ │ │ │ │ └── waveglow_denoiser.py
│ │ │ │ │ ├── tts_data.py
│ │ │ │ │ └── utils.py
│ │ │ │ └── speech-resynth/
│ │ │ │ └── README.md
│ │ │ ├── translation/
│ │ │ │ ├── README.md
│ │ │ │ ├── prepare-iwslt14.sh
│ │ │ │ ├── prepare-iwslt17-multilingual.sh
│ │ │ │ ├── prepare-wmt14en2de.sh
│ │ │ │ └── prepare-wmt14en2fr.sh
│ │ │ ├── translation_moe/
│ │ │ │ ├── README.md
│ │ │ │ ├── score.py
│ │ │ │ └── translation_moe_src/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── logsumexp_moe.py
│ │ │ │ ├── mean_pool_gating_network.py
│ │ │ │ └── translation_moe.py
│ │ │ ├── truncated_bptt/
│ │ │ │ ├── README.md
│ │ │ │ ├── __init__.py
│ │ │ │ ├── transformer_xl_model.py
│ │ │ │ └── truncated_bptt_lm_task.py
│ │ │ ├── unsupervised_quality_estimation/
│ │ │ │ ├── README.md
│ │ │ │ ├── aggregate_scores.py
│ │ │ │ ├── meteor.py
│ │ │ │ └── repeat_lines.py
│ │ │ ├── wav2vec/
│ │ │ │ ├── README.md
│ │ │ │ ├── __init__.py
│ │ │ │ ├── config/
│ │ │ │ │ ├── finetuning/
│ │ │ │ │ │ ├── base_100h.yaml
│ │ │ │ │ │ ├── base_10h.yaml
│ │ │ │ │ │ ├── base_10m.yaml
│ │ │ │ │ │ ├── base_1h.yaml
│ │ │ │ │ │ ├── base_960h.yaml
│ │ │ │ │ │ ├── vox_100h.yaml
│ │ │ │ │ │ ├── vox_10h.yaml
│ │ │ │ │ │ ├── vox_10m.yaml
│ │ │ │ │ │ ├── vox_1h.yaml
│ │ │ │ │ │ └── vox_960h.yaml
│ │ │ │ │ └── pretraining/
│ │ │ │ │ ├── wav2vec2_base_librispeech.yaml
│ │ │ │ │ ├── wav2vec2_large_librivox.yaml
│ │ │ │ │ ├── wav2vec2_large_librivox_tpu-pod.yaml
│ │ │ │ │ └── wav2vec2_large_librivox_tpu.yaml
│ │ │ │ ├── libri_labels.py
│ │ │ │ ├── scripts/
│ │ │ │ │ └── binarize_manifest.sh
│ │ │ │ ├── unsupervised/
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── config/
│ │ │ │ │ │ ├── finetuning/
│ │ │ │ │ │ │ └── w2v_finetune.yaml
│ │ │ │ │ │ ├── gan/
│ │ │ │ │ │ │ └── w2vu.yaml
│ │ │ │ │ │ ├── generate/
│ │ │ │ │ │ │ └── viterbi.yaml
│ │ │ │ │ │ ├── timit_matched/
│ │ │ │ │ │ │ ├── test.uid
│ │ │ │ │ │ │ ├── train.uid
│ │ │ │ │ │ │ ├── train_text.uid
│ │ │ │ │ │ │ └── valid.uid
│ │ │ │ │ │ └── timit_unmatched/
│ │ │ │ │ │ ├── test.uid
│ │ │ │ │ │ ├── train.uid
│ │ │ │ │ │ ├── train_text.uid
│ │ │ │ │ │ └── valid.uid
│ │ │ │ │ ├── data/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── extracted_features_dataset.py
│ │ │ │ │ │ └── random_input_dataset.py
│ │ │ │ │ ├── kaldi_self_train/
│ │ │ │ │ │ ├── README.md
│ │ │ │ │ │ └── st/
│ │ │ │ │ │ ├── cmd.sh
│ │ │ │ │ │ ├── decode_phone.sh
│ │ │ │ │ │ ├── decode_word_step1.sh
│ │ │ │ │ │ ├── decode_word_step2.sh
│ │ │ │ │ │ ├── local/
│ │ │ │ │ │ │ ├── copy_aligned_text.py
│ │ │ │ │ │ │ ├── decode.sh
│ │ │ │ │ │ │ ├── prepare_data_from_w2v.py
│ │ │ │ │ │ │ ├── prepare_lang.sh
│ │ │ │ │ │ │ ├── prepare_lang_word.sh
│ │ │ │ │ │ │ ├── prepare_lm.sh
│ │ │ │ │ │ │ ├── score.sh
│ │ │ │ │ │ │ ├── show_wer.sh
│ │ │ │ │ │ │ ├── train_subset_lgbeam.sh
│ │ │ │ │ │ │ ├── unsup_select.py
│ │ │ │ │ │ │ ├── unsup_select_decode.sh
│ │ │ │ │ │ │ └── unsup_select_decode_word.sh
│ │ │ │ │ │ ├── path.sh
│ │ │ │ │ │ ├── steps
│ │ │ │ │ │ ├── steps_gan/
│ │ │ │ │ │ │ ├── train_deltas.sh
│ │ │ │ │ │ │ ├── train_lda_mllt.sh
│ │ │ │ │ │ │ └── train_sat.sh
│ │ │ │ │ │ ├── train.sh
│ │ │ │ │ │ └── utils
│ │ │ │ │ ├── models/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ └── wav2vec_u.py
│ │ │ │ │ ├── scripts/
│ │ │ │ │ │ ├── apply_pca.py
│ │ │ │ │ │ ├── copy_labels.py
│ │ │ │ │ │ ├── filter_lexicon.py
│ │ │ │ │ │ ├── filter_tsv.py
│ │ │ │ │ │ ├── g2p_wrd_to_phn.py
│ │ │ │ │ │ ├── ltr_to_wrd.py
│ │ │ │ │ │ ├── mean_pool.py
│ │ │ │ │ │ ├── merge_clusters.py
│ │ │ │ │ │ ├── normalize_and_filter_text.py
│ │ │ │ │ │ ├── normalize_text.py
│ │ │ │ │ │ ├── pca.py
│ │ │ │ │ │ ├── phonemize_with_sil.py
│ │ │ │ │ │ ├── prepare_audio.sh
│ │ │ │ │ │ ├── prepare_text.sh
│ │ │ │ │ │ ├── prepare_timit.sh
│ │ │ │ │ │ ├── remove_silence.py
│ │ │ │ │ │ ├── vads.py
│ │ │ │ │ │ ├── wav2vec_apply_cluster_faiss.py
│ │ │ │ │ │ ├── wav2vec_cluster_faiss.py
│ │ │ │ │ │ ├── wav2vec_extract_features.py
│ │ │ │ │ │ ├── wer.py
│ │ │ │ │ │ └── wrd_to_ltr.py
│ │ │ │ │ ├── tasks/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ └── unpaired_audio_text.py
│ │ │ │ │ └── w2vu_generate.py
│ │ │ │ ├── vq-wav2vec_featurize.py
│ │ │ │ ├── wav2vec_featurize.py
│ │ │ │ ├── wav2vec_manifest.py
│ │ │ │ └── xlsr/
│ │ │ │ ├── README.md
│ │ │ │ └── config/
│ │ │ │ └── finetune.yaml
│ │ │ ├── wmt19/
│ │ │ │ └── README.md
│ │ │ ├── wmt20/
│ │ │ │ └── README.md
│ │ │ ├── wmt21/
│ │ │ │ ├── README.md
│ │ │ │ ├── eval.sh
│ │ │ │ └── scripts/
│ │ │ │ ├── normalize-punctuation.perl
│ │ │ │ └── replace-unicode-punctuation.perl
│ │ │ ├── xglm/
│ │ │ │ ├── README.md
│ │ │ │ └── model_card.md
│ │ │ └── xlmr/
│ │ │ └── README.md
│ │ ├── fairseq/
│ │ │ ├── __init__.py
│ │ │ ├── benchmark/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── dummy_dataset.py
│ │ │ │ ├── dummy_lm.py
│ │ │ │ ├── dummy_masked_lm.py
│ │ │ │ ├── dummy_model.py
│ │ │ │ └── dummy_mt.py
│ │ │ ├── binarizer.py
│ │ │ ├── checkpoint_utils.py
│ │ │ ├── clib/
│ │ │ │ ├── cuda/
│ │ │ │ │ ├── ngram_repeat_block_cuda.cpp
│ │ │ │ │ └── ngram_repeat_block_cuda_kernel.cu
│ │ │ │ ├── libbase/
│ │ │ │ │ └── balanced_assignment.cpp
│ │ │ │ ├── libbleu/
│ │ │ │ │ ├── libbleu.cpp
│ │ │ │ │ └── module.cpp
│ │ │ │ ├── libnat/
│ │ │ │ │ └── edit_dist.cpp
│ │ │ │ └── libnat_cuda/
│ │ │ │ ├── binding.cpp
│ │ │ │ ├── edit_dist.cu
│ │ │ │ └── edit_dist.h
│ │ │ ├── config/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── config.yaml
│ │ │ │ └── model/
│ │ │ │ ├── transformer_lm/
│ │ │ │ │ ├── transformer_lm_baevski_gbw.yaml
│ │ │ │ │ ├── transformer_lm_baevski_wiki103.yaml
│ │ │ │ │ ├── transformer_lm_big.yaml
│ │ │ │ │ ├── transformer_lm_gbw.yaml
│ │ │ │ │ ├── transformer_lm_gpt.yaml
│ │ │ │ │ ├── transformer_lm_gpt2_big.yaml
│ │ │ │ │ ├── transformer_lm_gpt2_medium.yaml
│ │ │ │ │ ├── transformer_lm_gpt2_small.yaml
│ │ │ │ │ └── transformer_lm_wiki103.yaml
│ │ │ │ ├── wav2vec/
│ │ │ │ │ └── vq_wav2vec_gumbel.yaml
│ │ │ │ └── wav2vec2/
│ │ │ │ ├── wav2vec2_base.yaml
│ │ │ │ └── wav2vec2_large.yaml
│ │ │ ├── criterions/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── adaptive_loss.py
│ │ │ │ ├── composite_loss.py
│ │ │ │ ├── cross_entropy.py
│ │ │ │ ├── ctc.py
│ │ │ │ ├── fairseq_criterion.py
│ │ │ │ ├── fastspeech2_loss.py
│ │ │ │ ├── hubert_criterion.py
│ │ │ │ ├── label_smoothed_cross_entropy.py
│ │ │ │ ├── label_smoothed_cross_entropy_latency_augmented.py
│ │ │ │ ├── label_smoothed_cross_entropy_with_alignment.py
│ │ │ │ ├── legacy_masked_lm.py
│ │ │ │ ├── masked_lm.py
│ │ │ │ ├── model_criterion.py
│ │ │ │ ├── nat_loss.py
│ │ │ │ ├── sentence_prediction.py
│ │ │ │ ├── sentence_ranking.py
│ │ │ │ ├── speech_to_speech_criterion.py
│ │ │ │ ├── tacotron2_loss.py
│ │ │ │ └── wav2vec_criterion.py
│ │ │ ├── data/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── add_target_dataset.py
│ │ │ │ ├── append_token_dataset.py
│ │ │ │ ├── audio/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── audio_utils.py
│ │ │ │ │ ├── data_cfg.py
│ │ │ │ │ ├── feature_transforms/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── delta_deltas.py
│ │ │ │ │ │ ├── global_cmvn.py
│ │ │ │ │ │ ├── specaugment.py
│ │ │ │ │ │ └── utterance_cmvn.py
│ │ │ │ │ ├── frm_text_to_speech_dataset.py
│ │ │ │ │ ├── hubert_dataset.py
│ │ │ │ │ ├── multi_modality_dataset.py
│ │ │ │ │ ├── raw_audio_dataset.py
│ │ │ │ │ ├── speech_to_speech_dataset.py
│ │ │ │ │ ├── speech_to_text_dataset.py
│ │ │ │ │ ├── speech_to_text_joint_dataset.py
│ │ │ │ │ └── text_to_speech_dataset.py
│ │ │ │ ├── backtranslation_dataset.py
│ │ │ │ ├── base_wrapper_dataset.py
│ │ │ │ ├── bucket_pad_length_dataset.py
│ │ │ │ ├── colorize_dataset.py
│ │ │ │ ├── concat_dataset.py
│ │ │ │ ├── concat_sentences_dataset.py
│ │ │ │ ├── data_utils.py
│ │ │ │ ├── data_utils_fast.pyx
│ │ │ │ ├── denoising_dataset.py
│ │ │ │ ├── dictionary.py
│ │ │ │ ├── encoders/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── byte_bpe.py
│ │ │ │ │ ├── byte_utils.py
│ │ │ │ │ ├── bytes.py
│ │ │ │ │ ├── characters.py
│ │ │ │ │ ├── fastbpe.py
│ │ │ │ │ ├── gpt2_bpe.py
│ │ │ │ │ ├── gpt2_bpe_utils.py
│ │ │ │ │ ├── hf_bert_bpe.py
│ │ │ │ │ ├── hf_byte_bpe.py
│ │ │ │ │ ├── moses_tokenizer.py
│ │ │ │ │ ├── nltk_tokenizer.py
│ │ │ │ │ ├── sentencepiece_bpe.py
│ │ │ │ │ ├── space_tokenizer.py
│ │ │ │ │ ├── subword_nmt_bpe.py
│ │ │ │ │ └── utils.py
│ │ │ │ ├── fairseq_dataset.py
│ │ │ │ ├── fasta_dataset.py
│ │ │ │ ├── huffman/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── huffman_coder.py
│ │ │ │ │ └── huffman_mmap_indexed_dataset.py
│ │ │ │ ├── id_dataset.py
│ │ │ │ ├── indexed_dataset.py
│ │ │ │ ├── iterators.py
│ │ │ │ ├── language_pair_dataset.py
│ │ │ │ ├── legacy/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── block_pair_dataset.py
│ │ │ │ │ ├── masked_lm_dataset.py
│ │ │ │ │ └── masked_lm_dictionary.py
│ │ │ │ ├── list_dataset.py
│ │ │ │ ├── lm_context_window_dataset.py
│ │ │ │ ├── lru_cache_dataset.py
│ │ │ │ ├── mask_tokens_dataset.py
│ │ │ │ ├── monolingual_dataset.py
│ │ │ │ ├── multi_corpus_dataset.py
│ │ │ │ ├── multi_corpus_sampled_dataset.py
│ │ │ │ ├── multilingual/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── multilingual_data_manager.py
│ │ │ │ │ ├── multilingual_utils.py
│ │ │ │ │ ├── sampled_multi_dataset.py
│ │ │ │ │ ├── sampled_multi_epoch_dataset.py
│ │ │ │ │ └── sampling_method.py
│ │ │ │ ├── nested_dictionary_dataset.py
│ │ │ │ ├── noising.py
│ │ │ │ ├── num_samples_dataset.py
│ │ │ │ ├── numel_dataset.py
│ │ │ │ ├── offset_tokens_dataset.py
│ │ │ │ ├── pad_dataset.py
│ │ │ │ ├── plasma_utils.py
│ │ │ │ ├── prepend_dataset.py
│ │ │ │ ├── prepend_token_dataset.py
│ │ │ │ ├── raw_label_dataset.py
│ │ │ │ ├── replace_dataset.py
│ │ │ │ ├── resampling_dataset.py
│ │ │ │ ├── roll_dataset.py
│ │ │ │ ├── round_robin_zip_datasets.py
│ │ │ │ ├── shorten_dataset.py
│ │ │ │ ├── sort_dataset.py
│ │ │ │ ├── strip_token_dataset.py
│ │ │ │ ├── subsample_dataset.py
│ │ │ │ ├── text_compressor.py
│ │ │ │ ├── token_block_dataset.py
│ │ │ │ ├── token_block_utils_fast.pyx
│ │ │ │ ├── transform_eos_concat_langpair_dataset.py
│ │ │ │ ├── transform_eos_dataset.py
│ │ │ │ └── transform_eos_lang_pair_dataset.py
│ │ │ ├── dataclass/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── configs.py
│ │ │ │ ├── constants.py
│ │ │ │ ├── initialize.py
│ │ │ │ └── utils.py
│ │ │ ├── distributed/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── distributed_timeout_wrapper.py
│ │ │ │ ├── fully_sharded_data_parallel.py
│ │ │ │ ├── legacy_distributed_data_parallel.py
│ │ │ │ ├── module_proxy_wrapper.py
│ │ │ │ ├── tpu_distributed_data_parallel.py
│ │ │ │ └── utils.py
│ │ │ ├── ds_trainer.py
│ │ │ ├── file_chunker_utils.py
│ │ │ ├── file_io.py
│ │ │ ├── file_utils.py
│ │ │ ├── hub_utils.py
│ │ │ ├── incremental_decoding_utils.py
│ │ │ ├── iterative_refinement_generator.py
│ │ │ ├── logging/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── meters.py
│ │ │ │ ├── metrics.py
│ │ │ │ └── progress_bar.py
│ │ │ ├── model_parallel/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── criterions/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── vocab_parallel_cross_entropy.py
│ │ │ │ ├── megatron_trainer.py
│ │ │ │ ├── models/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── pipeline_parallel_transformer/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── layers.py
│ │ │ │ │ │ └── model.py
│ │ │ │ │ ├── roberta/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ └── model.py
│ │ │ │ │ ├── transformer.py
│ │ │ │ │ └── transformer_lm.py
│ │ │ │ └── modules/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── multihead_attention.py
│ │ │ │ └── transformer_layer.py
│ │ │ ├── models/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── bart/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── hub_interface.py
│ │ │ │ │ └── model.py
│ │ │ │ ├── composite_encoder.py
│ │ │ │ ├── distributed_fairseq_model.py
│ │ │ │ ├── ema/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── ema.py
│ │ │ │ ├── fairseq_decoder.py
│ │ │ │ ├── fairseq_encoder.py
│ │ │ │ ├── fairseq_incremental_decoder.py
│ │ │ │ ├── fairseq_model.py
│ │ │ │ ├── fconv.py
│ │ │ │ ├── fconv_lm.py
│ │ │ │ ├── fconv_self_att.py
│ │ │ │ ├── hubert/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── hubert.py
│ │ │ │ │ └── hubert_asr.py
│ │ │ │ ├── huggingface/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── hf_gpt2.py
│ │ │ │ ├── lightconv.py
│ │ │ │ ├── lightconv_lm.py
│ │ │ │ ├── lstm.py
│ │ │ │ ├── lstm_lm.py
│ │ │ │ ├── masked_lm.py
│ │ │ │ ├── model_utils.py
│ │ │ │ ├── multilingual_transformer.py
│ │ │ │ ├── nat/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── cmlm_transformer.py
│ │ │ │ │ ├── fairseq_nat_model.py
│ │ │ │ │ ├── insertion_transformer.py
│ │ │ │ │ ├── iterative_nonautoregressive_transformer.py
│ │ │ │ │ ├── levenshtein_transformer.py
│ │ │ │ │ ├── levenshtein_utils.py
│ │ │ │ │ ├── nat_crf_transformer.py
│ │ │ │ │ ├── nonautoregressive_ensembles.py
│ │ │ │ │ └── nonautoregressive_transformer.py
│ │ │ │ ├── roberta/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── alignment_utils.py
│ │ │ │ │ ├── enc_dec.py
│ │ │ │ │ ├── hub_interface.py
│ │ │ │ │ ├── model.py
│ │ │ │ │ ├── model_camembert.py
│ │ │ │ │ ├── model_gottbert.py
│ │ │ │ │ └── model_xlmr.py
│ │ │ │ ├── speech_to_speech/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── modules.py
│ │ │ │ │ └── s2s_transformer.py
│ │ │ │ ├── speech_to_text/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── berard.py
│ │ │ │ │ ├── convtransformer.py
│ │ │ │ │ ├── hub_interface.py
│ │ │ │ │ ├── modules/
│ │ │ │ │ │ ├── augmented_memory_attention.py
│ │ │ │ │ │ └── emformer.py
│ │ │ │ │ ├── s2t_conformer.py
│ │ │ │ │ ├── s2t_transformer.py
│ │ │ │ │ ├── utils.py
│ │ │ │ │ └── xm_transformer.py
│ │ │ │ ├── text_to_speech/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── codehifigan.py
│ │ │ │ │ ├── fastspeech2.py
│ │ │ │ │ ├── hifigan.py
│ │ │ │ │ ├── hub_interface.py
│ │ │ │ │ ├── tacotron2.py
│ │ │ │ │ ├── tts_transformer.py
│ │ │ │ │ └── vocoder.py
│ │ │ │ ├── transformer/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── transformer_base.py
│ │ │ │ │ ├── transformer_config.py
│ │ │ │ │ ├── transformer_decoder.py
│ │ │ │ │ ├── transformer_encoder.py
│ │ │ │ │ └── transformer_legacy.py
│ │ │ │ ├── transformer_align.py
│ │ │ │ ├── transformer_from_pretrained_xlm.py
│ │ │ │ ├── transformer_lm.py
│ │ │ │ └── wav2vec/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── utils.py
│ │ │ │ ├── wav2vec.py
│ │ │ │ ├── wav2vec2.py
│ │ │ │ └── wav2vec2_asr.py
│ │ │ ├── modules/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── adaptive_input.py
│ │ │ │ ├── adaptive_softmax.py
│ │ │ │ ├── base_layer.py
│ │ │ │ ├── beamable_mm.py
│ │ │ │ ├── character_token_embedder.py
│ │ │ │ ├── checkpoint_activations.py
│ │ │ │ ├── conformer_layer.py
│ │ │ │ ├── conv_tbc.py
│ │ │ │ ├── cross_entropy.py
│ │ │ │ ├── cuda_utils.cu
│ │ │ │ ├── downsampled_multihead_attention.py
│ │ │ │ ├── dynamic_convolution.py
│ │ │ │ ├── dynamic_crf_layer.py
│ │ │ │ ├── dynamicconv_layer/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── cuda_function_gen.py
│ │ │ │ │ ├── dynamicconv_cuda.cpp
│ │ │ │ │ ├── dynamicconv_cuda.cuh
│ │ │ │ │ ├── dynamicconv_cuda_kernel.cu
│ │ │ │ │ ├── dynamicconv_layer.py
│ │ │ │ │ ├── dynamiconv_cpu.cpp
│ │ │ │ │ └── setup.py
│ │ │ │ ├── espnet_multihead_attention.py
│ │ │ │ ├── fairseq_dropout.py
│ │ │ │ ├── fp32_batch_norm.py
│ │ │ │ ├── fp32_group_norm.py
│ │ │ │ ├── fp32_instance_norm.py
│ │ │ │ ├── gelu.py
│ │ │ │ ├── grad_multiply.py
│ │ │ │ ├── gumbel_vector_quantizer.py
│ │ │ │ ├── kmeans_attention.py
│ │ │ │ ├── kmeans_vector_quantizer.py
│ │ │ │ ├── layer_drop.py
│ │ │ │ ├── layer_norm.py
│ │ │ │ ├── learned_positional_embedding.py
│ │ │ │ ├── lightconv_layer/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── cuda_function_gen.py
│ │ │ │ │ ├── lightconv_cuda.cpp
│ │ │ │ │ ├── lightconv_cuda.cuh
│ │ │ │ │ ├── lightconv_cuda_kernel.cu
│ │ │ │ │ ├── lightconv_layer.py
│ │ │ │ │ └── setup.py
│ │ │ │ ├── lightweight_convolution.py
│ │ │ │ ├── linearized_convolution.py
│ │ │ │ ├── location_attention.py
│ │ │ │ ├── lstm_cell_with_zoneout.py
│ │ │ │ ├── multihead_attention.py
│ │ │ │ ├── positional_embedding.py
│ │ │ │ ├── positional_encoding.py
│ │ │ │ ├── quant_noise.py
│ │ │ │ ├── quantization/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── pq/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── em.py
│ │ │ │ │ │ ├── modules/
│ │ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ │ ├── qconv.py
│ │ │ │ │ │ │ ├── qemb.py
│ │ │ │ │ │ │ └── qlinear.py
│ │ │ │ │ │ ├── pq.py
│ │ │ │ │ │ └── utils.py
│ │ │ │ │ ├── quantization_options.py
│ │ │ │ │ └── scalar/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── modules/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── qact.py
│ │ │ │ │ │ ├── qconv.py
│ │ │ │ │ │ ├── qemb.py
│ │ │ │ │ │ └── qlinear.py
│ │ │ │ │ ├── ops.py
│ │ │ │ │ └── utils.py
│ │ │ │ ├── rotary_positional_embedding.py
│ │ │ │ ├── same_pad.py
│ │ │ │ ├── scalar_bias.py
│ │ │ │ ├── sinusoidal_positional_embedding.py
│ │ │ │ ├── sparse_multihead_attention.py
│ │ │ │ ├── sparse_transformer_sentence_encoder.py
│ │ │ │ ├── sparse_transformer_sentence_encoder_layer.py
│ │ │ │ ├── transformer_layer.py
│ │ │ │ ├── transformer_sentence_encoder.py
│ │ │ │ ├── transformer_sentence_encoder_layer.py
│ │ │ │ ├── transpose_last.py
│ │ │ │ ├── unfold.py
│ │ │ │ └── vggblock.py
│ │ │ ├── nan_detector.py
│ │ │ ├── ngram_repeat_block.py
│ │ │ ├── optim/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── adadelta.py
│ │ │ │ ├── adafactor.py
│ │ │ │ ├── adagrad.py
│ │ │ │ ├── adam.py
│ │ │ │ ├── adamax.py
│ │ │ │ ├── amp_optimizer.py
│ │ │ │ ├── bmuf.py
│ │ │ │ ├── composite.py
│ │ │ │ ├── cpu_adam.py
│ │ │ │ ├── dynamic_loss_scaler.py
│ │ │ │ ├── fairseq_optimizer.py
│ │ │ │ ├── fp16_optimizer.py
│ │ │ │ ├── fused_adam.py
│ │ │ │ ├── fused_lamb.py
│ │ │ │ ├── lr_scheduler/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── cosine_lr_scheduler.py
│ │ │ │ │ ├── fairseq_lr_scheduler.py
│ │ │ │ │ ├── fixed_schedule.py
│ │ │ │ │ ├── inverse_square_root_schedule.py
│ │ │ │ │ ├── manual_lr_scheduler.py
│ │ │ │ │ ├── pass_through.py
│ │ │ │ │ ├── polynomial_decay_schedule.py
│ │ │ │ │ ├── reduce_lr_on_plateau.py
│ │ │ │ │ ├── step_lr_scheduler.py
│ │ │ │ │ ├── tri_stage_lr_scheduler.py
│ │ │ │ │ └── triangular_lr_scheduler.py
│ │ │ │ ├── nag.py
│ │ │ │ ├── sgd.py
│ │ │ │ └── shard.py
│ │ │ ├── options.py
│ │ │ ├── pdb.py
│ │ │ ├── quantization_utils.py
│ │ │ ├── registry.py
│ │ │ ├── scoring/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── bleu.py
│ │ │ │ ├── chrf.py
│ │ │ │ ├── meteor.py
│ │ │ │ ├── tokenizer.py
│ │ │ │ └── wer.py
│ │ │ ├── search.py
│ │ │ ├── sequence_generator.py
│ │ │ ├── sequence_scorer.py
│ │ │ ├── speech_generator.py
│ │ │ ├── tasks/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── audio_finetuning.py
│ │ │ │ ├── audio_pretraining.py
│ │ │ │ ├── cross_lingual_lm.py
│ │ │ │ ├── denoising.py
│ │ │ │ ├── fairseq_task.py
│ │ │ │ ├── frm_text_to_speech.py
│ │ │ │ ├── hubert_pretraining.py
│ │ │ │ ├── language_modeling.py
│ │ │ │ ├── legacy_masked_lm.py
│ │ │ │ ├── masked_lm.py
│ │ │ │ ├── multilingual_denoising.py
│ │ │ │ ├── multilingual_language_modeling.py
│ │ │ │ ├── multilingual_masked_lm.py
│ │ │ │ ├── multilingual_translation.py
│ │ │ │ ├── online_backtranslation.py
│ │ │ │ ├── semisupervised_translation.py
│ │ │ │ ├── sentence_prediction.py
│ │ │ │ ├── sentence_ranking.py
│ │ │ │ ├── simultaneous_translation.py
│ │ │ │ ├── speech_to_speech.py
│ │ │ │ ├── speech_to_text.py
│ │ │ │ ├── text_to_speech.py
│ │ │ │ ├── translation.py
│ │ │ │ ├── translation_from_pretrained_bart.py
│ │ │ │ ├── translation_from_pretrained_xlm.py
│ │ │ │ ├── translation_lev.py
│ │ │ │ └── translation_multi_simple_epoch.py
│ │ │ ├── token_generation_constraints.py
│ │ │ ├── tokenizer.py
│ │ │ ├── trainer.py
│ │ │ ├── utils.py
│ │ │ └── version.txt
│ │ ├── fairseq_cli/
│ │ │ ├── __init__.py
│ │ │ ├── eval_lm.py
│ │ │ ├── generate.py
│ │ │ ├── hydra_train.py
│ │ │ ├── interactive.py
│ │ │ ├── preprocess.py
│ │ │ ├── score.py
│ │ │ ├── train.py
│ │ │ └── validate.py
│ │ ├── hubconf.py
│ │ ├── pyproject.toml
│ │ ├── scripts/
│ │ │ ├── __init__.py
│ │ │ ├── average_checkpoints.py
│ │ │ ├── build_sym_alignment.py
│ │ │ ├── compare_namespaces.py
│ │ │ ├── compound_split_bleu.sh
│ │ │ ├── constraints/
│ │ │ │ ├── extract.py
│ │ │ │ └── validate.py
│ │ │ ├── convert_dictionary.lua
│ │ │ ├── convert_model.lua
│ │ │ ├── count_docs.py
│ │ │ ├── read_binarized.py
│ │ │ ├── rm_pt.py
│ │ │ ├── sacrebleu.sh
│ │ │ ├── shard_docs.py
│ │ │ ├── split_train_valid_docs.py
│ │ │ ├── spm_decode.py
│ │ │ ├── spm_encode.py
│ │ │ ├── spm_train.py
│ │ │ └── test_fsdp.sh
│ │ ├── setup.cfg
│ │ ├── setup.py
│ │ └── train.py
│ ├── generate.py
│ ├── infinibatch/
│ │ ├── .gitattributes
│ │ ├── .github/
│ │ │ └── workflows/
│ │ │ ├── gh-pages.yml
│ │ │ └── unit_tests.yml
│ │ ├── .gitignore
│ │ ├── CODE_OF_CONDUCT.md
│ │ ├── LICENSE
│ │ ├── README.md
│ │ ├── SECURITY.md
│ │ ├── docs/
│ │ │ └── config.mako
│ │ ├── infinibatch/
│ │ │ ├── __init__.py
│ │ │ ├── datasets.py
│ │ │ └── iterators.py
│ │ ├── pyproject.toml
│ │ ├── requirements.txt
│ │ ├── setup.py
│ │ └── test/
│ │ ├── test_datasets.py
│ │ ├── test_doctests.py
│ │ └── test_iterators.py
│ ├── interactive.py
│ ├── open_clip/
│ │ ├── .github/
│ │ │ └── workflows/
│ │ │ ├── ci.yml
│ │ │ └── python-publish.yml
│ │ ├── .gitignore
│ │ ├── CITATION.cff
│ │ ├── HISTORY.md
│ │ ├── LICENSE
│ │ ├── MANIFEST.in
│ │ ├── Makefile
│ │ ├── README.md
│ │ ├── docs/
│ │ │ ├── Interacting_with_open_clip.ipynb
│ │ │ └── clip_conceptual_captions.md
│ │ ├── requirements-test.txt
│ │ ├── requirements-training.txt
│ │ ├── requirements.txt
│ │ ├── setup.py
│ │ └── src/
│ │ ├── data/
│ │ │ └── gather_cc.py
│ │ ├── open_clip/
│ │ │ ├── __init__.py
│ │ │ ├── factory.py
│ │ │ ├── loss.py
│ │ │ ├── model.py
│ │ │ ├── model_configs/
│ │ │ │ ├── RN101-quickgelu.json
│ │ │ │ ├── RN101.json
│ │ │ │ ├── RN50-quickgelu.json
│ │ │ │ ├── RN50.json
│ │ │ │ ├── RN50x16.json
│ │ │ │ ├── RN50x4.json
│ │ │ │ ├── ViT-B-16-plus-240.json
│ │ │ │ ├── ViT-B-16-plus.json
│ │ │ │ ├── ViT-B-16.json
│ │ │ │ ├── ViT-B-32-plus-256.json
│ │ │ │ ├── ViT-B-32-quickgelu.json
│ │ │ │ ├── ViT-B-32.json
│ │ │ │ ├── ViT-H-14.json
│ │ │ │ ├── ViT-H-16.json
│ │ │ │ ├── ViT-L-14-280.json
│ │ │ │ ├── ViT-L-14-336.json
│ │ │ │ ├── ViT-L-14.json
│ │ │ │ ├── ViT-L-16-320.json
│ │ │ │ ├── ViT-L-16.json
│ │ │ │ ├── ViT-g-14.json
│ │ │ │ ├── timm-efficientnetv2_rw_s.json
│ │ │ │ ├── timm-resnet50d.json
│ │ │ │ ├── timm-resnetaa50d.json
│ │ │ │ ├── timm-resnetblur50.json
│ │ │ │ ├── timm-swin_base_patch4_window7_224.json
│ │ │ │ ├── timm-vit_base_patch16_224.json
│ │ │ │ ├── timm-vit_base_patch32_224.json
│ │ │ │ └── timm-vit_small_patch16_224.json
│ │ │ ├── openai.py
│ │ │ ├── pretrained.py
│ │ │ ├── timm_model.py
│ │ │ ├── tokenizer.py
│ │ │ ├── transform.py
│ │ │ ├── utils.py
│ │ │ └── version.py
│ │ └── training/
│ │ ├── .gitignore
│ │ ├── __init__.py
│ │ ├── data.py
│ │ ├── distributed.py
│ │ ├── imagenet_zeroshot_data.py
│ │ ├── logger.py
│ │ ├── main.py
│ │ ├── params.py
│ │ ├── scheduler.py
│ │ ├── train.py
│ │ └── zero_shot.py
│ ├── preprocess.py
│ ├── requirements.txt
│ ├── run_gradio.sh
│ ├── torchscale/
│ │ ├── .gitignore
│ │ ├── CODE_OF_CONDUCT.md
│ │ ├── LICENSE
│ │ ├── README.md
│ │ ├── SECURITY.md
│ │ ├── SUPPORT.md
│ │ ├── examples/
│ │ │ ├── __init__.py
│ │ │ └── fairseq/
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ ├── generate.py
│ │ │ ├── interactive.py
│ │ │ ├── laion-token-base.sh
│ │ │ ├── laion-wild-token-base.sh
│ │ │ ├── models/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── bert.py
│ │ │ │ ├── language_modeling.py
│ │ │ │ └── machine_translation.py
│ │ │ ├── tasks/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── data/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── basic_loader.py
│ │ │ │ │ ├── laion_loader.py
│ │ │ │ │ ├── laion_loader_test.py
│ │ │ │ │ ├── lm_loader.py
│ │ │ │ │ ├── mlm_loader.py
│ │ │ │ │ ├── spm_lm_loader.py
│ │ │ │ │ ├── utils.py
│ │ │ │ │ ├── wild_loader.py
│ │ │ │ │ ├── wild_loader_test.py
│ │ │ │ │ └── wild_loader_test_2.py
│ │ │ │ ├── gpt_base.py
│ │ │ │ ├── pretraining.py
│ │ │ │ └── vl_gpt_base.py
│ │ │ ├── train.py
│ │ │ ├── utils/
│ │ │ │ ├── __init__.py
│ │ │ │ └── sparse_clip.py
│ │ │ └── wild-token-base.sh
│ │ ├── setup.py
│ │ └── torchscale/
│ │ ├── __init__.py
│ │ ├── architecture/
│ │ │ ├── __init__.py
│ │ │ ├── config.py
│ │ │ ├── decoder.py
│ │ │ ├── encoder.py
│ │ │ ├── encoder_decoder.py
│ │ │ └── utils.py
│ │ ├── component/
│ │ │ ├── __init__.py
│ │ │ ├── droppath.py
│ │ │ ├── embedding.py
│ │ │ ├── feedforward_network.py
│ │ │ ├── multihead_attention.py
│ │ │ ├── multiway_network.py
│ │ │ ├── relative_position_bias.py
│ │ │ ├── sope_relative_position.py
│ │ │ └── xmoe/
│ │ │ ├── __init__.py
│ │ │ ├── moe_layer.py
│ │ │ └── routing.py
│ │ └── model/
│ │ ├── BEiT3.py
│ │ └── __init__.py
│ ├── train.py
│ ├── train.sh
│ ├── unilm/
│ │ ├── __init__.py
│ │ ├── criterions/
│ │ │ ├── __init__.py
│ │ │ └── unigpt.py
│ │ ├── data/
│ │ │ ├── __init__.py
│ │ │ ├── basic_loader.py
│ │ │ ├── lm_loader.py
│ │ │ ├── spm_lm_loader.py
│ │ │ ├── utils.py
│ │ │ └── vl/
│ │ │ ├── Interleaved_loader.py
│ │ │ ├── laion2b_loader.py
│ │ │ ├── laion2b_obj_loader.py
│ │ │ ├── obj_utils.py
│ │ │ ├── vl_base_loader.py
│ │ │ └── vl_loader.py
│ │ ├── models/
│ │ │ ├── __init__.py
│ │ │ ├── connector.py
│ │ │ ├── gpt.py
│ │ │ ├── gpt_eval.py
│ │ │ ├── unigpt.py
│ │ │ └── vl/
│ │ │ ├── __init__.py
│ │ │ ├── clip.py
│ │ │ ├── openai.py
│ │ │ └── vlm_generator.py
│ │ └── tasks/
│ │ ├── __init__.py
│ │ ├── generation_obj.py
│ │ ├── gpt_base.py
│ │ └── gpt_interleaved_laion_obj.py
│ ├── validate.py
│ └── vl_setup_xl.sh
├── kosmos-2.5/
│ ├── CASES.md
│ ├── CODE_OF_CONDUCT.md
│ ├── LICENSE
│ ├── README.md
│ ├── SECURITY.md
│ ├── SUPPORT.md
│ ├── __init.py
│ ├── dict.txt
│ ├── draw_bbox.py
│ ├── inference.py
│ ├── kosmos2_5/
│ │ ├── __init__.py
│ │ ├── data/
│ │ │ ├── __init__.py
│ │ │ └── utils.py
│ │ ├── models/
│ │ │ ├── __init__.py
│ │ │ ├── connector.py
│ │ │ ├── gpt.py
│ │ │ └── unigpt.py
│ │ └── tasks/
│ │ ├── __init__.py
│ │ └── generation.py
│ └── requirements.txt
├── layoutlm/
│ ├── README.md
│ └── deprecated/
│ ├── .flake8
│ ├── .gitignore
│ ├── .isort.cfg
│ ├── .pre-commit-config.yaml
│ ├── README.md
│ ├── examples/
│ │ ├── classification/
│ │ │ └── run_classification.py
│ │ └── seq_labeling/
│ │ ├── preprocess.py
│ │ ├── preprocess.sh
│ │ └── run_seq_labeling.py
│ ├── layoutlm/
│ │ ├── __init__.py
│ │ ├── data/
│ │ │ ├── __init__.py
│ │ │ ├── funsd.py
│ │ │ └── rvl_cdip.py
│ │ └── modeling/
│ │ ├── __init__.py
│ │ └── layoutlm.py
│ ├── mypy.ini
│ └── setup.py
├── layoutlmft/
│ ├── .gitignore
│ ├── Makefile
│ ├── README.md
│ ├── examples/
│ │ ├── run_funsd.py
│ │ ├── run_xfun_re.py
│ │ └── run_xfun_ser.py
│ ├── layoutlmft/
│ │ ├── __init__.py
│ │ ├── data/
│ │ │ ├── __init__.py
│ │ │ ├── data_args.py
│ │ │ ├── data_collator.py
│ │ │ ├── datasets/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── funsd.py
│ │ │ │ └── xfun.py
│ │ │ └── utils.py
│ │ ├── evaluation.py
│ │ ├── models/
│ │ │ ├── __init__.py
│ │ │ ├── layoutlm/
│ │ │ │ └── __init__.py
│ │ │ ├── layoutlmv2/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── configuration_layoutlmv2.py
│ │ │ │ ├── detectron2_config.py
│ │ │ │ ├── modeling_layoutlmv2.py
│ │ │ │ ├── tokenization_layoutlmv2.py
│ │ │ │ └── tokenization_layoutlmv2_fast.py
│ │ │ ├── layoutxlm/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── configuration_layoutxlm.py
│ │ │ │ ├── modeling_layoutxlm.py
│ │ │ │ ├── tokenization_layoutxlm.py
│ │ │ │ └── tokenization_layoutxlm_fast.py
│ │ │ └── model_args.py
│ │ ├── modules/
│ │ │ ├── __init__.py
│ │ │ └── decoders/
│ │ │ ├── __init__.py
│ │ │ └── re.py
│ │ ├── trainers/
│ │ │ ├── __init__.py
│ │ │ ├── funsd_trainer.py
│ │ │ └── xfun_trainer.py
│ │ └── utils.py
│ ├── pyproject.toml
│ ├── requirements.txt
│ ├── setup.cfg
│ └── setup.py
├── layoutlmv2/
│ └── README.md
├── layoutlmv3/
│ ├── .gitignore
│ ├── README.md
│ ├── examples/
│ │ ├── object_detection/
│ │ │ ├── adaptive_binarize.py
│ │ │ ├── cascade_layoutlmv3.yaml
│ │ │ ├── convert_to_coco_format.py
│ │ │ ├── ditod/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── backbone.py
│ │ │ │ ├── beit.py
│ │ │ │ ├── config.py
│ │ │ │ ├── dataset_mapper.py
│ │ │ │ ├── deit.py
│ │ │ │ ├── icdar_evaluation.py
│ │ │ │ ├── mycheckpointer.py
│ │ │ │ ├── mytrainer.py
│ │ │ │ ├── rcnn_vl.py
│ │ │ │ └── table_evaluation/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── data_structure.py
│ │ │ │ └── evaluate.py
│ │ │ └── train_net.py
│ │ ├── run_funsd_cord.py
│ │ └── run_xfund.py
│ ├── layoutlmft/
│ │ ├── __init__.py
│ │ ├── data/
│ │ │ ├── __init__.py
│ │ │ ├── cord.py
│ │ │ ├── data_collator.py
│ │ │ ├── funsd.py
│ │ │ ├── image_utils.py
│ │ │ └── xfund.py
│ │ └── models/
│ │ ├── __init__.py
│ │ └── layoutlmv3/
│ │ ├── __init__.py
│ │ ├── configuration_layoutlmv3.py
│ │ ├── modeling_layoutlmv3.py
│ │ ├── tokenization_layoutlmv3.py
│ │ └── tokenization_layoutlmv3_fast.py
│ ├── requirements.txt
│ ├── setup.cfg
│ └── setup.py
├── layoutreader/
│ ├── README.md
│ ├── decode_seq2seq.py
│ ├── run_seq2seq.py
│ ├── s2s_ft/
│ │ ├── config.py
│ │ ├── configuration_minilm.py
│ │ ├── configuration_unilm.py
│ │ ├── convert_state_dict.py
│ │ ├── modeling.py
│ │ ├── modeling_decoding.py
│ │ ├── s2s_loader.py
│ │ ├── tokenization_minilm.py
│ │ ├── tokenization_unilm.py
│ │ └── utils.py
│ └── setup.py
├── layoutxlm/
│ └── README.md
├── longnet/
│ └── README.md
├── longvit/
│ └── README.md
├── markuplm/
│ ├── README.md
│ ├── examples/
│ │ └── fine_tuning/
│ │ ├── run_swde/
│ │ │ ├── constants.py
│ │ │ ├── eval_utils.py
│ │ │ ├── pack_data.py
│ │ │ ├── prepare_data.py
│ │ │ ├── run.py
│ │ │ └── utils.py
│ │ └── run_websrc/
│ │ ├── dataset_generation.py
│ │ ├── draft.py
│ │ ├── run.py
│ │ ├── utils.py
│ │ └── utils_evaluate.py
│ ├── markuplmft/
│ │ ├── __init__.py
│ │ ├── data/
│ │ │ ├── __init__.py
│ │ │ └── tag_utils.py
│ │ └── models/
│ │ ├── __init__.py
│ │ └── markuplm/
│ │ ├── __init__.py
│ │ ├── configuration_markuplm.py
│ │ ├── modeling_markuplm.py
│ │ ├── tokenization_markuplm.py
│ │ └── tokenization_markuplm_fast.py
│ ├── requirements.txt
│ └── setup.py
├── mathscale/
│ ├── MWPBench/
│ │ ├── Dockerfile
│ │ ├── README.md
│ │ ├── data/
│ │ │ ├── fresh_gaokao_math_2023.json
│ │ │ ├── full_test.json
│ │ │ └── full_train.json
│ │ ├── eval_openai/
│ │ │ └── driver.py
│ │ ├── eval_vllm/
│ │ │ ├── driver.py
│ │ │ └── util.py
│ │ ├── requirements.txt
│ │ └── scripts/
│ │ ├── eval_openai.alpaca_force_ans.sh
│ │ ├── eval_openai.freshgaokao.alpaca_force_ans.sh
│ │ ├── eval_vllm.alpaca.4gpus.sh
│ │ └── eval_vllm.freshgaokao.alpaca.4gpus.sh
│ └── README.md
├── metalm/
│ └── README.md
├── minilm/
│ ├── README.md
│ └── examples/
│ └── run_xnli.py
├── retnet/
│ └── README.md
├── s2s-ft/
│ ├── .gitignore
│ ├── README.md
│ ├── decode_seq2seq.py
│ ├── evaluations/
│ │ ├── bs_pyrouge.py
│ │ ├── eval_for_cnndm.py
│ │ ├── eval_for_gigaword.py
│ │ └── eval_for_xsum.py
│ ├── gen_seq_from_trace.py
│ ├── run_seq2seq.py
│ ├── s2s_ft/
│ │ ├── config.py
│ │ ├── configuration_minilm.py
│ │ ├── configuration_unilm.py
│ │ ├── convert_state_dict.py
│ │ ├── modeling.py
│ │ ├── modeling_decoding.py
│ │ ├── s2s_loader.py
│ │ ├── tokenization_minilm.py
│ │ ├── tokenization_unilm.py
│ │ └── utils.py
│ └── setup.py
├── simlm/
│ ├── README.md
│ ├── ds_config.json
│ ├── misc/
│ │ ├── compute_metrics_marco.py
│ │ ├── dpr/
│ │ │ ├── evaluate_dpr_retrieval.py
│ │ │ ├── format_and_evaluate.py
│ │ │ └── mine_hard_negatives.py
│ │ ├── marco_pred_to_cases.py
│ │ └── prepare_msmarco_data.py
│ ├── requirements.txt
│ ├── scripts/
│ │ ├── download_msmarco_data.sh
│ │ ├── dpr/
│ │ │ ├── encode_wiki.sh
│ │ │ ├── eval_dpr.sh
│ │ │ ├── nq_gen_kd_teacher_scores.sh
│ │ │ ├── rerank_nq.sh
│ │ │ ├── search_dpr.sh
│ │ │ ├── train_nq_biencoder.sh
│ │ │ ├── train_nq_kd.sh
│ │ │ └── train_nq_reranker.sh
│ │ ├── encode_marco.sh
│ │ ├── gen_kd_teacher_scores.sh
│ │ ├── rerank_marco.sh
│ │ ├── search_marco.sh
│ │ ├── train_biencoder_marco.sh
│ │ ├── train_kd_biencoder.sh
│ │ ├── train_reranker_marco.sh
│ │ └── train_rlm.sh
│ └── src/
│ ├── __init__.py
│ ├── collators/
│ │ ├── __init__.py
│ │ ├── biencoder_collator.py
│ │ ├── collator_utils.py
│ │ ├── cross_encoder_collator.py
│ │ └── rlm_collator.py
│ ├── config.py
│ ├── data_utils.py
│ ├── inference/
│ │ ├── __init__.py
│ │ ├── encode_main.py
│ │ ├── gen_teacher_scores.py
│ │ ├── rerank_main.py
│ │ └── search_main.py
│ ├── loaders/
│ │ ├── __init__.py
│ │ ├── biencoder_dataloader.py
│ │ ├── cross_encoder_dataloader.py
│ │ ├── loader_utils.py
│ │ └── rlm_dataloader.py
│ ├── logger_config.py
│ ├── metrics.py
│ ├── models/
│ │ ├── __init__.py
│ │ ├── biencoder_model.py
│ │ ├── cross_encoder_model.py
│ │ └── rlm.py
│ ├── train_biencoder.py
│ ├── train_cross_encoder.py
│ ├── train_rlm.py
│ ├── trainers/
│ │ ├── __init__.py
│ │ ├── biencoder_trainer.py
│ │ ├── reranker_trainer.py
│ │ └── rlm_trainer.py
│ └── utils.py
├── speechlm/
│ ├── README.md
│ ├── SpeechLM.py
│ ├── dataset/
│ │ ├── CommonVoice/
│ │ │ └── v4/
│ │ │ └── en/
│ │ │ └── en-de/
│ │ │ ├── config_base_ende.yaml
│ │ │ ├── config_large_ende.yaml
│ │ │ ├── dev-sample100_st_en_de_local.tsv
│ │ │ ├── spm_char_st_en_de.model
│ │ │ ├── spm_char_st_en_de.txt
│ │ │ └── spm_char_st_en_de.vocab
│ │ ├── LibriLM/
│ │ │ ├── hidden_unit/
│ │ │ │ └── bin-idx/
│ │ │ │ ├── config.yaml
│ │ │ │ └── dict.km.txt
│ │ │ └── phone_unit/
│ │ │ └── bin-idx/
│ │ │ ├── config.yaml
│ │ │ ├── dict.ltr.txt
│ │ │ └── dict.phn.txt
│ │ └── LibriSpeech/
│ │ ├── asr/
│ │ │ ├── train_sample100.ltr
│ │ │ └── train_sample100.tsv
│ │ ├── fast_phone2unit/
│ │ │ ├── config.yaml
│ │ │ ├── config_generate.yaml
│ │ │ ├── dict.km.txt
│ │ │ ├── genset_examples.tsv
│ │ │ └── train_exmples.tsv
│ │ ├── hidden_unit/
│ │ │ ├── dict.km.txt
│ │ │ ├── train_sample100.km
│ │ │ └── train_sample100.tsv
│ │ └── phone_unit/
│ │ ├── dict.phn.txt
│ │ ├── train_sample100.phn
│ │ └── train_sample100.tsv
│ ├── modules.py
│ ├── speechlm/
│ │ ├── __init__.py
│ │ ├── config/
│ │ │ ├── decode/
│ │ │ │ ├── infer_fsqlm.yaml
│ │ │ │ ├── infer_kenlm.yaml
│ │ │ │ └── infer_viterbi.yaml
│ │ │ ├── finetune/
│ │ │ │ ├── speechlm_base_100h.yaml
│ │ │ │ └── speechlm_large_960h.yaml
│ │ │ └── pretrain/
│ │ │ ├── speechlm_base_librispeech.yaml
│ │ │ ├── speechlm_large_librilight.yaml
│ │ │ └── speechlmp_base_cfg.pt
│ │ ├── criterions/
│ │ │ ├── __init__.py
│ │ │ ├── fasttext2unit_loss.py
│ │ │ └── speechlm_criterion.py
│ │ ├── data/
│ │ │ ├── concat_dataset.py
│ │ │ ├── hubert_dataset.py
│ │ │ ├── language_trible_dataset.py
│ │ │ ├── load_langpair_dataset.py
│ │ │ ├── multimodal_corpus_dataset.py
│ │ │ └── text_to_unit_dataset.py
│ │ ├── data_process/
│ │ │ ├── covost2/
│ │ │ │ ├── mp3_to_wav.py
│ │ │ │ └── prepare_covost_data.py
│ │ │ ├── filter_paireddata_by_len.py
│ │ │ ├── get_t2u_manifest.py
│ │ │ ├── get_t2u_manifest_textonly.py
│ │ │ ├── phoneize_with_sil.py
│ │ │ ├── phoneme_tokenizer/
│ │ │ │ ├── ltr2kaldi_phn_sil025.py
│ │ │ │ ├── mean5_and_std25_sil14_spn32.dict
│ │ │ │ └── repeat_withou_insert_sil_less_4375.py
│ │ │ ├── prepare_covost2_enxx.sh
│ │ │ ├── prepare_phn2ltr_librilm.sh
│ │ │ ├── txt2idx.sh
│ │ │ └── wrd2ltr.py
│ │ ├── generate_unit.py
│ │ ├── infer.py
│ │ ├── models/
│ │ │ ├── __init__.py
│ │ │ ├── fasttext2unit.py
│ │ │ ├── speechlm.py
│ │ │ ├── speechlm_ctcasr.py
│ │ │ └── speechlm_st.py
│ │ ├── modules/
│ │ │ ├── __init__.py
│ │ │ ├── learned_positional_embedding.py
│ │ │ ├── multihead_attention.py
│ │ │ ├── relative_pos_enc.py
│ │ │ ├── transformer_decoder.py
│ │ │ ├── transformer_encoder.py
│ │ │ ├── transformer_layer.py
│ │ │ └── w2v_encoder.py
│ │ ├── scripts/
│ │ │ ├── pretrain_speechlm/
│ │ │ │ ├── base_speechlmh.sh
│ │ │ │ ├── base_speechlmp.sh
│ │ │ │ └── large_speechlmp.sh
│ │ │ ├── tokenizer_fastT2U/
│ │ │ │ ├── generate.sh
│ │ │ │ ├── infer.sh
│ │ │ │ └── train_s_5e-4.sh
│ │ │ ├── tune_speechlm_asr/
│ │ │ │ ├── finetune_base_ctc.sh
│ │ │ │ ├── finetune_large_ctc.sh
│ │ │ │ ├── inference_ctc.sh
│ │ │ │ ├── inference_ctc_kenlm.sh
│ │ │ │ ├── inference_ctc_large.sh
│ │ │ │ └── inference_ctc_large_fsqlm.sh
│ │ │ └── tune_speechlm_st/
│ │ │ ├── ft_base_covost_enxx.sh
│ │ │ ├── ft_large_covost_enxx.sh
│ │ │ ├── inference_base.sh
│ │ │ └── inference_large.sh
│ │ ├── tasks/
│ │ │ ├── fast_text_to_unit.py
│ │ │ └── joint_sc2t_pretrain.py
│ │ └── unit_generator.py
│ └── speechlm_README.md
├── speecht5/
│ ├── README.md
│ ├── scripts/
│ │ └── generate_speech.py
│ └── speecht5/
│ ├── __init__.py
│ ├── criterions/
│ │ ├── __init__.py
│ │ ├── speech_pretrain_criterion.py
│ │ ├── speech_to_text_loss.py
│ │ ├── speecht5_criterion.py
│ │ ├── text_pretrain_criterion.py
│ │ └── text_to_speech_loss.py
│ ├── data/
│ │ ├── __init__.py
│ │ ├── multitask_dataset.py
│ │ ├── speech_dataset.py
│ │ ├── speech_to_speech_dataset.py
│ │ ├── speech_to_text_dataset.py
│ │ ├── text_dataset.py
│ │ └── text_to_speech_dataset.py
│ ├── models/
│ │ ├── __init__.py
│ │ ├── modules/
│ │ │ ├── __init__.py
│ │ │ ├── decoder.py
│ │ │ ├── encoder.py
│ │ │ ├── multihead_attention.py
│ │ │ ├── speaker_decoder_postnet.py
│ │ │ ├── speech_decoder_postnet.py
│ │ │ ├── speech_decoder_prenet.py
│ │ │ ├── speech_encoder_postnet.py
│ │ │ ├── speech_encoder_prenet.py
│ │ │ ├── text_decoder_postnet.py
│ │ │ ├── text_decoder_prenet.py
│ │ │ ├── text_encoder_prenet.py
│ │ │ └── transformer_layer.py
│ │ ├── speecht5.py
│ │ └── t5_transformer_lm.py
│ ├── sequence_generator.py
│ └── tasks/
│ ├── __init__.py
│ └── speecht5.py
├── storage/
│ ├── unilm-base-cased-config.json
│ ├── unilm-base-cased-vocab.txt
│ ├── unilm-large-cased-config.json
│ └── unilm-large-cased-vocab.txt
├── textdiffuser/
│ ├── README.md
│ ├── assets/
│ │ ├── examples/
│ │ │ ├── text-inpainting/
│ │ │ │ ├── case1.txt
│ │ │ │ ├── case2.txt
│ │ │ │ ├── case3.txt
│ │ │ │ └── case4.txt
│ │ │ ├── text-to-image/
│ │ │ │ ├── case1.txt
│ │ │ │ ├── case2.txt
│ │ │ │ └── case3.txt
│ │ │ └── text-to-image-with-template/
│ │ │ ├── case1.txt
│ │ │ ├── case2.txt
│ │ │ └── case3.txt
│ │ ├── files/
│ │ │ ├── modeling_utils.py
│ │ │ ├── scheduling_ddpm.py
│ │ │ └── unet_2d_condition.py
│ │ └── font/
│ │ └── .gitkeep
│ ├── data/
│ │ ├── mario-laion-example/
│ │ │ ├── 06269/
│ │ │ │ ├── 062690093/
│ │ │ │ │ ├── caption.txt
│ │ │ │ │ ├── charseg.npy
│ │ │ │ │ ├── info.json
│ │ │ │ │ └── ocr.txt
│ │ │ │ ├── 062692210/
│ │ │ │ │ ├── caption.txt
│ │ │ │ │ ├── charseg.npy
│ │ │ │ │ ├── info.json
│ │ │ │ │ └── ocr.txt
│ │ │ │ └── 062692530/
│ │ │ │ ├── caption.txt
│ │ │ │ ├── charseg.npy
│ │ │ │ ├── info.json
│ │ │ │ └── ocr.txt
│ │ │ └── 27197/
│ │ │ ├── 271975131/
│ │ │ │ ├── caption.txt
│ │ │ │ ├── charseg.npy
│ │ │ │ ├── info.json
│ │ │ │ └── ocr.txt
│ │ │ ├── 271975251/
│ │ │ │ ├── caption.txt
│ │ │ │ ├── charseg.npy
│ │ │ │ ├── info.json
│ │ │ │ └── ocr.txt
│ │ │ └── 271978467/
│ │ │ ├── caption.txt
│ │ │ ├── charseg.npy
│ │ │ ├── info.json
│ │ │ └── ocr.txt
│ │ ├── mario-laion-unzip.py
│ │ └── visualize_charseg.ipynb
│ ├── eval/
│ │ ├── MARIOEval_evaluate.py
│ │ ├── MARIOEval_generate.py
│ │ ├── README.md
│ │ ├── clipscore.py
│ │ ├── evaluate.sh
│ │ ├── fid_score.py
│ │ ├── generate.sh
│ │ ├── inception.py
│ │ ├── ocr_eval.py
│ │ └── requirements.txt
│ ├── evaluate.py
│ ├── gradio_app.py
│ ├── inference.py
│ ├── model/
│ │ ├── layout_generator.py
│ │ ├── layout_transformer.py
│ │ └── text_segmenter/
│ │ ├── unet.py
│ │ └── unet_parts.py
│ ├── requirements.txt
│ ├── text-inpainting.sh
│ ├── text-to-image-with-template.sh
│ ├── text-to-image.sh
│ ├── textdiffuser-ckpt/
│ │ └── .gitkeep
│ ├── train.py
│ ├── train.sh
│ └── util.py
├── textdiffuser-2/
│ ├── README.md
│ ├── assets/
│ │ ├── attention_processor.py
│ │ └── reference_requirements.txt
│ ├── cog.yaml
│ ├── data/
│ │ ├── check_layout_planner_data.py
│ │ └── layout_planner_data_5k.json
│ ├── demo_textdiffuser2_inpainting_full.py
│ ├── demo_textdiffuser2_t2i_full.py
│ ├── extensions/
│ │ ├── angle_template_file.txt
│ │ ├── inference_textdiffuser2_t2i_full_angle.py
│ │ ├── inference_textdiffuser2_t2i_full_angle.sh
│ │ ├── inference_textdiffuser2_t2i_full_quadrilateral.py
│ │ ├── inference_textdiffuser2_t2i_full_quadrilateral.sh
│ │ ├── quadrilateral_template_file.txt
│ │ ├── train_textdiffuser2_t2i_full_angle.py
│ │ ├── train_textdiffuser2_t2i_full_angle.sh
│ │ ├── train_textdiffuser2_t2i_full_quadrilateral.py
│ │ └── train_textdiffuser2_t2i_full_quadrilateral.sh
│ ├── inference_textdiffuser2_t2i_full.py
│ ├── inference_textdiffuser2_t2i_full.sh
│ ├── inference_textdiffuser2_t2i_lora.py
│ ├── inference_textdiffuser2_t2i_lora.sh
│ ├── predict.py
│ ├── requirements.txt
│ ├── train_layout_planner.sh
│ ├── train_textdiffuser2_inpainting_full.py
│ ├── train_textdiffuser2_inpainting_full.sh
│ ├── train_textdiffuser2_t2i_full.py
│ ├── train_textdiffuser2_t2i_full.sh
│ ├── train_textdiffuser2_t2i_lora.py
│ └── train_textdiffuser2_t2i_lora.sh
├── trocr/
│ ├── README.md
│ ├── __init__.py
│ ├── augmentation/
│ │ ├── __init__.py
│ │ ├── blur.py
│ │ ├── camera.py
│ │ ├── geometry.py
│ │ ├── noise.py
│ │ ├── ops.py
│ │ ├── pattern.py
│ │ ├── process.py
│ │ ├── test.py
│ │ ├── warp.py
│ │ └── weather.py
│ ├── bpe.py
│ ├── convert_to_SROIE_format.py
│ ├── data.py
│ ├── data_aug.py
│ ├── deit.py
│ ├── generator.py
│ ├── pic_inference.ipynb
│ ├── pic_inference.py
│ ├── requirements.txt
│ ├── scoring.py
│ ├── task.py
│ ├── trocr_models.py
│ ├── unilm3-cased.model
│ ├── unilm_models.py
│ └── vit_models.py
├── unilm/
│ └── README.md
├── unilm-v1/
│ ├── README.md
│ └── src/
│ ├── biunilm/
│ │ ├── __init__.py
│ │ ├── decode_seq2seq.py
│ │ ├── gen_seq_from_trace.py
│ │ ├── loader_utils.py
│ │ ├── run_seq2seq.py
│ │ └── seq2seq_loader.py
│ ├── cnndm/
│ │ ├── __init__.py
│ │ ├── bs_pyrouge.py
│ │ └── eval.py
│ ├── gigaword/
│ │ ├── __init__.py
│ │ ├── bs_pyrouge.py
│ │ └── eval.py
│ ├── nn/
│ │ ├── __init__.py
│ │ └── data_parallel.py
│ ├── pytorch_pretrained_bert/
│ │ ├── __init__.py
│ │ ├── __main__.py
│ │ ├── file_utils.py
│ │ ├── loss.py
│ │ ├── modeling.py
│ │ ├── optimization.py
│ │ ├── optimization_fp16.py
│ │ └── tokenization.py
│ ├── qg/
│ │ ├── eval.py
│ │ └── eval_on_unilm_tokenized_ref.py
│ └── setup.py
├── unimim/
│ └── README.md
├── valle/
│ └── README.md
├── vl-beit/
│ └── README.md
├── vlmo/
│ ├── DATA.md
│ ├── README.md
│ ├── requirements.txt
│ ├── run.py
│ ├── setup.py
│ └── vlmo/
│ ├── __init__.py
│ ├── config.py
│ ├── datamodules/
│ │ ├── __init__.py
│ │ ├── coco_caption_karpathy_datamodule.py
│ │ ├── conceptual_caption_datamodule.py
│ │ ├── datamodule_base.py
│ │ ├── f30k_caption_karpathy_datamodule.py
│ │ ├── multitask_datamodule.py
│ │ ├── nlvr2_datamodule.py
│ │ ├── sbu_datamodule.py
│ │ ├── vg_caption_datamodule.py
│ │ ├── vqav2_datamodule.py
│ │ └── wikibk_datamodule.py
│ ├── datasets/
│ │ ├── __init__.py
│ │ ├── base_dataset.py
│ │ ├── coco_caption_karpathy_dataset.py
│ │ ├── conceptual_caption_dataset.py
│ │ ├── f30k_caption_karpathy_dataset.py
│ │ ├── nlvr2_dataset.py
│ │ ├── sbu_caption_dataset.py
│ │ ├── vg_caption_dataset.py
│ │ ├── vqav2_dataset.py
│ │ └── wikibk_dataset.py
│ ├── gadgets/
│ │ ├── __init__.py
│ │ └── my_metrics.py
│ ├── modules/
│ │ ├── __init__.py
│ │ ├── dist_utils.py
│ │ ├── heads.py
│ │ ├── multiway_transformer.py
│ │ ├── objectives.py
│ │ ├── vlmo_module.py
│ │ └── vlmo_utils.py
│ ├── transforms/
│ │ ├── __init__.py
│ │ ├── pixelbert.py
│ │ ├── randaug.py
│ │ ├── randaugment.py
│ │ ├── square_transform.py
│ │ └── utils.py
│ └── utils/
│ ├── glossary.py
│ ├── write_coco_karpathy.py
│ ├── write_conceptual_caption.py
│ ├── write_f30k_karpathy.py
│ ├── write_nlvr2.py
│ ├── write_sbu.py
│ ├── write_vg.py
│ ├── write_vqa.py
│ └── write_wikibk.py
├── wavlm/
│ ├── README.md
│ ├── WavLM.py
│ └── modules.py
├── xdoc/
│ ├── README.md
│ └── fine_tuning/
│ ├── README.md
│ ├── funsd/
│ │ ├── layoutlmft/
│ │ │ ├── __init__.py
│ │ │ ├── data/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── data_args.py
│ │ │ │ ├── data_collator.py
│ │ │ │ ├── datasets/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── funsd.py
│ │ │ │ │ └── xfun.py
│ │ │ │ └── utils.py
│ │ │ ├── evaluation.py
│ │ │ ├── models/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── layoutlm/
│ │ │ │ │ └── __init__.py
│ │ │ │ ├── layoutlmv2/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── configuration_layoutlmv2.py
│ │ │ │ │ ├── detectron2_config.py
│ │ │ │ │ ├── modeling_layoutlmv2.py
│ │ │ │ │ ├── tokenization_layoutlmv2.py
│ │ │ │ │ └── tokenization_layoutlmv2_fast.py
│ │ │ │ ├── layoutxlm/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── configuration_layoutxlm.py
│ │ │ │ │ ├── modeling_layoutxlm.py
│ │ │ │ │ ├── tokenization_layoutxlm.py
│ │ │ │ │ └── tokenization_layoutxlm_fast.py
│ │ │ │ └── model_args.py
│ │ │ ├── modules/
│ │ │ │ ├── __init__.py
│ │ │ │ └── decoders/
│ │ │ │ ├── __init__.py
│ │ │ │ └── re.py
│ │ │ ├── trainers/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── funsd_trainer.py
│ │ │ │ └── xfun_trainer.py
│ │ │ └── utils.py
│ │ ├── model.py
│ │ ├── requirements.txt
│ │ ├── run_funsd.py
│ │ └── run_funsd.sh
│ ├── squad/
│ │ ├── requirements.txt
│ │ ├── run_squad.py
│ │ ├── run_squad_v1.sh
│ │ ├── run_squad_v2.sh
│ │ ├── trainer_qa.py
│ │ ├── trainer_seq2seq_qa.py
│ │ └── utils_qa.py
│ └── websrc/
│ ├── args.py
│ ├── cache/
│ │ └── .gitkeep
│ ├── model.py
│ ├── requirements.txt
│ ├── run_websrc.py
│ ├── trainer.py
│ ├── util.py
│ ├── web_tag_utils.py
│ └── websrc.py
├── xlmt/
│ └── README.md
├── xmoe/
│ └── README.md
└── xtune/
├── README.md
├── scripts/
│ ├── cross-lingual-transfer/
│ │ ├── train_mlqa.sh
│ │ ├── train_panx.sh
│ │ ├── train_pawsx.sh
│ │ ├── train_tydiqa.sh
│ │ ├── train_udpos.sh
│ │ ├── train_xnli.sh
│ │ └── train_xquad.sh
│ ├── download_data.sh
│ ├── download_model.sh
│ ├── preprocess_panx.sh
│ ├── preprocess_udpos.sh
│ ├── train.sh
│ └── translate-train-all/
│ ├── train_mlqa.sh
│ ├── train_panx.sh
│ ├── train_pawsx.sh
│ ├── train_tydiqa.sh
│ ├── train_udpos.sh
│ ├── train_xnli.sh
│ └── train_xquad.sh
├── setup.py
├── src/
│ ├── pequod/
│ │ ├── __init__.py
│ │ ├── data/
│ │ │ ├── __init__.py
│ │ │ ├── dataloader.py
│ │ │ ├── sampler.py
│ │ │ ├── utils_squad.py
│ │ │ ├── utils_squad_evaluate.py
│ │ │ ├── wili.py
│ │ │ ├── xdoc.py
│ │ │ ├── xqa.py
│ │ │ └── xretrieval.py
│ │ ├── eval/
│ │ │ ├── __init__.py
│ │ │ ├── bretrieval.py
│ │ │ ├── evaluator.py
│ │ │ ├── utils_retrieve.py
│ │ │ └── xretrieval.py
│ │ ├── io.py
│ │ ├── model/
│ │ │ ├── __init__.py
│ │ │ └── roberta.py
│ │ ├── optim/
│ │ │ ├── __init__.py
│ │ │ ├── la.py
│ │ │ └── la0.py
│ │ ├── text/
│ │ │ ├── __init__.py
│ │ │ └── tokenization_sentencepiece.py
│ │ ├── tools/
│ │ │ ├── __init__.py
│ │ │ └── convert.py
│ │ └── training/
│ │ ├── __init__.py
│ │ ├── trainer.py
│ │ └── xtrainer.py
│ ├── run_cls.py
│ ├── run_qa.py
│ ├── run_tag.py
│ ├── tools/
│ │ ├── __init__.py
│ │ ├── check_many2many_alignment.py
│ │ ├── dump_hf_state_dict.py
│ │ ├── get_eval_results.py
│ │ ├── sample_xnli.py
│ │ └── xnli_sampling_statistics.py
│ ├── transformers/
│ │ ├── __init__.py
│ │ ├── activations.py
│ │ ├── commands/
│ │ │ ├── __init__.py
│ │ │ ├── convert.py
│ │ │ ├── download.py
│ │ │ ├── env.py
│ │ │ ├── run.py
│ │ │ ├── serving.py
│ │ │ ├── train.py
│ │ │ └── user.py
│ │ ├── configuration_albert.py
│ │ ├── configuration_auto.py
│ │ ├── configuration_bart.py
│ │ ├── configuration_bert.py
│ │ ├── configuration_camembert.py
│ │ ├── configuration_ctrl.py
│ │ ├── configuration_distilbert.py
│ │ ├── configuration_flaubert.py
│ │ ├── configuration_gpt2.py
│ │ ├── configuration_mmbt.py
│ │ ├── configuration_openai.py
│ │ ├── configuration_roberta.py
│ │ ├── configuration_t5.py
│ │ ├── configuration_transfo_xl.py
│ │ ├── configuration_utils.py
│ │ ├── configuration_xlm.py
│ │ ├── configuration_xlm_roberta.py
│ │ ├── configuration_xlnet.py
│ │ ├── convert_albert_original_tf_checkpoint_to_pytorch.py
│ │ ├── convert_bart_original_pytorch_checkpoint_to_pytorch.py
│ │ ├── convert_bert_original_tf_checkpoint_to_pytorch.py
│ │ ├── convert_bert_pytorch_checkpoint_to_original_tf.py
│ │ ├── convert_gpt2_original_tf_checkpoint_to_pytorch.py
│ │ ├── convert_openai_original_tf_checkpoint_to_pytorch.py
│ │ ├── convert_pytorch_checkpoint_to_tf2.py
│ │ ├── convert_roberta_original_pytorch_checkpoint_to_pytorch.py
│ │ ├── convert_t5_original_tf_checkpoint_to_pytorch.py
│ │ ├── convert_transfo_xl_original_tf_checkpoint_to_pytorch.py
│ │ ├── convert_xlm_original_pytorch_checkpoint_to_pytorch.py
│ │ ├── convert_xlnet_original_tf_checkpoint_to_pytorch.py
│ │ ├── data/
│ │ │ ├── __init__.py
│ │ │ ├── metrics/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── evaluate_mlqa.py
│ │ │ │ ├── evaluate_squad.py
│ │ │ │ ├── mlqa_evaluation_v1.py
│ │ │ │ └── squad_metrics.py
│ │ │ └── processors/
│ │ │ ├── __init__.py
│ │ │ ├── glue.py
│ │ │ ├── squad.py
│ │ │ ├── utils.py
│ │ │ ├── xglue.py
│ │ │ ├── xnli.py
│ │ │ └── xtreme.py
│ │ ├── file_utils.py
│ │ ├── hf_api.py
│ │ ├── modelcard.py
│ │ ├── modeling_albert.py
│ │ ├── modeling_auto.py
│ │ ├── modeling_bart.py
│ │ ├── modeling_bert.py
│ │ ├── modeling_camembert.py
│ │ ├── modeling_ctrl.py
│ │ ├── modeling_distilbert.py
│ │ ├── modeling_encoder_decoder.py
│ │ ├── modeling_flaubert.py
│ │ ├── modeling_gpt2.py
│ │ ├── modeling_mmbt.py
│ │ ├── modeling_openai.py
│ │ ├── modeling_roberta.py
│ │ ├── modeling_t5.py
│ │ ├── modeling_tf_albert.py
│ │ ├── modeling_tf_auto.py
│ │ ├── modeling_tf_bert.py
│ │ ├── modeling_tf_camembert.py
│ │ ├── modeling_tf_ctrl.py
│ │ ├── modeling_tf_distilbert.py
│ │ ├── modeling_tf_gpt2.py
│ │ ├── modeling_tf_openai.py
│ │ ├── modeling_tf_pytorch_utils.py
│ │ ├── modeling_tf_roberta.py
│ │ ├── modeling_tf_t5.py
│ │ ├── modeling_tf_transfo_xl.py
│ │ ├── modeling_tf_transfo_xl_utilities.py
│ │ ├── modeling_tf_utils.py
│ │ ├── modeling_tf_xlm.py
│ │ ├── modeling_tf_xlm_roberta.py
│ │ ├── modeling_tf_xlnet.py
│ │ ├── modeling_transfo_xl.py
│ │ ├── modeling_transfo_xl_utilities.py
│ │ ├── modeling_utils.py
│ │ ├── modeling_xlm.py
│ │ ├── modeling_xlm_roberta.py
│ │ ├── modeling_xlnet.py
│ │ ├── optimization.py
│ │ ├── optimization_tf.py
│ │ ├── pipelines.py
│ │ ├── tokenization_albert.py
│ │ ├── tokenization_auto.py
│ │ ├── tokenization_bart.py
│ │ ├── tokenization_bert.py
│ │ ├── tokenization_bert_japanese.py
│ │ ├── tokenization_camembert.py
│ │ ├── tokenization_ctrl.py
│ │ ├── tokenization_distilbert.py
│ │ ├── tokenization_flaubert.py
│ │ ├── tokenization_gpt2.py
│ │ ├── tokenization_openai.py
│ │ ├── tokenization_roberta.py
│ │ ├── tokenization_t5.py
│ │ ├── tokenization_transfo_xl.py
│ │ ├── tokenization_utils.py
│ │ ├── tokenization_xlm.py
│ │ ├── tokenization_xlm_roberta.py
│ │ ├── tokenization_xlnet.py
│ │ └── utils_encoder_decoder.py
│ ├── ud-conversion-tools/
│ │ └── conllu_to_conll.py
│ └── utils_tag.py
├── transformers-cli
└── utils_preprocess.py
================================================
FILE CONTENTS
================================================
================================================
FILE: .github/ISSUE_TEMPLATE/bug_report.md
================================================
---
name: Bug report
about: Create a report to help us improve
title: ''
labels: ''
assignees: ''
---
**Describe the bug**
Model I am using (UniLM, MiniLM, LayoutLM ...):
The problem arises when using:
* [ ] the official example scripts: (give details below)
* [ ] my own modified scripts: (give details below)
A clear and concise description of what the bug is.
**To Reproduce**
Steps to reproduce the behavior:
1.
2.
3.
**Expected behavior**
A clear and concise description of what you expected to happen.
- Platform:
- Python version:
- PyTorch version (GPU?):
================================================
FILE: .github/ISSUE_TEMPLATE/custom.md
================================================
---
name: Custom issue template
about: Describe this issue template's purpose here.
title: ''
labels: ''
assignees: ''
---
**Describe**
Model I am using (UniLM, MiniLM, LayoutLM ...):
================================================
FILE: .gitignore
================================================
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
speechlm/dataset/LibriSpeech/fast_phone2unit/dict.PHN.txt
speechlm/dataset/LibriSpeech/fast_phone2unit/dict.phn.txt
================================================
FILE: .gitmodules
================================================
[submodule "deltalm/fairseq"]
path = deltalm/fairseq
url = https://github.com/pytorch/fairseq.git
[submodule "speechlm/fairseq"]
path = speechlm/fairseq
url = https://github.com/facebookresearch/fairseq.git
[submodule "speecht5/fairseq"]
path = speecht5/fairseq
url = https://github.com/facebookresearch/fairseq.git
================================================
FILE: CODE_OF_CONDUCT.md
================================================
# Microsoft Open Source Code of Conduct
This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
Resources:
- [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
- [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
- Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
================================================
FILE: CONTRIBUTING.md
================================================
# Contributing
This project welcomes contributions and suggestions. Most contributions require you to
agree to a Contributor License Agreement (CLA) declaring that you have the right to,
and actually do, grant us the rights to use your contribution. For details, visit
https://cla.microsoft.com.
When you submit a pull request, a CLA-bot will automatically determine whether you need
to provide a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the
instructions provided by the bot. You will only need to do this once across all repositories using our CLA.
This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
================================================
FILE: Diff-Transformer/Diff-Transformer-V2/README.md
================================================
# Differential Transformer V2 (DIFF V2)
[Read the blog post here](https://spiky-homegrown-4cb.notion.site/Differential-Transformer-V2-2e7baa052def80ecaa93d4d67d125417)
The implementation is provided in `multihead_flashdiffv2.py`.
## TL;DR
We introduce **Differential Transformer V2** (DIFF V2), an improved version of [Differential Transformer](https://arxiv.org/abs/2410.05258) (DIFF V1). This revision focuses on inference efficiency, training stability for production-level LLMs, and architectural elegance.
### Key Improvements
1. **Faster Inference & No Need of Custom Attention Kernels**
Instead of forcing the attention parameter count to match the baseline Transformer (as in DIFF V1), we introduce additional parameters for $Q_2$. This design allows DIFF V2 to match the baseline Transformer’s decoding speed and directly use [FlashAttention](https://github.com/Dao-AILab/flash-attention) without custom kernels.
2. **Improved Training Stability**
We remove the per-head RMSNorm after differential attention. We find the per-head RMSNorm can lead to instability in later stages of large-scale pretraining of LLM.
3. **Simpler Parameterization & Initialization**
We replace the globally shared $\lambda$ with a token-specific, head-wise projected $\lambda$. This eliminates the exponential re-parameterization and initialization complexity of $\lambda$ in V1.
## Implementation Details
### Pseudocode
In the script, `h` represents number of query heads, `h_kv` represents number of key-value heads, and `d` means head dimension. The $\lambda$ in DIFF V2 is projected from $X$ for each token each head.
(For simplicity, we omit the batch dimension and assume that both the input and output of the following `flash_attn_func` are three-dimensional tensors `(tokens, heads, head dimension)`. Heads belonging to the same GQA group are arranged contiguously in the output)
```python
def DiffAttnV2(
q, k, v, lam
):
"""
q: (N, 2h, d)
k: (N, h_kv, d)
v: (N, h_kv, d)
lam: (N, h, 1)
"""
attn = flash_attn_func(q, k, v)
attn1, attn2 = (attn[:, 0::2],
attn[:, 1::2])
lam_val = sigmoid(lam)
attn = attn1 - lam_val * attn2
return attn
```
### Note
DIFF V2 subtracts two heads that are **in the same GQA group, which means they share the same key and value**.
```python
# Subtraction of two heads that are **not** in the same GQA group
# ❌ Wrong Implementation of DIFF V2!
...
attn = flash_attn_func(q, k, v)
nh = attn.size(1)
attn1, attn2 = (attn[:, :nh//2],
attn[:, nh//2:])
# similarly, also wrong implementation:
# attn1, attn2 = attn.chunk(2, dim=1)
...
```
```python
# DIFF V2: Subtraction of two heads that are **in** the same GQA group
# ✅ Correct Implementation of DIFF V2
...
attn = flash_attn_func(q, k, v)
attn1, attn2 = (attn[:, 0::2],
attn[:, 1::2])
...
```
================================================
FILE: Diff-Transformer/Diff-Transformer-V2/multihead_flashdiffv2.py
================================================
import torch
from torch import nn
from typing import Optional, Tuple
from ..kernel.rotary import apply_rotary_emb
from flash_attn import flash_attn_func
@torch.compile
def diff_func(attn1: torch.Tensor, attn2: torch.Tensor, lambda_val: torch.Tensor) -> torch.Tensor:
return attn1 - torch.sigmoid(lambda_val).unsqueeze(-1) * attn2
class MultiheadFlashDiffV2(nn.Module):
"""
Differential Attention Version 2 (DiffAttnV2) implementation using Flash Attention.
"""
def __init__(
self,
use_diff_v2: bool, # If False, acts as a baseline Transformer attention
d_model: int, # Model dimension
num_heads: int, # Number of output heads
num_kv_heads: Optional[int], # Number of KV heads for GQA
head_dim: int, # Dimension per head
):
super().__init__()
self.use_diff_v2 = use_diff_v2
self.d_model = d_model
self.num_heads = num_heads
self.num_kv_heads = num_kv_heads if num_kv_heads is not None else num_heads
self.head_dim = head_dim
self.num_q_heads = 2 * self.num_heads if self.use_diff_v2 else self.num_heads
self.q_proj = nn.Linear(self.d_model, self.num_q_heads * self.head_dim, bias=False)
self.k_proj = nn.Linear(self.d_model, self.num_kv_heads * self.head_dim, bias=False)
self.v_proj = nn.Linear(self.d_model, self.num_kv_heads * self.head_dim, bias=False)
self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.d_model, bias=False)
self.lambda_proj = nn.Linear(self.d_model, self.num_heads, bias=False) if self.use_diff_v2 else None
def forward(
self,
x: torch.Tensor, # Input tensor [bsz, seq_len, d_model]
rel_pos: Tuple[torch.Tensor, torch.Tensor], # Rotary embedding (cos, sin)
) -> torch.Tensor:
"""
Forward pass for MultiheadFlashDiffV2.
Args:
x: Input hidden states of shape [batch, length, d_model]
rel_pos: Tuple of (cos, sin) tensors for rotary positional embeddings
Returns:
Output tensor of shape [batch, length, d_model]
"""
bsz, tgt_len, _ = x.size()
src_len = tgt_len
q = self.q_proj(x)
k = self.k_proj(x)
v = self.v_proj(x)
q = q.view(bsz, tgt_len, self.num_q_heads, self.head_dim)
k = k.view(bsz, src_len, self.num_kv_heads, self.head_dim)
v = v.view(bsz, src_len, self.num_kv_heads, self.head_dim)
q = apply_rotary_emb(q, *rel_pos, interleaved=True)
k = apply_rotary_emb(k, *rel_pos, interleaved=True)
attn = flash_attn_func(q, k, v, causal=True)
if self.use_diff_v2:
lambda_val = self.lambda_proj(x)
attn1, attn2 = attn[:, :, 0::2], attn[:, :, 1::2]
attn = diff_func(attn1, attn2, lambda_val)
attn = attn.reshape(bsz, tgt_len, self.num_heads * self.head_dim)
output = self.o_proj(attn)
return output
================================================
FILE: Diff-Transformer/README.md
================================================
# Differential Transformer
## Approach
## Contents
`multihead_diffattn.py` contains naive implementation of multi-head differential attention.
`multihead_flashdiff_1.py` contains multi-head differential attention implemented with FlashAttention, for packages that support different qk/v dimensions (e.g., our [customized-flash-attention](https://aka.ms/flash-diff) and [xformers](https://github.com/facebookresearch/xformers)). **(Recommended for faster training and inference)**
`multihead_flashdiff_2.py` contains multi-head differential attention implemented with FlashAttention, for packages that **do not** support different qk/v dimensions (e.g., [flash-attention](https://github.com/Dao-AILab/flash-attention)).
`multihead_attention.py` contains implementation of conventional multi-head attention.
`example.py` contains instantiation of differential attention and conventional attention in pair, which can be compared against each other.
Also refer to [PR](https://github.com/microsoft/unilm/pull/1633) for another implementation.
We recommend using models with a sufficiently large number of heads to minimize the impact of halving heads. For instance, using Diff Transformer with more than 8 heads (the minimum used in the paper, with the same number of parameters as Transformer with 16 heads) is advisable.
## Core Code
================================================
FILE: Diff-Transformer/example.py
================================================
from multihead_diffattn import MultiheadDiffAttn
from multihead_attention import MultiheadAttention
if __name__ == "__main__":
# Diff Attention with MHA, 1024 embed_dim, 8 heads, 8 kv_heads
diff_attn_mha = MultiheadDiffAttn(embed_dim=1024, depth=0, num_heads=8, num_kv_heads=None)
# can be compared against baseline Attention with MHA, 1024 embed_dim, 16 heads, 16 kv_heads
attn_mha = MultiheadAttention(embed_dim=1024, depth=0, num_heads=16, num_kv_heads=None)
# write code to print their number of parameters
print("Number of parameters in 1 layer diff_attn_mha:", sum(p.numel() for p in diff_attn_mha.parameters()))
print("Number of parameters in 1 layer attn_mha:", sum(p.numel() for p in attn_mha.parameters()))
# Diff Attention with GQA, 1024 embed_dim, 8 heads, 4 kv_heads
diff_attn_gqa = MultiheadDiffAttn(embed_dim=1024, depth=0, num_heads=8, num_kv_heads=4)
# can be compared against baseline Attention with GQA, 1024 embed_dim, 16 heads, 8 kv_heads
attn_gqa = MultiheadAttention(embed_dim=1024, depth=0, num_heads=16, num_kv_heads=8)
print("Number of parameters in 1 layer diff_attn_gqa:", sum(p.numel() for p in diff_attn_gqa.parameters()))
print("Number of parameters in 1 layer attn_gqa:", sum(p.numel() for p in attn_gqa.parameters()))
================================================
FILE: Diff-Transformer/kernel/rotary.py
================================================
# Copyright (c) 2023, Tri Dao.
from typing import Optional, Union
import torch
import triton
import triton.language as tl
# @triton.autotune(
# configs=[
# triton.Config({"BLOCK_M": 2}),
# triton.Config({"BLOCK_M": 4}),
# triton.Config({"BLOCK_M": 8}),
# triton.Config({"BLOCK_M": 16}),
# ],
# key=["CACHE_KEY_SEQLEN", "BLOCK_K", "INTERLEAVED"],
# )
@triton.jit
def rotary_kernel(
OUT, # Pointers to matrices
X,
COS,
SIN,
CU_SEQLENS,
SEQLEN_OFFSETS, # this could be int or a pointer
# Matrix dimensions
seqlen,
nheads,
rotary_dim,
seqlen_ro,
CACHE_KEY_SEQLEN,
# strides
stride_out_batch,
stride_out_seqlen,
stride_out_nheads,
stride_out_headdim,
stride_x_batch,
stride_x_seqlen,
stride_x_nheads,
stride_x_headdim,
# Meta-parameters
BLOCK_K: tl.constexpr,
IS_SEQLEN_OFFSETS_TENSOR: tl.constexpr,
IS_VARLEN: tl.constexpr,
INTERLEAVED: tl.constexpr,
CONJUGATE: tl.constexpr,
BLOCK_M: tl.constexpr,
):
pid_m = tl.program_id(axis=0)
pid_batch = tl.program_id(axis=1)
pid_head = tl.program_id(axis=2)
rotary_dim_half = rotary_dim // 2
if not IS_VARLEN:
X = X + pid_batch * stride_x_batch + pid_head * stride_x_nheads
OUT = OUT + pid_batch * stride_out_batch + pid_head * stride_out_nheads
else:
start_idx = tl.load(CU_SEQLENS + pid_batch)
seqlen = tl.load(CU_SEQLENS + pid_batch + 1) - start_idx
X = X + start_idx * stride_x_seqlen + pid_head * stride_x_nheads
OUT = OUT + start_idx * stride_out_seqlen + pid_head * stride_out_nheads
if pid_m * BLOCK_M >= seqlen:
return
rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
if not IS_SEQLEN_OFFSETS_TENSOR:
rm_cs = rm + SEQLEN_OFFSETS
else:
rm_cs = rm + tl.load(SEQLEN_OFFSETS + pid_batch)
rk = tl.arange(0, BLOCK_K)
rk_half = tl.arange(0, BLOCK_K // 2)
if not INTERLEAVED:
# Load the 1st and 2nd halves of X, do calculation, then store to 1st and 2nd halves of OUT
X = X + (rm[:, None] * stride_x_seqlen + rk_half[None, :] * stride_x_headdim)
COS = COS + (rm_cs[:, None] * rotary_dim_half + rk_half[None, :])
SIN = SIN + (rm_cs[:, None] * rotary_dim_half + rk_half[None, :])
cos = tl.load(
COS, mask=(rm_cs[:, None] < seqlen_ro) & (rk_half[None, :] < rotary_dim_half), other=1.0
).to(tl.float32)
sin = tl.load(
SIN, mask=(rm_cs[:, None] < seqlen_ro) & (rk_half[None, :] < rotary_dim_half), other=0.0
).to(tl.float32)
x0 = tl.load(
X, mask=(rm[:, None] < seqlen) & (rk_half[None, :] < rotary_dim_half), other=0.0
).to(tl.float32)
x1 = tl.load(
X + rotary_dim_half * stride_x_headdim,
mask=(rm[:, None] < seqlen) & (rk_half[None, :] < rotary_dim_half),
other=0.0,
).to(tl.float32)
if CONJUGATE:
sin = -sin
o0 = x0 * cos - x1 * sin
o1 = x0 * sin + x1 * cos
# write back result
OUT = OUT + (rm[:, None] * stride_out_seqlen + rk_half[None, :] * stride_out_headdim)
tl.store(OUT, o0, mask=(rm[:, None] < seqlen) & (rk_half[None, :] < rotary_dim_half))
tl.store(
OUT + rotary_dim_half * stride_out_headdim,
o1,
mask=(rm[:, None] < seqlen) & (rk_half[None, :] < rotary_dim_half),
)
else:
# We don't want to load X[0, 2, 4, ...] and X[1, 3, 5, ...] separately since both are slow.
# Instead, we load x0 = X[0, 1, 2, 3, ...] and x1 = X[1, 0, 3, 2, ...].
# Loading x0 will be fast but x1 will be slow.
# Then we load cos = COS[0, 0, 1, 1, ...] and sin = SIN[0, 0, 1, 1, ...].
# Then we do the calculation and use tl.where to pick put the right outputs for the even
# and for the odd indices.
rk_swap = rk + ((rk + 1) % 2) * 2 - 1 # 1, 0, 3, 2, 5, 4, ...
rk_repeat = tl.arange(0, BLOCK_K) // 2
X0 = X + (rm[:, None] * stride_x_seqlen + rk[None, :] * stride_x_headdim)
X1 = X + (rm[:, None] * stride_x_seqlen + rk_swap[None, :] * stride_x_headdim)
COS = COS + (rm_cs[:, None] * rotary_dim_half + rk_repeat[None, :])
SIN = SIN + (rm_cs[:, None] * rotary_dim_half + rk_repeat[None, :])
cos = tl.load(
COS,
mask=(rm_cs[:, None] < seqlen_ro) & (rk_repeat[None, :] < rotary_dim_half),
other=1.0,
).to(tl.float32)
sin = tl.load(
SIN,
mask=(rm_cs[:, None] < seqlen_ro) & (rk_repeat[None, :] < rotary_dim_half),
other=0.0,
).to(tl.float32)
x0 = tl.load(X0, mask=(rm[:, None] < seqlen) & (rk[None, :] < rotary_dim), other=0.0).to(
tl.float32
)
x1 = tl.load(
X1, mask=(rm[:, None] < seqlen) & (rk_swap[None, :] < rotary_dim), other=0.0
).to(tl.float32)
if CONJUGATE:
sin = -sin
x0_cos = x0 * cos
x1_sin = x1 * sin
out = tl.where(rk[None, :] % 2 == 0, x0_cos - x1_sin, x0_cos + x1_sin)
OUT = OUT + (rm[:, None] * stride_out_seqlen + rk[None, :] * stride_out_headdim)
tl.store(OUT, out, mask=(rm[:, None] < seqlen) & (rk[None, :] < rotary_dim))
def apply_rotary(
x: torch.Tensor,
cos: torch.Tensor,
sin: torch.Tensor,
seqlen_offsets: Union[int, torch.Tensor] = 0,
cu_seqlens: Optional[torch.Tensor] = None,
max_seqlen: Optional[int] = None,
interleaved=False,
inplace=False,
conjugate=False,
) -> torch.Tensor:
"""
Arguments:
x: (batch, seqlen, nheads, headdim) if cu_seqlens is None
else (total_seqlen, nheads, headdim).
cos: (seqlen_ro, rotary_dim / 2)
sin: (seqlen_ro, rotary_dim / 2)
seqlen_offsets: integer or integer tensor of size (batch,)
cu_seqlens: (batch + 1,) or None
max_seqlen: int
Returns:
y: (batch, seqlen, nheads, headdim)
"""
is_varlen = cu_seqlens is not None
if not is_varlen:
batch, seqlen, nheads, headdim = x.shape
else:
assert max_seqlen is not None, "If cu_seqlens is passed in, then max_seqlen must be passed"
total_seqlen, nheads, headdim = x.shape
batch_p_1 = cu_seqlens.shape[0]
batch = batch_p_1 - 1
seqlen = max_seqlen
seqlen_ro, rotary_dim = cos.shape
assert sin.shape == cos.shape
rotary_dim *= 2
assert rotary_dim <= headdim, "rotary_dim must be <= headdim"
assert headdim <= 256, "Only support headdim <= 256"
assert seqlen_ro >= seqlen, "seqlen_ro must be >= seqlen"
assert (
cos.dtype == sin.dtype
), f"cos and sin must have the same dtype, got {cos.dtype} and {sin.dtype}"
assert (
x.dtype == cos.dtype
), f"Input and cos/sin must have the same dtype, got {x.dtype} and {cos.dtype}"
cos, sin = cos.contiguous(), sin.contiguous()
if isinstance(seqlen_offsets, torch.Tensor):
assert seqlen_offsets.shape == (batch,)
assert seqlen_offsets.dtype in [torch.int32, torch.int64]
seqlen_offsets = seqlen_offsets.contiguous()
else:
assert seqlen_offsets + seqlen <= seqlen_ro
output = torch.empty_like(x) if not inplace else x
if rotary_dim < headdim and not inplace:
output[..., rotary_dim:].copy_(x[..., rotary_dim:])
BLOCK_K = (
32
if rotary_dim <= 32
else (64 if rotary_dim <= 64 else (128 if rotary_dim <= 128 else 256))
)
grid = lambda META: (triton.cdiv(seqlen, META["BLOCK_M"]), batch, nheads) # noqa
BLOCK_M = 4 if interleaved else (8 if rotary_dim <= 64 else 4)
# Need this, otherwise Triton tries to launch from cuda:0 and we get
# ValueError: Pointer argument (at 0) cannot be accessed from Triton (cpu tensor?)
with torch.cuda.device(x.device.index):
rotary_kernel[grid](
output, # data ptrs
x,
cos,
sin,
cu_seqlens,
seqlen_offsets,
seqlen, # shapes
nheads,
rotary_dim,
seqlen_ro,
seqlen // 128, # key for triton cache (limit number of compilations)
output.stride(0) if not is_varlen else 0, # batch_strides if not varlen else 0
output.stride(-3), # seqlen_stride or total_seqlen_stride
output.stride(-2), # nheads_stride
output.stride(-1), # headdim_stride
x.stride(0) if not is_varlen else 0, # batch_strides if not varlen else 0
x.stride(-3), # seqlen stride or total_seqlen_stride
x.stride(-2), # nheads stride
x.stride(-1), # headdim stride
BLOCK_K,
isinstance(seqlen_offsets, torch.Tensor),
is_varlen,
interleaved,
conjugate,
BLOCK_M,
)
return output
class ApplyRotaryEmb(torch.autograd.Function):
@staticmethod
def forward(
ctx,
x,
cos,
sin,
interleaved=False,
inplace=False,
seqlen_offsets: Union[int, torch.Tensor] = 0,
cu_seqlens: Optional[torch.Tensor] = None,
max_seqlen: Optional[int] = None,
):
out = apply_rotary(
x,
cos,
sin,
seqlen_offsets=seqlen_offsets,
cu_seqlens=cu_seqlens,
max_seqlen=max_seqlen,
interleaved=interleaved,
inplace=inplace,
)
if isinstance(seqlen_offsets, int):
# Can't save int with save_for_backward
ctx.save_for_backward(cos, sin, cu_seqlens)
ctx.seqlen_offsets = seqlen_offsets
else:
ctx.save_for_backward(cos, sin, cu_seqlens, seqlen_offsets)
ctx.seqlen_offsets = None
ctx.interleaved = interleaved
ctx.inplace = inplace
ctx.max_seqlen = max_seqlen
return out if not inplace else x
@staticmethod
def backward(ctx, do):
seqlen_offsets = ctx.seqlen_offsets
if seqlen_offsets is None:
cos, sin, cu_seqlens, seqlen_offsets = ctx.saved_tensors
else:
cos, sin, cu_seqlens = ctx.saved_tensors
# TD [2023-09-02]: For some reason Triton (2.0.0.post1) errors with
# "[CUDA]: invalid device context", and cloning makes it work. Idk why. Triton 2.1.0 works.
if not ctx.interleaved and not ctx.inplace:
do = do.clone()
dx = apply_rotary(
do,
cos,
sin,
seqlen_offsets=seqlen_offsets,
cu_seqlens=cu_seqlens,
max_seqlen=ctx.max_seqlen,
interleaved=ctx.interleaved,
inplace=ctx.inplace,
conjugate=True,
)
return dx, None, None, None, None, None, None, None
def apply_rotary_emb(
x,
cos,
sin,
interleaved=False,
inplace=False,
seqlen_offsets: Union[int, torch.Tensor] = 0,
cu_seqlens: Optional[torch.Tensor] = None,
max_seqlen: Optional[int] = None,
):
"""
Arguments:
x: (batch_size, seqlen, nheads, headdim) if cu_seqlens is None
else (total_seqlen, nheads, headdim)
cos, sin: (seqlen_rotary, rotary_dim / 2)
interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead
of 1st half and 2nd half (GPT-NeoX style).
inplace: if True, apply rotary embedding in-place.
seqlen_offsets: (batch_size,) or int. Each sequence in x is shifted by this amount.
Most commonly used in inference when we have KV cache.
cu_seqlens: (batch + 1,) or None
max_seqlen: int
Return:
out: (batch_size, seqlen, nheads, headdim) if cu_seqlens is None
else (total_seqlen, nheads, headdim)
rotary_dim must be <= headdim
Apply rotary embedding to the first rotary_dim of x.
"""
return ApplyRotaryEmb.apply(
x, cos, sin, interleaved, inplace, seqlen_offsets, cu_seqlens, max_seqlen
)
================================================
FILE: Diff-Transformer/multihead_attention.py
================================================
import math
import torch
import torch.nn.functional as F
from torch import nn
from kernel.rotary import apply_rotary_emb
from flash_attn import flash_attn_func
try:
from apex.normalization import FusedRMSNorm as RMSNorm
except ModuleNotFoundError:
print("No fused RMSNorm")
from rms_norm import RMSNorm
def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor:
"""torch.repeat_interleave(x, dim=1, repeats=n_rep)"""
bs, n_kv_heads, slen, head_dim = x.shape
if n_rep == 1:
return x
return (
x[:, :, None, :, :]
.expand(bs, n_kv_heads, n_rep, slen, head_dim)
.reshape(bs, n_kv_heads * n_rep, slen, head_dim)
)
class MultiheadAttention(nn.Module):
def __init__(
self,
embed_dim,
depth,
num_heads,
num_kv_heads=None,
):
super().__init__()
self.embed_dim = embed_dim
self.num_heads = num_heads
self.num_kv_heads = num_kv_heads if num_kv_heads is not None else num_heads
self.n_rep = self.num_heads // self.num_kv_heads
self.head_dim = embed_dim // num_heads
self.scaling = self.head_dim ** -0.5
self.q_proj = nn.Linear(embed_dim, embed_dim, bias=False)
self.k_proj = nn.Linear(embed_dim, embed_dim // self.n_rep, bias=False)
self.v_proj = nn.Linear(embed_dim, embed_dim // self.n_rep, bias=False)
self.out_proj = nn.Linear(embed_dim, embed_dim, bias=False)
def forward(
self,
x,
rel_pos,
attn_mask=None,
):
bsz, tgt_len, embed_dim = x.size()
src_len = tgt_len
q = self.q_proj(x)
k = self.k_proj(x)
v = self.v_proj(x)
q = q.view(bsz, tgt_len, self.num_heads, self.head_dim)
k = k.view(bsz, src_len, self.num_kv_heads, self.head_dim)
v = v.view(bsz, src_len, self.num_kv_heads, self.head_dim)
q = apply_rotary_emb(q, *rel_pos, interleaved=True)
k = apply_rotary_emb(k, *rel_pos, interleaved=True)
offset = src_len - tgt_len
q = q.transpose(1, 2)
k = repeat_kv(k.transpose(1, 2), self.n_rep)
v = repeat_kv(v.transpose(1, 2), self.n_rep)
q *= self.scaling
attn_weights = torch.matmul(q, k.transpose(-1, -2))
if attn_mask is None:
attn_mask = torch.triu(
torch.zeros([tgt_len, src_len])
.float()
.fill_(float("-inf"))
.type_as(attn_weights),
1 + offset,
)
attn_weights = torch.nan_to_num(attn_weights)
attn_weights += attn_mask
attn_weights = F.softmax(attn_weights, dim=-1, dtype=torch.float32).type_as(
attn_weights
)
attn = torch.matmul(attn_weights, v)
attn = attn.transpose(1, 2).reshape(bsz, tgt_len, self.num_heads * self.head_dim)
attn = self.out_proj(attn)
return attn
================================================
FILE: Diff-Transformer/multihead_diffattn.py
================================================
import math
import torch
import torch.nn.functional as F
from torch import nn
from kernel.rotary import apply_rotary_emb
from flash_attn import flash_attn_func
try:
from apex.normalization import FusedRMSNorm as RMSNorm
except ModuleNotFoundError:
print("No fused RMSNorm")
from rms_norm import RMSNorm
def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor:
"""torch.repeat_interleave(x, dim=1, repeats=n_rep)"""
bs, n_kv_heads, slen, head_dim = x.shape
if n_rep == 1:
return x
return (
x[:, :, None, :, :]
.expand(bs, n_kv_heads, n_rep, slen, head_dim)
.reshape(bs, n_kv_heads * n_rep, slen, head_dim)
)
def lambda_init_fn(depth):
return 0.8 - 0.6 * math.exp(-0.3 * depth)
class MultiheadDiffAttn(nn.Module):
def __init__(
self,
embed_dim,
depth, # current layer index
num_heads,
num_kv_heads=None,
):
super().__init__()
self.embed_dim = embed_dim
# arg num_heads set to half of baseline Transformer's num_heads
# for e.g., to compare with a baseline Transformer with 16 heads, pass in num_heads=8 for DIFF Transformer
self.num_heads = num_heads
# arg num_kv_heads set to half of baseline Transformer's num_kv_heads if use GQA
# for e.g., to compare with a baseline Transformer with 16 heads and 8 kv_heads,
# pass in num_heads=8, num_kv_heads=4 for DIFF Transformer
# if use MHA, pass in num_kv_heads=None
self.num_kv_heads = num_kv_heads if num_kv_heads is not None else num_heads
self.n_rep = self.num_heads // self.num_kv_heads
self.head_dim = embed_dim // num_heads // 2
self.scaling = self.head_dim ** -0.5
self.q_proj = nn.Linear(embed_dim, embed_dim, bias=False)
self.k_proj = nn.Linear(embed_dim, embed_dim // self.n_rep, bias=False)
self.v_proj = nn.Linear(embed_dim, embed_dim // self.n_rep, bias=False)
self.out_proj = nn.Linear(embed_dim, embed_dim, bias=False)
# depth means current layer index
self.lambda_init = lambda_init_fn(depth)
self.lambda_q1 = nn.Parameter(torch.zeros(self.head_dim, dtype=torch.float32).normal_(mean=0,std=0.1))
self.lambda_k1 = nn.Parameter(torch.zeros(self.head_dim, dtype=torch.float32).normal_(mean=0,std=0.1))
self.lambda_q2 = nn.Parameter(torch.zeros(self.head_dim, dtype=torch.float32).normal_(mean=0,std=0.1))
self.lambda_k2 = nn.Parameter(torch.zeros(self.head_dim, dtype=torch.float32).normal_(mean=0,std=0.1))
self.subln = RMSNorm(2 * self.head_dim, eps=1e-5, elementwise_affine=True)
def forward(
self,
x,
rel_pos,
attn_mask=None,
):
bsz, tgt_len, embed_dim = x.size()
src_len = tgt_len
q = self.q_proj(x)
k = self.k_proj(x)
v = self.v_proj(x)
q = q.view(bsz, tgt_len, 2 * self.num_heads, self.head_dim)
k = k.view(bsz, src_len, 2 * self.num_kv_heads, self.head_dim)
v = v.view(bsz, src_len, self.num_kv_heads, 2 * self.head_dim)
q = apply_rotary_emb(q, *rel_pos, interleaved=True)
k = apply_rotary_emb(k, *rel_pos, interleaved=True)
offset = src_len - tgt_len
q = q.transpose(1, 2)
k = repeat_kv(k.transpose(1, 2), self.n_rep)
v = repeat_kv(v.transpose(1, 2), self.n_rep)
q *= self.scaling
attn_weights = torch.matmul(q, k.transpose(-1, -2))
if attn_mask is None:
attn_mask = torch.triu(
torch.zeros([tgt_len, src_len])
.float()
.fill_(float("-inf"))
.type_as(attn_weights),
1 + offset,
)
attn_weights = torch.nan_to_num(attn_weights)
attn_weights += attn_mask
attn_weights = F.softmax(attn_weights, dim=-1, dtype=torch.float32).type_as(
attn_weights
)
lambda_1 = torch.exp(torch.sum(self.lambda_q1 * self.lambda_k1, dim=-1).float()).type_as(q)
lambda_2 = torch.exp(torch.sum(self.lambda_q2 * self.lambda_k2, dim=-1).float()).type_as(q)
lambda_full = lambda_1 - lambda_2 + self.lambda_init
attn_weights = attn_weights.view(bsz, self.num_heads, 2, tgt_len, src_len)
attn_weights = attn_weights[:, :, 0] - lambda_full * attn_weights[:, :, 1]
attn = torch.matmul(attn_weights, v)
attn = self.subln(attn)
attn = attn * (1 - self.lambda_init)
attn = attn.transpose(1, 2).reshape(bsz, tgt_len, self.num_heads * 2 * self.head_dim)
attn = self.out_proj(attn)
return attn
================================================
FILE: Diff-Transformer/multihead_flashdiff_1.py
================================================
import math
import torch
import torch.nn.functional as F
from torch import nn
from kernel.rotary import apply_rotary_emb
from flex_head_fa import flash_attn_func
try:
from apex.normalization import FusedRMSNorm as RMSNorm
except ModuleNotFoundError:
print("No fused RMSNorm")
from rms_norm import RMSNorm
def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor:
"""torch.repeat_interleave(x, dim=1, repeats=n_rep)"""
bs, n_kv_heads, slen, head_dim = x.shape
if n_rep == 1:
return x
return (
x[:, :, None, :, :]
.expand(bs, n_kv_heads, n_rep, slen, head_dim)
.reshape(bs, n_kv_heads * n_rep, slen, head_dim)
)
def lambda_init_fn(depth):
return 0.8 - 0.6 * math.exp(-0.3 * depth)
class MultiheadFlashDiff1(nn.Module):
"""
(Recommended)
DiffAttn implemented with FlashAttention, for packages that support different qk/v dimensions
e.g., our customized flex_head_fa (https://aka.ms/flash-diff) and xformers (https://github.com/facebookresearch/xformers)
"""
def __init__(
self,
embed_dim,
depth, # current layer index
num_heads,
num_kv_heads=None,
):
super().__init__()
self.embed_dim = embed_dim
# arg num_heads set to half of baseline Transformer's num_heads
# for e.g., to compare with a baseline Transformer with 16 heads, pass in num_heads=8 for DIFF Transformer
self.num_heads = num_heads
# arg num_kv_heads set to half of baseline Transformer's num_kv_heads if use GQA
# for e.g., to compare with a baseline Transformer with 16 heads and 8 kv_heads,
# pass in num_heads=8, num_kv_heads=4 for DIFF Transformer
# if use MHA, pass in num_kv_heads=None
self.num_kv_heads = num_kv_heads if num_kv_heads is not None else num_heads
self.n_rep = self.num_heads // self.num_kv_heads
self.head_dim = embed_dim // num_heads // 2
self.scaling = self.head_dim ** -0.5
self.q_proj = nn.Linear(embed_dim, embed_dim, bias=False)
self.k_proj = nn.Linear(embed_dim, embed_dim // self.n_rep, bias=False)
self.v_proj = nn.Linear(embed_dim, embed_dim // self.n_rep, bias=False)
self.out_proj = nn.Linear(embed_dim, embed_dim, bias=False)
# depth means current layer index
self.lambda_init = lambda_init_fn(depth)
self.lambda_q1 = nn.Parameter(torch.zeros(self.head_dim, dtype=torch.float32).normal_(mean=0,std=0.1))
self.lambda_k1 = nn.Parameter(torch.zeros(self.head_dim, dtype=torch.float32).normal_(mean=0,std=0.1))
self.lambda_q2 = nn.Parameter(torch.zeros(self.head_dim, dtype=torch.float32).normal_(mean=0,std=0.1))
self.lambda_k2 = nn.Parameter(torch.zeros(self.head_dim, dtype=torch.float32).normal_(mean=0,std=0.1))
self.subln = RMSNorm(2 * self.head_dim, eps=1e-5, elementwise_affine=True)
def forward(
self,
x,
rel_pos,
attn_mask=None,
):
bsz, tgt_len, embed_dim = x.size()
src_len = tgt_len
q = self.q_proj(x)
k = self.k_proj(x)
v = self.v_proj(x)
q = q.view(bsz, tgt_len, 2 * self.num_heads, self.head_dim)
k = k.view(bsz, src_len, 2 * self.num_kv_heads, self.head_dim)
v = v.view(bsz, src_len, self.num_kv_heads, 2 * self.head_dim)
q = apply_rotary_emb(q, *rel_pos, interleaved=True)
k = apply_rotary_emb(k, *rel_pos, interleaved=True)
offset = src_len - tgt_len
q = q.reshape(bsz, tgt_len, self.num_heads, 2, self.head_dim)
k = k.reshape(bsz, src_len, self.num_kv_heads, 2, self.head_dim)
q1, q2 = q[:, :, :, 0], q[:, :, :, 1]
k1, k2 = k[:, :, :, 0], k[:, :, :, 1]
attn1 = flash_attn_func(q1, k1, v, causal=True)
attn2 = flash_attn_func(q2, k2, v, causal=True)
lambda_1 = torch.exp(torch.sum(self.lambda_q1 * self.lambda_k1, dim=-1).float()).type_as(q)
lambda_2 = torch.exp(torch.sum(self.lambda_q2 * self.lambda_k2, dim=-1).float()).type_as(q)
lambda_full = lambda_1 - lambda_2 + self.lambda_init
attn = attn1 - lambda_full * attn2
attn = self.subln(attn)
attn = attn * (1 - self.lambda_init)
attn = attn.reshape(bsz, tgt_len, self.num_heads * 2 * self.head_dim)
attn = self.out_proj(attn)
return attn
================================================
FILE: Diff-Transformer/multihead_flashdiff_2.py
================================================
import math
import torch
import torch.nn.functional as F
from torch import nn
from kernel.rotary import apply_rotary_emb
from flash_attn import flash_attn_func
try:
from apex.normalization import FusedRMSNorm as RMSNorm
except ModuleNotFoundError:
print("No fused RMSNorm")
from rms_norm import RMSNorm
def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor:
"""torch.repeat_interleave(x, dim=1, repeats=n_rep)"""
bs, n_kv_heads, slen, head_dim = x.shape
if n_rep == 1:
return x
return (
x[:, :, None, :, :]
.expand(bs, n_kv_heads, n_rep, slen, head_dim)
.reshape(bs, n_kv_heads * n_rep, slen, head_dim)
)
def lambda_init_fn(depth):
return 0.8 - 0.6 * math.exp(-0.3 * depth)
class MultiheadFlashDiff2(nn.Module):
"""
DiffAttn implemented with FlashAttention, for packages that does not support different qk/v dimensions
e.g., flash-attention (https://github.com/Dao-AILab/flash-attention)
"""
def __init__(
self,
embed_dim,
depth, # current layer index
num_heads,
num_kv_heads=None,
):
super().__init__()
self.embed_dim = embed_dim
# arg num_heads set to half of baseline Transformer's num_heads
# for e.g., to compare with a baseline Transformer with 16 heads, pass in num_heads=8 for DIFF Transformer
self.num_heads = num_heads
# arg num_kv_heads set to half of baseline Transformer's num_kv_heads if use GQA
# for e.g., to compare with a baseline Transformer with 16 heads and 8 kv_heads,
# pass in num_heads=8, num_kv_heads=4 for DIFF Transformer
# if use MHA, pass in num_kv_heads=None
self.num_kv_heads = num_kv_heads if num_kv_heads is not None else num_heads
self.n_rep = self.num_heads // self.num_kv_heads
self.head_dim = embed_dim // num_heads // 2
self.scaling = self.head_dim ** -0.5
self.q_proj = nn.Linear(embed_dim, embed_dim, bias=False)
self.k_proj = nn.Linear(embed_dim, embed_dim // self.n_rep, bias=False)
self.v_proj = nn.Linear(embed_dim, embed_dim // self.n_rep, bias=False)
self.out_proj = nn.Linear(embed_dim, embed_dim, bias=False)
# depth means current layer index
self.lambda_init = lambda_init_fn(depth)
self.lambda_q1 = nn.Parameter(torch.zeros(self.head_dim, dtype=torch.float32).normal_(mean=0,std=0.1))
self.lambda_k1 = nn.Parameter(torch.zeros(self.head_dim, dtype=torch.float32).normal_(mean=0,std=0.1))
self.lambda_q2 = nn.Parameter(torch.zeros(self.head_dim, dtype=torch.float32).normal_(mean=0,std=0.1))
self.lambda_k2 = nn.Parameter(torch.zeros(self.head_dim, dtype=torch.float32).normal_(mean=0,std=0.1))
self.subln = RMSNorm(2 * self.head_dim, eps=1e-5, elementwise_affine=True)
def forward(
self,
x,
rel_pos,
attn_mask=None,
):
bsz, tgt_len, embed_dim = x.size()
src_len = tgt_len
q = self.q_proj(x)
k = self.k_proj(x)
v = self.v_proj(x)
q = q.view(bsz, tgt_len, 2 * self.num_heads, self.head_dim)
k = k.view(bsz, src_len, 2 * self.num_kv_heads, self.head_dim)
v = v.view(bsz, src_len, self.num_kv_heads, 2, self.head_dim)
q = apply_rotary_emb(q, *rel_pos, interleaved=True)
k = apply_rotary_emb(k, *rel_pos, interleaved=True)
offset = src_len - tgt_len
q = q.reshape(bsz, tgt_len, self.num_heads, 2, self.head_dim)
k = k.reshape(bsz, src_len, self.num_kv_heads, 2, self.head_dim)
q1, q2 = q[:, :, :, 0], q[:, :, :, 1]
k1, k2 = k[:, :, :, 0], k[:, :, :, 1]
v1, v2 = v[:, :, :, 0], v[:, :, :, 1]
attn11 = flash_attn_func(q1, k1, v1, causal=True)
attn12 = flash_attn_func(q1, k1, v2, causal=True)
attn1 = torch.cat([attn11, attn12], dim=-1)
attn21 = flash_attn_func(q2, k2, v1, causal=True)
attn22 = flash_attn_func(q2, k2, v2, causal=True)
attn2 = torch.cat([attn21, attn22], dim=-1)
lambda_1 = torch.exp(torch.sum(self.lambda_q1 * self.lambda_k1, dim=-1).float()).type_as(q)
lambda_2 = torch.exp(torch.sum(self.lambda_q2 * self.lambda_k2, dim=-1).float()).type_as(q)
lambda_full = lambda_1 - lambda_2 + self.lambda_init
attn = attn1 - lambda_full * attn2
attn = self.subln(attn)
attn = attn * (1 - self.lambda_init)
attn = attn.reshape(bsz, tgt_len, self.num_heads * 2 * self.head_dim)
attn = self.out_proj(attn)
return attn
================================================
FILE: Diff-Transformer/rms_norm.py
================================================
import torch
import torch.nn as nn
class RMSNorm(nn.Module):
def __init__(self, dim: int, eps: float = 1e-6, elementwise_affine=True, memory_efficient=False):
super().__init__()
self.dim = dim
self.eps = eps
self.elementwise_affine = elementwise_affine
if self.elementwise_affine:
self.weight = nn.Parameter(torch.ones(dim))
else:
self.register_parameter('weight', None)
def _norm(self, x):
return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
def forward(self, x):
output = self._norm(x.float()).type_as(x)
if self.weight is not None:
output = output * self.weight
return output
def extra_repr(self) -> str:
return f'dim={self.dim}, eps={self.eps}, elementwise_affine={self.elementwise_affine}'
================================================
FILE: LICENSE
================================================
The MIT License (MIT)
Copyright (c) Microsoft Corporation
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
================================================
FILE: LatentLM/README.md
================================================
# [Multimodal Latent Language Modeling with Next-Token Diffusion]
Official PyTorch implementation and pretrained models of LatentLM.
---
## Setup & Usage
Coming soon!
## License
This project is licensed under the license found in the LICENSE file in the root directory of this source tree.
[Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct)
### Contact Information
For help or issues using BEiT models, please submit a GitHub issue.
For other communications, please contact [Li Dong](https://dong.li/) (`lidong1@microsoft.com`), [Furu Wei](http://gitnlp.org/) (`fuwei@microsoft.com`).
================================================
FILE: LatentLM/evaluate_fid.py
================================================
import argparse
import json
import os
import sys
import math
import numpy as np
from tqdm import tqdm
import torch
import torch.distributed as dist
from accelerate.utils import set_seed
from safetensors.torch import load_file
from tokenizer_models import AutoencoderKL, load_vae
from schedule.dpm_solver import DPMSolverMultistepScheduler
from models import All_models
from utils import safe_blob_dump
from metrics import compute_fid_without_store, compute_inception_score_from_tensor
def parse_args():
parser = argparse.ArgumentParser(description="Simple example of a training script.")
parser.add_argument(
"--seed",
type=int,
default=0,
help="A seed to use for the random number generator. Can be negative to not set a seed.",
)
parser.add_argument(
"--model",
type=str,
default="Transformer-L",
help="The config of the UNet model to train, leave as None to use standard DDPM configuration.",
)
parser.add_argument(
"--vae",
type=str,
default=None,
)
parser.add_argument(
"--train_data_dir",
type=str,
default="/tmp/ILSVRC/Data/CLS-LOC/train",
help=(
"A folder containing the training data. Folder contents must follow the structure described in"
" https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file"
" must exist to provide the captions for the images. Ignored if `dataset_name` is specified."
),
)
parser.add_argument(
"--ref_stat_path",
type=str,
default="/mnt/unilm/hangbo/beit3/t2i/assets/fid_stats/imagenet_256_val.npz",
)
parser.add_argument(
"--image_size",
type=int,
default=256,
help=(
"The image_size for input images, all the images in the train/validation dataset will be resized to this"
" image_size"
),
)
parser.add_argument("--num-classes", type=int, default=1000)
parser.add_argument(
"--mixed_precision",
type=str,
default="no",
choices=["no", "fp16", "bf16"],
help=(
"Whether to use mixed precision. Choose"
"between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
"and an Nvidia Ampere GPU."
),
)
parser.add_argument(
"--batch_size", type=int, default=32, help="Batch size (per device) for the training dataloader."
)
parser.add_argument(
"--steps_per_class", type=int, default=50, help="Number of steps per class."
)
parser.add_argument("--force_diffusion", action="store_true", help="Whether to force the use of diffusion models.")
parser.add_argument("--use_ema", action="store_true", help="Whether to use Exponential Moving Average for the final model weights.")
parser.add_argument("--ddpm_num_steps", type=int, default=1000)
parser.add_argument("--ddpm_num_inference_steps", type=int, default=250)
parser.add_argument("--ddpm_beta_schedule", type=str, default="cosine", help="The beta schedule to use for DDPM.")
parser.add_argument("--prediction_type", type=str, default="epsilon", help="Whether the model should predict the 'epsilon'/noise error or directly the reconstructed image 'x0'.")
parser.add_argument("--cfg-scale", type=float, default=4.0)
parser.add_argument(
"--checkpoint",
type=str,
default=None,
help=(
"Whether training should be resumed from a previous checkpoint. Use a path saved by"
' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
),
)
args = parser.parse_args()
return args
def suppress_output(rank):
"""Suppress output for all processes except the one with rank 0."""
if rank != 0:
sys.stdout = open(os.devnull, 'w')
@torch.no_grad()
def main(args):
set_seed(args.seed)
dist.init_process_group(backend="gloo", init_method='env://')
rank = dist.get_rank()
suppress_output(rank)
print(args)
device = f"cuda:{rank}" if torch.cuda.is_available() else "cpu"
if args.mixed_precision == "bf16":
dtype = torch.bfloat16
elif args.mixed_precision == "fp16":
dtype = torch.float16
else:
dtype = torch.float32
prefix = "ema" if args.use_ema else "standard"
exp_name = f"{prefix}_{args.steps_per_class}_{args.cfg_scale}_{args.ddpm_beta_schedule}_{args.ddpm_num_inference_steps}"
print(f"Exp_name {exp_name}")
vae, input_size, latent_size, flatten_input = load_vae(args.vae, args.image_size)
vae.eval()
other_state = torch.load(os.path.join(args.checkpoint, "other_state.pth"), map_location="cpu")
scaling_factor = other_state["scaling_factor"]
bias_factor = other_state["bias_factor"]
print(f"Scaling factor: {scaling_factor}, Bias factor: {bias_factor}")
# Potentially load in the weights and states from a previous save
latent_path = os.path.join(args.checkpoint, f"latent_{exp_name}.pth")
if os.path.exists(latent_path) and not args.force_diffusion:
all_latent_gather = torch.load(latent_path)
print("Loaded latent from file.")
else:
model = All_models[args.model](
input_size=input_size,
in_channels=latent_size,
num_classes=args.num_classes,
flatten_input=flatten_input,
).to(device).to(dtype)
noise_scheduler = DPMSolverMultistepScheduler(num_train_timesteps=args.ddpm_num_steps, beta_schedule=args.ddpm_beta_schedule, prediction_type=args.prediction_type)
model.eval()
if args.checkpoint:
if args.use_ema and other_state["ema"] is not None:
checkpoint = other_state["ema"]["shadow_params"]
for model_param, ema_param in zip(model.parameters(), checkpoint):
model_param.data = ema_param.data.to(device).to(dtype)
print(f"Loaded model from checkpoint {args.checkpoint}, EMA applied.")
else:
if os.path.exists(os.path.join(args.checkpoint, "model.safetensors")):
checkpoint = load_file(os.path.join(args.checkpoint, "model.safetensors"))
elif os.path.exists(os.path.join(args.checkpoint, "pytorch_model")):
checkpoint = torch.load(os.path.join(args.checkpoint, "pytorch_model", "mp_rank_00_model_states.pt"), map_location="cpu")["module"]
model.load_state_dict(checkpoint)
print(f"Loaded model from checkpoint {args.checkpoint}.")
def p_sample(model, image):
noise_scheduler.set_timesteps(args.ddpm_num_inference_steps)
for t in noise_scheduler.timesteps:
model_output = model(image, t.repeat(image.shape[0]).to(image))
image = noise_scheduler.step(model_output, t, image).prev_sample
return image
all_latent = []
class_start, class_end = args.num_classes // dist.get_world_size() * rank, args.num_classes // dist.get_world_size() * (rank + 1)
classes = torch.arange(class_start, class_end, device=device).repeat(args.steps_per_class)
classes = classes.chunk(math.ceil(classes.size(0) / args.batch_size))
for y in tqdm(classes, disable=rank != 0):
y_null = torch.full_like(y, args.num_classes, device=device)
y = torch.cat([y, y_null], 0)
# Sample images:
samples = model.sample_with_cfg(y, args.cfg_scale, p_sample)
all_latent.append(samples.float().cpu())
all_latent = torch.cat(all_latent, 0)
all_latent_gather = [torch.zeros_like(all_latent) for _ in range(dist.get_world_size())]
dist.all_gather(all_latent_gather, all_latent)
all_latent_gather = torch.cat(all_latent_gather, 0)
if rank == 0:
torch.save(all_latent_gather, latent_path)
if rank == 0:
all_images = torch.zeros((all_latent_gather.size(0), 3, 256, 256))
if args.image_size != 256:
transform = torch.nn.Upsample(size=(256, 256), mode="bilinear")
else:
transform = torch.nn.Identity()
idx = 0
for samples in tqdm(all_latent_gather.chunk(math.ceil(all_latent_gather.size(0) / args.batch_size))):
images = vae.decode(samples.to(device).to(dtype) / scaling_factor - bias_factor)
images = transform(images)
images = (torch.clamp(images.float(), -1.0, 1.0) * 0.5 + 0.5).cpu().float()
all_images[idx:idx + images.shape[0]] = images
idx += images.shape[0]
print(all_images.shape)
fid_score = compute_fid_without_store(all_images, args.ref_stat_path, batch_size=args.batch_size, device=device)
print(fid_score)
IS_mean, IS_std = compute_inception_score_from_tensor(
all_images,
batch_size=args.batch_size,
device=device,
)
print(IS_mean, IS_std)
result_path = os.path.join(args.checkpoint, f"result_{exp_name}.json")
result = {
"fid": fid_score.item(),
"IS_mean": IS_mean.item(),
"IS_std": IS_std.item(),
}
safe_blob_dump(result_path, result)
image_path = os.path.join(args.checkpoint, f"images_{exp_name}.npz")
all_images = (all_images * 255.0).clamp(0, 255).to(torch.uint8).permute(0, 2, 3, 1).numpy()
np.savez_compressed(image_path, all_images)
if __name__ == "__main__":
args = parse_args()
main(args)
================================================
FILE: LatentLM/evaluate_fid_fidelity.py
================================================
import argparse
import json
import os
import numpy as np
import torch
from torchvision import transforms
from torchvision.datasets import ImageFolder
import torch_fidelity
from utils import center_crop_arr, safe_blob_write
def parse_args():
parser = argparse.ArgumentParser(description="Simple example of a training script.")
parser.add_argument(
"--seed",
type=int,
default=0,
help="A seed to use for the random number generator. Can be negative to not set a seed.",
)
parser.add_argument(
"--model",
type=str,
default="Transformer-L",
help="The config of the UNet model to train, leave as None to use standard DDPM configuration.",
)
parser.add_argument(
"--vae",
type=str,
default=None,
)
parser.add_argument("--train_data_dir", type=str, default="/tmp/ILSVRC/Data/CLS-LOC/train", help="A folder containing the training data.")
parser.add_argument(
"--ref_stat_path",
type=str,
default="/mnt/unilm/hangbo/beit3/t2i/assets/fid_stats/imagenet_256_val.npz",
)
parser.add_argument(
"--image_size",
type=int,
default=256,
help=(
"The image_size for input images, all the images in the train/validation dataset will be resized to this"
" image_size"
),
)
parser.add_argument(
"--batch_size", type=int, default=32, help="Batch size (per device) for the training dataloader."
)
parser.add_argument(
"--steps_per_class", type=int, default=50, help="Number of steps per class."
)
parser.add_argument("--use_ema", action="store_true", help="Whether to use Exponential Moving Average for the final model weights.")
parser.add_argument("--ddpm_num_steps", type=int, default=1000)
parser.add_argument("--ddpm_num_inference_steps", type=int, default=250)
parser.add_argument("--ddpm_beta_schedule", type=str, default="cosine", help="The beta schedule to use for DDPM.")
parser.add_argument("--prediction_type", type=str, default="epsilon", help="Whether the model should predict the 'epsilon'/noise error or directly the reconstructed image 'x0'.")
parser.add_argument("--cfg-scale", type=float, default=4.0)
parser.add_argument(
"--checkpoint",
type=str,
default=None,
help=(
"Whether training should be resumed from a previous checkpoint. Use a path saved by"
' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
),
)
args = parser.parse_args()
return args
class ImageDataset(torch.utils.data.Dataset):
def __init__(self, images):
self.images = images
def __len__(self):
return len(self.images)
def __getitem__(self, idx):
return self.images[idx]
class RefImageDataset(torch.utils.data.Dataset):
def __init__(self, dataset):
self.dataset = dataset
def __len__(self):
return len(self.dataset)
def __getitem__(self, idx):
item = self.dataset[idx]
item = np.array(item[0])
item = torch.from_numpy(item).permute(2, 0, 1)
return item
@torch.no_grad()
def main(args):
prefix = "ema" if args.use_ema else "standard"
exp_name = f"{prefix}_{args.steps_per_class}_{args.cfg_scale}_{args.ddpm_beta_schedule}_{args.ddpm_num_inference_steps}"
print(f"Exp_name {exp_name}")
image_path = os.path.join(args.checkpoint, f"images_{exp_name}.npz")
print(f"Computing fidelity metrics from {image_path}...")
images = np.load(image_path)["arr_0"]
images = torch.from_numpy(images).permute(0, 3, 1, 2)
print(images.shape)
dataset = ImageDataset(images)
ref_dataset = ImageFolder(args.train_data_dir, transform=transforms.Lambda(lambda pil_image: center_crop_arr(pil_image, args.image_size)))
ref_dataset = RefImageDataset(ref_dataset)
metrics_dict = torch_fidelity.calculate_metrics(
input1=dataset,
input2=ref_dataset,
batch_size=args.batch_size,
cuda=True,
isc=True,
fid=True,
kid=False,
prc=False,
save_cpu_ram=True,
verbose=True,
)
print(metrics_dict)
# metrics_dict = torch_fidelity.calculate_metrics(
# input1=dataset,
# input2=ref_dataset,
# batch_size=args.batch_size,
# cuda=True,
# prc=True,
# prc_batch_size=args.batch_size,
# save_cpu_ram=True,
# verbose=True,
# )
# print(metrics_dict)
if __name__ == "__main__":
args = parse_args()
main(args)
================================================
FILE: LatentLM/inference_speed.py
================================================
import argparse
import json
import os
import sys
import time
import torch
from tqdm import tqdm
from accelerate.utils import set_seed
from tokenizer_models import AutoencoderKL, load_vae
from schedule.dpm_solver import DPMSolverMultistepScheduler
from models import All_models
def parse_args():
parser = argparse.ArgumentParser(description="Simple example of a training script.")
parser.add_argument(
"--seed",
type=int,
default=0,
help="A seed to use for the random number generator. Can be negative to not set a seed.",
)
parser.add_argument(
"--model",
type=str,
default="Transformer-L",
help="The config of the model to train, leave as None to use standard DDPM configuration.",
)
parser.add_argument(
"--num_kv_heads",
type=int,
default=None,
help="The number of heads to use in the key/value attention in the model.",
)
parser.add_argument(
"--vae",
type=str,
default=None,
)
parser.add_argument(
"--train_data_dir",
type=str,
default="/tmp/ILSVRC/Data/CLS-LOC/train",
help=(
"A folder containing the training data. Folder contents must follow the structure described in"
" https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file"
" must exist to provide the captions for the images. Ignored if `dataset_name` is specified."
),
)
parser.add_argument(
"--ref_stat_path",
type=str,
default="/mnt/unilm/hangbo/beit3/t2i/assets/fid_stats/imagenet_256_val.npz",
)
parser.add_argument(
"--image_size",
type=int,
default=256,
help=(
"The image_size for input images, all the images in the train/validation dataset will be resized to this"
" image_size"
),
)
parser.add_argument("--num-classes", type=int, default=1000)
parser.add_argument(
"--mixed_precision",
type=str,
default="no",
choices=["no", "fp16", "bf16"],
help=(
"Whether to use mixed precision. Choose"
"between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
"and an Nvidia Ampere GPU."
),
)
parser.add_argument(
"--batch_size", type=int, default=32, help="Batch size (per device) for the training dataloader."
)
parser.add_argument(
"--steps_per_class", type=int, default=50, help="Number of steps per class."
)
parser.add_argument("--force_diffusion", action="store_true", help="Whether to force the use of diffusion models.")
parser.add_argument("--use_ema", action="store_true", help="Whether to use Exponential Moving Average for the final model weights.")
parser.add_argument("--ddpm_num_steps", type=int, default=1000)
parser.add_argument("--ddpm_num_inference_steps", type=int, default=250)
parser.add_argument("--ddpm_beta_schedule", type=str, default="cosine", help="The beta schedule to use for DDPM.")
parser.add_argument("--prediction_type", type=str, default="epsilon", help="Whether the model should predict the 'epsilon'/noise error or directly the reconstructed image 'x0'.")
parser.add_argument("--cfg-scale", type=float, default=4.0)
parser.add_argument(
"--checkpoint",
type=str,
default=None,
help=(
"Whether training should be resumed from a previous checkpoint. Use a path saved by"
' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
),
)
args = parser.parse_args()
return args
def suppress_output(rank):
"""Suppress output for all processes except the one with rank 0."""
if rank != 0:
sys.stdout = open(os.devnull, 'w')
@torch.no_grad()
def main(args):
set_seed(args.seed)
print(args)
device = "cuda" if torch.cuda.is_available() else "cpu"
if args.mixed_precision == "bf16":
dtype = torch.bfloat16
elif args.mixed_precision == "fp16":
dtype = torch.float16
else:
dtype = torch.float32
prefix = "ema" if args.use_ema else "standard"
exp_name = f"{prefix}_{args.steps_per_class}_{args.cfg_scale}_{args.ddpm_beta_schedule}_{args.ddpm_num_inference_steps}"
print(f"Exp_name {exp_name}")
vae, input_size, latent_size, flatten_input = load_vae(args.vae, args.image_size)
vae.eval()
# Potentially load in the weights and states from a previous save
model = All_models[args.model](
input_size=input_size,
in_channels=latent_size,
num_kv_heads=args.num_kv_heads,
num_classes=args.num_classes,
flatten_input=flatten_input,
).to(device).to(dtype)
noise_scheduler = DPMSolverMultistepScheduler(num_train_timesteps=args.ddpm_num_steps, beta_schedule=args.ddpm_beta_schedule, prediction_type=args.prediction_type)
model.eval()
def p_sample(model, image):
noise_scheduler.set_timesteps(args.ddpm_num_inference_steps)
for t in noise_scheduler.timesteps:
model_output = model(image, t.repeat(image.shape[0]).to(image))
image = noise_scheduler.step(model_output, t, image).prev_sample
return image
start = time.time()
for _ in tqdm(range(5)):
y = torch.randint(0, args.num_classes, (args.batch_size,)).to(device)
y_null = torch.full_like(y, args.num_classes, device=device)
y = torch.cat([y, y_null], 0)
# Sample images:
samples = model.sample_with_cfg(y, args.cfg_scale, p_sample)
end = time.time()
print(args.model, args.batch_size)
print(f"Time taken: {end - start}, FPS: {5 * args.batch_size / (end - start)}")
if __name__ == "__main__":
args = parse_args()
main(args)
================================================
FILE: LatentLM/metrics/IS.py
================================================
"""Utils for Inception Score calculation.
Borrowed from:
PyTorch StudioGAN: https://github.com/POSTECH-CVLab/PyTorch-StudioGAN
The MIT License (MIT)
See license file or visit https://github.com/POSTECH-CVLab/PyTorch-StudioGAN for details
"""
from pathlib import Path
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from .fid import get_inception_model, create_dataset_from_files
def inception_softmax(inception_model, images):
with torch.no_grad():
logits = inception_model.get_logits(images)
ps = torch.nn.functional.softmax(logits, dim=1)
return ps
@torch.no_grad()
def calculate_kl_div(ps, splits: int):
scores = []
num_samples = ps.shape[0]
for j in range(splits):
part = ps[(j * num_samples // splits):((j + 1) * num_samples // splits), :]
kl = part * (torch.log(part) - torch.log(torch.unsqueeze(torch.mean(part, 0), 0)))
kl = torch.mean(torch.sum(kl, 1))
kl = torch.exp(kl)
scores.append(kl.unsqueeze(0))
scores = torch.cat(scores, 0)
m_scores = torch.mean(scores).detach().cpu().numpy()
m_std = torch.std(scores).detach().cpu().numpy()
return m_scores, m_std
@torch.no_grad()
def compute_inception_score_from_dataset(dataset,
splits,
batch_size,
device=torch.device('cuda'),
inception_model=None,
disable_tqdm=False):
"""
Args:
- dataset: dataset returning **float (0~1)** images
"""
if inception_model is None:
inception_model = get_inception_model().to(device)
data_loader = DataLoader(dataset, shuffle=True, batch_size=batch_size, num_workers=16)
inception_model.eval()
probs_list = []
for imgs in tqdm(data_loader, disable=disable_tqdm):
imgs = imgs[0].to(device)
logits = inception_model.get_logits(imgs)
probs = torch.nn.functional.softmax(logits, dim=-1)
probs_list.append(probs)
probs_list = torch.cat(probs_list, 0)
m_scores, m_std = calculate_kl_div(probs_list, splits=splits)
return m_scores, m_std
def compute_inception_score_from_files(path,
splits=10,
batch_size=500,
device=torch.device('cuda'),
inception_model=None,
disable_tqdm=False):
dataset = create_dataset_from_files(path)
return compute_inception_score_from_dataset(dataset,
splits,
batch_size,
device,
inception_model,
disable_tqdm)
def compute_inception_score_from_tensor(tensor,
splits=10,
batch_size=500,
device=torch.device('cuda'),
inception_model=None,
disable_tqdm=False):
dataset = torch.utils.data.TensorDataset(tensor)
return compute_inception_score_from_dataset(dataset,
splits,
batch_size,
device,
inception_model,
disable_tqdm)
================================================
FILE: LatentLM/metrics/__init__.py
================================================
# Copyright (c) 2022-present, Kakao Brain Corp.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .fid import *
from .IS import *
================================================
FILE: LatentLM/metrics/fid.py
================================================
"""Adapted from https://github.com/mseitzer/pytorch-fid/blob/master/src/pytorch_fid/fid_score.py"""
import glob
import logging
import os
from pathlib import Path
import numpy as np
import torch
import torch.nn.functional as F
from scipy import linalg
from torch.utils.data import DataLoader
from tqdm import tqdm
from .inception import InceptionV3
import pickle
class InceptionWrapper(InceptionV3):
def forward(self, inp):
pred = super().forward(inp)[0]
# If model output is not scalar, apply global spatial average pooling.
# This happens if you choose a dimensionality not equal 2048.
if pred.size(2) != 1 or pred.size(3) != 1:
pred = F.adaptive_avg_pool2d(pred, output_size=(1, 1))
pred = pred.reshape(pred.shape[0], -1)
return pred
def get_logits(self, inp):
_, logits = super().forward(inp, return_logits=True)
return logits
def get_inception_model(dims=2048):
block_idx = InceptionV3.BLOCK_INDEX_BY_DIM[dims]
model = InceptionWrapper([block_idx])
return model
def mean_covar_torch(xs):
mu = torch.mean(xs, dim=0, keepdim=True)
ys = xs - mu
unnormalized_sigma = (ys.T @ ys)
sigma = unnormalized_sigma / (xs.shape[0] - 1)
return mu, sigma
def mean_covar_numpy(xs):
if isinstance(xs, torch.Tensor):
xs = xs.cpu().numpy()
return np.mean(xs, axis=0), np.cov(xs, rowvar=False)
def frechet_distance(mu1, sigma1, mu2, sigma2, eps=1e-6):
"""Numpy implementation of the Frechet Distance.
The Frechet distance between two multivariate Gaussians X_1 ~ N(mu_1, C_1)
and X_2 ~ N(mu_2, C_2) is
d^2 = ||mu_1 - mu_2||^2 + Tr(C_1 + C_2 - 2*sqrt(C_1*C_2)).
Stable version by Dougal J. Sutherland.
Params:
-- mu1 : Numpy array containing the activations of a layer of the
inception net (like returned by the function 'get_predictions')
for generated samples.
-- mu2 : The sample mean over activations, precalculated on an
representative data set.
-- sigma1: The covariance matrix over activations for generated samples.
-- sigma2: The covariance matrix over activations, precalculated on an
representative data set.
Returns:
-- : The Frechet Distance.
"""
mu1 = np.atleast_1d(mu1)
mu2 = np.atleast_1d(mu2)
sigma1 = np.atleast_2d(sigma1)
sigma2 = np.atleast_2d(sigma2)
assert mu1.shape == mu2.shape, \
'Training and test mean vectors have different lengths'
assert sigma1.shape == sigma2.shape, \
'Training and test covariances have different dimensions'
diff = mu1 - mu2
# Product might be almost singular
covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False)
if not np.isfinite(covmean).all():
msg = ('fid calculation produces singular product; '
'adding %s to diagonal of cov estimates') % eps
logging.warning(msg)
offset = np.eye(sigma1.shape[0]) * eps
covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset))
# Numerical error might give slight imaginary component
if np.iscomplexobj(covmean):
if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-3):
m = np.max(np.abs(covmean.imag))
raise ValueError('Imaginary component {}'.format(m))
covmean = covmean.real
tr_covmean = np.trace(covmean)
return (diff.dot(diff) + np.trace(sigma1) +
np.trace(sigma2) - 2 * tr_covmean)
@torch.no_grad()
def compute_statistics_dataset(dataset,
batch_size=64,
inception_model=None,
stage1_model=None,
device=torch.device('cuda'),
skip_original=False,
):
if skip_original and stage1_model is None:
return None, None, None, None
if inception_model is None:
inception_model = get_inception_model().to(device)
loader = DataLoader(dataset, shuffle=False, pin_memory=True, batch_size=batch_size, num_workers=16)
inception_model.eval()
if stage1_model:
stage1_model.eval()
acts = []
acts_recon = []
sample_size_sum = 0.0
sample_sum = torch.tensor(0.0, device=device)
sample_sq_sum = torch.tensor(0.0, device=device)
sample_max = torch.tensor(float('-inf'), device=device)
sample_min = torch.tensor(float('inf'), device=device)
for xs, _ in tqdm(loader, desc="compute acts"):
xs = xs.to(device, non_blocking=True)
# we are assuming that dataset returns value in -1 ~ 1 -> remap to 0 ~ 1
xs = torch.clamp(xs*0.5 + 0.5, 0, 1)
sample_sum += xs.sum()
sample_sq_sum += xs.pow(2.0).sum()
sample_size_sum += xs.numel()
sample_max = max(xs.max(), sample_max)
sample_min = min(xs.min(), sample_min)
act = inception_model(xs).cpu() if not skip_original else None
acts.append(act)
if stage1_model:
# here we assume that stage1 model input & output values are in -1 ~ 1 range
# this may not cover DiscreteVAE
imgs = 2. * xs - 1.
xs_recon = torch.cat([
stage1_model(imgs[i:i+1])[0] for i in range(imgs.shape[0])
], dim=0)
xs_recon = torch.clamp(xs_recon * 0.5 + 0.5, 0, 1)
act_recon = inception_model(xs_recon).cpu()
acts_recon.append(act_recon)
sample_mean = sample_sum.item() / sample_size_sum
sample_std = ((sample_sq_sum.item() / sample_size_sum) - (sample_mean ** 2.0)) ** 0.5
logging.info(f'val imgs. stats :: '
f'max: {sample_max:.4f}, min: {sample_min:.4f}, mean: {sample_mean:.4f}, std: {sample_std:.4f}')
acts = torch.cat(acts, dim=0) if not skip_original else None
if skip_original:
mu_acts, sigma_acts = None, None
else:
mu_acts, sigma_acts = mean_covar_numpy(acts)
if stage1_model:
acts_recon = torch.cat(acts_recon, dim=0)
mu_acts_recon, sigma_acts_recon = mean_covar_numpy(acts_recon)
else:
mu_acts_recon, sigma_acts_recon = None, None
return mu_acts, sigma_acts, mu_acts_recon, sigma_acts_recon
def create_dataset_from_files(path, verbose=False):
samples = []
pkl_lists = glob.glob(os.path.join(path, 'samples*.pkl'))
first_file_name = os.path.basename(pkl_lists[0])
last_file_name = os.path.basename(pkl_lists[-1])
logging.info(f'loading generated images from {path}: [{first_file_name}, ..., {last_file_name}]')
for pkl in tqdm(pkl_lists, desc='loading pickles'):
with open(pkl, 'rb') as f:
# samples.append(pickle.load(f).cpu().numpy())
s = pickle.load(f)
if isinstance(s, np.ndarray):
s = torch.from_numpy(s)
samples.append(s)
datasets = [torch.utils.data.TensorDataset(sample) for sample in samples]
dataset = torch.utils.data.ConcatDataset(datasets)
if verbose:
total_size = sum([sample.size for sample in samples])
sample_mean = sum([sample.sum() for sample in samples]) / total_size
sample_std = (sum([((sample - sample_mean)**2).sum() for sample in samples]) / total_size) ** 0.5
sample_max = max([sample.max() for sample in samples])
sample_min = min([sample.min() for sample in samples])
logging.info(f'gen. imgs. stats :: '
f'max: {sample_max:.4f}, min: {sample_min:.4f}, mean: {sample_mean:.4f}, std: {sample_std:.4f}')
return dataset
@torch.no_grad()
def compute_activations_from_dataset(dataset,
batch_size=64,
inception_model=None,
device=torch.device('cuda'),
normalized=False,
):
if inception_model is None:
inception_model = get_inception_model().to(device)
loader = DataLoader(dataset, shuffle=False, pin_memory=True, batch_size=batch_size, num_workers=16)
acts = []
inception_model.eval()
for xs in tqdm(loader, desc="compute acts (gen. imgs)"):
xs = xs[0].to(device, non_blocking=True)
if normalized:
xs = 0.5 * xs + 0.5
act = inception_model(xs)
acts.append(act.cpu())
acts = torch.cat(acts, dim=0)
return acts
def compute_statistics_from_files(path,
batch_size=64,
inception_model=None,
device=torch.device('cuda'),
return_acts=False,
):
dataset = create_dataset_from_files(path)
acts = compute_activations_from_dataset(dataset,
batch_size=batch_size,
inception_model=inception_model,
device=device)
mu_acts, sigma_acts = mean_covar_numpy(acts)
if return_acts:
return mu_acts, sigma_acts, acts
else:
return mu_acts, sigma_acts
def compute_statistics_from_tensor(tensor,
batch_size=64,
inception_model=None,
device=torch.device('cuda'),
return_acts=False,
):
dataset = torch.utils.data.TensorDataset(tensor)
acts = compute_activations_from_dataset(dataset,
batch_size=batch_size,
inception_model=inception_model,
device=device)
mu_acts, sigma_acts = mean_covar_numpy(acts)
if return_acts:
return mu_acts, sigma_acts, acts
else:
return mu_acts, sigma_acts
def compute_rfid(dataset,
stage1_model,
batch_size=64,
device=torch.device('cuda'),
):
mu_orig, sigma_orig, mu_recon, sigma_recon = \
compute_statistics_dataset(dataset,
stage1_model=stage1_model,
batch_size=batch_size,
device=device,
skip_original=False,
)
rfid = frechet_distance(mu_orig, sigma_orig, mu_recon, sigma_recon)
return rfid
def compute_fid(fake_path,
ref_stat_path,
batch_size=64,
device=torch.device('cuda'),
):
act_path = Path(fake_path) / 'acts.npz'
if not act_path.exists():
mu, sigma, acts = compute_statistics_from_files(fake_path,
batch_size=batch_size,
device=device,
return_acts=True,
)
np.savez(act_path, acts=acts, mu=mu, sigma=sigma)
logging.info(f'activations saved to {act_path.as_posix()}')
else:
logging.info(f'precomputed activations found: {act_path.as_posix()}')
acts_fake = np.load(act_path)
stats_ref = np.load(ref_stat_path)
mu_ref, sigma_ref = stats_ref['mu'], stats_ref['sigma']
logging.info(f'reference batch stats loaded from {ref_stat_path}')
mu_fake, sigma_fake = acts_fake['mu'], acts_fake['sigma']
logging.info('computing fid...')
fid = frechet_distance(mu_ref, sigma_ref, mu_fake, sigma_fake)
logging.info('FID: {fid:.4f}'.format(fid=fid))
return fid
def compute_fid_without_store(tensor, ref_stat_path, batch_size=64, device=torch.device('cuda')):
print('Compute mu and sigma for fake images...')
mu_fake, sigma_fake = compute_statistics_from_tensor(tensor, batch_size=batch_size, device=device)
stats_ref = np.load(ref_stat_path)
mu_ref, sigma_ref = stats_ref['mu'], stats_ref['sigma']
print(f'reference batch stats loaded from {ref_stat_path}')
print('computing fid...')
fid = frechet_distance(mu_ref, sigma_ref, mu_fake, sigma_fake)
print('FID: {fid:.4f}'.format(fid=fid))
return fid
================================================
FILE: LatentLM/metrics/inception.py
================================================
"""https://github.com/mseitzer/pytorch-fid/blob/master/src/pytorch_fid/inception.py
"""
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from torch.utils.model_zoo import load_url
# Inception weights ported to Pytorch from
# http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz
FID_WEIGHTS_URL = 'https://github.com/mseitzer/pytorch-fid/releases/download/fid_weights/pt_inception-2015-12-05-6726825d.pth' # noqa: E501
class InceptionV3(nn.Module):
"""Pretrained InceptionV3 network returning feature maps"""
# Index of default block of inception to return,
# corresponds to output of final average pooling
DEFAULT_BLOCK_INDEX = 3
# Maps feature dimensionality to their output blocks indices
BLOCK_INDEX_BY_DIM = {
64: 0, # First max pooling features
192: 1, # Second max pooling featurs
768: 2, # Pre-aux classifier features
2048: 3 # Final average pooling features
}
def __init__(self,
output_blocks=[DEFAULT_BLOCK_INDEX],
resize_input=True,
normalize_input=True,
requires_grad=False,
use_fid_inception=True):
"""Build pretrained InceptionV3
Parameters
----------
output_blocks : list of int
Indices of blocks to return features of. Possible values are:
- 0: corresponds to output of first max pooling
- 1: corresponds to output of second max pooling
- 2: corresponds to output which is fed to aux classifier
- 3: corresponds to output of final average pooling
resize_input : bool
If true, bilinearly resizes input to width and height 299 before
feeding input to model. As the network without fully connected
layers is fully convolutional, it should be able to handle inputs
of arbitrary size, so resizing might not be strictly needed
normalize_input : bool
If true, scales the input from range (0, 1) to the range the
pretrained Inception network expects, namely (-1, 1)
requires_grad : bool
If true, parameters of the model require gradients. Possibly useful
for finetuning the network
use_fid_inception : bool
If true, uses the pretrained Inception model used in Tensorflow's
FID implementation. If false, uses the pretrained Inception model
available in torchvision. The FID Inception model has different
weights and a slightly different structure from torchvision's
Inception model. If you want to compute FID scores, you are
strongly advised to set this parameter to true to get comparable
results.
"""
super(InceptionV3, self).__init__()
self.resize_input = resize_input
self.normalize_input = normalize_input
self.output_blocks = sorted(output_blocks)
self.last_needed_block = max(output_blocks)
assert self.last_needed_block <= 3, \
'Last possible output block index is 3'
self.blocks = nn.ModuleList()
if use_fid_inception:
inception = fid_inception_v3()
else:
inception = _inception_v3(pretrained=True)
# Block 0: input to maxpool1
block0 = [
inception.Conv2d_1a_3x3,
inception.Conv2d_2a_3x3,
inception.Conv2d_2b_3x3,
nn.MaxPool2d(kernel_size=3, stride=2)
]
self.blocks.append(nn.Sequential(*block0))
# Block 1: maxpool1 to maxpool2
if self.last_needed_block >= 1:
block1 = [
inception.Conv2d_3b_1x1,
inception.Conv2d_4a_3x3,
nn.MaxPool2d(kernel_size=3, stride=2)
]
self.blocks.append(nn.Sequential(*block1))
# Block 2: maxpool2 to aux classifier
if self.last_needed_block >= 2:
block2 = [
inception.Mixed_5b,
inception.Mixed_5c,
inception.Mixed_5d,
inception.Mixed_6a,
inception.Mixed_6b,
inception.Mixed_6c,
inception.Mixed_6d,
inception.Mixed_6e,
]
self.blocks.append(nn.Sequential(*block2))
# Block 3: aux classifier to final avgpool
if self.last_needed_block >= 3:
block3 = [
inception.Mixed_7a,
inception.Mixed_7b,
inception.Mixed_7c,
nn.AdaptiveAvgPool2d(output_size=(1, 1))
]
self.blocks.append(nn.Sequential(*block3))
self.fc = nn.Linear(2048, 1008, bias=True)
with torch.no_grad():
self.fc.weight.copy_(inception.fc.weight)
self.fc.bias.copy_(inception.fc.bias)
for param in self.parameters():
param.requires_grad = requires_grad
def forward(self, inp, return_logits=False):
"""Get Inception feature maps
Parameters
----------
inp : torch.autograd.Variable
Input tensor of shape Bx3xHxW. Values are expected to be in
range (0, 1)
Returns
-------
List of torch.autograd.Variable, corresponding to the selected output
block, sorted ascending by index
"""
outp = []
x = inp
if self.resize_input:
x = F.interpolate(x,
size=(299, 299),
mode='bilinear',
align_corners=False)
if self.normalize_input:
x = 2 * x - 1 # Scale from range (0, 1) to range (-1, 1)
for idx, block in enumerate(self.blocks):
x = block(x)
if idx in self.output_blocks:
outp.append(x)
# if idx == self.last_needed_block:
# break
if return_logits:
x = F.dropout(x, training=False)
x = torch.flatten(x, 1)
logit = self.fc(x)
return outp, logit
else:
return outp
def _inception_v3(*args, **kwargs):
"""Wraps `torchvision.models.inception_v3`
Skips default weight inititialization if supported by torchvision version.
See https://github.com/mseitzer/pytorch-fid/issues/28.
"""
kwargs['init_weights'] = False
return torchvision.models.inception_v3(*args, **kwargs)
def fid_inception_v3():
"""Build pretrained Inception model for FID computation
The Inception model for FID computation uses a different set of weights
and has a slightly different structure than torchvision's Inception.
This method first constructs torchvision's Inception and then patches the
necessary parts that are different in the FID Inception model.
"""
inception = _inception_v3(num_classes=1008,
aux_logits=False,
pretrained=False)
inception.Mixed_5b = FIDInceptionA(192, pool_features=32)
inception.Mixed_5c = FIDInceptionA(256, pool_features=64)
inception.Mixed_5d = FIDInceptionA(288, pool_features=64)
inception.Mixed_6b = FIDInceptionC(768, channels_7x7=128)
inception.Mixed_6c = FIDInceptionC(768, channels_7x7=160)
inception.Mixed_6d = FIDInceptionC(768, channels_7x7=160)
inception.Mixed_6e = FIDInceptionC(768, channels_7x7=192)
inception.Mixed_7b = FIDInceptionE_1(1280)
inception.Mixed_7c = FIDInceptionE_2(2048)
state_dict = load_url(FID_WEIGHTS_URL, progress=True)
inception.load_state_dict(state_dict)
return inception
class FIDInceptionA(torchvision.models.inception.InceptionA):
"""InceptionA block patched for FID computation"""
def __init__(self, in_channels, pool_features):
super(FIDInceptionA, self).__init__(in_channels, pool_features)
def forward(self, x):
branch1x1 = self.branch1x1(x)
branch5x5 = self.branch5x5_1(x)
branch5x5 = self.branch5x5_2(branch5x5)
branch3x3dbl = self.branch3x3dbl_1(x)
branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl)
# Patch: Tensorflow's average pool does not use the padded zero's in
# its average calculation
branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1,
count_include_pad=False)
branch_pool = self.branch_pool(branch_pool)
outputs = [branch1x1, branch5x5, branch3x3dbl, branch_pool]
return torch.cat(outputs, 1)
class FIDInceptionC(torchvision.models.inception.InceptionC):
"""InceptionC block patched for FID computation"""
def __init__(self, in_channels, channels_7x7):
super(FIDInceptionC, self).__init__(in_channels, channels_7x7)
def forward(self, x):
branch1x1 = self.branch1x1(x)
branch7x7 = self.branch7x7_1(x)
branch7x7 = self.branch7x7_2(branch7x7)
branch7x7 = self.branch7x7_3(branch7x7)
branch7x7dbl = self.branch7x7dbl_1(x)
branch7x7dbl = self.branch7x7dbl_2(branch7x7dbl)
branch7x7dbl = self.branch7x7dbl_3(branch7x7dbl)
branch7x7dbl = self.branch7x7dbl_4(branch7x7dbl)
branch7x7dbl = self.branch7x7dbl_5(branch7x7dbl)
# Patch: Tensorflow's average pool does not use the padded zero's in
# its average calculation
branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1,
count_include_pad=False)
branch_pool = self.branch_pool(branch_pool)
outputs = [branch1x1, branch7x7, branch7x7dbl, branch_pool]
return torch.cat(outputs, 1)
class FIDInceptionE_1(torchvision.models.inception.InceptionE):
"""First InceptionE block patched for FID computation"""
def __init__(self, in_channels):
super(FIDInceptionE_1, self).__init__(in_channels)
def forward(self, x):
branch1x1 = self.branch1x1(x)
branch3x3 = self.branch3x3_1(x)
branch3x3 = [
self.branch3x3_2a(branch3x3),
self.branch3x3_2b(branch3x3),
]
branch3x3 = torch.cat(branch3x3, 1)
branch3x3dbl = self.branch3x3dbl_1(x)
branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
branch3x3dbl = [
self.branch3x3dbl_3a(branch3x3dbl),
self.branch3x3dbl_3b(branch3x3dbl),
]
branch3x3dbl = torch.cat(branch3x3dbl, 1)
# Patch: Tensorflow's average pool does not use the padded zero's in
# its average calculation
branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1,
count_include_pad=False)
branch_pool = self.branch_pool(branch_pool)
outputs = [branch1x1, branch3x3, branch3x3dbl, branch_pool]
return torch.cat(outputs, 1)
class FIDInceptionE_2(torchvision.models.inception.InceptionE):
"""Second InceptionE block patched for FID computation"""
def __init__(self, in_channels):
super(FIDInceptionE_2, self).__init__(in_channels)
def forward(self, x):
branch1x1 = self.branch1x1(x)
branch3x3 = self.branch3x3_1(x)
branch3x3 = [
self.branch3x3_2a(branch3x3),
self.branch3x3_2b(branch3x3),
]
branch3x3 = torch.cat(branch3x3, 1)
branch3x3dbl = self.branch3x3dbl_1(x)
branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
branch3x3dbl = [
self.branch3x3dbl_3a(branch3x3dbl),
self.branch3x3dbl_3b(branch3x3dbl),
]
branch3x3dbl = torch.cat(branch3x3dbl, 1)
# Patch: The FID Inception model uses max pooling instead of average
# pooling. This is likely an error in this specific Inception
# implementation, as other Inception models use average pooling here
# (which matches the description in the paper).
branch_pool = F.max_pool2d(x, kernel_size=3, stride=1, padding=1)
branch_pool = self.branch_pool(branch_pool)
outputs = [branch1x1, branch3x3, branch3x3dbl, branch_pool]
return torch.cat(outputs, 1)
================================================
FILE: LatentLM/models/DiT.py
================================================
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
# --------------------------------------------------------
# References:
# GLIDE: https://github.com/openai/glide-text2im
# MAE: https://github.com/facebookresearch/mae/blob/main/models_mae.py
# --------------------------------------------------------
import functools
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import math
from timm.models.vision_transformer import PatchEmbed
from .RMSNorm import RMSNorm
def modulate(x, shift, scale):
return x * (1 + scale) + shift
#################################################################################
# Embedding Layers for Timesteps and Class Labels #
#################################################################################
class TimestepEmbedder(nn.Module):
"""
Embeds scalar timesteps into vector representations.
"""
def __init__(self, hidden_size, frequency_embedding_size=256):
super().__init__()
self.mlp = nn.Sequential(
nn.Linear(frequency_embedding_size, hidden_size, bias=False),
nn.SiLU(),
nn.Linear(hidden_size, hidden_size, bias=False),
)
self.frequency_embedding_size = frequency_embedding_size
@staticmethod
def timestep_embedding(t, dim, max_period=10000):
"""
Create sinusoidal timestep embeddings.
:param t: a 1-D Tensor of N indices, one per batch element.
These may be fractional.
:param dim: the dimension of the output.
:param max_period: controls the minimum frequency of the embeddings.
:return: an (N, D) Tensor of positional embeddings.
"""
# https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
half = dim // 2
freqs = torch.exp(
-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
).to(t.device)
args = t[:, None].float() * freqs[None]
embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
if dim % 2:
embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
return embedding.to(t.dtype)
def forward(self, t):
t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
t_emb = self.mlp(t_freq)
return t_emb
class LabelEmbedder(nn.Module):
"""
Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.
"""
def __init__(self, num_classes, hidden_size, dropout_prob):
super().__init__()
use_cfg_embedding = dropout_prob > 0
self.embedding_table = nn.Embedding(num_classes + use_cfg_embedding, hidden_size)
self.num_classes = num_classes
self.dropout_prob = dropout_prob
def token_drop(self, labels, force_drop_ids=None):
"""
Drops labels to enable classifier-free guidance.
"""
if force_drop_ids is None:
drop_ids = torch.rand(labels.shape[0], device=labels.device) < self.dropout_prob
else:
drop_ids = force_drop_ids == 1
labels = torch.where(drop_ids, self.num_classes, labels)
return labels
def forward(self, labels, train, force_drop_ids=None):
use_dropout = self.dropout_prob > 0
if (train and use_dropout) or (force_drop_ids is not None):
labels = self.token_drop(labels, force_drop_ids)
embeddings = self.embedding_table(labels)
return embeddings
class SwiGLU(nn.Module):
def __init__(
self,
embed_dim,
ffn_dim,
drop=0.,
):
super().__init__()
self.embed_dim = embed_dim
self.fc1 = nn.Linear(self.embed_dim, ffn_dim, bias=False)
self.gate = nn.Linear(self.embed_dim, ffn_dim, bias=False)
self.fc2 = nn.Linear(ffn_dim, self.embed_dim, bias=False)
self.drop = nn.Dropout(drop)
def forward(self, x):
x_shape = x.shape
x = x.reshape(-1, x.size(-1))
x = F.silu(self.fc1(x)) * self.gate(x)
x = self.drop(x)
x = self.fc2(x)
x = self.drop(x)
output = x.view(x_shape)
return output
#################################################################################
# Core DiT Model #
#################################################################################
class Attention(nn.Module):
def __init__(self, dim, num_heads=8, num_kv_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.):
super().__init__()
self.dim = dim
self.head_dim = dim // num_heads
self.num_heads = num_heads
self.num_kv_heads = num_kv_heads
self.n_rep = num_heads // num_kv_heads
head_dim = dim // num_heads
self.scale = head_dim ** -0.5
self.qkv = nn.Linear(dim, dim + 2 * self.num_kv_heads * self.head_dim, bias=qkv_bias)
self.attn_drop = nn.Dropout(attn_drop)
self.proj = nn.Linear(dim, dim, bias=False)
self.proj_drop = nn.Dropout(proj_drop)
def forward(self, x):
B, N, C = x.shape
qkv = self.qkv(x).reshape(B, N, self.num_heads + 2 * self.num_kv_heads, self.head_dim)
q, k, v = torch.split(qkv, [self.num_heads, self.num_kv_heads, self.num_kv_heads], dim=2)
q, k, v = q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)
x = F.scaled_dot_product_attention(
q, k, v,
dropout_p=self.attn_drop.p if self.training else 0.,
)
x = x.transpose(1, 2).reshape(B, N, C)
x = self.proj(x)
x = self.proj_drop(x)
return x
class DiTBlock(nn.Module):
"""
A DiT block with adaptive layer norm zero (adaLN-Zero) conditioning.
"""
def __init__(self, hidden_size, num_heads, num_kv_heads, mlp_ratio=4.0, proj_drop=0., attn_drop=0., **block_kwargs):
super().__init__()
self.norm1 = RMSNorm(hidden_size, elementwise_affine=False, eps=1e-6)
self.attn = Attention(hidden_size, num_heads=num_heads, num_kv_heads=num_kv_heads, qkv_bias=False, proj_drop=proj_drop, attn_drop=attn_drop, **block_kwargs)
self.norm2 = RMSNorm(hidden_size, elementwise_affine=False, eps=1e-6)
mlp_hidden_dim = int(hidden_size * mlp_ratio * 2 / 3 / 64) * 64
self.mlp = SwiGLU(hidden_size, mlp_hidden_dim, drop=proj_drop)
self.adaLN_modulation = nn.Sequential(
nn.SiLU(),
nn.Linear(hidden_size, 6 * hidden_size, bias=False)
)
def forward(self, x, c):
shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(c).chunk(6, dim=-1)
x = x + gate_msa * self.attn(modulate(self.norm1(x), shift_msa, scale_msa))
x = x + gate_mlp * self.mlp(modulate(self.norm2(x), shift_mlp, scale_mlp))
return x
class FinalLayer(nn.Module):
"""
The final layer of DiT.
"""
def __init__(self, hidden_size, output_size):
super().__init__()
self.norm_final = RMSNorm(hidden_size, elementwise_affine=False, eps=1e-6)
self.linear = nn.Linear(hidden_size, output_size, bias=False)
self.adaLN_modulation = nn.Sequential(
nn.SiLU(),
nn.Linear(hidden_size, 2 * hidden_size, bias=False)
)
def forward(self, x, c):
shift, scale = self.adaLN_modulation(c).chunk(2, dim=-1)
x = modulate(self.norm_final(x), shift, scale)
x = self.linear(x)
return x
class DiT(nn.Module):
"""
Diffusion model with a Transformer backbone.
"""
def __init__(
self,
input_size=32,
patch_size=1,
flatten_input=False,
in_channels=4,
hidden_size=1152,
depth=28,
num_heads=16,
num_kv_heads=None,
mlp_ratio=4.0,
class_dropout_prob=0.1,
num_classes=1000,
drop=0.0,
norm_layer=None
):
super().__init__()
self.in_channels = in_channels
self.out_channels = in_channels
self.input_size = input_size
self.patch_size = patch_size if not flatten_input else 1
self.num_heads = num_heads
self.num_kv_heads = num_kv_heads if num_kv_heads is not None else num_heads
self.flatten_input_size = input_size * input_size // self.patch_size // self.patch_size
self.flatten_input = flatten_input
self.x_embedder = PatchEmbed(input_size, patch_size, in_channels, hidden_size, strict_img_size=False, norm_layer=norm_layer) if not flatten_input else nn.Linear(in_channels, hidden_size, bias=False)
self.t_embedder = TimestepEmbedder(hidden_size)
self.y_embedder = LabelEmbedder(num_classes, hidden_size, class_dropout_prob)
# Will use fixed sin-cos embedding:
self.pos_embed = nn.Parameter(torch.zeros(1, self.flatten_input_size, hidden_size), requires_grad=False)
self.blocks = nn.ModuleList([
DiTBlock(hidden_size, self.num_heads, self.num_kv_heads, mlp_ratio=mlp_ratio, proj_drop=drop, attn_drop=drop) for _ in range(depth)
])
self.final_layer = FinalLayer(hidden_size, self.patch_size * self.patch_size * self.out_channels)
self.initialize_weights()
@property
def device(self):
return next(self.parameters()).device
@property
def dtype(self):
return next(self.parameters()).dtype
def initialize_weights(self):
# Initialize transformer layers:
def _basic_init(module):
if isinstance(module, nn.Linear):
if module.bias is not None:
nn.init.constant_(module.bias, 0)
self.apply(_basic_init)
# Initialize (and freeze) pos_embed by sin-cos embedding:
pos_embed = get_2d_sincos_pos_embed(self.pos_embed.shape[-1], int(self.x_embedder.num_patches ** 0.5)) if not self.flatten_input \
else get_1d_sincos_pos_embed(self.pos_embed.shape[-1], self.flatten_input_size)
self.pos_embed.data.copy_(torch.from_numpy(pos_embed).float().unsqueeze(0))
# Initialize patch_embed like nn.Linear (instead of nn.Conv2d):
if not self.flatten_input:
nn.init.constant_(self.x_embedder.proj.bias, 0)
# Initialize label embedding table:
nn.init.normal_(self.y_embedder.embedding_table.weight, std=0.02)
# Initialize timestep embedding MLP:
nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
# Zero-out adaLN modulation layers in DiT blocks:
for block in self.blocks:
nn.init.constant_(block.adaLN_modulation[-1].weight, 0)
# Zero-out output layers:
nn.init.constant_(self.final_layer.adaLN_modulation[-1].weight, 0)
nn.init.constant_(self.final_layer.linear.weight, 0)
def unpatchify(self, x):
"""
x: (N, T, patch_size**2 * C)
imgs: (N, H, W, C)
"""
c = self.out_channels
p = self.x_embedder.patch_size[0]
h = w = int(x.shape[1] ** 0.5)
assert h * w == x.shape[1]
x = x.reshape(shape=(x.shape[0], h, w, p, p, c))
x = torch.einsum('nhwpqc->nchpwq', x)
imgs = x.reshape(shape=(x.shape[0], c, h * p, h * p))
return imgs
def forward(self, x_noise, t, y, **kwargs):
"""
Forward pass of DiT.
x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
t: (N,) tensor of diffusion timesteps
y: (N,) tensor of class labels
"""
x = self.x_embedder(x_noise) + self.pos_embed # (N, T, D), where T = H * W / patch_size ** 2
t = self.t_embedder(t) # (N, D)
y = self.y_embedder(y, self.training) # (N, D)
c = (t + y).unsqueeze(1) # (N, D)
for block in self.blocks:
x = block(x, c) # (N, T, D)
x = self.final_layer(x, c) # (N, T, patch_size ** 2 * out_channels)
if not self.flatten_input:
x = self.unpatchify(x) # (N, out_channels, H, W)
return x
def sample_with_cfg(self, y, cfg_scale, sample_func):
bsz = y.shape[0]
z = torch.randn(bsz, self.in_channels, self.input_size, self.input_size, device=self.device, dtype=self.dtype) if not self.flatten_input else torch.randn(bsz, self.flatten_input_size, self.in_channels, device=self.device, dtype=self.dtype)
samples = sample_func(functools.partial(self.forward_with_cfg, y=y, cfg_scale=cfg_scale), z)
samples, _ = samples.chunk(2, dim=0)
return samples
def forward_with_cfg(self, x, t, y, cfg_scale):
"""
Forward pass of DiT, but also batches the unconditional forward pass for classifier-free guidance.
"""
# https://github.com/openai/glide-text2im/blob/main/notebooks/text2im.ipynb
half = x[: len(x) // 2]
combined = torch.cat([half, half], dim=0)
eps = self.forward(combined, t, y)
cond_eps, uncond_eps = torch.split(eps, len(eps) // 2, dim=0)
half_eps = uncond_eps + cfg_scale * (cond_eps - uncond_eps)
eps = torch.cat([half_eps, half_eps], dim=0)
return eps
#################################################################################
# Sine/Cosine Positional Embedding Functions #
#################################################################################
# https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py
def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0):
"""
grid_size: int of the grid height and width
return:
pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
"""
grid_h = np.arange(grid_size, dtype=np.float32)
grid_w = np.arange(grid_size, dtype=np.float32)
grid = np.meshgrid(grid_w, grid_h) # here w goes first
grid = np.stack(grid, axis=0)
grid = grid.reshape([2, 1, grid_size, grid_size])
pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
if cls_token and extra_tokens > 0:
pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
return pos_embed
def get_1d_sincos_pos_embed(embed_dim, seq_len, cls_token=False, extra_tokens=0):
"""
seq_len: int of the sequence length
return:
pos_embed: [seq_len, embed_dim] or [1+seq_len, embed_dim] (w/ or w/o cls_token)
"""
pos = np.arange(seq_len, dtype=np.float32)
pos_embed = get_1d_sincos_pos_embed_from_grid(embed_dim, pos)
if cls_token and extra_tokens > 0:
pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
return pos_embed
def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
assert embed_dim % 2 == 0
# use half of dimensions to encode grid_h
emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2)
emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2)
emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
return emb
def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
"""
embed_dim: output dimension for each position
pos: a list of positions to be encoded: size (M,)
out: (M, D)
"""
assert embed_dim % 2 == 0
omega = np.arange(embed_dim // 2, dtype=np.float64)
omega /= embed_dim / 2.
omega = 1. / 10000**omega # (D/2,)
pos = pos.reshape(-1) # (M,)
out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product
emb_sin = np.sin(out) # (M, D/2)
emb_cos = np.cos(out) # (M, D/2)
emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D)
return emb
#################################################################################
# DiT Configs #
#################################################################################
def DiT_13B(**kwargs):
return DiT(depth=40, hidden_size=5120, num_heads=40, **kwargs)
def DiT_7B(**kwargs):
return DiT(depth=32, hidden_size=4096, num_heads=32, **kwargs)
def DiT_3B(**kwargs):
return DiT(depth=32, hidden_size=2560, num_heads=20, **kwargs)
def DiT_XL(**kwargs):
return DiT(depth=24, hidden_size=2048, num_heads=16, **kwargs)
def DiT_Large(**kwargs):
return DiT(depth=24, hidden_size=1536, num_heads=12, **kwargs)
def DiT_Medium(**kwargs):
return DiT(depth=24, hidden_size=1024, num_heads=16, **kwargs)
def DiT_Base(**kwargs):
return DiT(depth=12, hidden_size=768, num_heads=12, **kwargs)
DiT_models = {
'DiT-13B': DiT_13B, 'DiT-7B': DiT_7B, 'DiT-3B': DiT_3B, 'DiT-XL': DiT_XL, 'DiT-Large': DiT_Large,
'DiT-Medium': DiT_Medium, 'DiT-Base': DiT_Base
}
================================================
FILE: LatentLM/models/EMA.py
================================================
import copy
from typing import Any, Dict, Iterable, Optional, Union
import torch
# Adapted from torch-ema https://github.com/fadel/pytorch_ema/blob/master/torch_ema/ema.py#L14
class EMAModel:
"""
Exponential Moving Average of models weights
"""
def __init__(
self,
parameters: Iterable[torch.nn.Parameter],
decay: float = 0.9999,
min_decay: float = 0.0,
update_after_step: int = 0,
use_ema_warmup: bool = False,
inv_gamma: Union[float, int] = 1.0,
power: Union[float, int] = 2 / 3,
model_cls: Optional[Any] = None,
model_config: Dict[str, Any] = None,
**kwargs,
):
"""
Args:
parameters (Iterable[torch.nn.Parameter]): The parameters to track.
decay (float): The decay factor for the exponential moving average.
min_decay (float): The minimum decay factor for the exponential moving average.
update_after_step (int): The number of steps to wait before starting to update the EMA weights.
use_ema_warmup (bool): Whether to use EMA warmup.
inv_gamma (float):
Inverse multiplicative factor of EMA warmup. Default: 1. Only used if `use_ema_warmup` is True.
power (float): Exponential factor of EMA warmup. Default: 2/3. Only used if `use_ema_warmup` is True.
device (Optional[Union[str, torch.device]]): The device to store the EMA weights on. If None, the EMA
weights will be stored on CPU.
@crowsonkb's notes on EMA Warmup:
If gamma=1 and power=1, implements a simple average. gamma=1, power=2/3 are good values for models you plan
to train for a million or more steps (reaches decay factor 0.999 at 31.6K steps, 0.9999 at 1M steps),
gamma=1, power=3/4 for models you plan to train for less (reaches decay factor 0.999 at 10K steps, 0.9999
at 215.4k steps).
"""
parameters = list(parameters)
self.shadow_params = [p.clone().detach() for p in parameters]
self.temp_stored_params = None
self.decay = decay
self.min_decay = min_decay
self.update_after_step = update_after_step
self.use_ema_warmup = use_ema_warmup
self.inv_gamma = inv_gamma
self.power = power
self.optimization_step = 0
self.cur_decay_value = None # set in `step()`
self.model_cls = model_cls
self.model_config = model_config
def get_decay(self, optimization_step: int) -> float:
"""
Compute the decay factor for the exponential moving average.
"""
step = max(0, optimization_step - self.update_after_step - 1)
if step <= 0:
return 0.0
if self.use_ema_warmup:
cur_decay_value = 1 - (1 + step / self.inv_gamma) ** -self.power
else:
cur_decay_value = (1 + step) / (10 + step)
cur_decay_value = min(cur_decay_value, self.decay)
# make sure decay is not smaller than min_decay
cur_decay_value = max(cur_decay_value, self.min_decay)
return cur_decay_value
@torch.no_grad()
def step(self, parameters: Iterable[torch.nn.Parameter]):
parameters = list(parameters)
self.optimization_step += 1
# Compute the decay factor for the exponential moving average.
decay = self.get_decay(self.optimization_step)
self.cur_decay_value = decay
one_minus_decay = 1 - decay
for s_param, param in zip(self.shadow_params, parameters):
if param.requires_grad:
s_param.sub_(one_minus_decay * (s_param - param))
else:
s_param.copy_(param)
def to(self, device=None, dtype=None) -> None:
r"""Move internal buffers of the ExponentialMovingAverage to `device`.
Args:
device: like `device` argument to `torch.Tensor.to`
"""
# .to() on the tensors handles None correctly
self.shadow_params = [
p.to(device=device, dtype=dtype) if p.is_floating_point() else p.to(device=device)
for p in self.shadow_params
]
def state_dict(self) -> dict:
r"""
Returns the state of the ExponentialMovingAverage as a dict. This method is used by accelerate during
checkpointing to save the ema state dict.
"""
return {
"decay": self.decay,
"min_decay": self.min_decay,
"optimization_step": self.optimization_step,
"update_after_step": self.update_after_step,
"use_ema_warmup": self.use_ema_warmup,
"inv_gamma": self.inv_gamma,
"power": self.power,
"shadow_params": self.shadow_params,
}
def load_state_dict(self, state_dict: dict) -> None:
r"""
Args:
Loads the ExponentialMovingAverage state. This method is used by accelerate during checkpointing to save the
ema state dict.
state_dict (dict): EMA state. Should be an object returned
from a call to :meth:`state_dict`.
"""
# deepcopy, to be consistent with module API
state_dict = copy.deepcopy(state_dict)
self.decay = state_dict.get("decay", self.decay)
if self.decay < 0.0 or self.decay > 1.0:
raise ValueError("Decay must be between 0 and 1")
self.min_decay = state_dict.get("min_decay", self.min_decay)
if not isinstance(self.min_decay, float):
raise ValueError("Invalid min_decay")
self.optimization_step = state_dict.get("optimization_step", self.optimization_step)
if not isinstance(self.optimization_step, int):
raise ValueError("Invalid optimization_step")
self.update_after_step = state_dict.get("update_after_step", self.update_after_step)
if not isinstance(self.update_after_step, int):
raise ValueError("Invalid update_after_step")
self.use_ema_warmup = state_dict.get("use_ema_warmup", self.use_ema_warmup)
if not isinstance(self.use_ema_warmup, bool):
raise ValueError("Invalid use_ema_warmup")
self.inv_gamma = state_dict.get("inv_gamma", self.inv_gamma)
if not isinstance(self.inv_gamma, (float, int)):
raise ValueError("Invalid inv_gamma")
self.power = state_dict.get("power", self.power)
if not isinstance(self.power, (float, int)):
raise ValueError("Invalid power")
shadow_params = state_dict.get("shadow_params", None)
for model_param, ema_param in zip(self.shadow_params, shadow_params):
model_param.data = ema_param.data.to(model_param)
================================================
FILE: LatentLM/models/RMSNorm.py
================================================
import torch
import torch.nn as nn
class RMSNorm(nn.Module):
def __init__(self, dim: int, eps: float = 1e-6, elementwise_affine=True):
super().__init__()
self.dim = dim
self.eps = eps
self.elementwise_affine = elementwise_affine
if self.elementwise_affine:
self.weight = nn.Parameter(torch.ones(dim))
else:
self.register_parameter('weight', None)
def _norm(self, x):
return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
def forward(self, x):
output = self._norm(x.float()).type_as(x)
if self.weight is not None:
output = output * self.weight
return output
def extra_repr(self) -> str:
return f'dim={self.dim}, eps={self.eps}, elementwise_affine={self.elementwise_affine}'
================================================
FILE: LatentLM/models/Transformer.py
================================================
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
# --------------------------------------------------------
# References:
# GLIDE: https://github.com/openai/glide-text2im
# MAE: https://github.com/facebookresearch/mae/blob/main/models_mae.py
# --------------------------------------------------------
import functools
import torch
import torch.nn as nn
import torch.nn.functional as F
from timm.models.vision_transformer import PatchEmbed
try:
from flash_attn import flash_attn_func
has_flash_attn2 = torch.cuda.get_device_properties(0).major >= 8
except ImportError:
has_flash_attn2 = False
print("flash_attn2 not found")
from .DiT import LabelEmbedder, TimestepEmbedder, FinalLayer, SwiGLU, modulate
from .kernel.rotary import apply_rotary_pos_emb as apply_rotary_emb
from .RMSNorm import RMSNorm
def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor:
"""torch.repeat_interleave(x, dim=1, repeats=n_rep)"""
bs, n_kv_heads, slen, head_dim = x.shape
if n_rep == 1:
return x
return (
x[:, :, None, :, :]
.expand(bs, n_kv_heads, n_rep, slen, head_dim)
.reshape(bs, n_kv_heads * n_rep, slen, head_dim)
)
class Attention(nn.Module):
def __init__(self, dim, num_heads=8, num_kv_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.):
super().__init__()
self.dim = dim
self.head_dim = dim // num_heads
self.num_heads = num_heads
self.num_kv_heads = num_kv_heads
self.n_rep = num_heads // num_kv_heads
head_dim = dim // num_heads
self.scale = head_dim ** -0.5
self.qkv = nn.Linear(dim, dim + 2 * self.num_kv_heads * self.head_dim, bias=qkv_bias)
self.attn_drop = nn.Dropout(attn_drop)
self.proj = nn.Linear(dim, dim, bias=False)
self.proj_drop = nn.Dropout(proj_drop)
def forward(self, x, start_pos, rel_pos, incremental_state=None):
B, N, C = x.shape
qkv = self.qkv(x).reshape(B, N, self.num_heads + 2 * self.num_kv_heads, self.head_dim)
q, k, v = torch.split(qkv, [self.num_heads, self.num_kv_heads, self.num_kv_heads], dim=2)
q = apply_rotary_emb(q, *rel_pos, interleaved=True)
k = apply_rotary_emb(k, *rel_pos, interleaved=True)
if incremental_state is not None:
incremental_state["key"][:B, start_pos : start_pos + N] = k
incremental_state["value"][:B, start_pos : start_pos + N] = v
k = incremental_state["key"][:B, :start_pos + N]
v = incremental_state["value"][:B, :start_pos + N]
if has_flash_attn2 and (x.dtype == torch.float16 or x.dtype == torch.bfloat16):
x = flash_attn_func(q, k, v, causal=True, dropout_p=self.attn_drop.p if self.training else 0.)
else:
q = q.transpose(1, 2)
k = repeat_kv(k.transpose(1, 2), self.n_rep)
v = repeat_kv(v.transpose(1, 2), self.n_rep)
x = F.scaled_dot_product_attention(
q, k, v,
is_causal=incremental_state is None,
dropout_p=self.attn_drop.p if self.training else 0.,
)
x = x.transpose(1, 2)
x = self.proj(x.reshape(B, N, C))
x = self.proj_drop(x)
return x
class Block(nn.Module):
def __init__(self, hidden_size, num_heads, num_kv_heads, mlp_ratio=4.0, proj_drop=0., attn_drop=0., **block_kwargs):
super().__init__()
self.norm1 = RMSNorm(hidden_size, elementwise_affine=False, eps=1e-6)
self.attn = Attention(hidden_size, num_heads=num_heads, num_kv_heads=num_kv_heads, qkv_bias=False, proj_drop=proj_drop, attn_drop=attn_drop, **block_kwargs)
self.norm2 = RMSNorm(hidden_size, elementwise_affine=False, eps=1e-6)
mlp_hidden_dim = int(hidden_size * mlp_ratio * 2 / 3 / 64) * 64
self.mlp = SwiGLU(hidden_size, mlp_hidden_dim, drop=proj_drop)
def forward(self, x, start_pos, rel_pos, incremental_state=None):
x = x + self.attn(self.norm1(x), start_pos, rel_pos, incremental_state)
x = x + self.mlp(self.norm2(x))
return x
class MLPBlock(nn.Module):
def __init__(self, hidden_size, mlp_ratio=4.0, drop=0.0, **block_kwargs):
super().__init__()
self.norm = RMSNorm(hidden_size, elementwise_affine=False, eps=1e-6)
mlp_hidden_dim = int(hidden_size * mlp_ratio * 2 / 3 / 64) * 64
self.mlp = SwiGLU(hidden_size, mlp_hidden_dim, drop=drop)
self.adaLN_modulation = nn.Sequential(
nn.SiLU(),
nn.Linear(hidden_size, 3 * hidden_size, bias=False)
)
def forward(self, x, c):
shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(c).chunk(3, dim=-1)
x = x + gate_mlp * self.mlp(modulate(self.norm(x), shift_mlp, scale_mlp))
return x
class ConditionLayer(nn.Module):
def __init__(self, hidden_size):
super().__init__()
self.norm_final = RMSNorm(hidden_size, elementwise_affine=False, eps=1e-6)
self.linear = nn.Linear(hidden_size, hidden_size, bias=False)
def forward(self, x):
x = self.norm_final(x)
x = self.linear(x)
return x
class Transformer(nn.Module):
def __init__(
self,
input_size=32,
patch_size=1,
flatten_input=False,
in_channels=4,
hidden_size=1152,
depth=28,
diffusion_depth=3,
num_heads=16,
num_kv_heads=None,
mlp_ratio=4.0,
class_dropout_prob=0.1,
num_classes=1000,
posi_scale=1,
drop=0.0,
norm_layer=None
):
super().__init__()
self.in_channels = in_channels
self.out_channels = in_channels
self.input_size = input_size
self.patch_size = patch_size
self.num_heads = num_heads
self.num_kv_heads = num_kv_heads if num_kv_heads is not None else num_heads
self.head_dim = hidden_size // num_heads
self.hidden_size = hidden_size
self.flatten_input_size = input_size * input_size
self.flatten_input = flatten_input
self.posi_scale = posi_scale
self.x_embedder = PatchEmbed(input_size, patch_size, in_channels, hidden_size, strict_img_size=False, norm_layer=norm_layer) if not flatten_input else nn.Linear(in_channels, hidden_size, bias=False)
self.noisy_x_embedder = PatchEmbed(input_size, patch_size, in_channels, hidden_size, strict_img_size=False, norm_layer=norm_layer) if not flatten_input else nn.Linear(in_channels, hidden_size, bias=False)
self.t_embedder = TimestepEmbedder(hidden_size)
self.y_embedder = LabelEmbedder(num_classes, hidden_size, class_dropout_prob)
self._precomputed_freqs_cis = None
self.blocks = nn.ModuleList([
Block(hidden_size, self.num_heads, self.num_kv_heads, mlp_ratio=mlp_ratio, proj_drop=drop, attn_drop=drop) for _ in range(depth)
])
self.diffusion_blocks = nn.ModuleList([
MLPBlock(hidden_size, mlp_ratio=mlp_ratio) for _ in range(diffusion_depth)
])
self.condition_layer = ConditionLayer(hidden_size)
self.final_layer = FinalLayer(hidden_size, patch_size * patch_size * self.out_channels if not flatten_input else self.out_channels)
self.initialize_weights()
def initialize_weights(self):
# Initialize transformer layers:
def _basic_init(module):
if isinstance(module, nn.Linear):
if module.bias is not None:
nn.init.constant_(module.bias, 0)
self.apply(_basic_init)
# Initialize patch_embed like nn.Linear (instead of nn.Conv2d):
if not self.flatten_input:
nn.init.constant_(self.x_embedder.proj.bias, 0)
# Initialize label embedding table, timestep embedding MLP, and CLS:
nn.init.normal_(self.y_embedder.embedding_table.weight, std=0.02)
nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
# Zero-out adaLN modulation layers in DiT blocks:
for block in self.diffusion_blocks:
nn.init.constant_(block.adaLN_modulation[-1].weight, 0)
# Zero-out output layers:
nn.init.constant_(self.final_layer.adaLN_modulation[-1].weight, 0)
nn.init.constant_(self.final_layer.linear.weight, 0)
@property
def device(self):
return next(self.parameters()).device
@property
def dtype(self):
return next(self.parameters()).dtype
def unpatchify(self, x):
"""
x: (N, T, patch_size**2 * C)
imgs: (N, H, W, C)
"""
c = self.out_channels
p = self.x_embedder.patch_size[0]
h = w = int(x.shape[1] ** 0.5)
assert h * w == x.shape[1]
x = x.reshape(shape=(x.shape[0], h, w, p, p, c))
x = torch.einsum('nhwpqc->nchpwq', x)
imgs = x.reshape(shape=(x.shape[0], c, h * p, h * p))
return imgs
def build_rel_pos(self, x, start_pos = 0):
if self._precomputed_freqs_cis is None:
angle = 1.0 / ((10000 * self.posi_scale) ** torch.linspace(0, 1, self.head_dim // 2, dtype=torch.float, device=x.device))
index = torch.arange(self.flatten_input_size).to(angle)
self._precomputed_freqs_cis = index[:, None] * angle
cos = torch.cos(self._precomputed_freqs_cis[start_pos:start_pos+x.size(1)])
sin = torch.sin(self._precomputed_freqs_cis[start_pos:start_pos+x.size(1)])
rel_pos = (cos.to(x.dtype), sin.to(x.dtype))
return rel_pos
def forward(self, x_noise, t, x_start, y, batch_mul=1):
"""
Forward pass of ransformer.
x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
t: (N,) tensor of diffusion timesteps
y: (N,) tensor of class labels
"""
condition = self.forward_parallel(x_start, y)
condition = condition.repeat_interleave(batch_mul, dim=0)
x = self.forward_diffusion(x_noise, t, condition)
return x
def forward_parallel(self, x, y):
x = self.x_embedder(x)
y = self.y_embedder(y, self.training)
x = torch.cat((y.unsqueeze(1), x[:, :-1]), dim=1)
rel_pos = self.build_rel_pos(x)
for block in self.blocks:
x = block(x, 0, rel_pos)
x = self.condition_layer(x)
return x
def forward_recurrent(self, x, start_pos = 0, incremental_state = None):
start_pos = start_pos if start_pos != 0 else 0
x = self.y_embedder(x, self.training).unsqueeze(1) if start_pos == 0 else self.x_embedder(x)
rel_pos = self.build_rel_pos(x, start_pos)
for idx, block in enumerate(self.blocks):
if incremental_state is not None and idx not in incremental_state:
incremental_state[idx] = {
"key": torch.empty(x.shape[0], self.flatten_input_size, self.num_kv_heads, self.head_dim, device=x.device, dtype=x.dtype),
"value": torch.empty(x.shape[0], self.flatten_input_size, self.num_kv_heads, self.head_dim, device=x.device, dtype=x.dtype),
}
x = block(x, start_pos, rel_pos, incremental_state[idx])
x = self.condition_layer(x[:, -1:])
return x
def forward_diffusion(self, x, t, condition):
bsz, seq_len = t.shape if t.dim() > 1 else (t.shape[0], 1)
t = self.t_embedder(t.view(-1)).view(bsz, seq_len, -1)
c = condition + t
x = self.noisy_x_embedder(x)
for block in self.diffusion_blocks:
x = block(x, c)
x = self.final_layer(x, c)
if not self.flatten_input:
x = self.unpatchify(x) # (N, out_channels, H, W)
return x
def sample_with_cfg(self, prev_token, cfg_scale, sample_func):
bsz, half_bsz = prev_token.shape[0], prev_token.shape[0] // 2
incremental_state = {}
samples = []
for i in range(self.flatten_input_size):
if self.flatten_input:
z = torch.randn(bsz, 1, self.in_channels, device=self.device, dtype=self.dtype)
else:
p = self.noisy_x_embedder.patch_size[0]
h = w = self.input_size // p
z = torch.randn(bsz, self.in_channels, p, p, device=self.device, dtype=self.dtype)
recurrent_input = torch.cat([prev_token, prev_token], dim=0) if i != 0 else prev_token
condition = self.forward_recurrent(recurrent_input, start_pos = i, incremental_state=incremental_state)
prev_token = sample_func(functools.partial(self.forward_with_cfg, condition=condition, cfg_scale=cfg_scale), z)
prev_token, _ = prev_token.chunk(2, dim=0) # Remove null class samples
samples.append(prev_token)
if self.flatten_input:
samples = torch.cat(samples, 1)
else:
samples = torch.stack(samples, 2).view(half_bsz, self.in_channels, h, w, p, p).permute(0, 1, 2, 4, 3, 5).reshape(half_bsz, self.in_channels, h * p, w * p)
return samples
def forward_with_cfg(self, x, t, condition, cfg_scale):
"""
Forward pass of ClassTransformer, but also batches the unconditional forward pass for classifier-free guidance.
"""
# https://github.com/openai/glide-text2im/blob/main/notebooks/text2im.ipynb
half = x[: len(x) // 2]
combined = torch.cat([half, half], dim=0)
eps = self.forward_diffusion(combined, t, condition)
cond_eps, uncond_eps = torch.split(eps, len(eps) // 2, dim=0)
half_eps = uncond_eps + cfg_scale * (cond_eps - uncond_eps)
eps = torch.cat([half_eps, half_eps], dim=0)
return eps
#################################################################################
# Transformer Configs #
#################################################################################
def Transformer_13B(**kwargs):
return Transformer(depth=40, hidden_size=5120, num_heads=40, mlp_ratio=6, **kwargs)
def Transformer_7B(**kwargs):
return Transformer(depth=32, hidden_size=4096, num_heads=32, mlp_ratio=6, **kwargs)
def Transformer_3B(**kwargs):
return Transformer(depth=32, hidden_size=2560, num_heads=20, mlp_ratio=6, **kwargs)
def Transformer_XL(**kwargs):
return Transformer(depth=24, hidden_size=2048, num_heads=16, mlp_ratio=6, **kwargs)
def Transformer_Large(**kwargs):
return Transformer(depth=24, hidden_size=1536, num_heads=12, mlp_ratio=6, **kwargs)
def Transformer_Medium(**kwargs):
return Transformer(depth=24, hidden_size=1024, num_heads=16, mlp_ratio=6, **kwargs)
def Transformer_Base(**kwargs):
return Transformer(depth=12, hidden_size=768, num_heads=12, mlp_ratio=6, **kwargs)
def Transformer_H(**kwargs):
return Transformer(depth=40, hidden_size=1280, num_heads=20, mlp_ratio=4, diffusion_depth=12, **kwargs)
def Transformer_L(**kwargs):
return Transformer(depth=32, hidden_size=1024, num_heads=16, mlp_ratio=4, diffusion_depth=8, **kwargs)
def Transformer_B(**kwargs):
return Transformer(depth=24, hidden_size=768, num_heads=12, mlp_ratio=4, diffusion_depth=6, **kwargs)
Transformer_models = {
'Transformer-13B': Transformer_13B, 'Transformer-7B': Transformer_7B, 'Transformer-3B': Transformer_3B, 'Transformer-XL': Transformer_XL, 'Transformer-Large': Transformer_Large, 'Transformer-Medium': Transformer_Medium, 'Transformer-Base': Transformer_Base,
'Transformer-H': Transformer_H, 'Transformer-L': Transformer_L, 'Transformer-B': Transformer_B
}
================================================
FILE: LatentLM/models/__init__.py
================================================
from .DiT import DiT_models, DiT
from .Transformer import Transformer_models, Transformer
from .EMA import EMAModel
All_models = {**DiT_models, **Transformer_models}
================================================
FILE: LatentLM/models/kernel/rotary.py
================================================
# Copyright (c) 2023, Tri Dao.
from typing import Optional, Union
import torch
import triton
import triton.language as tl
# @triton.autotune(
# configs=[
# triton.Config({"BLOCK_M": 2}),
# triton.Config({"BLOCK_M": 4}),
# triton.Config({"BLOCK_M": 8}),
# triton.Config({"BLOCK_M": 16}),
# ],
# key=["CACHE_KEY_SEQLEN", "BLOCK_K", "INTERLEAVED"],
# )
@triton.jit
def rotary_kernel(
OUT, # Pointers to matrices
X,
COS,
SIN,
CU_SEQLENS,
SEQLEN_OFFSETS, # this could be int or a pointer
# Matrix dimensions
seqlen,
nheads,
rotary_dim,
seqlen_ro,
CACHE_KEY_SEQLEN,
# strides
stride_out_batch,
stride_out_seqlen,
stride_out_nheads,
stride_out_headdim,
stride_x_batch,
stride_x_seqlen,
stride_x_nheads,
stride_x_headdim,
# Meta-parameters
BLOCK_K: tl.constexpr,
IS_SEQLEN_OFFSETS_TENSOR: tl.constexpr,
IS_VARLEN: tl.constexpr,
INTERLEAVED: tl.constexpr,
CONJUGATE: tl.constexpr,
BLOCK_M: tl.constexpr,
):
pid_m = tl.program_id(axis=0)
pid_batch = tl.program_id(axis=1)
pid_head = tl.program_id(axis=2)
rotary_dim_half = rotary_dim // 2
if not IS_VARLEN:
X = X + pid_batch * stride_x_batch + pid_head * stride_x_nheads
OUT = OUT + pid_batch * stride_out_batch + pid_head * stride_out_nheads
else:
start_idx = tl.load(CU_SEQLENS + pid_batch)
seqlen = tl.load(CU_SEQLENS + pid_batch + 1) - start_idx
X = X + start_idx * stride_x_seqlen + pid_head * stride_x_nheads
OUT = OUT + start_idx * stride_out_seqlen + pid_head * stride_out_nheads
if pid_m * BLOCK_M >= seqlen:
return
rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
if not IS_SEQLEN_OFFSETS_TENSOR:
rm_cs = rm + SEQLEN_OFFSETS
else:
rm_cs = rm + tl.load(SEQLEN_OFFSETS + pid_batch)
rk = tl.arange(0, BLOCK_K)
rk_half = tl.arange(0, BLOCK_K // 2)
if not INTERLEAVED:
# Load the 1st and 2nd halves of X, do calculation, then store to 1st and 2nd halves of OUT
X = X + (rm[:, None] * stride_x_seqlen + rk_half[None, :] * stride_x_headdim)
COS = COS + (rm_cs[:, None] * rotary_dim_half + rk_half[None, :])
SIN = SIN + (rm_cs[:, None] * rotary_dim_half + rk_half[None, :])
cos = tl.load(
COS, mask=(rm_cs[:, None] < seqlen_ro) & (rk_half[None, :] < rotary_dim_half), other=1.0
).to(tl.float32)
sin = tl.load(
SIN, mask=(rm_cs[:, None] < seqlen_ro) & (rk_half[None, :] < rotary_dim_half), other=0.0
).to(tl.float32)
x0 = tl.load(
X, mask=(rm[:, None] < seqlen) & (rk_half[None, :] < rotary_dim_half), other=0.0
).to(tl.float32)
x1 = tl.load(
X + rotary_dim_half * stride_x_headdim,
mask=(rm[:, None] < seqlen) & (rk_half[None, :] < rotary_dim_half),
other=0.0,
).to(tl.float32)
if CONJUGATE:
sin = -sin
o0 = x0 * cos - x1 * sin
o1 = x0 * sin + x1 * cos
# write back result
OUT = OUT + (rm[:, None] * stride_out_seqlen + rk_half[None, :] * stride_out_headdim)
tl.store(OUT, o0, mask=(rm[:, None] < seqlen) & (rk_half[None, :] < rotary_dim_half))
tl.store(
OUT + rotary_dim_half * stride_out_headdim,
o1,
mask=(rm[:, None] < seqlen) & (rk_half[None, :] < rotary_dim_half),
)
else:
# We don't want to load X[0, 2, 4, ...] and X[1, 3, 5, ...] separately since both are slow.
# Instead, we load x0 = X[0, 1, 2, 3, ...] and x1 = X[1, 0, 3, 2, ...].
# Loading x0 will be fast but x1 will be slow.
# Then we load cos = COS[0, 0, 1, 1, ...] and sin = SIN[0, 0, 1, 1, ...].
# Then we do the calculation and use tl.where to pick put the right outputs for the even
# and for the odd indices.
rk_swap = rk + ((rk + 1) % 2) * 2 - 1 # 1, 0, 3, 2, 5, 4, ...
rk_repeat = tl.arange(0, BLOCK_K) // 2
X0 = X + (rm[:, None] * stride_x_seqlen + rk[None, :] * stride_x_headdim)
X1 = X + (rm[:, None] * stride_x_seqlen + rk_swap[None, :] * stride_x_headdim)
COS = COS + (rm_cs[:, None] * rotary_dim_half + rk_repeat[None, :])
SIN = SIN + (rm_cs[:, None] * rotary_dim_half + rk_repeat[None, :])
cos = tl.load(
COS,
mask=(rm_cs[:, None] < seqlen_ro) & (rk_repeat[None, :] < rotary_dim_half),
other=1.0,
).to(tl.float32)
sin = tl.load(
SIN,
mask=(rm_cs[:, None] < seqlen_ro) & (rk_repeat[None, :] < rotary_dim_half),
other=0.0,
).to(tl.float32)
x0 = tl.load(X0, mask=(rm[:, None] < seqlen) & (rk[None, :] < rotary_dim), other=0.0).to(
tl.float32
)
x1 = tl.load(
X1, mask=(rm[:, None] < seqlen) & (rk_swap[None, :] < rotary_dim), other=0.0
).to(tl.float32)
if CONJUGATE:
sin = -sin
x0_cos = x0 * cos
x1_sin = x1 * sin
out = tl.where(rk[None, :] % 2 == 0, x0_cos - x1_sin, x0_cos + x1_sin)
OUT = OUT + (rm[:, None] * stride_out_seqlen + rk[None, :] * stride_out_headdim)
tl.store(OUT, out, mask=(rm[:, None] < seqlen) & (rk[None, :] < rotary_dim))
def apply_rotary(
x: torch.Tensor,
cos: torch.Tensor,
sin: torch.Tensor,
seqlen_offsets: Union[int, torch.Tensor] = 0,
cu_seqlens: Optional[torch.Tensor] = None,
max_seqlen: Optional[int] = None,
interleaved=False,
inplace=False,
conjugate=False,
) -> torch.Tensor:
"""
Arguments:
x: (batch, seqlen, nheads, headdim) if cu_seqlens is None
else (total_seqlen, nheads, headdim).
cos: (seqlen_ro, rotary_dim / 2)
sin: (seqlen_ro, rotary_dim / 2)
seqlen_offsets: integer or integer tensor of size (batch,)
cu_seqlens: (batch + 1,) or None
max_seqlen: int
Returns:
y: (batch, seqlen, nheads, headdim)
"""
is_varlen = cu_seqlens is not None
if not is_varlen:
batch, seqlen, nheads, headdim = x.shape
else:
assert max_seqlen is not None, "If cu_seqlens is passed in, then max_seqlen must be passed"
total_seqlen, nheads, headdim = x.shape
batch_p_1 = cu_seqlens.shape[0]
batch = batch_p_1 - 1
seqlen = max_seqlen
seqlen_ro, rotary_dim = cos.shape
assert sin.shape == cos.shape
rotary_dim *= 2
assert rotary_dim <= headdim, "rotary_dim must be <= headdim"
assert headdim <= 256, "Only support headdim <= 256"
assert seqlen_ro >= seqlen, "seqlen_ro must be >= seqlen"
assert (
cos.dtype == sin.dtype
), f"cos and sin must have the same dtype, got {cos.dtype} and {sin.dtype}"
assert (
x.dtype == cos.dtype
), f"Input and cos/sin must have the same dtype, got {x.dtype} and {cos.dtype}"
cos, sin = cos.contiguous(), sin.contiguous()
if isinstance(seqlen_offsets, torch.Tensor):
assert seqlen_offsets.shape == (batch,)
assert seqlen_offsets.dtype in [torch.int32, torch.int64]
seqlen_offsets = seqlen_offsets.contiguous()
else:
assert seqlen_offsets + seqlen <= seqlen_ro
output = torch.empty_like(x) if not inplace else x
if rotary_dim < headdim and not inplace:
output[..., rotary_dim:].copy_(x[..., rotary_dim:])
BLOCK_K = (
32
if rotary_dim <= 32
else (64 if rotary_dim <= 64 else (128 if rotary_dim <= 128 else 256))
)
grid = lambda META: (triton.cdiv(seqlen, META["BLOCK_M"]), batch, nheads) # noqa
BLOCK_M = 4 if interleaved else (8 if rotary_dim <= 64 else 4)
# Need this, otherwise Triton tries to launch from cuda:0 and we get
# ValueError: Pointer argument (at 0) cannot be accessed from Triton (cpu tensor?)
with torch.cuda.device(x.device.index):
rotary_kernel[grid](
output, # data ptrs
x,
cos,
sin,
cu_seqlens,
seqlen_offsets,
seqlen, # shapes
nheads,
rotary_dim,
seqlen_ro,
seqlen // 128, # key for triton cache (limit number of compilations)
output.stride(0) if not is_varlen else 0, # batch_strides if not varlen else 0
output.stride(-3), # seqlen_stride or total_seqlen_stride
output.stride(-2), # nheads_stride
output.stride(-1), # headdim_stride
x.stride(0) if not is_varlen else 0, # batch_strides if not varlen else 0
x.stride(-3), # seqlen stride or total_seqlen_stride
x.stride(-2), # nheads stride
x.stride(-1), # headdim stride
BLOCK_K,
isinstance(seqlen_offsets, torch.Tensor),
is_varlen,
interleaved,
conjugate,
BLOCK_M,
)
return output
class ApplyRotaryEmb(torch.autograd.Function):
@staticmethod
def forward(
ctx,
x,
cos,
sin,
interleaved=False,
inplace=False,
seqlen_offsets: Union[int, torch.Tensor] = 0,
cu_seqlens: Optional[torch.Tensor] = None,
max_seqlen: Optional[int] = None,
):
out = apply_rotary(
x,
cos,
sin,
seqlen_offsets=seqlen_offsets,
cu_seqlens=cu_seqlens,
max_seqlen=max_seqlen,
interleaved=interleaved,
inplace=inplace,
)
if isinstance(seqlen_offsets, int):
# Can't save int with save_for_backward
ctx.save_for_backward(cos, sin, cu_seqlens)
ctx.seqlen_offsets = seqlen_offsets
else:
ctx.save_for_backward(cos, sin, cu_seqlens, seqlen_offsets)
ctx.seqlen_offsets = None
ctx.interleaved = interleaved
ctx.inplace = inplace
ctx.max_seqlen = max_seqlen
return out if not inplace else x
@staticmethod
def backward(ctx, do):
seqlen_offsets = ctx.seqlen_offsets
if seqlen_offsets is None:
cos, sin, cu_seqlens, seqlen_offsets = ctx.saved_tensors
else:
cos, sin, cu_seqlens = ctx.saved_tensors
# TD [2023-09-02]: For some reason Triton (2.0.0.post1) errors with
# "[CUDA]: invalid device context", and cloning makes it work. Idk why. Triton 2.1.0 works.
if not ctx.interleaved and not ctx.inplace:
do = do.clone()
dx = apply_rotary(
do,
cos,
sin,
seqlen_offsets=seqlen_offsets,
cu_seqlens=cu_seqlens,
max_seqlen=ctx.max_seqlen,
interleaved=ctx.interleaved,
inplace=ctx.inplace,
conjugate=True,
)
return dx, None, None, None, None, None, None, None
def apply_rotary_emb(
x,
cos,
sin,
interleaved=False,
inplace=False,
seqlen_offsets: Union[int, torch.Tensor] = 0,
cu_seqlens: Optional[torch.Tensor] = None,
max_seqlen: Optional[int] = None,
):
"""
Arguments:
x: (batch_size, seqlen, nheads, headdim) if cu_seqlens is None
else (total_seqlen, nheads, headdim)
cos, sin: (seqlen_rotary, rotary_dim / 2)
interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead
of 1st half and 2nd half (GPT-NeoX style).
inplace: if True, apply rotary embedding in-place.
seqlen_offsets: (batch_size,) or int. Each sequence in x is shifted by this amount.
Most commonly used in inference when we have KV cache.
cu_seqlens: (batch + 1,) or None
max_seqlen: int
Return:
out: (batch_size, seqlen, nheads, headdim) if cu_seqlens is None
else (total_seqlen, nheads, headdim)
rotary_dim must be <= headdim
Apply rotary embedding to the first rotary_dim of x.
"""
return ApplyRotaryEmb.apply(
x, cos, sin, interleaved, inplace, seqlen_offsets, cu_seqlens, max_seqlen
)
def rotate_every_two(x):
x1 = x[:, :, :, ::2]
x2 = x[:, :, :, 1::2]
x = torch.stack((-x2, x1), dim=-1)
return x.flatten(-2)
def apply_rotary_pos_emb(x, cos, sin, interleaved=False):
cos, sin = map(lambda t: torch.repeat_interleave(t, 2, dim=-1).unsqueeze(1), (cos, sin))
return (x * cos) + (rotate_every_two(x) * sin)
================================================
FILE: LatentLM/models/kernel/swiglu.py
================================================
import torch
swiglu_fwd_codestring = """
template T swiglu_fwd(T x, T y) {
return float(x) * float(y) / (1.0f + ::exp(-float(x)));
}
"""
swiglu_bwd_codestring = """
template T swiglu_bwd(T x, T y, T g, T& dx, T& dy) {
float x_sigmoid = 1.0f / (1.0f + ::exp(-float(x)));
dx = x_sigmoid * (1 + float(x) * (1.0f - x_sigmoid)) * float(g) * float(y);
dy = float(x) * x_sigmoid * float(g);
}
"""
swiglu_fwd = torch.cuda.jiterator._create_jit_fn(swiglu_fwd_codestring)
swiglu_bwd = torch.cuda.jiterator._create_multi_output_jit_fn(swiglu_bwd_codestring, num_outputs=2)
class SwiGLUFunction(torch.autograd.Function):
@staticmethod
def forward(ctx, x, y):
ctx.save_for_backward(x, y)
return swiglu_fwd(x, y)
@staticmethod
def backward(ctx, dout):
x, y = ctx.saved_tensors
return swiglu_bwd(x, y, dout)
swiglu = SwiGLUFunction.apply
================================================
FILE: LatentLM/sample_hf.py
================================================
import argparse
import os
from tqdm import tqdm
import torch
import torch.nn.functional as F
from torchvision.utils import save_image
from accelerate.utils import set_seed
from safetensors.torch import load_file
from tokenizer_models import AutoencoderKL, load_vae
from schedule.dpm_solver import DPMSolverMultistepScheduler
from models import All_models
def parse_args():
parser = argparse.ArgumentParser(description="Simple example of a training script.")
parser.add_argument(
"--seed",
type=int,
default=0,
help="A seed to use for the random number generator. Can be negative to not set a seed.",
)
parser.add_argument(
"--model",
type=str,
default="Transformer-L",
help="The config of the UNet model to train, leave as None to use standard DDPM configuration.",
)
parser.add_argument(
"--vae",
type=str,
default=None,
)
parser.add_argument(
"--train_data_dir",
type=str,
default="/tmp/ILSVRC/Data/CLS-LOC/train",
help=(
"A folder containing the training data. Folder contents must follow the structure described in"
" https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file"
" must exist to provide the captions for the images. Ignored if `dataset_name` is specified."
),
)
parser.add_argument(
"--image_size",
type=int,
default=256,
help=(
"The image_size for input images, all the images in the train/validation dataset will be resized to this"
" image_size"
),
)
parser.add_argument("--num-classes", type=int, default=1000)
parser.add_argument(
"--mixed_precision",
type=str,
default="no",
choices=["no", "fp16", "bf16"],
help=(
"Whether to use mixed precision. Choose"
"between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
"and an Nvidia Ampere GPU."
),
)
parser.add_argument(
"--prediction_type",
type=str,
default="epsilon",
help="Whether the model should predict the 'epsilon'/noise error or directly the reconstructed image 'x0'.",
)
parser.add_argument("--use_ema", action="store_true", help="Whether to use Exponential Moving Average for the final model weights.")
parser.add_argument("--ddpm_num_steps", type=int, default=1000)
parser.add_argument("--ddpm_num_inference_steps", type=int, default=250)
parser.add_argument("--ddpm_beta_schedule", type=str, default="cosine", help="The beta schedule to use for DDPM.")
parser.add_argument("--cfg-scale", type=float, default=4.0)
parser.add_argument(
"--checkpoint",
type=str,
default=None,
help=(
"Whether training should be resumed from a previous checkpoint. Use a path saved by"
' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
),
)
parser.add_argument("--image_name", type=str, default="sample.png")
args = parser.parse_args()
return args
@torch.no_grad()
def main(args):
set_seed(args.seed)
device = "cuda" if torch.cuda.is_available() else "cpu"
if args.mixed_precision == "bf16":
dtype = torch.bfloat16
elif args.mixed_precision == "fp16":
dtype = torch.float16
else:
dtype = torch.float32
# Create model:
vae, input_size, latent_size, flatten_input = load_vae(args.vae, args.image_size)
model = All_models[args.model](
input_size=input_size,
in_channels=latent_size,
num_classes=args.num_classes,
flatten_input=flatten_input,
).to(device).to(dtype)
# Initialize the scheduler
noise_scheduler = DPMSolverMultistepScheduler(num_train_timesteps=args.ddpm_num_steps, beta_schedule=args.ddpm_beta_schedule, prediction_type=args.prediction_type)
model.eval()
vae.eval()
# Potentially load in the weights and states from a previous save
if args.checkpoint:
other_state = torch.load(os.path.join(args.checkpoint, "other_state.pth"))
scaling_factor = other_state["scaling_factor"]
bias_factor = other_state["bias_factor"]
print(f"Scaling factor: {scaling_factor}, Bias factor: {bias_factor}")
if args.use_ema and other_state["ema"] is not None:
checkpoint = other_state["ema"]["shadow_params"]
for model_param, ema_param in zip(model.parameters(), checkpoint):
model_param.data = ema_param.data.to(device).to(dtype)
print(f"Loaded model from checkpoint {args.checkpoint}, EMA applied.")
else:
if os.path.exists(os.path.join(args.checkpoint, "model.safetensors")):
checkpoint = load_file(os.path.join(args.checkpoint, "model.safetensors"))
elif os.path.exists(os.path.join(args.checkpoint, "pytorch_model")):
checkpoint = torch.load(os.path.join(args.checkpoint, "pytorch_model", "mp_rank_00_model_states.pt"))["module"]
else:
raise ValueError(f"Could not find model checkpoint in {args.checkpoint}.")
model.load_state_dict(checkpoint)
print(f"Loaded model from checkpoint {args.checkpoint}.")
# Labels to condition the model with (feel free to change):
class_labels = [281, 282, 283, 284, 285, 4, 7, 963]
# class_labels = [207, 360, 387, 974, 88, 979, 417, 279]
def p_sample(model, image):
noise_scheduler.set_timesteps(args.ddpm_num_inference_steps)
for t in noise_scheduler.timesteps:
model_output = model(image, t.repeat(image.shape[0]).to(image))
image = noise_scheduler.step(model_output, t, image).prev_sample
return image
# Create sampling noise:
n = len(class_labels)
y = torch.tensor(class_labels, device=device)
# Setup classifier-free guidance:
y_null = torch.tensor([1000] * n, device=device)
y = torch.cat([y, y_null], 0)
# Sample images:
samples = model.sample_with_cfg(y, args.cfg_scale, p_sample)
images = vae.decode(samples / scaling_factor - bias_factor)
# Save and display images:
save_image(images, f"visuals/{args.image_name}", nrow=4, normalize=True, value_range=(-1, 1))
print(f"Saved image to visuals/{args.image_name}")
if __name__ == "__main__":
args = parse_args()
main(args)
================================================
FILE: LatentLM/sample_many.py
================================================
import argparse
import os
from tqdm import tqdm
import random
import torch
import torch.nn.functional as F
from torchvision.utils import save_image
from accelerate.utils import set_seed
from timm.models import create_model
from safetensors.torch import load_file
from tokenizer_models import AutoencoderKL, load_vae
from schedule.dpm_solver import DPMSolverMultistepScheduler
from models import All_models
imagenet_indices = [
1, 10, 84, 94, 97, 98, 100, 104, 107, 117, 151, 157, 161, 178, 182, 183,
268, 322, 337, 354, 366, 380, 973, 975, 978, 980, 981, 983, 985, 986, 991,
995, 996, 998, 999, 409, 453, 483, 497, 555, 648, 651, 690, 700, 701, 714,
759, 762, 765, 780, 859, 861, 928, 929, 963
]
def parse_args():
parser = argparse.ArgumentParser(description="Simple example of a training script.")
parser.add_argument(
"--seed",
type=int,
default=0,
help="A seed to use for the random number generator. Can be negative to not set a seed.",
)
parser.add_argument(
"--model",
type=str,
default="Transformer-L",
help="The config of the UNet model to train, leave as None to use standard DDPM configuration.",
)
parser.add_argument(
"--vae",
type=str,
default=None,
)
parser.add_argument(
"--train_data_dir",
type=str,
default="/tmp/ILSVRC/Data/CLS-LOC/train",
help=(
"A folder containing the training data. Folder contents must follow the structure described in"
" https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file"
" must exist to provide the captions for the images. Ignored if `dataset_name` is specified."
),
)
parser.add_argument(
"--image_size",
type=int,
default=256,
help=(
"The image_size for input images, all the images in the train/validation dataset will be resized to this"
" image_size"
),
)
parser.add_argument("--num-classes", type=int, default=1000)
parser.add_argument(
"--mixed_precision",
type=str,
default="no",
choices=["no", "fp16", "bf16"],
help=(
"Whether to use mixed precision. Choose"
"between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
"and an Nvidia Ampere GPU."
),
)
parser.add_argument(
"--prediction_type",
type=str,
default="epsilon",
help="Whether the model should predict the 'epsilon'/noise error or directly the reconstructed image 'x0'.",
)
parser.add_argument("--use_ema", action="store_true", help="Whether to use Exponential Moving Average for the final model weights.")
parser.add_argument("--ddpm_num_steps", type=int, default=1000)
parser.add_argument("--ddpm_num_inference_steps", type=int, default=250)
parser.add_argument("--ddpm_beta_schedule", type=str, default="cosine", help="The beta schedule to use for DDPM.")
parser.add_argument("--cfg-scale", type=float, default=4.0)
parser.add_argument(
"--checkpoint",
type=str,
default=None,
help=(
"Whether training should be resumed from a previous checkpoint. Use a path saved by"
' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
),
)
parser.add_argument(
"--batch_size", type=int, default=32, help="Batch size (per device) for the training dataloader."
)
args = parser.parse_args()
return args
@torch.no_grad()
def main(args):
set_seed(args.seed)
device = "cuda" if torch.cuda.is_available() else "cpu"
if args.mixed_precision == "bf16":
dtype = torch.bfloat16
elif args.mixed_precision == "fp16":
dtype = torch.float16
else:
dtype = torch.float32
vae, input_size, latent_size, flatten_input = load_vae(args.vae, args.image_size)
model = All_models[args.model](
input_size=input_size,
in_channels=latent_size,
num_classes=args.num_classes,
flatten_input=flatten_input,
).to(device).to(dtype)
# Initialize the scheduler
noise_scheduler = DPMSolverMultistepScheduler(num_train_timesteps=args.ddpm_num_steps, beta_schedule=args.ddpm_beta_schedule, prediction_type=args.prediction_type)
model.eval()
vae.eval()
# Potentially load in the weights and states from a previous save
if args.checkpoint:
other_state = torch.load(os.path.join(args.checkpoint, "other_state.pth"))
scaling_factor = other_state["scaling_factor"]
bias_factor = other_state["bias_factor"]
print(f"Scaling factor: {scaling_factor}, Bias factor: {bias_factor}")
if args.use_ema and other_state["ema"] is not None:
checkpoint = other_state["ema"]["shadow_params"]
for model_param, ema_param in zip(model.parameters(), checkpoint):
model_param.data = ema_param.data.to(device).to(dtype)
print(f"Loaded model from checkpoint {args.checkpoint}, EMA applied.")
else:
if os.path.exists(os.path.join(args.checkpoint, "model.safetensors")):
checkpoint = load_file(os.path.join(args.checkpoint, "model.safetensors"))
elif os.path.exists(os.path.join(args.checkpoint, "pytorch_model")):
checkpoint = torch.load(os.path.join(args.checkpoint, "pytorch_model", "mp_rank_00_model_states.pt"))["module"]
else:
raise ValueError(f"Could not find model checkpoint in {args.checkpoint}.")
model.load_state_dict(checkpoint)
print(f"Loaded model from checkpoint {args.checkpoint}.")
image_id = 0
for _ in tqdm(range(5)):
def p_sample(model, image):
noise_scheduler.set_timesteps(args.ddpm_num_inference_steps)
for t in noise_scheduler.timesteps:
model_output = model(image, t.repeat(image.shape[0]).to(image))
image = noise_scheduler.step(model_output, t, image).prev_sample
return image
# Create sampling noise:
n = args.batch_size
y = torch.randint(0, args.num_classes, (n,), device=device)
# y = torch.tensor(random.choices([281, 282, 283, 284, 285, 4, 7, 963], k=n), device=device)
# Setup classifier-free guidance:
y_null = torch.tensor([1000] * n, device=device)
y = torch.cat([y, y_null], 0)
# Sample images:
samples = model.sample_with_cfg(y, args.cfg_scale, p_sample)
images = vae.decode(samples / scaling_factor - bias_factor)
# Save image one by one
for i, image in enumerate(images):
save_image(image, f"demo/{image_id}.png", normalize=True, value_range=(-1, 1))
image_id += 1
if __name__ == "__main__":
args = parse_args()
main(args)
================================================
FILE: LatentLM/schedule/__init__.py
================================================
from .ddpm import DDPMScheduler
from .dpm_solver import DPMSolverMultistepScheduler
================================================
FILE: LatentLM/schedule/ddpm.py
================================================
# Copyright 2024 UC Berkeley Team and The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# DISCLAIMER: This file is strongly influenced by https://github.com/ermongroup/ddim
import math
from dataclasses import dataclass
from typing import List, Optional, Tuple, Union
import numpy as np
import torch
from diffusers.configuration_utils import ConfigMixin, register_to_config
from diffusers.utils import BaseOutput
from diffusers.utils.torch_utils import randn_tensor
from diffusers.schedulers.scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin
@dataclass
class DDPMSchedulerOutput(BaseOutput):
"""
Output class for the scheduler's `step` function output.
Args:
prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
denoising loop.
pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
The predicted denoised sample `(x_{0})` based on the model output from the current timestep.
`pred_original_sample` can be used to preview progress or for guidance.
"""
prev_sample: torch.Tensor
pred_original_sample: Optional[torch.Tensor] = None
def betas_for_alpha_bar(
num_diffusion_timesteps,
max_beta=0.999,
alpha_transform_type="cosine",
):
"""
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
(1-beta) over time from t = [0,1].
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
to that part of the diffusion process.
Args:
num_diffusion_timesteps (`int`): the number of betas to produce.
max_beta (`float`): the maximum beta to use; use values lower than 1 to
prevent singularities.
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
Choose from `cosine` or `exp`
Returns:
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
"""
if alpha_transform_type == "cosine":
def alpha_bar_fn(t):
return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
# return math.cos(t * math.pi / 2 * 0.95) ** 2
elif alpha_transform_type == "exp":
def alpha_bar_fn(t):
return math.exp(t * -12.0)
elif alpha_transform_type == "cauchy":
# µ + γ tan (π (0.5 - x)) γ = 1, µ = 3
# alpha^2 = 1-1/(exp(λ)+1)
def alpha_bar_fn(t, gamma=1, mu=3):
snr = mu + gamma * math.tan(math.pi * (0.5 - t) * 0.9)
return 1 - 1 / (math.exp(snr) + 1.1)
elif alpha_transform_type == "laplace":
# µ − bsgn(0.5 − t) log(1 − 2|t − 0.5|) µ = 0, b = 1
def alpha_bar_fn(t, mu=0, b=1):
snr = mu - b * math.copysign(1, 0.5 - t) * math.log(1 - 2 * abs(t - 0.5) * 0.98)
return 1 - 1 / (math.exp(snr) + 1.02)
else:
raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}")
betas = []
for i in range(num_diffusion_timesteps):
t1 = i / num_diffusion_timesteps
t2 = (i + 1) / num_diffusion_timesteps
betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
return torch.tensor(betas, dtype=torch.float32)
# Copied from diffusers.schedulers.scheduling_ddim.rescale_zero_terminal_snr
def rescale_zero_terminal_snr(betas):
"""
Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1)
Args:
betas (`torch.Tensor`):
the betas that the scheduler is being initialized with.
Returns:
`torch.Tensor`: rescaled betas with zero terminal SNR
"""
# Convert betas to alphas_bar_sqrt
alphas = 1.0 - betas
alphas_cumprod = torch.cumprod(alphas, dim=0)
alphas_bar_sqrt = alphas_cumprod.sqrt()
# Store old values.
alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone()
# Shift so the last timestep is zero.
alphas_bar_sqrt -= alphas_bar_sqrt_T
# Scale so the first timestep is back to the old value.
alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
# Convert alphas_bar_sqrt to betas
alphas_bar = alphas_bar_sqrt**2 # Revert sqrt
alphas = alphas_bar[1:] / alphas_bar[:-1] # Revert cumprod
alphas = torch.cat([alphas_bar[0:1], alphas])
betas = 1 - alphas
return betas
class DDPMScheduler(SchedulerMixin, ConfigMixin):
"""
`DDPMScheduler` explores the connections between denoising score matching and Langevin dynamics sampling.
This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
methods the library implements for all schedulers such as loading and saving.
Args:
num_train_timesteps (`int`, defaults to 1000):
The number of diffusion steps to train the model.
beta_start (`float`, defaults to 0.0001):
The starting `beta` value of inference.
beta_end (`float`, defaults to 0.02):
The final `beta` value.
beta_schedule (`str`, defaults to `"linear"`):
The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
`linear`, `scaled_linear`, or `squaredcos_cap_v2`.
trained_betas (`np.ndarray`, *optional*):
An array of betas to pass directly to the constructor without using `beta_start` and `beta_end`.
variance_type (`str`, defaults to `"fixed_small"`):
Clip the variance when adding noise to the denoised sample. Choose from `fixed_small`, `fixed_small_log`,
`fixed_large`, `fixed_large_log`, `learned` or `learned_range`.
clip_sample (`bool`, defaults to `True`):
Clip the predicted sample for numerical stability.
clip_sample_range (`float`, defaults to 1.0):
The maximum magnitude for sample clipping. Valid only when `clip_sample=True`.
prediction_type (`str`, defaults to `epsilon`, *optional*):
Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
`sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
Video](https://imagen.research.google/video/paper.pdf) paper).
thresholding (`bool`, defaults to `False`):
Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
as Stable Diffusion.
dynamic_thresholding_ratio (`float`, defaults to 0.995):
The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
sample_max_value (`float`, defaults to 1.0):
The threshold value for dynamic thresholding. Valid only when `thresholding=True`.
timestep_spacing (`str`, defaults to `"leading"`):
The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
steps_offset (`int`, defaults to 0):
An offset added to the inference steps, as required by some model families.
rescale_betas_zero_snr (`bool`, defaults to `False`):
Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
dark samples instead of limiting it to samples with medium brightness. Loosely related to
[`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506).
"""
_compatibles = [e.name for e in KarrasDiffusionSchedulers]
order = 1
@register_to_config
def __init__(
self,
num_train_timesteps: int = 1000,
beta_start: float = 0.0001,
beta_end: float = 0.02,
beta_schedule: str = "linear",
trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
variance_type: str = "fixed_large",
clip_sample: bool = False,
prediction_type: str = "epsilon",
thresholding: bool = False,
dynamic_thresholding_ratio: float = 0.995,
clip_sample_range: float = 1.0,
sample_max_value: float = 1.0,
timestep_spacing: str = "leading",
steps_offset: int = 0,
rescale_betas_zero_snr: int = False,
):
if trained_betas is not None:
self.betas = torch.tensor(trained_betas, dtype=torch.float32)
elif beta_schedule == "linear":
self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
elif beta_schedule == "scaled_linear":
# this schedule is very specific to the latent diffusion model.
self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
elif beta_schedule == "squaredcos_cap_v2" or beta_schedule == "cosine":
# Glide cosine schedule
self.betas = betas_for_alpha_bar(num_train_timesteps, alpha_transform_type="cosine")
elif beta_schedule == "cauchy":
self.betas = betas_for_alpha_bar(num_train_timesteps, alpha_transform_type="cauchy")
elif beta_schedule == "laplace":
self.betas = betas_for_alpha_bar(num_train_timesteps, alpha_transform_type="laplace")
elif beta_schedule == "sigmoid":
# GeoDiff sigmoid schedule
betas = torch.linspace(-6, 6, num_train_timesteps)
self.betas = torch.sigmoid(betas) * (beta_end - beta_start) + beta_start
else:
raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}")
# Rescale for zero SNR
if rescale_betas_zero_snr:
self.betas = rescale_zero_terminal_snr(self.betas)
self.alphas = 1.0 - self.betas
self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
self.one = torch.tensor(1.0)
# standard deviation of the initial noise distribution
self.init_noise_sigma = 1.0
# setable values
self.custom_timesteps = False
self.num_inference_steps = None
self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy())
self.variance_type = variance_type
def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor:
"""
Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
current timestep.
Args:
sample (`torch.Tensor`):
The input sample.
timestep (`int`, *optional*):
The current timestep in the diffusion chain.
Returns:
`torch.Tensor`:
A scaled input sample.
"""
return sample
def set_timesteps(
self,
num_inference_steps: Optional[int] = None,
device: Union[str, torch.device] = None,
timesteps: Optional[List[int]] = None,
):
"""
Sets the discrete timesteps used for the diffusion chain (to be run before inference).
Args:
num_inference_steps (`int`):
The number of diffusion steps used when generating samples with a pre-trained model. If used,
`timesteps` must be `None`.
device (`str` or `torch.device`, *optional*):
The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
timesteps (`List[int]`, *optional*):
Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
timestep spacing strategy of equal spacing between timesteps is used. If `timesteps` is passed,
`num_inference_steps` must be `None`.
"""
if num_inference_steps is not None and timesteps is not None:
raise ValueError("Can only pass one of `num_inference_steps` or `custom_timesteps`.")
if timesteps is not None:
for i in range(1, len(timesteps)):
if timesteps[i] >= timesteps[i - 1]:
raise ValueError("`custom_timesteps` must be in descending order.")
if timesteps[0] >= self.config.num_train_timesteps:
raise ValueError(
f"`timesteps` must start before `self.config.train_timesteps`:"
f" {self.config.num_train_timesteps}."
)
timesteps = np.array(timesteps, dtype=np.int64)
self.custom_timesteps = True
else:
if num_inference_steps > self.config.num_train_timesteps:
raise ValueError(
f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.config.train_timesteps`:"
f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle"
f" maximal {self.config.num_train_timesteps} timesteps."
)
self.num_inference_steps = num_inference_steps
self.custom_timesteps = False
# "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891
if self.config.timestep_spacing == "linspace":
timesteps = (
np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps)
.round()[::-1]
.copy()
.astype(np.int64)
)
elif self.config.timestep_spacing == "leading":
step_ratio = self.config.num_train_timesteps // self.num_inference_steps
# creates integer timesteps by multiplying by ratio
# casting to int to avoid issues when num_inference_step is power of 3
timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.int64)
timesteps += self.config.steps_offset
elif self.config.timestep_spacing == "trailing":
step_ratio = self.config.num_train_timesteps / self.num_inference_steps
# creates integer timesteps by multiplying by ratio
# casting to int to avoid issues when num_inference_step is power of 3
timesteps = np.round(np.arange(self.config.num_train_timesteps, 0, -step_ratio)).astype(np.int64)
timesteps -= 1
else:
raise ValueError(
f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'linspace', 'leading' or 'trailing'."
)
self.timesteps = torch.from_numpy(timesteps).to(device)
def _get_variance(self, t, predicted_variance=None, variance_type=None):
prev_t = self.previous_timestep(t)
alpha_prod_t = self.alphas_cumprod[t]
alpha_prod_t_prev = self.alphas_cumprod[prev_t] if prev_t >= 0 else self.one
current_beta_t = 1 - alpha_prod_t / alpha_prod_t_prev
# For t > 0, compute predicted variance βt (see formula (6) and (7) from https://arxiv.org/pdf/2006.11239.pdf)
# and sample from it to get previous sample
# x_{t-1} ~ N(pred_prev_sample, variance) == add variance to pred_sample
variance = (1 - alpha_prod_t_prev) / (1 - alpha_prod_t) * current_beta_t
# we always take the log of variance, so clamp it to ensure it's not 0
variance = torch.clamp(variance, min=1e-20)
if variance_type is None:
variance_type = self.config.variance_type
# hacks - were probably added for training stability
if variance_type == "fixed_small":
variance = variance
# for rl-diffuser https://arxiv.org/abs/2205.09991
elif variance_type == "fixed_small_log":
variance = torch.log(variance)
variance = torch.exp(0.5 * variance)
elif variance_type == "fixed_large":
variance = current_beta_t
elif variance_type == "fixed_large_log":
# Glide max_log
variance = torch.log(current_beta_t)
elif variance_type == "learned":
return predicted_variance
elif variance_type == "learned_range":
min_log = torch.log(variance)
max_log = torch.log(current_beta_t)
frac = (predicted_variance + 1) / 2
variance = frac * max_log + (1 - frac) * min_log
return variance
def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
"""
"Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
pixels from saturation at each step. We find that dynamic thresholding results in significantly better
photorealism as well as better image-text alignment, especially when using very large guidance weights."
https://arxiv.org/abs/2205.11487
"""
dtype = sample.dtype
batch_size, channels, *remaining_dims = sample.shape
if dtype not in (torch.float32, torch.float64):
sample = sample.float() # upcast for quantile calculation, and clamp not implemented for cpu half
# Flatten sample for doing quantile calculation along each image
sample = sample.reshape(batch_size, channels * np.prod(remaining_dims))
abs_sample = sample.abs() # "a certain percentile absolute pixel value"
s = torch.quantile(abs_sample, self.config.dynamic_thresholding_ratio, dim=1)
s = torch.clamp(
s, min=1, max=self.config.sample_max_value
) # When clamped to min=1, equivalent to standard clipping to [-1, 1]
s = s.unsqueeze(1) # (batch_size, 1) because clamp will broadcast along dim=0
sample = torch.clamp(sample, -s, s) / s # "we threshold xt0 to the range [-s, s] and then divide by s"
sample = sample.reshape(batch_size, channels, *remaining_dims)
sample = sample.to(dtype)
return sample
def step(
self,
model_output: torch.Tensor,
timestep: int,
sample: torch.Tensor,
generator=None,
return_dict: bool = True,
) -> Union[DDPMSchedulerOutput, Tuple]:
"""
Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
process from the learned model outputs (most often the predicted noise).
Args:
model_output (`torch.Tensor`):
The direct output from learned diffusion model.
timestep (`float`):
The current discrete timestep in the diffusion chain.
sample (`torch.Tensor`):
A current instance of a sample created by the diffusion process.
generator (`torch.Generator`, *optional*):
A random number generator.
return_dict (`bool`, *optional*, defaults to `True`):
Whether or not to return a [`~schedulers.scheduling_ddpm.DDPMSchedulerOutput`] or `tuple`.
Returns:
[`~schedulers.scheduling_ddpm.DDPMSchedulerOutput`] or `tuple`:
If return_dict is `True`, [`~schedulers.scheduling_ddpm.DDPMSchedulerOutput`] is returned, otherwise a
tuple is returned where the first element is the sample tensor.
"""
t = timestep
prev_t = self.previous_timestep(t)
if model_output.shape[1] == sample.shape[1] * 2 and self.variance_type in ["learned", "learned_range"]:
model_output, predicted_variance = torch.split(model_output, sample.shape[1], dim=1)
else:
predicted_variance = None
# 1. compute alphas, betas
alpha_prod_t = self.alphas_cumprod[t]
alpha_prod_t_prev = self.alphas_cumprod[prev_t] if prev_t >= 0 else self.one
beta_prod_t = 1 - alpha_prod_t
beta_prod_t_prev = 1 - alpha_prod_t_prev
current_alpha_t = alpha_prod_t / alpha_prod_t_prev
current_beta_t = 1 - current_alpha_t
# 2. compute predicted original sample from predicted noise also called
# "predicted x_0" of formula (15) from https://arxiv.org/pdf/2006.11239.pdf
if self.config.prediction_type == "epsilon":
pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
elif self.config.prediction_type == "sample":
pred_original_sample = model_output
elif self.config.prediction_type == "v_prediction":
pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output
else:
raise ValueError(
f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample` or"
" `v_prediction` for the DDPMScheduler."
)
# 3. Clip or threshold "predicted x_0"
if self.config.thresholding:
pred_original_sample = self._threshold_sample(pred_original_sample)
elif self.config.clip_sample:
pred_original_sample = pred_original_sample.clamp(
-self.config.clip_sample_range, self.config.clip_sample_range
)
# 4. Compute coefficients for pred_original_sample x_0 and current sample x_t
# See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
pred_original_sample_coeff = (alpha_prod_t_prev ** (0.5) * current_beta_t) / beta_prod_t
current_sample_coeff = current_alpha_t ** (0.5) * beta_prod_t_prev / beta_prod_t
# 5. Compute predicted previous sample µ_t
# See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
pred_prev_sample = pred_original_sample_coeff * pred_original_sample + current_sample_coeff * sample
# 6. Add noise
variance = 0
if t > 0:
device = model_output.device
variance_noise = randn_tensor(
model_output.shape, generator=generator, device=device, dtype=model_output.dtype
)
if self.variance_type == "fixed_small_log":
variance = self._get_variance(t, predicted_variance=predicted_variance) * variance_noise
elif self.variance_type == "learned_range":
variance = self._get_variance(t, predicted_variance=predicted_variance)
variance = torch.exp(0.5 * variance) * variance_noise
else:
variance = (self._get_variance(t, predicted_variance=predicted_variance) ** 0.5) * variance_noise
pred_prev_sample = pred_prev_sample + variance
if not return_dict:
return (pred_prev_sample,)
return DDPMSchedulerOutput(prev_sample=pred_prev_sample, pred_original_sample=pred_original_sample)
def add_noise(
self,
original_samples: torch.Tensor,
noise: torch.Tensor,
timesteps: torch.IntTensor,
) -> torch.Tensor:
# Make sure alphas_cumprod and timestep have same device and dtype as original_samples
# Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
# for the subsequent add_noise calls
self.alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device)
alphas_cumprod = self.alphas_cumprod.to(dtype=original_samples.dtype)
timesteps = timesteps.to(original_samples.device)
sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
sqrt_alpha_prod = sqrt_alpha_prod.flatten()
while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
return noisy_samples
def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor:
# Make sure alphas_cumprod and timestep have same device and dtype as sample
self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device)
alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype)
timesteps = timesteps.to(sample.device)
sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
sqrt_alpha_prod = sqrt_alpha_prod.flatten()
while len(sqrt_alpha_prod.shape) < len(sample.shape):
sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
while len(sqrt_one_minus_alpha_prod.shape) < len(sample.shape):
sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
velocity = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample
return velocity
def __len__(self):
return self.config.num_train_timesteps
def previous_timestep(self, timestep):
if self.custom_timesteps:
index = (self.timesteps == timestep).nonzero(as_tuple=True)[0][0]
if index == self.timesteps.shape[0] - 1:
prev_t = torch.tensor(-1)
else:
prev_t = self.timesteps[index + 1]
else:
num_inference_steps = (
self.num_inference_steps if self.num_inference_steps else self.config.num_train_timesteps
)
prev_t = timestep - self.config.num_train_timesteps // num_inference_steps
return prev_t
================================================
FILE: LatentLM/schedule/dpm_solver.py
================================================
# Copyright 2024 TSAIL Team and The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# DISCLAIMER: This file is strongly influenced by https://github.com/LuChengTHU/dpm-solver
import math
from typing import List, Optional, Tuple, Union
import numpy as np
import torch
from diffusers.configuration_utils import ConfigMixin, register_to_config
from diffusers.utils import deprecate
from diffusers.utils.torch_utils import randn_tensor
from diffusers.schedulers.scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
from .ddpm import betas_for_alpha_bar, rescale_zero_terminal_snr
class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
"""
`DPMSolverMultistepScheduler` is a fast dedicated high-order solver for diffusion ODEs.
This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
methods the library implements for all schedulers such as loading and saving.
Args:
num_train_timesteps (`int`, defaults to 1000):
The number of diffusion steps to train the model.
beta_start (`float`, defaults to 0.0001):
The starting `beta` value of inference.
beta_end (`float`, defaults to 0.02):
The final `beta` value.
beta_schedule (`str`, defaults to `"linear"`):
The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
`linear`, `scaled_linear`, or `squaredcos_cap_v2`.
trained_betas (`np.ndarray`, *optional*):
Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
solver_order (`int`, defaults to 2):
The DPMSolver order which can be `1` or `2` or `3`. It is recommended to use `solver_order=2` for guided
sampling, and `solver_order=3` for unconditional sampling.
prediction_type (`str`, defaults to `epsilon`, *optional*):
Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
`sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
Video](https://imagen.research.google/video/paper.pdf) paper).
thresholding (`bool`, defaults to `False`):
Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
as Stable Diffusion.
dynamic_thresholding_ratio (`float`, defaults to 0.995):
The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
sample_max_value (`float`, defaults to 1.0):
The threshold value for dynamic thresholding. Valid only when `thresholding=True` and
`algorithm_type="dpmsolver++"`.
algorithm_type (`str`, defaults to `dpmsolver++`):
Algorithm type for the solver; can be `dpmsolver`, `dpmsolver++`, `sde-dpmsolver` or `sde-dpmsolver++`. The
`dpmsolver` type implements the algorithms in the [DPMSolver](https://huggingface.co/papers/2206.00927)
paper, and the `dpmsolver++` type implements the algorithms in the
[DPMSolver++](https://huggingface.co/papers/2211.01095) paper. It is recommended to use `dpmsolver++` or
`sde-dpmsolver++` with `solver_order=2` for guided sampling like in Stable Diffusion.
solver_type (`str`, defaults to `midpoint`):
Solver type for the second-order solver; can be `midpoint` or `heun`. The solver type slightly affects the
sample quality, especially for a small number of steps. It is recommended to use `midpoint` solvers.
lower_order_final (`bool`, defaults to `True`):
Whether to use lower-order solvers in the final steps. Only valid for < 15 inference steps. This can
stabilize the sampling of DPMSolver for steps < 15, especially for steps <= 10.
euler_at_final (`bool`, defaults to `False`):
Whether to use Euler's method in the final step. It is a trade-off between numerical stability and detail
richness. This can stabilize the sampling of the SDE variant of DPMSolver for small number of inference
steps, but sometimes may result in blurring.
use_karras_sigmas (`bool`, *optional*, defaults to `False`):
Whether to use Karras sigmas for step sizes in the noise schedule during the sampling process. If `True`,
the sigmas are determined according to a sequence of noise levels {σi}.
use_lu_lambdas (`bool`, *optional*, defaults to `False`):
Whether to use the uniform-logSNR for step sizes proposed by Lu's DPM-Solver in the noise schedule during
the sampling process. If `True`, the sigmas and time steps are determined according to a sequence of
`lambda(t)`.
final_sigmas_type (`str`, defaults to `"zero"`):
The final `sigma` value for the noise schedule during the sampling process. If `"sigma_min"`, the final
sigma is the same as the last sigma in the training schedule. If `zero`, the final sigma is set to 0.
lambda_min_clipped (`float`, defaults to `-inf`):
Clipping threshold for the minimum value of `lambda(t)` for numerical stability. This is critical for the
cosine (`squaredcos_cap_v2`) noise schedule.
variance_type (`str`, *optional*):
Set to "learned" or "learned_range" for diffusion models that predict variance. If set, the model's output
contains the predicted Gaussian variance.
timestep_spacing (`str`, defaults to `"linspace"`):
The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
steps_offset (`int`, defaults to 0):
An offset added to the inference steps, as required by some model families.
rescale_betas_zero_snr (`bool`, defaults to `False`):
Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
dark samples instead of limiting it to samples with medium brightness. Loosely related to
[`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506).
"""
_compatibles = [e.name for e in KarrasDiffusionSchedulers]
order = 1
@register_to_config
def __init__(
self,
num_train_timesteps: int = 1000,
beta_start: float = 0.0001,
beta_end: float = 0.02,
beta_schedule: str = "linear",
trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
solver_order: int = 2,
prediction_type: str = "epsilon",
thresholding: bool = False,
dynamic_thresholding_ratio: float = 0.995,
sample_max_value: float = 1.0,
algorithm_type: str = "dpmsolver++",
solver_type: str = "midpoint",
lower_order_final: bool = True,
euler_at_final: bool = False,
use_karras_sigmas: Optional[bool] = False,
use_lu_lambdas: Optional[bool] = False,
final_sigmas_type: Optional[str] = "zero", # "zero", "sigma_min"
lambda_min_clipped: float = -float("inf"),
variance_type: Optional[str] = None,
timestep_spacing: str = "linspace",
steps_offset: int = 0,
rescale_betas_zero_snr: bool = False,
):
if algorithm_type in ["dpmsolver", "sde-dpmsolver"]:
deprecation_message = f"algorithm_type {algorithm_type} is deprecated and will be removed in a future version. Choose from `dpmsolver++` or `sde-dpmsolver++` instead"
deprecate("algorithm_types dpmsolver and sde-dpmsolver", "1.0.0", deprecation_message)
if trained_betas is not None:
self.betas = torch.tensor(trained_betas, dtype=torch.float32)
elif beta_schedule == "linear":
self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
elif beta_schedule == "scaled_linear":
# this schedule is very specific to the latent diffusion model.
self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
elif beta_schedule == "squaredcos_cap_v2" or beta_schedule == "cosine":
# Glide cosine schedule
self.betas = betas_for_alpha_bar(num_train_timesteps, alpha_transform_type="cosine")
elif beta_schedule == "cauchy":
self.betas = betas_for_alpha_bar(num_train_timesteps, alpha_transform_type="cauchy")
elif beta_schedule == "laplace":
self.betas = betas_for_alpha_bar(num_train_timesteps, alpha_transform_type="laplace")
else:
raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}")
if rescale_betas_zero_snr:
self.betas = rescale_zero_terminal_snr(self.betas)
self.alphas = 1.0 - self.betas
self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
if rescale_betas_zero_snr:
# Close to 0 without being 0 so first sigma is not inf
# FP16 smallest positive subnormal works well here
self.alphas_cumprod[-1] = 2**-24
# Currently we only support VP-type noise schedule
self.alpha_t = torch.sqrt(self.alphas_cumprod)
self.sigma_t = torch.sqrt(1 - self.alphas_cumprod)
self.lambda_t = torch.log(self.alpha_t) - torch.log(self.sigma_t)
self.sigmas = ((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5
# standard deviation of the initial noise distribution
self.init_noise_sigma = 1.0
# settings for DPM-Solver
if algorithm_type not in ["dpmsolver", "dpmsolver++", "sde-dpmsolver", "sde-dpmsolver++"]:
if algorithm_type == "deis":
self.register_to_config(algorithm_type="dpmsolver++")
else:
raise NotImplementedError(f"{algorithm_type} is not implemented for {self.__class__}")
if solver_type not in ["midpoint", "heun"]:
if solver_type in ["logrho", "bh1", "bh2"]:
self.register_to_config(solver_type="midpoint")
else:
raise NotImplementedError(f"{solver_type} is not implemented for {self.__class__}")
if algorithm_type not in ["dpmsolver++", "sde-dpmsolver++"] and final_sigmas_type == "zero":
raise ValueError(
f"`final_sigmas_type` {final_sigmas_type} is not supported for `algorithm_type` {algorithm_type}. Please choose `sigma_min` instead."
)
# setable values
self.num_inference_steps = None
timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=np.float32)[::-1].copy()
self.timesteps = torch.from_numpy(timesteps)
self.model_outputs = [None] * solver_order
self.lower_order_nums = 0
self._step_index = None
self._begin_index = None
self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication
@property
def step_index(self):
"""
The index counter for current timestep. It will increase 1 after each scheduler step.
"""
return self._step_index
@property
def begin_index(self):
"""
The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
"""
return self._begin_index
def set_begin_index(self, begin_index: int = 0):
"""
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
Args:
begin_index (`int`):
The begin index for the scheduler.
"""
self._begin_index = begin_index
def set_timesteps(
self,
num_inference_steps: int = None,
device: Union[str, torch.device] = None,
timesteps: Optional[List[int]] = None,
):
"""
Sets the discrete timesteps used for the diffusion chain (to be run before inference).
Args:
num_inference_steps (`int`):
The number of diffusion steps used when generating samples with a pre-trained model.
device (`str` or `torch.device`, *optional*):
The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
timesteps (`List[int]`, *optional*):
Custom timesteps used to support arbitrary timesteps schedule. If `None`, timesteps will be generated
based on the `timestep_spacing` attribute. If `timesteps` is passed, `num_inference_steps` and `sigmas`
must be `None`, and `timestep_spacing` attribute will be ignored.
"""
if num_inference_steps is None and timesteps is None:
raise ValueError("Must pass exactly one of `num_inference_steps` or `timesteps`.")
if num_inference_steps is not None and timesteps is not None:
raise ValueError("Can only pass one of `num_inference_steps` or `custom_timesteps`.")
if timesteps is not None and self.config.use_karras_sigmas:
raise ValueError("Cannot use `timesteps` with `config.use_karras_sigmas = True`")
if timesteps is not None and self.config.use_lu_lambdas:
raise ValueError("Cannot use `timesteps` with `config.use_lu_lambdas = True`")
if timesteps is not None:
timesteps = np.array(timesteps).astype(np.int64)
else:
# Clipping the minimum of all lambda(t) for numerical stability.
# This is critical for cosine (squaredcos_cap_v2) noise schedule.
clipped_idx = torch.searchsorted(torch.flip(self.lambda_t, [0]), self.config.lambda_min_clipped)
last_timestep = ((self.config.num_train_timesteps - clipped_idx).numpy()).item()
# "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891
if self.config.timestep_spacing == "linspace":
timesteps = (
np.linspace(0, last_timestep - 1, num_inference_steps + 1)
.round()[::-1][:-1]
.copy()
.astype(np.int64)
)
elif self.config.timestep_spacing == "leading":
step_ratio = last_timestep // (num_inference_steps + 1)
# creates integer timesteps by multiplying by ratio
# casting to int to avoid issues when num_inference_step is power of 3
timesteps = (
(np.arange(0, num_inference_steps + 1) * step_ratio).round()[::-1][:-1].copy().astype(np.int64)
)
timesteps += self.config.steps_offset
elif self.config.timestep_spacing == "trailing":
step_ratio = self.config.num_train_timesteps / num_inference_steps
# creates integer timesteps by multiplying by ratio
# casting to int to avoid issues when num_inference_step is power of 3
timesteps = np.arange(last_timestep, 0, -step_ratio).round().copy().astype(np.int64)
timesteps -= 1
else:
raise ValueError(
f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'linspace', 'leading' or 'trailing'."
)
sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
log_sigmas = np.log(sigmas)
if self.config.use_karras_sigmas:
sigmas = np.flip(sigmas).copy()
sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=num_inference_steps)
timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas]).round()
elif self.config.use_lu_lambdas:
lambdas = np.flip(log_sigmas.copy())
lambdas = self._convert_to_lu(in_lambdas=lambdas, num_inference_steps=num_inference_steps)
sigmas = np.exp(lambdas)
timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas]).round()
else:
sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
if self.config.final_sigmas_type == "sigma_min":
sigma_last = ((1 - self.alphas_cumprod[0]) / self.alphas_cumprod[0]) ** 0.5
elif self.config.final_sigmas_type == "zero":
sigma_last = 0
else:
raise ValueError(
f"`final_sigmas_type` must be one of 'zero', or 'sigma_min', but got {self.config.final_sigmas_type}"
)
sigmas = np.concatenate([sigmas, [sigma_last]]).astype(np.float32)
self.sigmas = torch.from_numpy(sigmas)
self.timesteps = torch.from_numpy(timesteps).to(device=device, dtype=torch.int64)
self.num_inference_steps = len(timesteps)
self.model_outputs = [
None,
] * self.config.solver_order
self.lower_order_nums = 0
# add an index counter for schedulers that allow duplicated timesteps
self._step_index = None
self._begin_index = None
self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
"""
"Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
pixels from saturation at each step. We find that dynamic thresholding results in significantly better
photorealism as well as better image-text alignment, especially when using very large guidance weights."
https://arxiv.org/abs/2205.11487
"""
dtype = sample.dtype
batch_size, channels, *remaining_dims = sample.shape
if dtype not in (torch.float32, torch.float64):
sample = sample.float() # upcast for quantile calculation, and clamp not implemented for cpu half
# Flatten sample for doing quantile calculation along each image
sample = sample.reshape(batch_size, channels * np.prod(remaining_dims))
abs_sample = sample.abs() # "a certain percentile absolute pixel value"
s = torch.quantile(abs_sample, self.config.dynamic_thresholding_ratio, dim=1)
s = torch.clamp(
s, min=1, max=self.config.sample_max_value
) # When clamped to min=1, equivalent to standard clipping to [-1, 1]
s = s.unsqueeze(1) # (batch_size, 1) because clamp will broadcast along dim=0
sample = torch.clamp(sample, -s, s) / s # "we threshold xt0 to the range [-s, s] and then divide by s"
sample = sample.reshape(batch_size, channels, *remaining_dims)
sample = sample.to(dtype)
return sample
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
def _sigma_to_t(self, sigma, log_sigmas):
# get log sigma
log_sigma = np.log(np.maximum(sigma, 1e-10))
# get distribution
dists = log_sigma - log_sigmas[:, np.newaxis]
# get sigmas range
low_idx = np.cumsum((dists >= 0), axis=0).argmax(axis=0).clip(max=log_sigmas.shape[0] - 2)
high_idx = low_idx + 1
low = log_sigmas[low_idx]
high = log_sigmas[high_idx]
# interpolate sigmas
w = (low - log_sigma) / (low - high)
w = np.clip(w, 0, 1)
# transform interpolation to time range
t = (1 - w) * low_idx + w * high_idx
t = t.reshape(sigma.shape)
return t
def _sigma_to_alpha_sigma_t(self, sigma):
alpha_t = 1 / ((sigma**2 + 1) ** 0.5)
sigma_t = sigma * alpha_t
return alpha_t, sigma_t
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
"""Constructs the noise schedule of Karras et al. (2022)."""
# Hack to make sure that other schedulers which copy this function don't break
# TODO: Add this logic to the other schedulers
if hasattr(self.config, "sigma_min"):
sigma_min = self.config.sigma_min
else:
sigma_min = None
if hasattr(self.config, "sigma_max"):
sigma_max = self.config.sigma_max
else:
sigma_max = None
sigma_min = sigma_min if sigma_min is not None else in_sigmas[-1].item()
sigma_max = sigma_max if sigma_max is not None else in_sigmas[0].item()
rho = 7.0 # 7.0 is the value used in the paper
ramp = np.linspace(0, 1, num_inference_steps)
min_inv_rho = sigma_min ** (1 / rho)
max_inv_rho = sigma_max ** (1 / rho)
sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
return sigmas
def _convert_to_lu(self, in_lambdas: torch.Tensor, num_inference_steps) -> torch.Tensor:
"""Constructs the noise schedule of Lu et al. (2022)."""
lambda_min: float = in_lambdas[-1].item()
lambda_max: float = in_lambdas[0].item()
rho = 1.0 # 1.0 is the value used in the paper
ramp = np.linspace(0, 1, num_inference_steps)
min_inv_rho = lambda_min ** (1 / rho)
max_inv_rho = lambda_max ** (1 / rho)
lambdas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
return lambdas
def convert_model_output(
self,
model_output: torch.Tensor,
*args,
sample: torch.Tensor = None,
**kwargs,
) -> torch.Tensor:
"""
Convert the model output to the corresponding type the DPMSolver/DPMSolver++ algorithm needs. DPM-Solver is
designed to discretize an integral of the noise prediction model, and DPM-Solver++ is designed to discretize an
integral of the data prediction model.
The algorithm and model type are decoupled. You can use either DPMSolver or DPMSolver++ for both noise
prediction and data prediction models.
Args:
model_output (`torch.Tensor`):
The direct output from the learned diffusion model.
sample (`torch.Tensor`):
A current instance of a sample created by the diffusion process.
Returns:
`torch.Tensor`:
The converted model output.
"""
timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None)
if sample is None:
if len(args) > 1:
sample = args[1]
else:
raise ValueError("missing `sample` as a required keyward argument")
if timestep is not None:
deprecate(
"timesteps",
"1.0.0",
"Passing `timesteps` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
)
# DPM-Solver++ needs to solve an integral of the data prediction model.
if self.config.algorithm_type in ["dpmsolver++", "sde-dpmsolver++"]:
if self.config.prediction_type == "epsilon":
# DPM-Solver and DPM-Solver++ only need the "mean" output.
if self.config.variance_type in ["learned", "learned_range"]:
model_output = model_output[:, :3]
sigma = self.sigmas[self.step_index]
alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
x0_pred = (sample - sigma_t * model_output) / alpha_t
elif self.config.prediction_type == "sample":
x0_pred = model_output
elif self.config.prediction_type == "v_prediction":
sigma = self.sigmas[self.step_index]
alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
x0_pred = alpha_t * sample - sigma_t * model_output
else:
raise ValueError(
f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
" `v_prediction` for the DPMSolverMultistepScheduler."
)
if self.config.thresholding:
x0_pred = self._threshold_sample(x0_pred)
return x0_pred
# DPM-Solver needs to solve an integral of the noise prediction model.
elif self.config.algorithm_type in ["dpmsolver", "sde-dpmsolver"]:
if self.config.prediction_type == "epsilon":
# DPM-Solver and DPM-Solver++ only need the "mean" output.
if self.config.variance_type in ["learned", "learned_range"]:
epsilon = model_output[:, :3]
else:
epsilon = model_output
elif self.config.prediction_type == "sample":
sigma = self.sigmas[self.step_index]
alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
epsilon = (sample - alpha_t * model_output) / sigma_t
elif self.config.prediction_type == "v_prediction":
sigma = self.sigmas[self.step_index]
alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
epsilon = alpha_t * model_output + sigma_t * sample
else:
raise ValueError(
f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
" `v_prediction` for the DPMSolverMultistepScheduler."
)
if self.config.thresholding:
sigma = self.sigmas[self.step_index]
alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
x0_pred = (sample - sigma_t * epsilon) / alpha_t
x0_pred = self._threshold_sample(x0_pred)
epsilon = (sample - alpha_t * x0_pred) / sigma_t
return epsilon
def dpm_solver_first_order_update(
self,
model_output: torch.Tensor,
*args,
sample: torch.Tensor = None,
noise: Optional[torch.Tensor] = None,
**kwargs,
) -> torch.Tensor:
"""
One step for the first-order DPMSolver (equivalent to DDIM).
Args:
model_output (`torch.Tensor`):
The direct output from the learned diffusion model.
sample (`torch.Tensor`):
A current instance of a sample created by the diffusion process.
Returns:
`torch.Tensor`:
The sample tensor at the previous timestep.
"""
timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None)
prev_timestep = args[1] if len(args) > 1 else kwargs.pop("prev_timestep", None)
if sample is None:
if len(args) > 2:
sample = args[2]
else:
raise ValueError(" missing `sample` as a required keyward argument")
if timestep is not None:
deprecate(
"timesteps",
"1.0.0",
"Passing `timesteps` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
)
if prev_timestep is not None:
deprecate(
"prev_timestep",
"1.0.0",
"Passing `prev_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
)
sigma_t, sigma_s = self.sigmas[self.step_index + 1], self.sigmas[self.step_index]
alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
alpha_s, sigma_s = self._sigma_to_alpha_sigma_t(sigma_s)
lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
lambda_s = torch.log(alpha_s) - torch.log(sigma_s)
h = lambda_t - lambda_s
if self.config.algorithm_type == "dpmsolver++":
x_t = (sigma_t / sigma_s) * sample - (alpha_t * (torch.exp(-h) - 1.0)) * model_output
elif self.config.algorithm_type == "dpmsolver":
x_t = (alpha_t / alpha_s) * sample - (sigma_t * (torch.exp(h) - 1.0)) * model_output
elif self.config.algorithm_type == "sde-dpmsolver++":
assert noise is not None
x_t = (
(sigma_t / sigma_s * torch.exp(-h)) * sample
+ (alpha_t * (1 - torch.exp(-2.0 * h))) * model_output
+ sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h)) * noise
)
elif self.config.algorithm_type == "sde-dpmsolver":
assert noise is not None
x_t = (
(alpha_t / alpha_s) * sample
- 2.0 * (sigma_t * (torch.exp(h) - 1.0)) * model_output
+ sigma_t * torch.sqrt(torch.exp(2 * h) - 1.0) * noise
)
return x_t
def multistep_dpm_solver_second_order_update(
self,
model_output_list: List[torch.Tensor],
*args,
sample: torch.Tensor = None,
noise: Optional[torch.Tensor] = None,
**kwargs,
) -> torch.Tensor:
"""
One step for the second-order multistep DPMSolver.
Args:
model_output_list (`List[torch.Tensor]`):
The direct outputs from learned diffusion model at current and latter timesteps.
sample (`torch.Tensor`):
A current instance of a sample created by the diffusion process.
Returns:
`torch.Tensor`:
The sample tensor at the previous timestep.
"""
timestep_list = args[0] if len(args) > 0 else kwargs.pop("timestep_list", None)
prev_timestep = args[1] if len(args) > 1 else kwargs.pop("prev_timestep", None)
if sample is None:
if len(args) > 2:
sample = args[2]
else:
raise ValueError(" missing `sample` as a required keyward argument")
if timestep_list is not None:
deprecate(
"timestep_list",
"1.0.0",
"Passing `timestep_list` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
)
if prev_timestep is not None:
deprecate(
"prev_timestep",
"1.0.0",
"Passing `prev_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
)
sigma_t, sigma_s0, sigma_s1 = (
self.sigmas[self.step_index + 1],
self.sigmas[self.step_index],
self.sigmas[self.step_index - 1],
)
alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
alpha_s1, sigma_s1 = self._sigma_to_alpha_sigma_t(sigma_s1)
lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
lambda_s0 = torch.log(alpha_s0) - torch.log(sigma_s0)
lambda_s1 = torch.log(alpha_s1) - torch.log(sigma_s1)
m0, m1 = model_output_list[-1], model_output_list[-2]
h, h_0 = lambda_t - lambda_s0, lambda_s0 - lambda_s1
r0 = h_0 / h
D0, D1 = m0, (1.0 / r0) * (m0 - m1)
if self.config.algorithm_type == "dpmsolver++":
# See https://arxiv.org/abs/2211.01095 for detailed derivations
if self.config.solver_type == "midpoint":
x_t = (
(sigma_t / sigma_s0) * sample
- (alpha_t * (torch.exp(-h) - 1.0)) * D0
- 0.5 * (alpha_t * (torch.exp(-h) - 1.0)) * D1
)
elif self.config.solver_type == "heun":
x_t = (
(sigma_t / sigma_s0) * sample
- (alpha_t * (torch.exp(-h) - 1.0)) * D0
+ (alpha_t * ((torch.exp(-h) - 1.0) / h + 1.0)) * D1
)
elif self.config.algorithm_type == "dpmsolver":
# See https://arxiv.org/abs/2206.00927 for detailed derivations
if self.config.solver_type == "midpoint":
x_t = (
(alpha_t / alpha_s0) * sample
- (sigma_t * (torch.exp(h) - 1.0)) * D0
- 0.5 * (sigma_t * (torch.exp(h) - 1.0)) * D1
)
elif self.config.solver_type == "heun":
x_t = (
(alpha_t / alpha_s0) * sample
- (sigma_t * (torch.exp(h) - 1.0)) * D0
- (sigma_t * ((torch.exp(h) - 1.0) / h - 1.0)) * D1
)
elif self.config.algorithm_type == "sde-dpmsolver++":
assert noise is not None
if self.config.solver_type == "midpoint":
x_t = (
(sigma_t / sigma_s0 * torch.exp(-h)) * sample
+ (alpha_t * (1 - torch.exp(-2.0 * h))) * D0
+ 0.5 * (alpha_t * (1 - torch.exp(-2.0 * h))) * D1
+ sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h)) * noise
)
elif self.config.solver_type == "heun":
x_t = (
(sigma_t / sigma_s0 * torch.exp(-h)) * sample
+ (alpha_t * (1 - torch.exp(-2.0 * h))) * D0
+ (alpha_t * ((1.0 - torch.exp(-2.0 * h)) / (-2.0 * h) + 1.0)) * D1
+ sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h)) * noise
)
elif self.config.algorithm_type == "sde-dpmsolver":
assert noise is not None
if self.config.solver_type == "midpoint":
x_t = (
(alpha_t / alpha_s0) * sample
- 2.0 * (sigma_t * (torch.exp(h) - 1.0)) * D0
- (sigma_t * (torch.exp(h) - 1.0)) * D1
+ sigma_t * torch.sqrt(torch.exp(2 * h) - 1.0) * noise
)
elif self.config.solver_type == "heun":
x_t = (
(alpha_t / alpha_s0) * sample
- 2.0 * (sigma_t * (torch.exp(h) - 1.0)) * D0
- 2.0 * (sigma_t * ((torch.exp(h) - 1.0) / h - 1.0)) * D1
+ sigma_t * torch.sqrt(torch.exp(2 * h) - 1.0) * noise
)
return x_t
def multistep_dpm_solver_third_order_update(
self,
model_output_list: List[torch.Tensor],
*args,
sample: torch.Tensor = None,
**kwargs,
) -> torch.Tensor:
"""
One step for the third-order multistep DPMSolver.
Args:
model_output_list (`List[torch.Tensor]`):
The direct outputs from learned diffusion model at current and latter timesteps.
sample (`torch.Tensor`):
A current instance of a sample created by diffusion process.
Returns:
`torch.Tensor`:
The sample tensor at the previous timestep.
"""
timestep_list = args[0] if len(args) > 0 else kwargs.pop("timestep_list", None)
prev_timestep = args[1] if len(args) > 1 else kwargs.pop("prev_timestep", None)
if sample is None:
if len(args) > 2:
sample = args[2]
else:
raise ValueError(" missing`sample` as a required keyward argument")
if timestep_list is not None:
deprecate(
"timestep_list",
"1.0.0",
"Passing `timestep_list` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
)
if prev_timestep is not None:
deprecate(
"prev_timestep",
"1.0.0",
"Passing `prev_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
)
sigma_t, sigma_s0, sigma_s1, sigma_s2 = (
self.sigmas[self.step_index + 1],
self.sigmas[self.step_index],
self.sigmas[self.step_index - 1],
self.sigmas[self.step_index - 2],
)
alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
alpha_s1, sigma_s1 = self._sigma_to_alpha_sigma_t(sigma_s1)
alpha_s2, sigma_s2 = self._sigma_to_alpha_sigma_t(sigma_s2)
lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
lambda_s0 = torch.log(alpha_s0) - torch.log(sigma_s0)
lambda_s1 = torch.log(alpha_s1) - torch.log(sigma_s1)
lambda_s2 = torch.log(alpha_s2) - torch.log(sigma_s2)
m0, m1, m2 = model_output_list[-1], model_output_list[-2], model_output_list[-3]
h, h_0, h_1 = lambda_t - lambda_s0, lambda_s0 - lambda_s1, lambda_s1 - lambda_s2
r0, r1 = h_0 / h, h_1 / h
D0 = m0
D1_0, D1_1 = (1.0 / r0) * (m0 - m1), (1.0 / r1) * (m1 - m2)
D1 = D1_0 + (r0 / (r0 + r1)) * (D1_0 - D1_1)
D2 = (1.0 / (r0 + r1)) * (D1_0 - D1_1)
if self.config.algorithm_type == "dpmsolver++":
# See https://arxiv.org/abs/2206.00927 for detailed derivations
x_t = (
(sigma_t / sigma_s0) * sample
- (alpha_t * (torch.exp(-h) - 1.0)) * D0
+ (alpha_t * ((torch.exp(-h) - 1.0) / h + 1.0)) * D1
- (alpha_t * ((torch.exp(-h) - 1.0 + h) / h**2 - 0.5)) * D2
)
elif self.config.algorithm_type == "dpmsolver":
# See https://arxiv.org/abs/2206.00927 for detailed derivations
x_t = (
(alpha_t / alpha_s0) * sample
- (sigma_t * (torch.exp(h) - 1.0)) * D0
- (sigma_t * ((torch.exp(h) - 1.0) / h - 1.0)) * D1
- (sigma_t * ((torch.exp(h) - 1.0 - h) / h**2 - 0.5)) * D2
)
return x_t
def index_for_timestep(self, timestep, schedule_timesteps=None):
if schedule_timesteps is None:
schedule_timesteps = self.timesteps
index_candidates = (schedule_timesteps == timestep).nonzero()
if len(index_candidates) == 0:
step_index = len(self.timesteps) - 1
# The sigma index that is taken for the **very** first `step`
# is always the second index (or the last index if there is only 1)
# This way we can ensure we don't accidentally skip a sigma in
# case we start in the middle of the denoising schedule (e.g. for image-to-image)
elif len(index_candidates) > 1:
step_index = index_candidates[1].item()
else:
step_index = index_candidates[0].item()
return step_index
def _init_step_index(self, timestep):
"""
Initialize the step_index counter for the scheduler.
"""
if self.begin_index is None:
if isinstance(timestep, torch.Tensor):
timestep = timestep.to(self.timesteps.device)
self._step_index = self.index_for_timestep(timestep)
else:
self._step_index = self._begin_index
def step(
self,
model_output: torch.Tensor,
timestep: int,
sample: torch.Tensor,
generator=None,
variance_noise: Optional[torch.Tensor] = None,
return_dict: bool = True,
) -> Union[SchedulerOutput, Tuple]:
"""
Predict the sample from the previous timestep by reversing the SDE. This function propagates the sample with
the multistep DPMSolver.
Args:
model_output (`torch.Tensor`):
The direct output from learned diffusion model.
timestep (`int`):
The current discrete timestep in the diffusion chain.
sample (`torch.Tensor`):
A current instance of a sample created by the diffusion process.
generator (`torch.Generator`, *optional*):
A random number generator.
variance_noise (`torch.Tensor`):
Alternative to generating noise with `generator` by directly providing the noise for the variance
itself. Useful for methods such as [`LEdits++`].
return_dict (`bool`):
Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`.
Returns:
[`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`:
If return_dict is `True`, [`~schedulers.scheduling_utils.SchedulerOutput`] is returned, otherwise a
tuple is returned where the first element is the sample tensor.
"""
if self.num_inference_steps is None:
raise ValueError(
"Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
)
if self.step_index is None:
self._init_step_index(timestep)
# Improve numerical stability for small number of steps
lower_order_final = (self.step_index == len(self.timesteps) - 1) and (
self.config.euler_at_final
or (self.config.lower_order_final and len(self.timesteps) < 15)
or self.config.final_sigmas_type == "zero"
)
lower_order_second = (
(self.step_index == len(self.timesteps) - 2) and self.config.lower_order_final and len(self.timesteps) < 15
)
model_output = self.convert_model_output(model_output, sample=sample)
for i in range(self.config.solver_order - 1):
self.model_outputs[i] = self.model_outputs[i + 1]
self.model_outputs[-1] = model_output
# Upcast to avoid precision issues when computing prev_sample
sample = sample.to(torch.float32)
if self.config.algorithm_type in ["sde-dpmsolver", "sde-dpmsolver++"] and variance_noise is None:
noise = randn_tensor(
model_output.shape, generator=generator, device=model_output.device, dtype=torch.float32
)
elif self.config.algorithm_type in ["sde-dpmsolver", "sde-dpmsolver++"]:
noise = variance_noise.to(device=model_output.device, dtype=torch.float32)
else:
noise = None
if self.config.solver_order == 1 or self.lower_order_nums < 1 or lower_order_final:
prev_sample = self.dpm_solver_first_order_update(model_output, sample=sample, noise=noise)
elif self.config.solver_order == 2 or self.lower_order_nums < 2 or lower_order_second:
prev_sample = self.multistep_dpm_solver_second_order_update(self.model_outputs, sample=sample, noise=noise)
else:
prev_sample = self.multistep_dpm_solver_third_order_update(self.model_outputs, sample=sample)
if self.lower_order_nums < self.config.solver_order:
self.lower_order_nums += 1
# Cast sample back to expected dtype
prev_sample = prev_sample.to(model_output.dtype)
# upon completion increase step index by one
self._step_index += 1
if not return_dict:
return (prev_sample,)
return SchedulerOutput(prev_sample=prev_sample)
def add_noise(
self,
original_samples: torch.Tensor,
noise: torch.Tensor,
timesteps: torch.IntTensor,
) -> torch.Tensor:
# Make sure sigmas and timesteps have the same device and dtype as original_samples
alpha_t = self.alpha_t.to(device=original_samples.device, dtype=original_samples.dtype)
sigma_t = self.sigma_t.to(device=original_samples.device, dtype=original_samples.dtype)
timesteps = timesteps.to(original_samples.device)
alpha_t = alpha_t[timesteps].flatten()
while len(alpha_t.shape) < len(original_samples.shape):
alpha_t = alpha_t.unsqueeze(-1)
sigma_t = sigma_t[timesteps].flatten()
while len(sigma_t.shape) < len(original_samples.shape):
sigma_t = sigma_t.unsqueeze(-1)
noisy_samples = alpha_t * original_samples + sigma_t * noise
return noisy_samples
def get_velocity(self, original_samples: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor:
alpha_t = self.alpha_t.to(device=original_samples.device, dtype=original_samples.dtype)
sigma_t = self.sigma_t.to(device=original_samples.device, dtype=original_samples.dtype)
timesteps = timesteps.to(original_samples.device)
alpha_t = alpha_t[timesteps].flatten()
while len(alpha_t.shape) < len(original_samples.shape):
alpha_t = alpha_t.unsqueeze(-1)
sigma_t = sigma_t[timesteps].flatten()
while len(sigma_t.shape) < len(original_samples.shape):
sigma_t = sigma_t.unsqueeze(-1)
velocity = alpha_t * noise - sigma_t * original_samples
return velocity
def __len__(self):
return self.config.num_train_timesteps
================================================
FILE: LatentLM/tokenizer_models/__init__.py
================================================
from .modeling_sigma_vae import sigma_vae
from .vae import AutoencoderKL
================================================
FILE: LatentLM/tokenizer_models/modeling_beit3_vision.py
================================================
# Copyright (c) 2022 Microsoft
# Licensed under The MIT License [see LICENSE for details]
import torch.nn as nn
from torchscale.architecture.encoder import Encoder
from torchscale.component.embedding import (
PositionalEmbedding,
VisionEmbedding,
)
from torchscale.architecture.config import EncoderConfig
class BEiT3Vision(nn.Module):
def __init__(self, args, **kwargs):
super().__init__()
self.args = args
assert not args.multiway
assert not args.share_encoder_input_output_embed
self.vision_embed = VisionEmbedding(
args.img_size,
args.patch_size,
args.in_chans,
args.encoder_embed_dim,
contain_mask_token=True,
prepend_cls_token=True,
)
# being consistent with Fairseq, which starts from 2 for position embedding
embed_positions = PositionalEmbedding(self.vision_embed.num_position_embeddings() + 2, args.encoder_embed_dim)
self.encoder = Encoder(
args,
embed_tokens=None,
embed_positions=embed_positions,
output_projection=None,
is_encoder_decoder=False,
)
def forward(
self,
visual_tokens=None,
vision_masked_position=None,
return_patch_tokens=False,
):
x = self.vision_embed(visual_tokens, vision_masked_position)
x = self.encoder(
src_tokens=None,
encoder_padding_mask=None,
token_embeddings=x,
)
encoder_out = x["encoder_out"]
if return_patch_tokens:
return encoder_out[:, 1:]
else:
return encoder_out[:, 0]
def beit3_base_vision(image_size):
config = EncoderConfig(
img_size=image_size, patch_size=16, vocab_size=64010, multiway=False,
layernorm_embedding=False, normalize_output=True, no_output_layer=True,
drop_path_rate=0, encoder_embed_dim=768, encoder_attention_heads=12,
encoder_ffn_embed_dim=int(768 * 4), encoder_layers=12,
)
return BEiT3Vision(config)
================================================
FILE: LatentLM/tokenizer_models/modeling_common.py
================================================
import torch
from torch import nn
from timm.models.layers import trunc_normal_ as __call_trunc_normal_
from .modeling_utils import VisionTransformer
from .modeling_beit3_vision import beit3_base_vision
from functools import partial
def trunc_normal_(tensor, mean=0., std=1.):
__call_trunc_normal_(tensor, mean=mean, std=std, a=-std, b=std)
class EncoderDecoderArchForImageReconstrction(nn.Module):
# This is the main class for the encoder-decoder architecture
# It is used for image reconstruction
# contains encoer backbone, decoder backbone
def __init__(
self,
encoder_config: dict,
encoder_post_processor: nn.Module,
decoder_pre_processor: nn.Module,
decoder_config: dict,
decoder_post_processor: nn.Module,
):
super().__init__()
self.img_size = encoder_config['img_size']
self.encoder = self.build_encoder(encoder_config)
self.encoder_post_processor = encoder_post_processor
self.decoder_pre_processor = decoder_pre_processor
self.decoder = self.build_decoder(decoder_config)
self.decoder_post_processor = decoder_post_processor
def init_weights(self):
if self.encoder_post_processor is not None:
self.encoder_post_processor.apply(self._init_weights)
if self.decoder_pre_processor is not None:
self.decoder_pre_processor.apply(self._init_weights)
if self.decoder_post_processor is not None:
self.decoder_post_processor.apply(self._init_weights)
def _init_weights(self, m):
if isinstance(m, nn.Linear) or isinstance(m, nn.Embedding):
trunc_normal_(m.weight, std=.02)
if isinstance(m, nn.Linear) and m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.LayerNorm):
nn.init.constant_(m.bias, 0)
nn.init.constant_(m.weight, 1.0)
@staticmethod
def build_encoder(config):
backbone = config.pop('arch')
if backbone.startswith('vit'):
module = VisionTransformer(**config)
elif backbone == 'beit3-base':
module = beit3_base_vision(image_size=config["img_size"])
return module
@staticmethod
def build_decoder(config):
backbone = config.pop('arch')
return VisionTransformer(**config)
def encode(self, img):
encoder_features = self.encoder(img, return_patch_tokens=True)
return self.encoder_post_processor(encoder_features)
def decode(self, quantize, **decoder_kwargs):
quantize = self.decoder_pre_processor(quantize)
decoder_features = self.decoder(quantize, return_patch_tokens=True, **decoder_kwargs)
return self.decoder_post_processor(decoder_features)
def get_model_default_params(
embed_dim=768, depth=12, img_size=256,
patch_size=16, in_chans=3, num_heads=12,
):
return dict(
img_size=img_size, patch_size=patch_size, in_chans=in_chans,
embed_dim=embed_dim, depth=depth, num_heads=num_heads, mlp_ratio=4.,
qkv_bias=True, drop_rate=0., attn_drop_rate=0.,
drop_path_rate=0., norm_layer=partial(nn.LayerNorm, eps=1e-6),
)
def get_basic_config(
img_size=256, patch_size=16,
encoder_arch='beit3-base', decoder_arch='vit-base',
**kwargs,
):
if encoder_arch in ('vit-base', 'beit3-base'):
encoder_config = get_model_default_params(
embed_dim=768, depth=12, num_heads=12,
)
else:
raise ValueError(f"Unknown encoder arch: {encoder_arch}")
encoder_config['patch_size'] = patch_size
encoder_config['img_size'] = img_size
encoder_config['arch'] = encoder_arch
if decoder_arch == 'vit-base':
decoder_config = get_model_default_params(
embed_dim=768, depth=12, num_heads=12,
)
else:
raise ValueError(f"Unknown decoder arch: {decoder_arch}")
decoder_config['arch'] = decoder_arch
return {
'encoder_config': encoder_config,
'decoder_config': decoder_config,
'patch_size': patch_size,
}, kwargs
================================================
FILE: LatentLM/tokenizer_models/modeling_sigma_vae.py
================================================
import torch
from torch import nn
from torch.nn import functional as F
from timm.models.registry import register_model
from .modeling_common import EncoderDecoderArchForImageReconstrction, get_basic_config
class DecodeHeadBLC(nn.Module):
def __init__(self, decoder_output_dim, patch_size, output_channels, patches_shape):
super().__init__()
num_pixels_per_patch = patch_size * patch_size * output_channels
self.patch_size = patch_size
self.output_channels = output_channels
self.fc1 = nn.Linear(decoder_output_dim, decoder_output_dim)
self.act = nn.Tanh()
self.fc2 = nn.Linear(decoder_output_dim, num_pixels_per_patch)
self.patches_shape = patches_shape
def forward(self, x):
x = self.fc1(x)
x = self.act(x)
x = self.fc2(x)
bsz = x.size(0)
x = x.view(
bsz, self.patches_shape[0], self.patches_shape[1],
self.output_channels, self.patch_size, self.patch_size)
x = x.permute(0, 3, 1, 4, 2, 5)
x = x.reshape(
bsz, self.output_channels,
self.patches_shape[0] * self.patch_size,
self.patches_shape[1] * self.patch_size,
)
return x
class GaussianDistribution(object):
def __init__(self, parameters, std):
self.parameters = parameters
self.mean = parameters
self.std = std
def sample(self, sampling_std=None):
if sampling_std is not None:
x = self.mean + sampling_std * torch.randn(self.mean.shape).to(device=self.parameters.device)
else:
batch_size = self.mean.size(0)
value = self.std / 0.8
std = torch.randn(batch_size).to(device=self.parameters.device) * value
while std.dim() < self.mean.dim():
std = std.unsqueeze(-1)
x = self.mean + std * torch.randn(self.mean.shape).to(device=self.parameters.device)
return x
def kl(self):
target = torch.zeros_like(self.mean)
return F.mse_loss(self.mean, target, reduction='mean')
def mode(self):
return self.mean
class EncodeHeadBLC(nn.Module):
def __init__(self, output_dim, latent_size, patches_shape, std):
super().__init__()
self.dense = nn.Linear(output_dim, latent_size)
self.patches_shape = patches_shape
self.latent_size = latent_size
self.std = std
def forward(self, x):
bsz = x.size(0)
x = self.dense(x)
x = x.reshape(bsz, self.patches_shape[0], self.patches_shape[1], self.latent_size)
x = x.permute(0, 3, 1, 2)
x = GaussianDistribution(x, self.std)
return x
class SigmaVAE(EncoderDecoderArchForImageReconstrction):
# SigmaVAE
def __init__(
self,
encoder_config: dict,
decoder_config: dict,
patch_size: int,
latent_size: int = 16,
kl_weight: float = 1e-2,
std: float = 0.75,
):
img_size = encoder_config['img_size']
patches_shape = (img_size // patch_size, img_size // patch_size, latent_size)
num_patches = (encoder_config['img_size'] // patch_size) ** 2
self.num_patches = num_patches
encoder_post_processor = EncodeHeadBLC(
encoder_config['embed_dim'], latent_size,
patches_shape, std=std
)
decoder_pre_processor = nn.Identity()
decoder_post_processor = DecodeHeadBLC(
decoder_config['embed_dim'], patch_size, encoder_config['in_chans'], patches_shape)
super().__init__(
encoder_config=encoder_config,
encoder_post_processor=encoder_post_processor,
decoder_pre_processor=decoder_pre_processor,
decoder_config=decoder_config,
decoder_post_processor=decoder_post_processor,
)
self.kl_weight = kl_weight
self.init_weights()
@register_model
def sigma_vae(latent_size, std, **kwargs):
basic_config, unused_kwargs = get_basic_config(**kwargs)
decoder_config = basic_config.pop('decoder_config')
decoder_config['patch_size'] = 1
# if decoder is vit arch, adjust the image size to be the size of the latent space
# without modification for the vit implementation
decoder_config['img_size'] = kwargs['img_size'] // kwargs['patch_size']
decoder_config['in_chans'] = latent_size
print("Unused args = %s" % str(unused_kwargs))
model = SigmaVAE(
latent_size=latent_size, std=std,
decoder_config=decoder_config, **basic_config)
return model
================================================
FILE: LatentLM/tokenizer_models/modeling_utils.py
================================================
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.checkpoint as checkpoint
from timm.models.layers import DropPath, Mlp, PatchEmbed, \
trunc_normal_ as __call_trunc_normal_
def trunc_normal_(tensor, mean=0., std=1.):
__call_trunc_normal_(tensor, mean=mean, std=std, a=-std, b=std)
class Attention(nn.Module):
def __init__(
self, dim, num_heads=8, qkv_bias=False,
attn_drop=0., proj_drop=0.
):
super().__init__()
assert dim % num_heads == 0, 'dim should be divisible by num_heads'
self.num_heads = num_heads
head_dim = dim // num_heads
self.scale = head_dim ** -0.5
# Disable bias for k: https://github.com/microsoft/unilm/issues/510
self.qkv = nn.Linear(dim, dim * 3, bias=False)
if qkv_bias:
self.q_bias = nn.Parameter(torch.zeros(dim))
self.v_bias = nn.Parameter(torch.zeros(dim))
else:
self.q_bias = None
self.v_bias = None
self.qk_float = False
self.attn_drop = nn.Dropout(attn_drop)
self.proj = nn.Linear(dim, dim)
self.proj_drop = nn.Dropout(proj_drop)
def forward(self, x, is_causal=False, attn_mask=None):
B, N, C = x.shape
qkv_bias = None
if self.q_bias is not None:
qkv_bias = torch.cat((self.q_bias, torch.zeros_like(self.v_bias, requires_grad=False), self.v_bias))
qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias)
qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple) (B, H, N, C)
x = F.scaled_dot_product_attention(
q, k, v,
attn_mask=attn_mask,
is_causal=is_causal,
dropout_p=self.attn_drop.p,
)
x = x.transpose(1, 2).reshape(B, N, C)
x = self.proj(x)
x = self.proj_drop(x)
return x
class Block(nn.Module):
def __init__(
self,
dim,
num_heads,
mlp_ratio=4.,
qkv_bias=False,
drop=0.,
attn_drop=0.,
drop_path=0.,
act_layer=nn.GELU,
norm_layer=nn.LayerNorm,
):
super().__init__()
self.norm1 = norm_layer(dim)
self.attn = Attention(
dim, num_heads=num_heads, qkv_bias=qkv_bias,
attn_drop=attn_drop, proj_drop=drop,
)
# NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
self.drop_path1 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
self.norm2 = norm_layer(dim)
self.mlp = Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer, drop=drop)
self.drop_path2 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
def forward(self, x, attn_mask=None, is_causal=False):
x = x + self.drop_path1(self.attn(self.norm1(x), attn_mask=attn_mask, is_causal=is_causal))
x = x + self.drop_path2(self.mlp(self.norm2(x)))
return x
class VisionTransformer(nn.Module):
""" Vision Transformer with support for patch or hybrid CNN input stage
"""
def __init__(
self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, depth=12,
num_heads=12, mlp_ratio=4., qkv_bias=False, drop_rate=0., attn_drop_rate=0.,
drop_path_rate=0., norm_layer=nn.LayerNorm, use_checkpoint=False, use_cls=True,
):
super().__init__()
self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models
self.patch_embed = PatchEmbed(
img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
num_patches = self.patch_embed.num_patches
self.num_heads = num_heads
if use_cls:
self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
else:
self.cls_token = None
self.decode_tokens = num_patches + (1 if use_cls else 0)
self.pos_embed = nn.Parameter(torch.zeros(1, self.decode_tokens, embed_dim))
self.pos_drop = nn.Dropout(p=drop_rate)
self.use_checkpoint = use_checkpoint
dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule
self.blocks = nn.ModuleList([
Block(
dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias,
drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer
) for i in range(depth)])
self.fc_norm = norm_layer(embed_dim)
trunc_normal_(self.pos_embed, std=.02)
if use_cls:
trunc_normal_(self.cls_token, std=.02)
self.apply(self._init_weights)
self.fix_init_weight()
self.num_patches = num_patches
def fix_init_weight(self):
def rescale(param, layer_id):
param.div_(math.sqrt(2.0 * layer_id))
for layer_id, layer in enumerate(self.blocks):
rescale(layer.attn.proj.weight.data, layer_id + 1)
rescale(layer.mlp.fc2.weight.data, layer_id + 1)
def _init_weights(self, m):
if isinstance(m, nn.Linear):
trunc_normal_(m.weight, std=.02)
if isinstance(m, nn.Linear) and m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.LayerNorm):
nn.init.constant_(m.bias, 0)
nn.init.constant_(m.weight, 1.0)
def forward_features(self, x, return_patch_tokens=False, **kwargs):
x = self.patch_embed(x)
if self.cls_token is not None:
batch_size, seq_len, _ = x.size()
cls_tokens = self.cls_token.expand(batch_size, -1, -1) # stole cls_tokens impl from Phil Wang, thanks
x = torch.cat((cls_tokens, x), dim=1)
x = x + self.pos_embed
x = self.pos_drop(x)
for blk in self.blocks:
if self.use_checkpoint:
x = checkpoint.checkpoint(blk, x)
else:
x = blk(x)
x = self.fc_norm(x)
return x[:, 1:] if return_patch_tokens else x
def forward(self, x, **kwargs):
x = self.forward_features(x, **kwargs)
return x
================================================
FILE: LatentLM/tokenizer_models/vae.py
================================================
# Adopted from LDM's KL-VAE: https://github.com/CompVis/latent-diffusion
import torch
import torch.nn as nn
import numpy as np
def nonlinearity(x):
# swish
return x * torch.sigmoid(x)
def Normalize(in_channels, num_groups=32):
return torch.nn.GroupNorm(
num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True
)
class Upsample(nn.Module):
def __init__(self, in_channels, with_conv):
super().__init__()
self.with_conv = with_conv
if self.with_conv:
self.conv = torch.nn.Conv2d(
in_channels, in_channels, kernel_size=3, stride=1, padding=1
)
def forward(self, x):
x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
if self.with_conv:
x = self.conv(x)
return x
class Downsample(nn.Module):
def __init__(self, in_channels, with_conv):
super().__init__()
self.with_conv = with_conv
if self.with_conv:
# no asymmetric padding in torch conv, must do it ourselves
self.conv = torch.nn.Conv2d(
in_channels, in_channels, kernel_size=3, stride=2, padding=0
)
def forward(self, x):
if self.with_conv:
pad = (0, 1, 0, 1)
x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
x = self.conv(x)
else:
x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2)
return x
class ResnetBlock(nn.Module):
def __init__(
self,
*,
in_channels,
out_channels=None,
conv_shortcut=False,
dropout,
temb_channels=512,
):
super().__init__()
self.in_channels = in_channels
out_channels = in_channels if out_channels is None else out_channels
self.out_channels = out_channels
self.use_conv_shortcut = conv_shortcut
self.norm1 = Normalize(in_channels)
self.conv1 = torch.nn.Conv2d(
in_channels, out_channels, kernel_size=3, stride=1, padding=1
)
if temb_channels > 0:
self.temb_proj = torch.nn.Linear(temb_channels, out_channels)
self.norm2 = Normalize(out_channels)
self.dropout = torch.nn.Dropout(dropout)
self.conv2 = torch.nn.Conv2d(
out_channels, out_channels, kernel_size=3, stride=1, padding=1
)
if self.in_channels != self.out_channels:
if self.use_conv_shortcut:
self.conv_shortcut = torch.nn.Conv2d(
in_channels, out_channels, kernel_size=3, stride=1, padding=1
)
else:
self.nin_shortcut = torch.nn.Conv2d(
in_channels, out_channels, kernel_size=1, stride=1, padding=0
)
def forward(self, x, temb):
h = x
h = self.norm1(h)
h = nonlinearity(h)
h = self.conv1(h)
if temb is not None:
h = h + self.temb_proj(nonlinearity(temb))[:, :, None, None]
h = self.norm2(h)
h = nonlinearity(h)
h = self.dropout(h)
h = self.conv2(h)
if self.in_channels != self.out_channels:
if self.use_conv_shortcut:
x = self.conv_shortcut(x)
else:
x = self.nin_shortcut(x)
return x + h
class AttnBlock(nn.Module):
def __init__(self, in_channels):
super().__init__()
self.in_channels = in_channels
self.norm = Normalize(in_channels)
self.q = torch.nn.Conv2d(
in_channels, in_channels, kernel_size=1, stride=1, padding=0
)
self.k = torch.nn.Conv2d(
in_channels, in_channels, kernel_size=1, stride=1, padding=0
)
self.v = torch.nn.Conv2d(
in_channels, in_channels, kernel_size=1, stride=1, padding=0
)
self.proj_out = torch.nn.Conv2d(
in_channels, in_channels, kernel_size=1, stride=1, padding=0
)
def forward(self, x):
h_ = x
h_ = self.norm(h_)
q = self.q(h_)
k = self.k(h_)
v = self.v(h_)
# compute attention
b, c, h, w = q.shape
q = q.reshape(b, c, h * w)
q = q.permute(0, 2, 1) # b,hw,c
k = k.reshape(b, c, h * w) # b,c,hw
w_ = torch.bmm(q, k) # b,hw,hw w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
w_ = w_ * (int(c) ** (-0.5))
w_ = torch.nn.functional.softmax(w_, dim=2)
# attend to values
v = v.reshape(b, c, h * w)
w_ = w_.permute(0, 2, 1) # b,hw,hw (first hw of k, second of q)
h_ = torch.bmm(v, w_) # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
h_ = h_.reshape(b, c, h, w)
h_ = self.proj_out(h_)
return x + h_
class Encoder(nn.Module):
def __init__(
self,
*,
ch=128,
out_ch=3,
ch_mult=(1, 1, 2, 2, 4),
num_res_blocks=2,
attn_resolutions=(16,),
dropout=0.0,
resamp_with_conv=True,
in_channels=3,
resolution=256,
z_channels=16,
double_z=True,
**ignore_kwargs,
):
super().__init__()
self.ch = ch
self.temb_ch = 0
self.num_resolutions = len(ch_mult)
self.num_res_blocks = num_res_blocks
self.resolution = resolution
self.in_channels = in_channels
# downsampling
self.conv_in = torch.nn.Conv2d(
in_channels, self.ch, kernel_size=3, stride=1, padding=1
)
curr_res = resolution
in_ch_mult = (1,) + tuple(ch_mult)
self.down = nn.ModuleList()
for i_level in range(self.num_resolutions):
block = nn.ModuleList()
attn = nn.ModuleList()
block_in = ch * in_ch_mult[i_level]
block_out = ch * ch_mult[i_level]
for i_block in range(self.num_res_blocks):
block.append(
ResnetBlock(
in_channels=block_in,
out_channels=block_out,
temb_channels=self.temb_ch,
dropout=dropout,
)
)
block_in = block_out
if curr_res in attn_resolutions:
attn.append(AttnBlock(block_in))
down = nn.Module()
down.block = block
down.attn = attn
if i_level != self.num_resolutions - 1:
down.downsample = Downsample(block_in, resamp_with_conv)
curr_res = curr_res // 2
self.down.append(down)
# middle
self.mid = nn.Module()
self.mid.block_1 = ResnetBlock(
in_channels=block_in,
out_channels=block_in,
temb_channels=self.temb_ch,
dropout=dropout,
)
self.mid.attn_1 = AttnBlock(block_in)
self.mid.block_2 = ResnetBlock(
in_channels=block_in,
out_channels=block_in,
temb_channels=self.temb_ch,
dropout=dropout,
)
# end
self.norm_out = Normalize(block_in)
self.conv_out = torch.nn.Conv2d(
block_in,
2 * z_channels if double_z else z_channels,
kernel_size=3,
stride=1,
padding=1,
)
def forward(self, x):
# assert x.shape[2] == x.shape[3] == self.resolution, "{}, {}, {}".format(x.shape[2], x.shape[3], self.resolution)
# timestep embedding
temb = None
# downsampling
hs = [self.conv_in(x)]
for i_level in range(self.num_resolutions):
for i_block in range(self.num_res_blocks):
h = self.down[i_level].block[i_block](hs[-1], temb)
if len(self.down[i_level].attn) > 0:
h = self.down[i_level].attn[i_block](h)
hs.append(h)
if i_level != self.num_resolutions - 1:
hs.append(self.down[i_level].downsample(hs[-1]))
# middle
h = hs[-1]
h = self.mid.block_1(h, temb)
h = self.mid.attn_1(h)
h = self.mid.block_2(h, temb)
# end
h = self.norm_out(h)
h = nonlinearity(h)
h = self.conv_out(h)
return h
class Decoder(nn.Module):
def __init__(
self,
*,
ch=128,
out_ch=3,
ch_mult=(1, 1, 2, 2, 4),
num_res_blocks=2,
attn_resolutions=(),
dropout=0.0,
resamp_with_conv=True,
in_channels=3,
resolution=256,
z_channels=16,
give_pre_end=False,
**ignore_kwargs,
):
super().__init__()
self.ch = ch
self.temb_ch = 0
self.num_resolutions = len(ch_mult)
self.num_res_blocks = num_res_blocks
self.resolution = resolution
self.in_channels = in_channels
self.give_pre_end = give_pre_end
# compute in_ch_mult, block_in and curr_res at lowest res
in_ch_mult = (1,) + tuple(ch_mult)
block_in = ch * ch_mult[self.num_resolutions - 1]
curr_res = resolution // 2 ** (self.num_resolutions - 1)
self.z_shape = (1, z_channels, curr_res, curr_res)
print(
"Working with z of shape {} = {} dimensions.".format(
self.z_shape, np.prod(self.z_shape)
)
)
# z to block_in
self.conv_in = torch.nn.Conv2d(
z_channels, block_in, kernel_size=3, stride=1, padding=1
)
# middle
self.mid = nn.Module()
self.mid.block_1 = ResnetBlock(
in_channels=block_in,
out_channels=block_in,
temb_channels=self.temb_ch,
dropout=dropout,
)
self.mid.attn_1 = AttnBlock(block_in)
self.mid.block_2 = ResnetBlock(
in_channels=block_in,
out_channels=block_in,
temb_channels=self.temb_ch,
dropout=dropout,
)
# upsampling
self.up = nn.ModuleList()
for i_level in reversed(range(self.num_resolutions)):
block = nn.ModuleList()
attn = nn.ModuleList()
block_out = ch * ch_mult[i_level]
for i_block in range(self.num_res_blocks + 1):
block.append(
ResnetBlock(
in_channels=block_in,
out_channels=block_out,
temb_channels=self.temb_ch,
dropout=dropout,
)
)
block_in = block_out
if curr_res in attn_resolutions:
attn.append(AttnBlock(block_in))
up = nn.Module()
up.block = block
up.attn = attn
if i_level != 0:
up.upsample = Upsample(block_in, resamp_with_conv)
curr_res = curr_res * 2
self.up.insert(0, up) # prepend to get consistent order
# end
self.norm_out = Normalize(block_in)
self.conv_out = torch.nn.Conv2d(
block_in, out_ch, kernel_size=3, stride=1, padding=1
)
def forward(self, z):
# assert z.shape[1:] == self.z_shape[1:]
self.last_z_shape = z.shape
# timestep embedding
temb = None
# z to block_in
h = self.conv_in(z)
# middle
h = self.mid.block_1(h, temb)
h = self.mid.attn_1(h)
h = self.mid.block_2(h, temb)
# upsampling
for i_level in reversed(range(self.num_resolutions)):
for i_block in range(self.num_res_blocks + 1):
h = self.up[i_level].block[i_block](h, temb)
if len(self.up[i_level].attn) > 0:
h = self.up[i_level].attn[i_block](h)
if i_level != 0:
h = self.up[i_level].upsample(h)
# end
if self.give_pre_end:
return h
h = self.norm_out(h)
h = nonlinearity(h)
h = self.conv_out(h)
return h
class DiagonalGaussianDistribution(object):
def __init__(self, parameters, deterministic=False):
self.parameters = parameters
self.mean, self.logvar = torch.chunk(parameters, 2, dim=1)
self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
self.deterministic = deterministic
self.std = torch.exp(0.5 * self.logvar)
self.var = torch.exp(self.logvar)
if self.deterministic:
self.var = self.std = torch.zeros_like(self.mean).to(
device=self.parameters.device
)
def sample(self):
x = self.mean + self.std * torch.randn(self.mean.shape).to(
device=self.parameters.device
)
return x
def kl(self, other=None):
if self.deterministic:
return torch.Tensor([0.0])
else:
if other is None:
return 0.5 * torch.sum(
torch.pow(self.mean, 2) + self.var - 1.0 - self.logvar,
dim=[1, 2, 3],
)
else:
return 0.5 * torch.sum(
torch.pow(self.mean - other.mean, 2) / other.var
+ self.var / other.var
- 1.0
- self.logvar
+ other.logvar,
dim=[1, 2, 3],
)
def nll(self, sample, dims=[1, 2, 3]):
if self.deterministic:
return torch.Tensor([0.0])
logtwopi = np.log(2.0 * np.pi)
return 0.5 * torch.sum(
logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var,
dim=dims,
)
def mode(self):
return self.mean
class AutoencoderKL(nn.Module):
def __init__(self, embed_dim, ch_mult, use_variational=True, ckpt_path=None):
super().__init__()
self.encoder = Encoder(ch_mult=ch_mult, z_channels=embed_dim)
self.decoder = Decoder(ch_mult=ch_mult, z_channels=embed_dim)
self.use_variational = use_variational
mult = 2 if self.use_variational else 1
self.quant_conv = torch.nn.Conv2d(2 * embed_dim, mult * embed_dim, 1)
self.post_quant_conv = torch.nn.Conv2d(embed_dim, embed_dim, 1)
self.embed_dim = embed_dim
if ckpt_path is not None:
self.init_from_ckpt(ckpt_path)
def init_from_ckpt(self, path):
sd = torch.load(path, map_location="cpu")["model"]
msg = self.load_state_dict(sd, strict=False)
print("Loading pre-trained KL-VAE")
print("Missing keys:")
print(msg.missing_keys)
print("Unexpected keys:")
print(msg.unexpected_keys)
print(f"Restored from {path}")
def encode(self, x):
h = self.encoder(x)
moments = self.quant_conv(h)
if not self.use_variational:
moments = torch.cat((moments, torch.ones_like(moments)), 1)
posterior = DiagonalGaussianDistribution(moments)
return posterior
def decode(self, z):
z = self.post_quant_conv(z)
dec = self.decoder(z)
return dec
def forward(self, inputs, disable=True, train=True, optimizer_idx=0):
if train:
return self.training_step(inputs, disable, optimizer_idx)
else:
return self.validation_step(inputs, disable)
================================================
FILE: LatentLM/train_hf.py
================================================
import argparse
import functools
import logging
import math
import os
from datetime import timedelta
import datasets
import torch
import torch.nn.functional as F
import torch.distributed as dist
from accelerate import Accelerator, InitProcessGroupKwargs
from accelerate.logging import get_logger
from accelerate.utils import ProjectConfiguration, set_seed
from datasets import load_dataset
from torchvision import transforms
from torchvision.datasets import ImageFolder
import diffusers
from diffusers.training_utils import compute_snr
from diffusers.optimization import get_scheduler
from models import All_models, DiT, Transformer, EMAModel
from timm.models import create_model
from utils import center_crop_arr, safe_blob_write, load_vae
from schedule.ddpm import DDPMScheduler
import wandb
logger = get_logger(__name__, log_level="INFO")
def parse_args():
parser = argparse.ArgumentParser(description="Simple example of a training script.")
# 基本参数
parser.add_argument("--seed", type=int, default=0, help="A seed to use for the random number generator. Can be negative to not set a seed.")
parser.add_argument("--output_dir", type=str, default="results", help="The output directory where the model predictions and checkpoints will be written.")
parser.add_argument("--cache_dir", type=str, default="/mnt/msranlp/yutao/cache", help="The directory where the downloaded models and datasets will be stored.")
parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
# 数据集参数
parser.add_argument("--dataset_name", type=str, default=None, help="The name of the Dataset (from the HuggingFace hub) to train on.")
parser.add_argument("--dataset_config_name", type=str, default=None, help="The config of the Dataset, leave as None if there's only one config.")
parser.add_argument("--train_data_dir", type=str, default="/tmp/ILSVRC/Data/CLS-LOC/train", help="A folder containing the training data.")
# 模型参数
parser.add_argument("--model", type=str, default="Transformer-L", help="The config of the UNet model to train.")
parser.add_argument("--vae", type=str, default=None, help="Path to pre-trained VAE model.")
parser.add_argument("--image_size", type=int, default=256, help="The image_size for input images.")
parser.add_argument("--num_classes", type=int, default=1000, help="Number of classes for the model.")
parser.add_argument("--dropout", type=float, default=0.0, help="Dropout probability.")
# 训练参数
parser.add_argument("--batch_size", type=int, default=32, help="Batch size (per device) for the training dataloader.")
parser.add_argument("--num_epochs", type=int, default=100, help="Number of epochs to train for.")
parser.add_argument("--gradient_accumulation_steps", type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.")
parser.add_argument("--dataloader_num_workers", type=int, default=2, help="The number of subprocesses to use for data loading.")
# 优化器参数
parser.add_argument("--learning_rate", type=float, default=1e-4, help="Initial learning rate (after the potential warmup period) to use.")
parser.add_argument("--lr_scheduler", type=str, default="cosine", help="The scheduler type to use.")
parser.add_argument("--lr_warmup_steps", type=int, default=100, help="Number of steps for the warmup in the lr scheduler.")
parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
parser.add_argument("--adam_beta2", type=float, default=0.98, help="The beta2 parameter for the Adam optimizer.")
parser.add_argument("--adam_weight_decay", type=float, default=0.01, help="Weight decay magnitude for the Adam optimizer.")
parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer.")
# EMA参数
parser.add_argument("--use_ema", action="store_true", help="Whether to use Exponential Moving Average for the final model weights.")
parser.add_argument("--ema_inv_gamma", type=float, default=1.0, help="The inverse gamma value for the EMA decay.")
parser.add_argument("--ema_power", type=float, default=3 / 4, help="The power value for the EMA decay.")
parser.add_argument("--ema_max_decay", type=float, default=0.9999, help="The maximum decay magnitude for EMA.")
# 日志参数
parser.add_argument("--logger", type=str, default=None, help="The logger type to use.")
parser.add_argument("--logging_dir", type=str, default="logs", help="The directory to store logs.")
parser.add_argument("--wandb_project", type=str, default=None, help="The wandb project name.")
parser.add_argument("--wandb_entity", type=str, default=None, help="The wandb entity (username or team).")
parser.add_argument("--log_every", type=int, default=100, help="Log every X steps.")
# 分布式训练参数
parser.add_argument("--mixed_precision", type=str, default="no", choices=["no", "fp16", "bf16"], help="Whether to use mixed precision.")
# DDPM参数
parser.add_argument("--prediction_type", type=str, default="epsilon", help="Whether the model should predict the 'epsilon'/noise error or directly the reconstructed image 'x0'.")
parser.add_argument("--ddpm_num_steps", type=int, default=1000, help="The number of steps to use for DDPM.")
parser.add_argument("--ddpm_num_inference_steps", type=int, default=1000, help="The number of inference steps to use for DDPM.")
parser.add_argument("--ddpm_beta_schedule", type=str, default="cosine", help="The beta schedule to use for DDPM.")
parser.add_argument("--ddpm_batch_mul", type=int, default=4, help="The batch multiplier to use for DDPM.")
parser.add_argument("--checkpointing_steps", type=int, default=5000, help="Save a checkpoint of the training state every X updates.")
parser.add_argument("--checkpoint", type=str, default=None, help="Resume training from a previous checkpoint.")
args = parser.parse_args()
env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
if env_local_rank != -1 and env_local_rank != args.local_rank:
args.local_rank = env_local_rank
if args.dataset_name is None and args.train_data_dir is None:
raise ValueError("You must specify either a dataset name from the hub or a train data directory.")
return args
def main(args):
set_seed(args.seed)
logging_dir = os.path.join(args.output_dir, args.logging_dir)
vae, input_size, latent_size, flatten_input = load_vae(args.vae, args.image_size)
model = All_models[args.model](
input_size=input_size,
in_channels=latent_size,
num_classes=args.num_classes,
flatten_input=flatten_input,
drop=args.dropout,
)
if args.mixed_precision == "bf16":
dtype = torch.bfloat16
elif args.mixed_precision == "fp16":
dtype = torch.float16
else:
dtype = torch.float32
# Create EMA for the model.
if args.use_ema:
ema_model = EMAModel(
model.parameters(),
decay=args.ema_max_decay,
min_decay=args.ema_max_decay,
use_ema_warmup=True,
inv_gamma=args.ema_inv_gamma,
power=args.ema_power,
)
# Initialize the scheduler
noise_scheduler = DDPMScheduler(num_train_timesteps=args.ddpm_num_steps, beta_schedule=args.ddpm_beta_schedule, prediction_type=args.prediction_type)
# Initialize the optimizer
optimizer = torch.optim.AdamW(
model.parameters(),
lr=args.learning_rate,
betas=(args.adam_beta1, args.adam_beta2),
weight_decay=args.adam_weight_decay,
eps=args.adam_epsilon,
)
# Initialize the accelerator
accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
kwargs = InitProcessGroupKwargs(timeout=timedelta(seconds=7200)) # a big number for high image_size or big dataset
accelerator = Accelerator(
gradient_accumulation_steps=args.gradient_accumulation_steps,
mixed_precision=args.mixed_precision,
log_with=args.logger,
project_config=accelerator_project_config,
kwargs_handlers=[kwargs],
)
# Make one log on every process with the configuration for debugging.
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
level=logging.INFO,
)
logger.info(accelerator.state, main_process_only=False)
if accelerator.is_local_main_process:
datasets.utils.logging.set_verbosity_warning()
diffusers.utils.logging.set_verbosity_info()
else:
datasets.utils.logging.set_verbosity_error()
diffusers.utils.logging.set_verbosity_error()
logger.info(args)
if accelerator.is_main_process:
if args.output_dir is not None:
os.makedirs(args.output_dir, exist_ok=True)
if args.wandb_project is not None:
wandb.init(project=args.wandb_project, entity=args.wandb_entity, config=args)
logger.info(model)
logger.info(f"Model Parameters: {sum(p.numel() for p in model.parameters()):,}")
# In distributed training, the load_dataset function guarantees that only one local process can concurrently
# download the dataset.
augmentations = transforms.Compose([
transforms.Lambda(lambda pil_image: center_crop_arr(pil_image, args.image_size)),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True)
])
if args.dataset_name is not None:
dataset = load_dataset(
args.dataset_name,
args.dataset_config_name,
cache_dir=args.cache_dir,
split="train",
)
def transform_images(examples):
images = [augmentations(image.convert("RGB")) for image in examples["image"]]
return {"input": images}
dataset.set_transform(transform_images)
else:
dataset = ImageFolder(args.train_data_dir, transform=augmentations)
train_dataloader = torch.utils.data.DataLoader(
dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.dataloader_num_workers
)
# Initialize the learning rate scheduler
lr_scheduler = get_scheduler(
args.lr_scheduler,
optimizer=optimizer,
num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
num_training_steps=(len(train_dataloader) * args.num_epochs // args.gradient_accumulation_steps),
)
# Prepare everything with our `accelerator`.
model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
model, optimizer, train_dataloader, lr_scheduler
)
# vae = accelerator.prepare_model(vae, evaluation_mode=True, device_placement=True)
vae.to(accelerator.device)
vae.eval()
if args.use_ema:
ema_model.to(accelerator.device)
# We need to initialize the trackers we use, and also store our configuration.
# The trackers initializes automatically on the main process.
if accelerator.is_main_process:
run = os.path.split(__file__)[-1].split(".")[0]
accelerator.init_trackers(run)
total_batch_size = args.batch_size * accelerator.num_processes * args.gradient_accumulation_steps
max_train_steps = len(train_dataloader) * args.num_epochs // args.gradient_accumulation_steps
logger.info("***** Running training *****")
logger.info(f" Num examples = {len(dataset)}")
logger.info(f" Num Epochs = {args.num_epochs}")
logger.info(f" Instantaneous batch size per device = {args.batch_size}")
logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}")
logger.info(f" Total optimization steps = {max_train_steps}")
global_step = 0
running_loss = 0
first_epoch = 0
scaling_factor = None
bias_factor = None
# Potentially load in the weights and states from a previous save
checkpoint_path = args.checkpoint
if checkpoint_path is None and os.path.exists(os.path.join(args.output_dir, "latest")):
with open(os.path.join(args.output_dir, "latest"), "r") as f:
checkpoint_path = f.read().strip()
if checkpoint_path is not None:
accelerator.print(f"Resuming from checkpoint {checkpoint_path}")
accelerator.load_state(checkpoint_path)
other_state = torch.load(os.path.join(checkpoint_path, "other_state.pth"), map_location="cpu")
global_step = other_state["steps"]
scaling_factor = other_state["scaling_factor"]
bias_factor = other_state["bias_factor"]
if args.use_ema:
ema_model.load_state_dict(other_state["ema"])
logger.info("EMA model loaded successfully")
first_epoch = global_step * args.gradient_accumulation_steps // len(train_dataloader)
resume_step = global_step * args.gradient_accumulation_steps % len(train_dataloader)
# Train!
# snr = compute_snr(noise_scheduler, torch.arange(args.ddpm_num_steps, device=accelerator.device))
# sample_weight = (
# torch.stack([snr, 5 * torch.ones(args.ddpm_num_steps, device=accelerator.device)], dim=1).min(dim=1)[0] / snr
# )
sample_weight = torch.ones(args.ddpm_num_steps, device=accelerator.device)
for epoch in range(first_epoch, args.num_epochs):
model.train()
for step, (clean_images, label) in enumerate(train_dataloader):
# Skip steps until we reach the resumed step
if args.checkpoint and epoch == first_epoch:
if step < resume_step:
continue
with torch.no_grad():
vae_latent = vae.encode(clean_images)
clean_images = vae_latent.sample()
mode_images = vae_latent.mode()
if scaling_factor is None:
scaling_factor = 1. / clean_images.flatten().std()
bias_factor = -clean_images.flatten().mean()
dist.all_reduce(scaling_factor, op=dist.ReduceOp.SUM)
dist.all_reduce(bias_factor, op=dist.ReduceOp.SUM)
scaling_factor = scaling_factor.item() / dist.get_world_size()
bias_factor = bias_factor.item() / dist.get_world_size()
logger.info(f"Scaling factor: {scaling_factor}, Bias factor: {bias_factor}")
clean_images = (clean_images + bias_factor) * scaling_factor
mode_images = (mode_images + bias_factor) * scaling_factor
with accelerator.accumulate(model):
bsz, latent_size, h, w = clean_images.shape
if isinstance(model.module, Transformer):
noise = torch.randn((bsz * args.ddpm_batch_mul * h * w, latent_size), device=clean_images.device, dtype=clean_images.dtype)
timesteps = torch.multinomial(sample_weight, bsz * args.ddpm_batch_mul * h * w, replacement=True)
clean_images_repeated = clean_images.repeat_interleave(args.ddpm_batch_mul, dim=0).permute(0, 2, 3, 1).reshape(-1, clean_images.shape[1])
noisy_images = noise_scheduler.add_noise(clean_images_repeated, noise, timesteps)
velocity = noise_scheduler.get_velocity(clean_images_repeated, noise, timesteps)
noisy_images, noise, velocity = [x.reshape(bsz * args.ddpm_batch_mul, h, w, latent_size).permute(0, 3, 1, 2) for x in [noisy_images, noise, velocity]]
timesteps = timesteps.reshape(bsz * args.ddpm_batch_mul, h * w)
model_output = model(noisy_images.to(dtype), timesteps.to(dtype), x_start=clean_images.to(dtype), y=label, batch_mul=args.ddpm_batch_mul)
elif isinstance(model.module, DiT):
noise = torch.randn_like(clean_images)
timesteps = torch.multinomial(sample_weight, bsz, replacement=True)
noisy_images = noise_scheduler.add_noise(clean_images, noise, timesteps)
velocity = noise_scheduler.get_velocity(clean_images, noise, timesteps)
model_output = model(noisy_images.to(dtype), timesteps.to(dtype), y=label)
else:
raise NotImplementedError()
if args.prediction_type == "epsilon":
loss = F.mse_loss(model_output.float(), noise.float())
elif args.prediction_type == "v_prediction":
loss = F.mse_loss(model_output.float(), velocity.float())
else:
raise NotImplementedError()
accelerator.backward(loss)
if accelerator.sync_gradients:
gnorm = accelerator.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
running_loss += loss.item()
if accelerator.sync_gradients:
global_step += 1
if args.use_ema:
ema_model.step(model.parameters())
if global_step % args.log_every == 0:
avg_loss = running_loss / args.log_every / args.gradient_accumulation_steps
running_loss = 0
logs = {"loss": avg_loss, "lr": lr_scheduler.get_last_lr()[0], "step": global_step, "gnorm": gnorm.item(), "batch size": total_batch_size, "epoch": epoch}
if args.use_ema:
logs["ema_decay"] = ema_model.cur_decay_value
logger.info(logs)
accelerator.log(logs, step=global_step)
if accelerator.is_main_process and args.wandb_project is not None:
wandb.log(logs, step=global_step)
if global_step % args.checkpointing_steps == 0:
def save_checkpoint(path):
accelerator.save_state(path)
if accelerator.is_main_process:
other_state = {
"scaling_factor": scaling_factor,
"bias_factor": bias_factor,
"steps": global_step,
"ema": ema_model.state_dict() if args.use_ema else None,
}
torch.save(other_state, os.path.join(path, "other_state.pth"))
save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
save_checkpoint(os.path.join(save_path))
if accelerator.is_main_process:
safe_blob_write(os.path.join(args.output_dir, "latest"), save_path)
logger.info(f"Saved state to {save_path}")
accelerator.end_training()
if __name__ == "__main__":
args = parse_args()
main(args)
================================================
FILE: LatentLM/utils.py
================================================
from PIL import Image
import numpy as np
import json
from collections import OrderedDict
import torch
import torch.distributed as dist
import logging
import os
import requests
from tqdm import tqdm
from tokenizer_models import AutoencoderKL, sigma_vae
#################################################################################
# Training Helper Functions #
#################################################################################
@torch.no_grad()
def update_ema(ema_model, model, decay=0.9999):
"""
Step the EMA model towards the current model.
"""
ema_params = OrderedDict(ema_model.named_parameters())
model_params = OrderedDict(model.named_parameters())
for name, param in model_params.items():
# TODO: Consider applying only to params that require_grad to avoid small numerical changes of pos_embed
ema_params[name].mul_(decay).add_(param.data, alpha=1 - decay)
def requires_grad(model, flag=True):
"""
Set requires_grad flag for all parameters in a model.
"""
for p in model.parameters():
p.requires_grad = flag
def cleanup():
"""
End DDP training.
"""
dist.destroy_process_group()
def create_logger(logging_dir):
"""
Create a logger that writes to a log file and stdout.
"""
if dist.get_rank() == 0: # real logger
logging.basicConfig(
level=logging.INFO,
format='[\033[34m%(asctime)s\033[0m] %(message)s',
datefmt='%Y-%m-%d %H:%M:%S',
handlers=[logging.StreamHandler(), logging.FileHandler(f"{logging_dir}/log.txt")]
)
logger = logging.getLogger(__name__)
else: # dummy logger (does nothing)
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
return logger
def center_crop_arr(pil_image, image_size):
"""
Center cropping implementation from ADM.
https://github.com/openai/guided-diffusion/blob/8fb3ad9197f16bbc40620447b2742e13458d2831/guided_diffusion/image_datasets.py#L126
"""
while min(*pil_image.size) >= 2 * image_size:
pil_image = pil_image.resize(
tuple(x // 2 for x in pil_image.size), resample=Image.BOX
)
scale = image_size / min(*pil_image.size)
pil_image = pil_image.resize(
tuple(round(x * scale) for x in pil_image.size), resample=Image.BICUBIC
)
arr = np.array(pil_image)
crop_y = (arr.shape[0] - image_size) // 2
crop_x = (arr.shape[1] - image_size) // 2
return Image.fromarray(arr[crop_y: crop_y + image_size, crop_x: crop_x + image_size])
def download_pretrained_vae(overwrite=False):
download_path = "/mnt/unilm/yutao/vae.ckpt"
if not os.path.exists(download_path) or overwrite:
headers = {'user-agent': 'Wget/1.16 (linux-gnu)'}
r = requests.get("https://www.dropbox.com/scl/fi/hhmuvaiacrarfg28qxhwz/kl16.ckpt?rlkey=l44xipsezc8atcffdp4q7mwmh&dl=0", stream=True, headers=headers)
print("Downloading KL-16 VAE...")
with open(download_path, 'wb') as f:
for chunk in tqdm(r.iter_content(chunk_size=1024*1024), unit="MB", total=254):
if chunk:
f.write(chunk)
def safe_blob_write(fn, text):
try:
if os.path.exists(fn):
os.remove(fn)
with open(fn, "w") as f:
f.write(text)
except:
print('Failed to write blob:', fn, text)
def safe_blob_dump(fn, result):
try:
if os.path.exists(fn):
os.remove(fn)
with open(fn, "w") as f:
json.dump(result, f)
except:
print('Failed to write blob:', fn, result)
def load_vae(vae_model_path, image_size):
data = torch.load(vae_model_path, map_location="cpu")
if "config" not in data:
input_size = image_size // 16
latent_size = 16
flatten_input = False
vae = AutoencoderKL(embed_dim=16, ch_mult=(1, 1, 2, 2, 4), ckpt_path=vae_model_path)
else:
model_config = data["config"]
input_size = image_size // model_config["patch_size"]
latent_size = model_config["latent_size"]
flatten_input = False
vae = sigma_vae(**model_config)
vae.load_state_dict(data["model"])
return vae, input_size, latent_size, flatten_input
================================================
FILE: NOTICE.md
================================================
NOTICES AND INFORMATION
Do Not Translate or Localize
This software incorporates material from third parties. Microsoft makes certain
open source code available at http://3rdpartysource.microsoft.com, or you may
send a check or money order for US $5.00, including the product name, the open
source component name, and version number, to:
Source Code Compliance Team
Microsoft Corporation
One Microsoft Way
Redmond, WA 98052
USA
Notwithstanding any other terms, you may reverse engineer this software to the
extent required to debug changes to any libraries licensed under the GNU Lesser
General Public License.
===============================================================================
Component.
huggingface/transformers
Open Source License/Copyright Notice.
```
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
```
================================================
FILE: PFPO/README.md
================================================
# Preference Optimization for Reasoning with Pseudo Feedback
This repo contains the source code for **Preference Optimization for Reasoning with Pseudo Feedback** (ICLR 2025).
We introduce a novel approach to generate pseudo feedback for reasoning tasks by framing the labeling of solutions to reasoning problems as an evaluation against
associated *test cases*. We explore two forms of pseudo feedback based on test cases: one generated by frontier LLMs and the other by extending self-consistency
to multi-test-case. We conduct experiments on both mathematical reasoning and coding tasks using pseudo feedback for preference optimization, and observe
improvements across both tasks. Specifically, using Mathstral-7B as our base model, we improve Mathstral-7B on MATH from 58.3 to 68.6, surpassing both `NuminaMath-72B` and `GPT-4-Turbo-1106-preview`. Building on Deepseek-coder-7B-v1.5, we achieve a score of 24.6 on LiveCodeBench (from
21.1), surpassing `Claude-3-Haiku`.
## Summary of Main Experimental Results
#### Mathematical Reasoning
| Model | MATH | GSM8K | College Math |
|----------------------------------------------------------------------|---------------|---------------|---------------|
| GPT-4o-2024-0512 | 78.7 | 95.8 | 46.7 |
| GPT-4-Turbo-2024-0409 | 72.8 | 94.8 | 44.2 |
| GPT-4-Turbo-1106-preview | 64.3 | --- | --- |
| GPT-4-0613 | 55.0 | 93.5 | 39.0 |
| NuminaMath-72B-CoT | 67.1 | 91.7 | 39.8 |
| Llama-3.1-8B-Instruct | 47.5 | 84.5 | 27.5 |
| Llama-3.1-70B-Instruct | 68.1 | 95.5 | 41.8 |
| Llama-3.1-8B-base | 20.3 (4-shot) | 56.7 (8-shot) | 20.1 (4-shot) |
| w/ SFT | 53.8 | 85.1 | 34.6 |
| w/ PFPO-LLM Iter. 0 | 55.0 | 86.6 | 35.8 |
| w/ PFPO-Self Iter. 1 | 55.9 | 87.6 | 36.6 |
| w/ PFPO-Self Iter. 2 | 56.6 | 88.9 | 37.0 |
| w/ PFPO-Self Iter. 3 | 57.0 | 88.8 | 36.7 |
| w/ PFPO-Self Iter. 4 | 57.4 | 89.1 | 37.6 |
| w/ PFPO-Self Iter. 5 | **57.8** | **89.6** | **38.0** |
| Mathstral-7B-v0.1 | 58.3 | 85.6 | 34.3 |
| w/ SFT | 61.4 | 87.3 | 38.4 |
| w/ PFPO-LLM Iter. 0 | 66.7 | 90.0 | 41.3 |
| w/ PFPO-Self Iter. 1 | 67.8 | **90.8** | 42.0 |
| w/ PFPO-Self Iter. 2 | **68.6** | 90.3 | 42.2 |
| w/ PFPO-Self Iter. 3 | 68.2 | 90.4 | **42.3** |
#### Coding - LiveCodeBench
| Model | Overall | Easy | Medium | Hard |
|---------------------------------------------------------------------------------------------|----------|----------|---------|---------|
| Claude-3.5-Sonnet | 51.3 | 87.2 | 45.3 | 11.0 |
| Claude-3-Sonnet | 26.9 | 67.2 | 7.3 | 1.4 |
| Claude-3-Haiku | 24.0 | 61.3 | 5.5 | 0.9 |
| GPT-3.5-Turbo-0125 | 24.0 | 55.0 | 11.6 | 0.3 |
| Llama-3.1-70B-Instruct | 31.8 | 67.9 | 17.3 | 4.1 |
| Llama-3-70B-Instruct | 27.4 | 59.4 | 15.6 | 1.3 |
| CodeQwen1.5-7B-Chat | 16.8 | 35.9 | 10.9 | 0.3 |
| DeepSeekCoder-V2-236B | 41.9 | 79.9 | 32.0 | 4.9 |
| Deepseek-Coder-33B-Instruct | 23.4 | 56.1 | 8.6 | 0.9 |
| Deepseek-coder-7B-v1.5-Insturct | 21.1 | 51.3 | 7.4 | 0.2 |
| w/ SFT (APPs) | 22.9 | 53.0 | 10.6 | 0.2 |
| w/ DPO (APPs) | 22.9 | 53.7 | 9.4 | 1.0 |
| w/ pDPO (APPs) | 22.9 | 55.0 | 8.1 | 1.3 |
| w/ PFPO-LLM Iter. 0 (APPs) | 24.0 | 56.8 | **9.3** | 1.4 |
| w/ PFPO-Self Iter. 1 (APPs & M.C.) | 24.2 | 57.8 | 8.5 | **1.7** |
| w/ PFPO-Self Iter. 2 (APPs & M.C. & xCode.) | **24.6** | **58.7** | 9.1 | 1.5 |
| w/ PFPO-Self Iter. 0 (APPs) | 23.4 | 54.2 | 10.3 | 0.7 |
| w/ PFPO-Self Iter. 1 (APPs & M.C.) | 23.7 | 55.8 | 9.5 | 1.1 |
| w/ PFPO-Self Iter. 2 (APPs & M.C. & xCode) | **24.3** | **56.8** | **9.8** | **1.6** |
Coding - APPs (click to expand)
| Model | Overall | Introductory | Interview | Competition |
|---------------------------------------------------------------------------------------------|----------|--------------|-----------|-------------|
| GPT-4-0613 | 35.1 | 61.8 | 34.4 | 10.6 |
| GPT-4o-2024-0513 | 34.0 | 56.6 | 32.2 | 16.7 |
| Llama-3.1-8B-Instruct | 11.5 | 29.4 | 8.5 | 2.7 |
| Llama-3.1-70B-Instruct | 24.9 | 51.8 | 21.3 | 9.1 |
| Codestral-22B-V0.1 | 20.3 | 45.2 | 16.9 | 5.8 |
| CodeQwen1.5-7B-chat | 8.6 | 24.1 | 16.8 | 2.0 |
| Qwen2.5-Coder-7B-Instruct | 15.7 | 37.3 | 12.3 | 4.1 |
| Deepseek-coder-33B-Instruct | 18.4 | 44.2 | 14.5 | 4.4 |
| Deepseek-coder-v1.5-Instruct | 14.3 | 35.7 | 10.8 | 3.2 |
| w/ SFT (APPs) | 15.4 | 37.8 | 11.6 | 4.1 |
| w/ DPO (APPs) | 16.3 | 36.2 | 13.3 | 5.3 |
| w/ pDPO (APPs) | 16.9 | 37.3 | 13.8 | 6.1 |
| w/ PFPO-LLM Iter. 0 (APPs) | 17.9 | 38.3 | 14.7 | 7.1 |
| w/ PFPO-Self Iter. 1 (APPs & M.C.) | 18.9 | **40.8** | 15.5 | **7.5** |
| w/ PFPO-Self Iter. 2 (APPs & M.C. & xCode.) | **19.1** | 39.6 | **16.1** | 7.4 |
| w/ PFPO-Self Iter. 0 (APPs) | 17.4 | 37.5 | 14.8 | 5.4 |
| w/ PFPO-Self Iter. 1 (APPs & M.C.) | 18.0 | 39.2 | 14.9 | 6.2 |
| w/ PFPO-Self Iter. 2 (APPs & M.C. & xCode.) | **19.1** | **40.9** | **15.9** | **6.9** |
Coding - HumanEval & MBPP (click to expand)
| Model | HumanEval | MBPP |
|---------------------------------------------------------------------------------------------------------------------|-----------|----------|
| GPT-4-0613 | 87.8 | 82.1 |
| GPT-4o-2024-0513 | 93.3 | 87.2 |
| Llama-3.1-8B-Instruct | 72.6 | 71.2 |
| Llama-3.1-70B-Instruct | 80.5 | 83.3 |
| Codestral-22B-V0.1 | 81.1 | 78.2 |
| CodeQwen1.5-7B-chat | 85.6 | 80.5 |
| Qwen2.5-Coder-7B-Instruct | 85.4 | 86.0 |
| Deepseek-coder-33B-Instruct | 77.4 | 79.0 |
| Deepseek-coder-v1.5-Instruct | 75.6 | 73.9 |
| w/ SFT (APPs) | 72.0 | 72.8 |
| w/ DPO (APPs) | 74.4 | 74.3 |
| w/ pDPO (APPs) | 73.8 | 73.2 |
| w/ PFPO-LLM Iter. 0 (APPs) | 73.8 | **75.9** |
| w/ PFPO-Self Iter. 1 (APPs & M.C.) | 76.8 | 73.9 |
| w/ PFPO-Self Iter. 2 (APPs & M.C. & xCode.) | **81.7** | 72.4 |
| w/ PFPO-Self Iter. 0 (APPs) | 73.2 | 75.1 |
| w/ PFPO-Self Iter. 1 (APPs & M.C.) | **79.3** | **75.5** |
| w/ PFPO-Self Iter. 2 (APPs & M.C. & xCode.) | 73.8 | 75.1 |
## Install Dependencies
Most dependencies are listed in `requirements.txt`.
Besides, you need to install flash-attention by yourself.
We also provides a docker image for running the experiments. You can pull the image by running:
```bash
docker pull jiaofangkai/normal:torch-2.5.1-vllm-0.6.4.post1-eval-1206
```
## Instruction to Run the Experiments
### Math (Taking Mathstral as Example)
#### SFT on MathScale
First, please prepare your own SFT data or download our released MathScale-4o (to be released soon). The file is single json file containing a list, where each
item has several keys: `question`, `box_solution`, and `id`, demonstrating the question, CoT solution with `\\bxoed{}`, and item index.
After that, run the following command:
```bash
torchrun --nnodes 2 --nproc_per_node 8 --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT trainer_base_ds_mul_fs_tp.py -cp conf/exp/mathscale/mistral/sft/ -cn mathstral-mathscale4o-sft-v2.0-v100
```
The above command should be run on two 8xV100 nodes. For less nodes, or less GPU resources, please change the gradient accumulation steps in the configuration
file accordingly.
In order to disable tensor parallel, please refer to
the [section](https://github.com/SparkJiao/pseudo-feedback?tab=readme-ov-file#enable-tensor-parallel-based-on-fairscale) below and set the `tp_size` to 1.
#### DPO using Ground-truth Feedback (Teacher Feedback)
**Run Inference**
Run the following command for inference using vLLM:
```bash
python vllm_inference.py test_file=${test_file} output_dir=${output_dir} eval_sub_path=${eval_sub_path} \
# Can keep the default values in the config file
sampling_params.n=8 sampling_params.temperature=1.0 sampling_params.top_p=0.9 split_size=1 split_id=0 \
-cp conf/api/vllm/mathscale/ -cn 4o_mathstral_train_0shot_v1_0
```
where `test_file` indicates the data file for inference, `output_dir` is the directory of your checkpoint, and `eval_sub_path` is sub-path of the checkpoint,
e.g., `checkpoint-100`. The data file is also a json file, which contains a list of items, where each item should have `question`, `id` and `label`.
**Construct Preference Pairs**
Run the following command:
```bash
python scripts/math_scale/construct_prefer_pair.py --input_file $input_file_glob_path --output_file $output_file_path
```
The input file path supports glob pattern, and the output file path is the file to save the constructed preference pairs.
**Run DPO Training**
```bash
torchrun --nnodes 1 --nproc_per_node 8 trainer_base_ds_mul_fs_tp.py -cp conf/exp/mathscale/mistral/dpo/ -cn mathstral-dpo-4o-iter0-v1.1-a100
```
The above config is set on single 8xA100-80G node. Remember to set `train_file` as your saved preference pair file, and `sft_model_dir` as the directory of the
SFT model checkpoint.
#### pDPO using Ground-truth Feedback (Teacher Feedback)
Following full trajectory sampling, we first need to sample some trajectory prefixes for completion and evaluation:
```bash
python scripts/math/deepseek_math_sample_steps.py --input_file $input_file --output_file $output_file \
--upper_step_ratio 0.7 --sample_ratio 0.3 --filter_all_same --sample_over_p 10
```
The `input_file` sets the full trajectory output data, and the `output_file` is the file to save the sampled prefixes. The `upper_step_ratio` indicates that we
avoid sampling steps at the last `1-upper_step_ratio` * 100 percent steps, and the `sample_ratio` is the ratio of sampled prefixes. The `sample_over_p` is the
number of sampled prefixes for each problem. `--filter_all_same` indicates that we avoid sampling prefixes from the problems where all predictions are the same.
**Run Completion Inference for Trajectory Prefixes**
```bash
python vllm_inference.py test_file=${test_file} output_dir=${output_dir} eval_sub_path=${eval_sub_path} \
# Can keep the default values in the config file
sampling_params.n=3 sampling_params.temperature=1.0 sampling_params.top_p=0.9 split_size=1 split_id=0 \
-cp conf/api/vllm/mathscale/ -cn 4o_mathstral_train_0shot_v1_0_completion
```
where `test_file` indicates the saved prefix file in the last step.
**Construct Prefix-Preference Pair**
```bash
python scripts/math_scale/construct_process_rm_sample_gd.py --input_file $prefix_completion_file --output_file $output_file --num_workers 128
```
**Run pDPO Training**
```bash
torchrun --nnodes 48 --nproc_per_node 8 --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT trainer_base_ds_mul_fs_tp.py \
-cp conf/exp/mathscale/mistral/dpo/ -cn mathstral-pdpo-4o-iter0-v2.2-V100
```
The above experiment runs on 48 8xV100 nodes with `tp_size=8`. Please adjust `per_gpu_train_batch_size`, `gradient_accumulation_steps accordingly` and `tp_size`
according to your resources.
#### DPO using Self-Generated Feedback
The overall workflow keeps the same the ground-truth feedback, and thus we only need to change the scripts for each step.
**Construct Preference Pairs**
```bash
python ~/gpt-chat-examples/scripts/math_scale/construct_prefer_pair_sc.py --input_file $full_trajectory_data --output_file $output_file --top_p $confidence_threshold
```
**Construct Prefix-Preference Pairs**
```bash
python scripts/math_scale/construct_process_rm_sample_sc.py \
--input_file $prefix_completion_file --output_file $output_file --response_file_for_sc $full_trajectory_data --response_id_field id --num_workers 128
```
For specified experimental configs, you can refer to
the [section](https://github.com/SparkJiao/pseudo-feedback?tab=readme-ov-file#configuration-of-all-experiments) below.
### Code
#### SFT on APPs
We use a special format to collect SFT data from GPT-4o, and you can refer to the prompt template here:
```bash
python scripts/apps/pp_solution_gen_inputs.py
```
Afterwards, we need to run the generated solutions on the annotated test cases for filtering:
```bash
python scripts/apps/solution_fail_extract.py --completion_file $completion_file --output_file $output_file --num_workers 16
```
Finally, we could conduct SFT training on this dataset:
```bash
torchrun --nnodes 2 --nproc_per_node 8 --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT trainer_base_ds_mul_fs_tp.py \
-cp conf/exp/apps/r2c_generation/deepseek_coder/sft/ -cn gpt4o-distil-v3.1-v100
```
The above experiment runs on 2 8xV100 nodes.
#### Pseudo Test Case Inputs Generation
Before we synthesize the pseudo feedback, we need to first prepare the test case inputs. We prompt general LLMs (e.g., GPT-4o, Mistral-Large-2409) to complete
this process, and you can find the prompting template here:
```
prompts/apps/test_input_gen_2shot_v2.1.txt
```
Note that, if your LLM service supports constraint decoding using `json object`, please enable this feature for better performance.
#### DPO on APPs based Ground-truth Test Cases
For running inference on the training set of APPs:
```bash
python vllm_inference.py split_size=1 split_id=0 -cp conf/api/vllm/apps/deepseek_coder/r2c/ -cn train_v2_0
```
Since the training set has included test cases, the above inference process will also include the evaluation, so that we can directly construct preference pairs
by the evaluation results:
```bash
python scripts/apps/construct_prefer_pair.py \
--input_file $full_trajectory_data --output_file $output_file --response_field response --test_case_field test_cases
```
Then, run DPO training:
```bash
torchrun --nnodes 2 --nproc_per_node 8 --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT trainer_base_ds_mul_fs_tp.py \
-cp conf/exp/apps/r2c_generation/deepseek_coder/dpo/ -cn gpt4o-distil-v3.2-v100
```
The above experiment runs on 2 8xV100 nodes.
#### DPO/pDPO on APPs based Self-Consistency Test Cases
In order to construct prefer pairs under self-consistency-based test cases, we need to re-run the full trajectory data (code solutions) on the synthetic test
case inputs and obtain the pseudo outputs:
```bash
python scripts/apps/solution_run_pseudo_outputs_local.py \
--completion_file $full_trajectory_data --output_file $output_file --pseudo_test_case $synthetic_test_inputs --num_workers 128
```
This process is better to be conducted in sandbox.
Afterwards, we can construct the prefix-preference pairs:
```bash
python scripts/apps/pseudo_test_cases/collect_pseudo_outputs.py \
--pseudo_test_case_file $result_file_on_synthetic_inputs \
--output_file $output_file \
--construct_prefer_pair --pass_case_margin 6 --pass_case_lower_bound 0.5
```
where `pass_case_margin` denotes the margin for preference pair, and `pass_case_lower_bound` is the minimum ratio of passed cases for some solution to serve as
a positive anchor.
Then, run DPO training:
```bash
torchrun --nnodes 8 --nproc_per_node 8 --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT trainer_base_ds_mul_fs_tp.py \
-cp conf/exp/apps/r2c_generation/deepseek_coder/dpo/ -cn gpt4o-distil-v4.0-v100-ps-test
```
The above experiment runs on 8 8xV100 nodes with `tp_size=8`.
In order to perform pDPO training, first sample steps from the full trajectory data:
```bash
python scripts/apps/prm/sample_steps.py \
--input_file $full_trajectory_data --upper_step_ratio 0.8 --sample_ratio 0.3 --output_file $output_file
```
For prefix completion, run:
```bash
python vllm_inference.py split_size=1 split_id=0 -cp conf/api/vllm/apps/deepseek_coder/r2c/ -cn train_v2_0_prefix_completion
```
As we have already synthesized the pseudo outputs, we can evaluate the prefix completions on the pseudo test cases:
```bash
python scripts/apps/pseudo_test_cases/prefix_fail_extract_pseudo_label.py \
--completion_file $prefix_completion_file --output_file $output_file --num_workers 64 --pseudo_test_cases $pseudo_test_cases
```
Finally, construct the prefix-preference pairs:
```bash
python scripts/apps/prm/construct_process_rm_sample_fix.py \
--input_file $prefix_completion_execute_file --output_file $output_file \
--pass_case_lower_bound 0.8 --pass_case_margin 4 --test_case_field pseudo_input_output
```
Then, run pDPO training:
```bash
torchrun --nnodes 16 --nproc_per_node 8 --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT trainer_base_ds_mul_fs_tp.py \
-cp conf/exp/apps/r2c_generation/deepseek_coder/dpo/ -cn gpt4o-distil-v4.9-V100-ps-pdpo
```
The above experiment runs on 16 8xV100 nodes with `tp_size=8`.
#### DPO/pDPO on MagiCoder-OSS and XCodeEval
Due to the similar process, we provide the commands for data processing in the following bash script for your reference:
```text
scripts/apps/pseudo_test_cases/pipeline.sh # For Magicoder-OSS
scripts/apps/pseudo_test_cases/xcode_pipeline.sh # For XCodeEval
```
We will release our preprocessed data including the synthetic test case inputs to reduce your workload.
## Configuration of All Experiments
Here are the configuration files of all experiments in Table 1, 2, 3, and 5 in the paper:
| Experiment | Configuration File |
|:------------------------------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------:|
| Mathstral w/ SFT | [yaml file](conf/exp/mathscale/mistral/sft/mathstral-mathscale4o-sft-v2.0-v100.yaml) |
| w/ DPO (M.S.-500k, Iter. 0) | [yaml file](./conf/exp/mathscale/mistral/dpo/mathstral-dpo-4o-iter0-v1.1-a100.yaml) |
| w/ pDPO (M.S.-500k, Iter. 0) | [yaml file](./conf/exp/mathscale/mistral/dpo/mathstral-pdpo-4o-iter0-v2.2-V100.yaml) |
| w/ pDPO (M.S.-300k-S.C., Iter. 1) | [yaml file](./conf/exp/mathscale/mistral/dpo/iter1-mscale-v0.1/mathstral-pdpo-mscale300k-iter1-v3.1-V100.yaml) |
| w/ pDPO (M.S.-300k-S.C., Iter. 2) | [yaml file](./conf/exp/mathscale/mistral/dpo/iter-2-mscale-v0.1/mathstral-pdpo-mscale300k-iter2-v1.3-H100.yaml) |
| Llama-3.1-8B w/ SFT | |
| w/ DPO (M.S.-500k, Iter. 0) | [yaml file](./conf/exp/mathscale/llama/dpo/llama3.1-dpo-4o-iter0-v1.0-v100.yaml) |
| w/ pDPO (M.S.-500k, Iter. 0) | [yaml file](./conf/exp/mathscale/llama/dpo/llama3.1-pdpo-4o-iter0-v2.2-A100.yaml) |
| w/ pDPO (Numina-S.C. 160k, Iter. 1) | [yaml file](./conf/exp/mathscale/llama/dpo/numina-co/llama3.1-pdpo-iter1-1.0-split01-p0.5-h100.yaml) |
| w/ pDPO (Numina-S.C. 320k, Iter. 2) | [yaml file](./conf/exp/mathscale/llama/dpo/numina-co/llama3.1-pdpo-iter2-split01-23-p0.5-v1.4-h100.yaml) |
| w/ pDPO (Numina-S.C. 480k, Iter. 3) | [yaml file](./conf/exp/mathscale/llama/dpo/numina-co/llama3.1-pdpo-iter3-split01-23-45-p0.5-v1.4-h100.yaml) |
| w/ pDPO (Numina-S.C. 640k, Iter. 4) | [yaml file](./conf/exp/mathscale/llama/dpo/numina-co/llama3.1-pdpo-iter4-split01-23-45-67-p0.0-v1.5-h100.yaml) |
| w/ pDPO (Numina-S.C. 790k, Iter. 5) | [yaml file](./conf/exp/mathscale/llama/dpo/numina-co/llama3.1-pdpo-iter4-split01-23-45-67-p0.0-v1.5-h100.yaml) |
| Deepseek-coder-v1.5-chat w/ SFT | [yaml file](./conf/exp/apps/r2c_generation/deepseek_coder/sft/gpt4o-distil-v3.1-v100) |
| w/ DPO (APPs) | [yaml file](./conf/exp/apps/r2c_generation/deepseek_coder/dpo/gpt4o-distil-v3.2-v100.yaml) |
| w/ pDPO (APPs) | [yaml file](./conf/exp/apps/r2c_generation/deepseek_coder/dpo/gpt4o-distil-v4.2-v100-gd-pdpo.yaml) |
| w/ DPO (APPs - S.C.) | [yaml file](./conf/exp/apps/r2c_generation/deepseek_coder/dpo/gpt4o-distil-v4.0-v100-ps-test.yaml) |
| w/ pDPO (APPs - S.C.) | [yaml file](./conf/exp/apps/r2c_generation/deepseek_coder/dpo/gpt4o-distil-v4.9-V100-ps-pdpo.yaml) |
| w/ DPO (APPs \& M.C. - S.C.) | [yaml file](./conf/exp/apps/r2c_generation/deepseek_coder/dpo/iter1/gpt4o-distil-combine-v1.2-a100-40-ps-test.yaml) |
| w/ DPO (APPs \& M.C. \& xCode. - S.C.) | [yaml file](./conf/exp/apps/r2c_generation/deepseek_coder/dpo/iter2/gpt4o-distil-combine-v1.0-H100-ps-test.yaml) |
| w/ pDPO (APPs \& M.C. \& xCode. - S.C.) | [yaml file](./conf/exp/apps/r2c_generation/deepseek_coder/dpo/iter2/gpt4o-distil-combine-pdpo-v1.3-v100-ps-test.yaml) |
| w/ pDPO (APPs \& M.C. - S.C.) | [yaml file](./conf/exp/apps/r2c_generation/deepseek_coder/dpo/iter1/gpt4o-distil-combine-pdpo-v1.2-h100-ps-test.yaml) |
## Evaluation Configs
For evaluation, simply run `python vllm_inference.py -cp $config_path -cn $config_name`. The evaluation is included in the inference process. Belows are the
evaluation configs for different tasks.
### MWPBench (including MATH and GSM8K):
The config file is `conf/api/vllm/mwp-bench/mathstral_test_0shot_v1_0.yaml`.
Note that, you need use sympy evaluation for more accurate evaluation. Please refer to `scripts/math_scale/qwen25math_style_eval_v2.0.py` for more details.
If your prediction file is generated through our config, simply run:
```bash
python scripts/math_scale/qwen25math_style_eval_v2.0.py --input_file $prediction_file_path
```
For the necessary dependency to run sympy, please create a new virtual environment and follow the instruction
of [Qwen2.5-Math](https://github.com/QwenLM/Qwen2.5-Math/tree/main/evaluation).
### Code
APPs: `conf/api/vllm/apps/deepseek_coder/r2c/dev_v2_0.yaml`
HumanEval: `conf/api/vllm/human_eval/ds_coder/r2c/test_v2_2_local.yaml`
MBPP-257: `conf/api/vllm/mbpp_sanitized/r2c/test_v1_0_local.yaml`
For the evaluation of LiveCodeBench, please refer to the official repo. You can also refer to
my [commit](https://github.com/LiveCodeBench/LiveCodeBench/commit/d3f852be5ea5b60d6b8aec3c7e31337c71e8ba56) for reference. We only modified the prompts template
to adapt to the evaluation.
## Basic Tutorial for Hydra Configuration
In this repo, we have used [Hydra](https://hydra.cc/) and Yaml files to configure the experiments. We have used some features of Hydra and we will give some
basic introduction here to avoid potential confusion.
### Launch Job
In most cases, the entrance is `trainer_base_ds_mul_fs_tp.py`, where you will see the following main function:
```python
import hydra
from omegaconf import DictConfig
@hydra.main(config_path="conf", config_name="config", version_base="1.2")
def main(cfg: DictConfig):
...
```
The launch command is as normal, such as using `torchrun` or `deepspeed`, for example:
```bash
deepspeed trainer_base_ds_mul_fs_tp.py seed=42 [other arguments without "--" prefix] \
cp=${config_path} cn=${config_name}
```
where `config_path` is the path of the directory containing the corresponding confie file, and `config_name` is the file name without the suffix `.yaml`.
### Runtime Function Calling and Dependency Import
In the configuration, you will see some usage like the following:
```yaml
model:
_target_: models.llama_tp.LlamaForCausalLM.from_pretrained
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
torch_dtype: ${torch_dtype}
pad_token_id: ${base_eos_token_id}
```
where `_target_` indicates this is a function call (including `__init__` function, i.e., object initialization), and the arguments are specified in the
following lines. Besides, `models.llama_tp.LlamaForCausalLM.from_pretrained` indicates the relative path of the function to be called, and you do not need to
import this function during coding.
In python code, you can obtain the returned value of the called function through
```python
model = hydra.utils.call(cfg.model, cfg.model_name_or_path, state_dict=pretrain_state_dict)
```
where the arguments not specified in the configuration file can be passed as additional arguments.
Additionally, you can initialize the objects through hydra in a recursive manner. In the above example, the `torch_dtype` is also defined as a returned value of
another function call:
```yaml
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
```
## Implementation
### Change Deepspeed Configuration
There are some pre-defined configurations under `conf/deepspeed`. You can import them in your config file at the beginning by changing `deepspeed@ds_cfg`:
```yaml
defaults:
- hydra: default
- deepspeed@ds_cfg: train_hybrid_engine_zero1_cosine
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
```
The `{a}@{b}:c` symbol indicates that the configuration group to be imported is `conf/a/c.yaml` and this configuration group is renamed to `b` in current
configuration file.
### Enable Tensor Parallel based on FairScale
The are some implementations using tensor parallel under `models`, ending with `_tp.py`. To enable tensor parallel, use the model with tensor parallel
implementation such as `models.llama_tp.LlamaForCausalLM.from_pretrained`, and set the `tp_size` in your configuration file.
Note that you need to use `scripts/model_converter/convert_llama_to_llama_tp.py` to convert the original model to the tensor parallel model. Currently the
script supports `Llama`, `Qwen` and `Mistral` model series.
### Memory Optimization
We would recommend the following order to try to reduce the memory usage:
```text
zero1 > zero2 > intra-node-zero3 & cross-node dp > intra-node tp & cross node zero1/2 > global zero3
```
More resources on this project can be found [here](https://huggingface.co/collections/chitanda/pfpo-67a41baa25f2892fafad2f0c)
## Contact
If you have any problem about our code or paper, feel free to open an issue or send an email to the authors.
## Citation
If you feel our paper or code is helpful, please cite our paper:
```
@inproceedings{jiao2024pfpo,
title={Preference Optimization for Reasoning with Pseudo Feedback},
author={Fangkai Jiao and Geyang Guo and Xingxing Zhang and Nancy F. Chen and Shafiq Joty and Furu Wei},
year={2025},
booktitle={ICLR},
}
```
If you feel the code base for pDPO is also useful, kindly cite the following paper:
```
@inproceedings{jiao2024lpr,
author={Fangkai Jiao and Chengwei Qin and Zhengyuan Liu and Nancy F. Chen and Shafiq Joty},
title = {Learning Planning-based Reasoning with Trajectory Collection and Process Rewards Synthesizing},
booktitle = {{EMNLP}},
publisher = {Association for Computational Linguistics},
year = {2024},
}
```
================================================
FILE: PFPO/apps_train_sub_val_ids.json
================================================
[
1143,
2406,
807,
3729,
4301,
2434,
4976,
4826,
1769,
1651,
3434,
2517,
1616,
4625,
4411,
706,
2075,
2319,
478,
67,
1311,
658,
1767,
3551,
49,
1312,
2289,
4770,
3096,
1039,
4047,
3402,
663,
2963,
3349,
3680,
4280,
4253,
2605,
312,
4951,
3710,
136,
2574,
3081,
4961,
2181,
1495,
4363,
4983,
2895,
191,
406,
297,
4532,
458,
62,
1123,
2827,
3401,
113,
1421,
1290,
586,
2022,
3506,
737,
1393,
4033,
4452,
4935,
3823,
2373,
122,
2687,
4053,
1317,
2187,
4472,
2767,
448,
2603,
2323,
980,
222,
2608,
4095,
1338,
617,
572,
3674,
4081,
3908,
372,
4116,
3809,
3704,
3265,
437,
4143,
2892,
457,
1587,
1638,
4358,
3210,
2889,
4211,
1795,
509,
89,
4220,
423,
3142,
178,
3573,
2436,
3961,
762,
2519,
1681,
3269,
1160,
2592,
47,
3409,
4173,
761,
3868,
149,
4286,
4911,
2110,
58,
1158,
2327,
731,
3999,
3167,
3416,
4330,
2599,
246,
1059,
2734,
2743,
4124,
467,
4522,
2995,
3507,
579,
1022,
1553,
668,
685,
2295,
2522,
2916,
3926,
1250,
2078,
2358,
1370,
3422,
1176,
2192,
2829,
1467,
3111,
3670,
3570,
1625,
4728,
3460,
2686,
1029,
1802,
3023,
3414,
1411,
57,
4118,
1437,
831,
2279,
1790,
3336,
867,
1339,
3650,
152,
2554,
919,
747,
1267,
230,
4520,
2390,
2474,
2362,
2640,
3925,
1383,
2904,
4668,
4930,
4686,
4682,
543,
2631,
3315,
4687,
3325,
2590,
4546,
3153,
3643,
1101,
1730,
2272,
4106,
1747,
4369,
2966,
2798,
28,
3390,
2266,
1084,
3571,
55,
1104,
2988,
3044,
3974,
1385,
2860,
256,
854,
1727,
4536,
3227,
2521,
1648,
4661,
1015,
2778,
3121,
4025,
3445,
2064,
3935,
1277,
834,
4447,
4518,
959,
4645,
228,
562,
3263,
3019,
616,
4635,
284,
342,
2558,
2846,
2079,
4252,
990,
1642,
4294,
2630,
3583,
1134,
351,
2654,
3335,
30,
1333,
96,
1668,
3208,
630,
4859,
4267,
2206,
341,
832,
4011,
2058,
1346,
2738,
2939,
334,
829,
864,
3261,
2496,
1685,
101,
3637,
2989,
2402,
4789,
3798,
479,
430,
1025,
2993,
1711,
3580,
2190,
3882,
974,
36,
2897,
2832,
3275,
1232,
3543,
3514,
4298,
1145,
1636,
1121,
86,
3705,
4308,
1125,
1459,
3547,
4070,
3114,
3113,
1589,
233,
1667,
3062,
830,
2822,
2458,
3,
594,
3481,
4187,
3037,
4806,
3525,
4714,
4343,
2410,
4626,
563,
2121,
1014,
3025,
680,
1741,
773,
2553,
4631,
749,
999,
2917,
3087,
2518,
1705,
394,
1336,
3101,
2196,
3242,
4861,
2774,
3492,
1296,
4784,
2056,
4433,
2307,
952,
72,
1151,
4355,
3687,
4676,
4372,
3845,
1268,
2906,
1621,
2785,
1351,
1492,
3775,
3029,
744,
1045,
4699,
3068,
2475,
748,
4545,
4835,
4493,
514,
2214,
4801,
1670,
2216,
44,
147,
2237,
2354,
4161,
4907,
1444,
715,
843,
3993,
85,
994,
3204,
3137,
3183,
4781,
3964,
2395,
1261,
597,
2394,
3279,
1778,
3889,
2408,
4212,
2388,
510,
4418,
2374,
2878,
1402,
288,
794,
4240,
1494,
2172,
1147,
887,
2043,
3819,
1751,
4382,
327,
4275,
2796,
4062,
2464,
4603,
672,
1294,
3060,
66,
87,
2629,
2761,
725,
875,
2491,
3198,
3601,
4309,
1163,
3984,
3517,
3857,
3556,
699,
4844,
313,
2720,
2814,
1120,
1584,
3346,
170,
501,
3531,
2427,
3112,
2378,
1371,
291,
1692,
1627,
3119,
4681
]
================================================
FILE: PFPO/conf/api/vllm/apps/deepseek_coder/dev_v1_0.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
train_file: "codeparrot/apps"
dev_file: ${train_file}
test_file: ${train_file}
port: 6000
model: ds-coder-v1.5-chat
sampling_params:
_target_: vllm.SamplingParams
n: 1
temperature: 0.0
stop: [ "", "\n\n\n\n", "Context:\n", "Thought 42:", "<|end_of_text|>", "<|eot_id|>, <|EOT|>" ]
max_tokens: 4096
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
split_size: -1
split_id: 0
max_num_seqs: 32
output_file: ../pretrained-models/deepseek-coder-7b-instruct-v1.5/apps/val.0shot.tem${tem}.n${n}.v1.0.json
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt: "{question}\n\nPlease write a program to solve the above problem under the given time constraints and memory limits."
# Data loading
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
read_fn:
_target_: data.apps.APPsReader
split: test
train_sub_split: "train"
template: ${chat_prefix}${prompt}${chat_connect}
instruction:
index_field: "problem_id"
service_based: True
split_size: ${split_size}
split_id: ${split_id}
service_processor:
_target_: data.vllm.VLLMRequestGenerator
# api_url: http://0.0.0.0:${port}/v1/chat/completions
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: 4096
# system_prompt: "You are a helpful assistant to help solve the complex reasoning problem."
model: ${model}
stop: ${sampling_params.stop}
temperature: ${sampling_params.temperature}
n: ${sampling_params.n}
max_data_num: -1
save_best: True
eval_sub_path:
output_dir: ../pretrained-models/deepseek-coder-7b-instruct-v1.5/
# Dataloader
num_workers: 32
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.code.code.CodeExtractor
output_file: ${output_file}
answer_clean:
_target_: post_processors.code.clean.get
name: standard
index_field: problem_id
test_case_field: "input_output"
evaluator:
_target_: post_processors.code.eval.return_apps_evaluator
timeout: 10
debug: False
saved_keys: [ "difficulty" ]
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/apps/deepseek_coder/dev_v1_0_fix_bos.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
train_file: "codeparrot/apps"
dev_file: ${train_file}
test_file: ${train_file}
port: 6000
model: ds-coder-v1.5-chat
sampling_params:
_target_: vllm.SamplingParams
n: 1
temperature: 0.0
stop: [ "", "\n\n\n\n", "Context:\n", "Thought 42:", "<|end_of_text|>", "<|eot_id|>, <|EOT|>" ]
max_tokens: 4096
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
split_size: -1
split_id: 0
max_num_seqs: 32
output_file: ../pretrained-models/deepseek-coder-7b-instruct-v1.5/apps/val.0shot.tem${tem}.n${n}.v1.0.fix_bos.json
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
chat_prefix: "You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt: "{question}\n\nPlease write a program to solve the above problem under the given time constraints and memory limits."
# Data loading
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
read_fn:
_target_: data.apps.APPsReader
split: test
train_sub_split: "train"
template: ${chat_prefix}${prompt}${chat_connect}
instruction:
index_field: "problem_id"
service_based: True
split_size: ${split_size}
split_id: ${split_id}
service_processor:
_target_: data.vllm.VLLMRequestGenerator
# api_url: http://0.0.0.0:${port}/v1/chat/completions
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: 4096
# system_prompt: "You are a helpful assistant to help solve the complex reasoning problem."
model: ${model}
stop: ${sampling_params.stop}
temperature: ${sampling_params.temperature}
n: ${sampling_params.n}
max_data_num: -1
save_best: True
eval_sub_path:
output_dir: ../pretrained-models/deepseek-coder-7b-instruct-v1.5/
# Dataloader
num_workers: 32
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.code.code.CodeExtractor
output_file: ${output_file}
answer_clean:
_target_: post_processors.code.clean.get
name: standard
index_field: problem_id
test_case_field: "input_output"
evaluator:
_target_: post_processors.code.code.APPsEvaluator
saved_keys: [ "difficulty" ]
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/apps/deepseek_coder/dev_v1_1.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
data_path_prefix: ""
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "codeparrot/apps"
dev_file: ${train_file}
test_file: ${train_file}
port: 6000
model: ds-coder-v1.5-chat
sampling_params:
_target_: vllm.SamplingParams
n: 1
temperature: 0.0
stop: [ "", "\n\n\n\n", "Context:\n", "Thought 42:", "<|end_of_text|>", "<|eot_id|>, <|EOT|>" ]
max_tokens: 4096
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
split_size: 8
split_id: 0
max_num_seqs: 64
suffix: ${split_id}-of-${split_size}
output_file: ${output_dir}/apps/${eval_sub_path}/test.0shot.tem${tem}.n${n}.${suffix}.v1.1.json # v1.1 for extraction
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt: "{question}\n\nPlease write a program to solve the above problem under the given time constraints and memory limits."
# Data loading
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
read_fn:
_target_: data.apps.APPsReader
split: test
template: ${chat_prefix}${prompt}${chat_connect}
instruction:
index_field: "problem_id"
service_based: False
split_size: ${split_size}
split_id: ${split_id}
service_processor:
_target_: data.vllm.VLLMRequestGenerator
# api_url: http://0.0.0.0:${port}/v1/chat/completions
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: 4096
# system_prompt: "You are a helpful assistant to help solve the complex reasoning problem."
model: ${model}
stop: ${sampling_params.stop}
temperature: ${sampling_params.temperature}
n: ${sampling_params.n}
max_data_num: -1
exp_name: deepseek-coder-v1.5-ins.7b.apps.code_gen.dpo.V100.w8.v1.1
save_best: False
eval_sub_path:
output_dir: ${output_path_prefix}experiments/${exp_name}/
# Dataloader
num_workers: 32
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.code.code.CodeExtractor
output_file: ${output_file}
answer_clean:
_target_: post_processors.code.clean.get
name: standard
index_field: problem_id
test_case_field: "input_output"
evaluator:
_target_: post_processors.code.code.APPsEvaluator
saved_keys: [ "difficulty" ]
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/apps/deepseek_coder/dev_v1_1_sample.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
data_path_prefix: ""
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "codeparrot/apps"
dev_file: ${train_file}
test_file: ${train_file}
port: 6000
model: ds-coder-v1.5-chat
sampling_params:
_target_: vllm.SamplingParams
n: 20
temperature: 1.0
stop: [ "", "\n\n\n\n", "Context:\n", "Thought 42:", "<|end_of_text|>", "<|eot_id|>, <|EOT|>" ]
max_tokens: 4096
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
split_size: 8
split_id: 0
max_num_seqs: 256
suffix: ${split_id}-of-${split_size}
output_file: ${output_dir}/apps/${eval_sub_path}/test.0shot.tem${tem}.n${n}.${suffix}.v1.1.json # v1.1 for extraction
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt: "{question}\n\nPlease write a program to solve the above problem under the given time constraints and memory limits."
# Data loading
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
read_fn:
_target_: data.apps.APPsReader
split: test
template: ${chat_prefix}${prompt}${chat_connect}
instruction:
index_field: "problem_id"
service_based: False
split_size: ${split_size}
split_id: ${split_id}
service_processor:
_target_: data.vllm.VLLMRequestGenerator
# api_url: http://0.0.0.0:${port}/v1/chat/completions
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: 4096
# system_prompt: "You are a helpful assistant to help solve the complex reasoning problem."
model: ${model}
stop: ${sampling_params.stop}
temperature: ${sampling_params.temperature}
n: ${sampling_params.n}
max_data_num: -1
exp_name: deepseek-coder-v1.5-ins.7b.apps.code_gen.dpo.V100.w8.v1.1
save_best: False
eval_sub_path:
output_dir: ${output_path_prefix}experiments/${exp_name}/
# Dataloader
num_workers: 32
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.code.code.CodeExtractor
output_file: ${output_file}
answer_clean:
_target_: post_processors.code.clean.get
name: standard
index_field: problem_id
test_case_field: "input_output"
evaluator:
_target_: post_processors.code.code.APPsEvaluator
saved_keys: [ "difficulty" ]
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/apps/deepseek_coder/dev_v2_0.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
data_path_prefix: ""
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "codeparrot/apps"
dev_file: ${train_file}
test_file: ${train_file}
port: 6000
model: ds-coder-v1.5-chat
sampling_params:
_target_: vllm.SamplingParams
n: 1
temperature: 0.0
stop: [ "", "\n\n\n\n", "Context:\n", "Thought 42:", "<|end_of_text|>", "<|eot_id|>, <|EOT|>" ]
max_tokens: 4096
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
split_size: 8
split_id: 0
max_num_seqs: 32
suffix: ${split_id}-of-${split_size}
output_file: ${output_dir}/apps/${eval_sub_path}/test.0shot.tem${tem}.n${n}.${suffix}.v2.0.json # v1.1 for extraction
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt: "{question}\n\nPlease write a program to solve the above problem under the given time constraints and memory limits."
# Data loading
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
read_fn:
_target_: data.apps.APPsWithFunctionName
split: test
use_starter_code: True
template: ${chat_prefix}${prompt}${chat_connect}
instruction:
index_field: "problem_id"
service_based: False
split_size: ${split_size}
split_id: ${split_id}
service_processor:
_target_: data.vllm.VLLMRequestGenerator
# api_url: http://0.0.0.0:${port}/v1/chat/completions
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: 4096
# system_prompt: "You are a helpful assistant to help solve the complex reasoning problem."
model: ${model}
stop: ${sampling_params.stop}
temperature: ${sampling_params.temperature}
n: ${sampling_params.n}
max_data_num: -1
flush_file: ${flush_file}
exp_name: deepseek-coder-v1.5-ins.7b.apps.code_gen.dpo.V100.w8.v1.1
save_best: False
eval_sub_path:
output_dir: ${output_path_prefix}experiments/${exp_name}/
# Dataloader
num_workers: 32
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.code.code.CodeExtractor
output_file: ${output_file}
answer_clean:
_target_: post_processors.code.clean.get
name: standard
index_field: problem_id
test_case_field: "input_output"
evaluator:
_target_: post_processors.code.code.APPsEvaluator
saved_keys: [ "difficulty" ]
resume: True
num_workers: 16
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/apps/deepseek_coder/r2c/dev_v1_0.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
data_path_prefix: ""
model_path_prefix: ../pretrained-models/ # /mnt/fangkai_blob/share/models
output_path_prefix: "" # /mnt/fangkai_blob/reward_modeling/
train_file: "codeparrot/apps"
dev_file: ${train_file}
test_file: ${train_file}
port: 6000
model: ds-coder-v1.5-chat
sampling_params:
_target_: vllm.SamplingParams
# n: 20
# temperature: 1.0
n: 1
temperature: 0.0
stop: [ "", "\n\n\n\n", "Context:\n", "Thought 42:", "<|end_of_text|>", "<|eot_id|>, <|EOT|>" ]
max_tokens: 4096
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
split_size: -1
split_id: 0
max_num_seqs: 32
#output_file: ${output_dir}/apps/val.0shot.tem${tem}.n${n}.v1.0.json
output_file: ${output_dir}/apps/${eval_sub_path}/val.0shot.tem${tem}.n${n}.v1.1.json # v1.1 for extraction
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
# Data loading
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
read_fn:
_target_: data.apps.APPsReader
split: test
# template: ${chat_prefix}${prompt}${chat_connect}
template:
_target_: data.input_utils.compose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
composition: "{chat_prefix}{prompt}{chat_connect}"
instruction:
index_field: "problem_id"
service_based: False
split_size: ${split_size}
split_id: ${split_id}
service_processor:
_target_: data.vllm.VLLMRequestGenerator
# api_url: http://0.0.0.0:${port}/v1/chat/completions
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: 4096
# system_prompt: "You are a helpful assistant to help solve the complex reasoning problem."
model: ${model}
stop: ${sampling_params.stop}
temperature: ${sampling_params.temperature}
n: ${sampling_params.n}
max_data_num: -1
saved_keys: [ "difficulty" ]
save_best: False
eval_sub_path:
output_dir: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v2.0/
# Dataloader
num_workers: 32
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.code.code.CodeExtractor
output_file: ${output_file}
answer_clean:
_target_: post_processors.code.clean.get
# name: standard
name: tag
index_field: problem_id
test_case_field: "input_output"
evaluator:
_target_: post_processors.code.eval.return_apps_evaluator
timeout: 10
debug: False
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/apps/deepseek_coder/r2c/dev_v1_1.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
data_path_prefix: ""
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "codeparrot/apps"
dev_file: ${train_file}
test_file: ${train_file}
port: 6000
model: ds-coder-v1.5-chat
sampling_params:
_target_: vllm.SamplingParams
# n: 20
# temperature: 1.0
n: 1
temperature: 0.0
stop: [ "", "\n\n\n\n", "Context:\n", "Thought 42:", "<|end_of_text|>", "<|eot_id|>, <|EOT|>" ]
max_tokens: 4096
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
split_size: 8
split_id: 0
max_num_seqs: 64
suffix: ${split_id}-of-${split_size}
output_file: ${output_dir}/apps/${eval_sub_path}/test.0shot.tem${tem}.n${n}.${suffix}.v1.1.json # v1.1 for extraction
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
# Data loading
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
read_fn:
_target_: data.apps.APPsReader
split: test
# template: ${chat_prefix}${prompt}${chat_connect}
template:
_target_: data.input_utils.compose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
composition: "{chat_prefix}{prompt}{chat_connect}"
instruction:
index_field: "problem_id"
service_based: False
split_size: ${split_size}
split_id: ${split_id}
service_processor:
_target_: data.vllm.VLLMRequestGenerator
# api_url: http://0.0.0.0:${port}/v1/chat/completions
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: 4096
# system_prompt: "You are a helpful assistant to help solve the complex reasoning problem."
model: ${model}
stop: ${sampling_params.stop}
temperature: ${sampling_params.temperature}
n: ${sampling_params.n}
max_data_num: -1
flush_file: ${flush_file}
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.sft_dpo.V100.tp4.dp2.v1.0.s42
save_best: False
eval_sub_path:
output_dir: ${output_path_prefix}experiments/${exp_name}/
# Dataloader
num_workers: 32
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.code.code.CodeExtractor
output_file: ${output_file}
answer_clean:
_target_: post_processors.code.clean.get
# name: standard
name: tag
index_field: problem_id
test_case_field: "input_output"
evaluator:
_target_: post_processors.code.code.APPsEvaluator
resume: True
saved_keys: [ "difficulty" ]
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/apps/deepseek_coder/r2c/dev_v1_1_sample.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
data_path_prefix: ""
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "codeparrot/apps"
dev_file: ${train_file}
test_file: ${train_file}
port: 6000
model: ds-coder-v1.5-chat
sampling_params:
_target_: vllm.SamplingParams
n: 20
temperature: 1.0
stop: [ "", "\n\n\n\n", "Context:\n", "Thought 42:", "<|end_of_text|>", "<|eot_id|>, <|EOT|>" ]
max_tokens: 4096
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
split_size: 8
split_id: 0
max_num_seqs: 256
suffix: ${split_id}-of-${split_size}
output_file: ${output_dir}/apps/${eval_sub_path}/test.0shot.tem${tem}.n${n}.${suffix}.v1.1.json # v1.1 for extraction
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
# Data loading
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
read_fn:
_target_: data.apps.APPsReader
split: test
# template: ${chat_prefix}${prompt}${chat_connect}
template:
_target_: data.input_utils.compose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
composition: "{chat_prefix}{prompt}{chat_connect}"
instruction:
index_field: "problem_id"
service_based: False
split_size: ${split_size}
split_id: ${split_id}
service_processor:
_target_: data.vllm.VLLMRequestGenerator
# api_url: http://0.0.0.0:${port}/v1/chat/completions
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: 4096
# system_prompt: "You are a helpful assistant to help solve the complex reasoning problem."
model: ${model}
stop: ${sampling_params.stop}
temperature: ${sampling_params.temperature}
n: ${sampling_params.n}
max_data_num: -1
flush_file: ${flush_file}
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.sft_dpo.V100.tp4.dp2.v1.0.s42
save_best: False
eval_sub_path:
output_dir: ${output_path_prefix}experiments/${exp_name}/
# Dataloader
num_workers: 32
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.code.code.CodeExtractor
output_file: ${output_file}
answer_clean:
_target_: post_processors.code.clean.get
# name: standard
name: tag
index_field: problem_id
test_case_field: "input_output"
evaluator:
_target_: post_processors.code.code.APPsEvaluator
resume: True
saved_keys: [ "difficulty" ]
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/apps/deepseek_coder/r2c/dev_v2_0.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
data_path_prefix: ""
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "codeparrot/apps"
dev_file: ${train_file}
test_file: ${train_file}
port: 6000
model: ds-coder-v1.5-chat
sampling_params:
_target_: vllm.SamplingParams
# n: 20
# temperature: 1.0
n: 1
temperature: 0.0
stop: [ "", "\n\n\n\n", "Context:\n", "Thought 42:", "<|end_of_text|>", "<|eot_id|>, <|EOT|>" ]
max_tokens: 4096
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
split_size: 8
split_id: 0
max_num_seqs: 32
suffix: ${split_id}-of-${split_size}
output_file: ${output_dir}/apps/${eval_sub_path}/test.0shot.tem${tem}.n${n}.${suffix}.v2.0.json # v1.1 for extraction
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
# Data loading
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
read_fn:
_target_: data.apps.APPsWithFunctionName
split: test
use_starter_code: True
# template: ${chat_prefix}${prompt}${chat_connect}
template:
_target_: data.input_utils.compose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
composition: "{chat_prefix}{prompt}{chat_connect}"
instruction:
index_field: "problem_id"
service_based: False
split_size: ${split_size}
split_id: ${split_id}
service_processor:
_target_: data.vllm.VLLMRequestGenerator
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: 4096
model: ${model}
stop: ${sampling_params.stop}
temperature: ${sampling_params.temperature}
n: ${sampling_params.n}
max_data_num: -1
flush_file: ${flush_file}
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.sft_dpo.V100.tp4.dp2.v1.0.s42
save_best: False
eval_sub_path:
output_dir: ${output_path_prefix}experiments/${exp_name}/
# Dataloader
num_workers: 32
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.code.code.CodeExtractor
output_file: ${output_file}
answer_clean:
_target_: post_processors.code.clean.get
# name: standard
name: tag
index_field: problem_id
test_case_field: "input_output"
resume: True
evaluator:
_target_: post_processors.code.code.APPsEvaluator
saved_keys: [ "difficulty" ]
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/apps/deepseek_coder/r2c/dev_v2_0_sample.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
data_path_prefix: ""
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "codeparrot/apps"
dev_file: ${train_file}
test_file: ${train_file}
port: 6000
model: ds-coder-v1.5-chat
sampling_params:
_target_: vllm.SamplingParams
# n: 20
# temperature: 1.0
n: 1
temperature: 1.0
stop: [ "", "\n\n\n\n", "Context:\n", "Thought 42:", "<|end_of_text|>", "<|eot_id|>, <|EOT|>" ]
max_tokens: 4096
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
split_size: 8
split_id: 0
max_num_seqs: 32
suffix: ${split_id}-of-${split_size}
output_file: ${output_dir}/apps/${eval_sub_path}/test.0shot.tem${tem}.n${n}.${suffix}.v2.0.s${seed}.json # v1.1 for extraction
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
# Data loading
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
read_fn:
_target_: data.apps.APPsWithFunctionName
split: test
use_starter_code: True
# template: ${chat_prefix}${prompt}${chat_connect}
template:
_target_: data.input_utils.compose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
composition: "{chat_prefix}{prompt}{chat_connect}"
instruction:
index_field: "problem_id"
service_based: False
split_size: ${split_size}
split_id: ${split_id}
service_processor:
_target_: data.vllm.VLLMRequestGenerator
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: 4096
model: ${model}
stop: ${sampling_params.stop}
temperature: ${sampling_params.temperature}
n: ${sampling_params.n}
max_data_num: -1
flush_file: ${flush_file}
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.sft_dpo.V100.tp4.dp2.v1.0.s42
save_best: False
eval_sub_path:
output_dir: ${output_path_prefix}experiments/${exp_name}/
# Dataloader
num_workers: 32
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.code.code.CodeExtractor
output_file: ${output_file}
answer_clean:
_target_: post_processors.code.clean.get
# name: standard
name: tag
index_field: problem_id
test_case_field: "input_output"
resume: True
evaluator:
_target_: post_processors.code.code.APPsEvaluator
saved_keys: [ "difficulty" ]
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/apps/deepseek_coder/r2c/general_combine_train_v2_0.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/dataset/
model_path_prefix: /mnt/fangkai_blob/share/models # ../pretrained-models/
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
#train_file: ${data_path_prefix}/magicoder/oss-instruct-apps-train-pseudo-test-inputs.v1.0.json
#train_file: /mnt/fangkai_blob/share/xCodeEval/xcode_train_4o_test_inputs_v1.json
train_file: /mnt/fangkai_blob/share/xcode_4o_oss_apps_test_inputs_v1.json
dev_file: ${train_file}
test_file: ${train_file}
port: 6000
model: ds-coder-v1.5-chat
sampling_params:
_target_: vllm.SamplingParams
n: 10
temperature: 1.0
stop: [ "", "\n\n\n\n", "Context:\n", "Thought 42:", "<|end_of_text|>", "<|eot_id|>, <|EOT|>" ]
max_tokens: 4096
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
split_size: 32
split_id: 0
max_num_seqs: 48
global_batch_size: 256
suffix: ${split_id}-of-${split_size}
#output_file: ${output_dir}/oss-instruct-apps-train/${eval_sub_path}/train.0shot.tem${tem}.n${n}.${suffix}.v2.0.json
#output_file: ${output_dir}/xcode-train/${eval_sub_path}/train.0shot.tem${tem}.n${n}.${suffix}.v2.0.json
output_file: ${output_dir}/oss-apps-xcode-combine/${eval_sub_path}/train.0shot.tem${tem}.n${n}.${suffix}.v2.1.s${seed}.json # v2.1 fix the starter code issue
#output_file: ${output_dir}/oss-instruct-apps-train/${eval_sub_path}/train.0shot.tem${tem}.n${n}.${suffix}.v2.1.json # v2.1 fix the starter code issue
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
# Data loading
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
read_fn:
# _target_: data.apps.PseudoInputsWithFunctionName
_target_: data.apps.PseudoInputsWithFunctionNameFixStarterCode # v2.1
use_starter_code: True
template:
_target_: data.input_utils.compose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
composition: "{chat_prefix}{prompt}{chat_connect}"
instruction:
index_field: "problem_id"
service_based: False
split_size: ${split_size}
split_id: ${split_id}
max_data_num: -1
flush_file: ${flush_file}
exp_name:
save_best: False
eval_sub_path:
output_dir: ${output_path_prefix}experiments/${exp_name}/
# Dataloader
num_workers: 32
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.code.code.CodeExtractor
output_file: ${output_file}
answer_clean:
_target_: post_processors.code.clean.get
# name: standard
name: tag
index_field: problem_id
resume: True
test_case_field:
evaluator:
_target_: post_processors.code.code.APPsEvaluator
saved_keys: [ "problem_id", "input_output", ]
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/apps/deepseek_coder/r2c/general_combine_train_v2_0_prefix_completion.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/dataset/
model_path_prefix: /mnt/fangkai_blob/share/models # ../pretrained-models/
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
#train_file: ${data_path_prefix}/magicoder/oss-instruct-apps-train-pseudo-test-inputs.v1.0.json
train_file: /mnt/fangkai_blob/share/xcode_4o_oss_apps_test_inputs_v1.json
dev_file: ${train_file}
test_file: ${train_file}
port: 6000
model: ds-coder-v1.5-chat
sampling_params:
_target_: vllm.SamplingParams
n: 3
temperature: 1.0
stop: [ "", "\n\n\n\n", "Context:\n", "Thought 42:", "<|end_of_text|>", "<|eot_id|>, <|EOT|>" ]
max_tokens: 4096
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
split_size: 256
split_id: 0
max_num_seqs: 16
global_batch_size: 128
global_split_id: 0
suffix: ${split_id}-of-${split_size}
#output_file: ${output_dir}/oss-instruct-apps-train/${eval_sub_path}/train.tem1.0.n10.prefix.upper0.8.r0.3.sample20_per.completion.tem${tem}.n${n}.${suffix}.v2.0.json
#output_file: ${output_dir}/oss-apps-xcode-combine/${eval_sub_path}/train.0shot.tem1.0.n10.v2.0.prefix.upper0.8.r0.3.sample10_per.completion.tem${tem}.n${n}.${suffix}.v2.0.json
#output_file: ${output_dir}/oss-apps-xcode-combine/${eval_sub_path}/train.0shot.tem1.0.n64.v2.0.prefix.upper0.8.r0.3.sample10_per.completion.tem${tem}.n${n}.glo-${global_split_id}-of-8.loc-${suffix}.v2.0.json
output_file: ${output_dir}/oss-apps-xcode-combine/${eval_sub_path}/train.0shot.tem1.0.n64.v2.0.prefix.upper0.8.r0.3.sample32_per.completion.tem${tem}.n${n}.glo-${global_split_id}-of-16.loc-${suffix}.v2.0.json
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
# Data loading
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
read_fn:
_target_: data.apps.PseudoInputsWithFunctionName
use_starter_code: True
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: id
# extract_fields: [ response, pred, prefix, prefix_id ]
extract_fields: [ prefix, prefix_id ]
# extra_file: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.sft_ps_test_case.process-dpo.V100.tp8dp16.v4.9.s42/oss-instruct-apps-train/checkpoint-700/split-32/train.0shot.tem1.0.n10.v2.0.prefix.upper0.8.r0.3.sample20_per.json
# extra_file: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.sft_ps_test_case.iter1.dpo.A100.tp4dp16.v1.2.s42/oss-apps-xcode-combine/checkpoint-800/train.0shot.tem1.0.n10.v2.0.prefix.upper0.8.r0.3.sample10_per.json
# extra_file: ${output_path_prefix}experiments/${exp_name}/oss-apps-xcode-combine/${eval_sub_path}/train.0shot.tem1.0.n64.v2.0.prefix.upper0.8.r0.3.sample10_per.${global_split_id}-of-8.json
extra_file: ${output_path_prefix}experiments/${exp_name}/oss-apps-xcode-combine/${eval_sub_path}/train.0shot.tem1.0.n64.v2.0.prefix.upper0.8.r0.3.sample32_per.${global_split_id}-of-16.json
# renamed_fields:
# response: orig_response
# pred: orig_pred
- _target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: [ prefix, prefix_id ]
mode: multi
template:
_target_: data.input_utils.compose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
prefix: "{prefix}"
composition: "{chat_prefix}{prompt}{chat_connect}{prefix}"
instruction:
index_field: prefix_id
service_based: False
split_size: ${split_size}
split_id: ${split_id}
max_data_num: -1
flush_file: ${flush_file}
exp_name:
save_best: False
eval_sub_path:
output_dir: ${output_path_prefix}experiments/${exp_name}/
# Dataloader
num_workers: 32
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.code.code.CodeExtractor
output_file: ${output_file}
answer_clean:
_target_: post_processors.code.clean.get
name: tag
index_field: prefix_id
test_case_field:
resume: True
evaluator:
_target_: post_processors.code.code.APPsEvaluator
saved_keys: [ "prefix_id", "prefix", "problem_id", ]
completion_separator: ${chat_connect}
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/apps/deepseek_coder/r2c/general_combine_train_v2_1_4o_non_sc.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/dataset/
model_path_prefix: /mnt/fangkai_blob/share/models # ../pretrained-models/
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
#train_file: ${data_path_prefix}/magicoder/oss-instruct-apps-train-pseudo-test-inputs.v1.0.json
#train_file: /mnt/fangkai_blob/share/xCodeEval/xcode_train_4o_test_inputs_v1.json
#train_file: /mnt/fangkai_blob/share/xcode_4o_oss_apps_test_inputs_v1.json
train_file: /mnt/fangkai_blob/share/xcode_4o_oss_apps_test_inputs_v1.shuf.combine.4o_ps_test_cases.sc_and_non_sc.json
dev_file: ${train_file}
test_file: ${train_file}
port: 6000
model: ds-coder-v1.5-chat
sampling_params:
_target_: vllm.SamplingParams
n: 10
temperature: 1.0
stop: [ "", "\n\n\n\n", "Context:\n", "Thought 42:", "<|end_of_text|>", "<|eot_id|>, <|EOT|>" ]
max_tokens: 4096
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
split_size: 32
split_id: 0
max_num_seqs: 128
global_batch_size: 256
suffix: ${split_id}-of-${split_size}
#output_file: ${output_dir}/oss-instruct-apps-train/${eval_sub_path}/train.0shot.tem${tem}.n${n}.${suffix}.v2.0.json
#output_file: ${output_dir}/xcode-train/${eval_sub_path}/train.0shot.tem${tem}.n${n}.${suffix}.v2.0.json
output_file: ${output_dir}/oss-apps-xcode-combine-4o-ps-tests/${eval_sub_path}/train.0shot.tem${tem}.n${n}.${suffix}.v2.1.s${seed}.json
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
# Data loading
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
read_fn:
_target_: data.apps.PseudoInputsWithFunctionNameFixStarterCode
use_starter_code: True
template:
_target_: data.input_utils.compose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
composition: "{chat_prefix}{prompt}{chat_connect}"
instruction:
index_field: "problem_id"
service_based: False
split_size: ${split_size}
split_id: ${split_id}
max_data_num: -1
flush_file: ${flush_file}
exp_name:
save_best: False
eval_sub_path:
output_dir: ${output_path_prefix}experiments/${exp_name}/
# Dataloader
num_workers: 32
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.code.code.CodeExtractor
output_file: ${output_file}
answer_clean:
_target_: post_processors.code.clean.get
# name: standard
name: tag
index_field: problem_id
resume: True
test_case_field: input_output_non_sc
evaluator:
_target_: post_processors.code.code.APPsEvaluator
saved_keys: [ "problem_id", "input_output", "input_output_non_sc", "question", "starter_code" ]
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/apps/deepseek_coder/r2c/sub_dev_v1_1.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
data_path_prefix: ""
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "codeparrot/apps"
dev_file: ${train_file}
test_file: ${train_file}
port: 6000
model: ds-coder-v1.5-chat
sampling_params:
_target_: vllm.SamplingParams
# n: 20
# temperature: 1.0
n: 1
temperature: 0.0
stop: [ "", "\n\n\n\n", "Context:\n", "Thought 42:", "<|end_of_text|>", "<|eot_id|>, <|EOT|>" ]
max_tokens: 4096
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
split_size: 8
split_id: 0
max_num_seqs: 64
suffix: ${split_id}-of-${split_size}
output_file: ${output_dir}/apps/${eval_sub_path}/sub_dev.0shot.tem${tem}.n${n}.${suffix}.v1.1.json # v1.1 for extraction
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
# Data loading
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
read_fn:
_target_: data.apps.APPsReader
split: train
train_sub_split: val
# template: ${chat_prefix}${prompt}${chat_connect}
template:
_target_: data.input_utils.compose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
composition: "{chat_prefix}{prompt}{chat_connect}"
instruction:
index_field: "problem_id"
service_based: False
split_size: ${split_size}
split_id: ${split_id}
service_processor:
_target_: data.vllm.VLLMRequestGenerator
# api_url: http://0.0.0.0:${port}/v1/chat/completions
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: 4096
# system_prompt: "You are a helpful assistant to help solve the complex reasoning problem."
model: ${model}
stop: ${sampling_params.stop}
temperature: ${sampling_params.temperature}
n: ${sampling_params.n}
max_data_num: -1
flush_file: ${flush_file}
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.sft_dpo.V100.tp4.dp2.v1.0.s42
save_best: False
eval_sub_path:
output_dir: ${output_path_prefix}experiments/${exp_name}/
# Dataloader
num_workers: 32
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.code.code.CodeExtractor
output_file: ${output_file}
answer_clean:
_target_: post_processors.code.clean.get
# name: standard
name: tag
index_field: problem_id
test_case_field: "input_output"
evaluator:
_target_: post_processors.code.code.APPsEvaluator
resume: True
saved_keys: [ "difficulty" ]
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/apps/deepseek_coder/r2c/sub_dev_v2_0.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
data_path_prefix: ""
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "codeparrot/apps"
dev_file: ${train_file}
test_file: ${train_file}
port: 6000
model: ds-coder-v1.5-chat
sampling_params:
_target_: vllm.SamplingParams
# n: 20
# temperature: 1.0
n: 1
temperature: 0.0
stop: [ "", "\n\n\n\n", "Context:\n", "Thought 42:", "<|end_of_text|>", "<|eot_id|>, <|EOT|>" ]
max_tokens: 4096
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
split_size: 8
split_id: 0
max_num_seqs: 32
suffix: ${split_id}-of-${split_size}
output_file: ${output_dir}/apps/${eval_sub_path}/sub_dev.0shot.tem${tem}.n${n}.${suffix}.v2.0.json # v1.1 for extraction
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
# Data loading
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
read_fn:
_target_: data.apps.APPsWithFunctionName
split: train
train_sub_split: val
use_starter_code: True
# template: ${chat_prefix}${prompt}${chat_connect}
template:
_target_: data.input_utils.compose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
composition: "{chat_prefix}{prompt}{chat_connect}"
instruction:
index_field: "problem_id"
service_based: False
split_size: ${split_size}
split_id: ${split_id}
service_processor:
_target_: data.vllm.VLLMRequestGenerator
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: 4096
model: ${model}
stop: ${sampling_params.stop}
temperature: ${sampling_params.temperature}
n: ${sampling_params.n}
max_data_num: -1
flush_file: ${flush_file}
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.sft_dpo.V100.tp4.dp2.v1.0.s42
save_best: False
eval_sub_path:
output_dir: ${output_path_prefix}experiments/${exp_name}/
# Dataloader
num_workers: 32
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.code.code.CodeExtractor
output_file: ${output_file}
answer_clean:
_target_: post_processors.code.clean.get
# name: standard
name: tag
index_field: problem_id
test_case_field: "input_output"
resume: True
evaluator:
_target_: post_processors.code.code.APPsEvaluator
saved_keys: [ "difficulty" ]
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/apps/deepseek_coder/r2c/train_v1_0.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
data_path_prefix: ""
model_path_prefix: /mnt/fangkai_blob/share/models # ../pretrained-models/
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "codeparrot/apps"
dev_file: ${train_file}
test_file: ${train_file}
port: 6000
model: ds-coder-v1.5-chat
sampling_params:
_target_: vllm.SamplingParams
n: 10
temperature: 1.0
stop: [ "", "\n\n\n\n", "Context:\n", "Thought 42:", "<|end_of_text|>", "<|eot_id|>, <|EOT|>" ]
max_tokens: 4096
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
split_size: 8
split_id: 0
max_num_seqs: 64
suffix: ${split_id}-of-${split_size}
output_file: ${output_dir}/apps/${eval_sub_path}/train.0shot.tem${tem}.n${n}.${suffix}.v1.1.json
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
# Data loading
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
read_fn:
_target_: data.apps.APPsReader
split: train
template:
_target_: data.input_utils.compose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
composition: "{chat_prefix}{prompt}{chat_connect}"
instruction:
index_field: "problem_id"
service_based: False
split_size: ${split_size}
split_id: ${split_id}
service_processor:
_target_: data.vllm.VLLMRequestGenerator
# api_url: http://0.0.0.0:${port}/v1/chat/completions
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: 4096
# system_prompt: "You are a helpful assistant to help solve the complex reasoning problem."
model: ${model}
stop: ${sampling_params.stop}
temperature: ${sampling_params.temperature}
n: ${sampling_params.n}
max_data_num: -1
flush_file: ${flush_file}
exp_name:
save_best: False
eval_sub_path:
output_dir: ${output_path_prefix}experiments/${exp_name}/
# Dataloader
num_workers: 32
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.code.code.CodeExtractor
output_file: ${output_file}
answer_clean:
_target_: post_processors.code.clean.get
# name: standard
name: tag
index_field: problem_id
test_case_field: "input_output"
evaluator:
_target_: post_processors.code.code.APPsEvaluator
resume: True
saved_keys: [ "difficulty" ]
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/apps/deepseek_coder/r2c/train_v1_0_s43.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
data_path_prefix: ""
model_path_prefix: /mnt/fangkai_blob/share/models # ../pretrained-models/
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "codeparrot/apps"
dev_file: ${train_file}
test_file: ${train_file}
port: 6000
model: ds-coder-v1.5-chat
sampling_params:
_target_: vllm.SamplingParams
n: 10
temperature: 1.0
stop: [ "", "\n\n\n\n", "Context:\n", "Thought 42:", "<|end_of_text|>", "<|eot_id|>, <|EOT|>" ]
max_tokens: 4096
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
split_size: 8
split_id: 0
max_num_seqs: 64
suffix: ${split_id}-of-${split_size}
output_file: ${output_dir}/apps/${eval_sub_path}/train.0shot.tem${tem}.n${n}.${suffix}.v1.1.s43.json
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
# Data loading
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
read_fn:
_target_: data.apps.APPsReader
split: train
template:
_target_: data.input_utils.compose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
composition: "{chat_prefix}{prompt}{chat_connect}"
instruction:
index_field: "problem_id"
service_based: False
split_size: ${split_size}
split_id: ${split_id}
service_processor:
_target_: data.vllm.VLLMRequestGenerator
# api_url: http://0.0.0.0:${port}/v1/chat/completions
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: 4096
# system_prompt: "You are a helpful assistant to help solve the complex reasoning problem."
model: ${model}
stop: ${sampling_params.stop}
temperature: ${sampling_params.temperature}
n: ${sampling_params.n}
max_data_num: -1
flush_file: ${flush_file}
exp_name:
save_best: False
eval_sub_path:
output_dir: ${output_path_prefix}experiments/${exp_name}/
# Dataloader
num_workers: 32
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.code.code.CodeExtractor
output_file: ${output_file}
answer_clean:
_target_: post_processors.code.clean.get
# name: standard
name: tag
index_field: problem_id
test_case_field: "input_output"
evaluator:
_target_: post_processors.code.code.APPsEvaluator
resume: True
saved_keys: [ "difficulty" ]
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 43
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/apps/deepseek_coder/r2c/train_v2_0.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
data_path_prefix: ""
model_path_prefix: /mnt/fangkai_blob/share/models # ../pretrained-models/
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "codeparrot/apps"
dev_file: ${train_file}
test_file: ${train_file}
port: 6000
model: ds-coder-v1.5-chat
sampling_params:
_target_: vllm.SamplingParams
n: 10
temperature: 1.0
stop: [ "", "\n\n\n\n", "Context:\n", "Thought 42:", "<|end_of_text|>", "<|eot_id|>, <|EOT|>" ]
max_tokens: 4096
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
split_size: 4
split_id: 0
max_num_seqs: 64
suffix: ${split_id}-of-${split_size}
output_file: ${output_dir}/apps/${eval_sub_path}/train.0shot.tem${tem}.n${n}.${suffix}.v2.0.json
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
# Data loading
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
read_fn:
_target_: data.apps.APPsWithFunctionName
split: train
use_starter_code: True
template:
_target_: data.input_utils.compose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
composition: "{chat_prefix}{prompt}{chat_connect}"
instruction:
index_field: "problem_id"
service_based: False
split_size: ${split_size}
split_id: ${split_id}
service_processor:
_target_: data.vllm.VLLMRequestGenerator
# api_url: http://0.0.0.0:${port}/v1/chat/completions
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: 4096
# system_prompt: "You are a helpful assistant to help solve the complex reasoning problem."
model: ${model}
stop: ${sampling_params.stop}
temperature: ${sampling_params.temperature}
n: ${sampling_params.n}
max_data_num: -1
flush_file: ${flush_file}
exp_name:
save_best: False
eval_sub_path:
output_dir: ${output_path_prefix}experiments/${exp_name}/
# Dataloader
num_workers: 32
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.code.code.CodeExtractor
output_file: ${output_file}
answer_clean:
_target_: post_processors.code.clean.get
# name: standard
name: tag
index_field: problem_id
test_case_field: "input_output"
evaluator:
_target_: post_processors.code.code.APPsEvaluator
saved_keys: [ "difficulty", "problem_id" ]
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/apps/deepseek_coder/r2c/train_v2_0_prefix_completion.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
data_path_prefix: ""
model_path_prefix: /mnt/fangkai_blob/share/models # ../pretrained-models/
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "codeparrot/apps"
dev_file: ${train_file}
test_file: ${train_file}
port: 6000
model: ds-coder-v1.5-chat
sampling_params:
_target_: vllm.SamplingParams
n: 5
temperature: 1.0
stop: [ "", "\n\n\n\n", "Context:\n", "Thought 42:", "<|end_of_text|>", "<|eot_id|>, <|EOT|>" ]
max_tokens: 4096
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
split_size: 64
split_id: 0
max_num_seqs: 32
suffix: ${split_id}-of-${split_size}
output_file: ${output_dir}/apps/${eval_sub_path}/train.tem1.0.n10.prefix.upper0.8.r0.3.completion.tem${tem}.n${n}.${suffix}.v2.0.json
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
# Data loading
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
read_fn:
_target_: data.apps.APPsWithFunctionName
split: train
use_starter_code: True
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: id
extract_fields: [ response, pred, prefix, prefix_id, full_res, res ]
extra_file: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v3.1.dp4.tp4.s42/apps/checkpoint-200/train.0shot.tem1.0.n10.prefix.upper0.8.r0.3.json
renamed_fields:
response: orig_response
pred: orig_pred
full_res: orig_full_res
res: orig_res
- _target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: [ prefix, prefix_id ]
mode: multi
template:
_target_: data.input_utils.compose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
prefix: "{prefix}"
composition: "{chat_prefix}{prompt}{chat_connect}{prefix}"
instruction:
index_field: prefix_id
service_based: False
split_size: ${split_size}
split_id: ${split_id}
service_processor:
_target_: data.vllm.VLLMRequestGenerator
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: 4096
model: ${model}
stop: ${sampling_params.stop}
temperature: ${sampling_params.temperature}
n: ${sampling_params.n}
max_data_num: -1
flush_file: ${flush_file}
exp_name:
save_best: False
eval_sub_path:
output_dir: ${output_path_prefix}experiments/${exp_name}/
# Dataloader
num_workers: 32
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.code.code.CodeExtractor
output_file: ${output_file}
answer_clean:
_target_: post_processors.code.clean.get
name: tag
index_field: prefix_id
test_case_field: "input_output"
evaluator:
_target_: post_processors.code.code.APPsEvaluator
saved_keys: [ "difficulty", "orig_response", "orig_pred", "prefix_id", "prefix", "problem_id", "orig_full_res", "orig_res" ]
completion_separator: ${chat_connect}
resume: True
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/apps/deepseek_coder/r2c/xcode_train_v2_0.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/dataset/
model_path_prefix: /mnt/fangkai_blob/share/models # ../pretrained-models/
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
#train_file: ${data_path_prefix}/magicoder/oss-instruct-apps-train-pseudo-test-inputs.v1.0.json
train_file: /mnt/fangkai_blob/share/xCodeEval/xcode_train_4o_test_inputs_v1.json
dev_file: ${train_file}
test_file: ${train_file}
port: 6000
model: ds-coder-v1.5-chat
sampling_params:
_target_: vllm.SamplingParams
n: 10
temperature: 1.0
stop: [ "", "\n\n\n\n", "Context:\n", "Thought 42:", "<|end_of_text|>", "<|eot_id|>, <|EOT|>" ]
max_tokens: 4096
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
split_size: 16
split_id: 0
max_num_seqs: 64
global_batch_size: 512
suffix: ${split_id}-of-${split_size}
#output_file: ${output_dir}/oss-instruct-apps-train/${eval_sub_path}/train.0shot.tem${tem}.n${n}.${suffix}.v2.0.json
output_file: ${output_dir}/xcode-train/${eval_sub_path}/train.0shot.tem${tem}.n${n}.${suffix}.v2.0.json
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
# Data loading
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
read_fn:
_target_: data.apps.PseudoInputsWithFunctionName
use_starter_code: True
template:
_target_: data.input_utils.compose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
composition: "{chat_prefix}{prompt}{chat_connect}"
instruction:
index_field: "problem_id"
service_based: False
split_size: ${split_size}
split_id: ${split_id}
max_data_num: -1
flush_file: ${flush_file}
exp_name:
save_best: False
eval_sub_path:
output_dir: ${output_path_prefix}experiments/${exp_name}/
# Dataloader
num_workers: 32
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.code.code.CodeExtractor
output_file: ${output_file}
answer_clean:
_target_: post_processors.code.clean.get
# name: standard
name: tag
index_field: problem_id
resume: True
test_case_field:
evaluator:
_target_: post_processors.code.code.APPsEvaluator
saved_keys: [ "problem_id", "input_output", ]
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/apps/deepseek_coder/sub_dev_v1_1.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
data_path_prefix: ""
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "codeparrot/apps"
dev_file: ${train_file}
test_file: ${train_file}
port: 6000
model: ds-coder-v1.5-chat
sampling_params:
_target_: vllm.SamplingParams
n: 1
temperature: 0.0
stop: [ "", "\n\n\n\n", "Context:\n", "Thought 42:", "<|end_of_text|>", "<|eot_id|>, <|EOT|>" ]
max_tokens: 4096
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
split_size: 8
split_id: 0
max_num_seqs: 64
suffix: ${split_id}-of-${split_size}
output_file: ${output_dir}/apps/${eval_sub_path}/sub_dev.0shot.tem${tem}.n${n}.${suffix}.v1.1.json # v1.1 for extraction
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt: "{question}\n\nPlease write a program to solve the above problem under the given time constraints and memory limits."
# Data loading
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
read_fn:
_target_: data.apps.APPsReader
split: train
train_sub_split: val
template: ${chat_prefix}${prompt}${chat_connect}
instruction:
index_field: "problem_id"
service_based: False
split_size: ${split_size}
split_id: ${split_id}
service_processor:
_target_: data.vllm.VLLMRequestGenerator
# api_url: http://0.0.0.0:${port}/v1/chat/completions
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: 4096
# system_prompt: "You are a helpful assistant to help solve the complex reasoning problem."
model: ${model}
stop: ${sampling_params.stop}
temperature: ${sampling_params.temperature}
n: ${sampling_params.n}
max_data_num: -1
exp_name: deepseek-coder-v1.5-ins.7b.apps.code_gen.dpo.V100.w8.v1.1
save_best: False
eval_sub_path:
output_dir: ${output_path_prefix}experiments/${exp_name}/
# Dataloader
num_workers: 32
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.code.code.CodeExtractor
output_file: ${output_file}
answer_clean:
_target_: post_processors.code.clean.get
name: standard
index_field: problem_id
test_case_field: "input_output"
evaluator:
_target_: post_processors.code.code.APPsEvaluator
saved_keys: [ "difficulty" ]
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/apps/deepseek_coder/sub_dev_v2_0.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
data_path_prefix: ""
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "codeparrot/apps"
dev_file: ${train_file}
test_file: ${train_file}
port: 6000
model: ds-coder-v1.5-chat
sampling_params:
_target_: vllm.SamplingParams
n: 1
temperature: 0.0
stop: [ "", "\n\n\n\n", "Context:\n", "Thought 42:", "<|end_of_text|>", "<|eot_id|>, <|EOT|>" ]
max_tokens: 4096
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
split_size: 8
split_id: 0
max_num_seqs: 32
suffix: ${split_id}-of-${split_size}
output_file: ${output_dir}/apps/${eval_sub_path}/sub_dev.0shot.tem${tem}.n${n}.${suffix}.v2.0.json # v1.1 for extraction
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt: "{question}\n\nPlease write a program to solve the above problem under the given time constraints and memory limits."
# Data loading
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
read_fn:
_target_: data.apps.APPsWithFunctionName
split: train
train_sub_split: val
use_starter_code: True
template: ${chat_prefix}${prompt}${chat_connect}
instruction:
index_field: "problem_id"
service_based: False
split_size: ${split_size}
split_id: ${split_id}
service_processor:
_target_: data.vllm.VLLMRequestGenerator
# api_url: http://0.0.0.0:${port}/v1/chat/completions
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: 4096
# system_prompt: "You are a helpful assistant to help solve the complex reasoning problem."
model: ${model}
stop: ${sampling_params.stop}
temperature: ${sampling_params.temperature}
n: ${sampling_params.n}
max_data_num: -1
flush_file: ${flush_file}
exp_name: deepseek-coder-v1.5-ins.7b.apps.code_gen.dpo.V100.w8.v1.1
save_best: False
eval_sub_path:
output_dir: ${output_path_prefix}experiments/${exp_name}/
# Dataloader
num_workers: 32
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.code.code.CodeExtractor
output_file: ${output_file}
answer_clean:
_target_: post_processors.code.clean.get
name: standard
index_field: problem_id
test_case_field: "input_output"
evaluator:
_target_: post_processors.code.code.APPsEvaluator
saved_keys: [ "difficulty" ]
resume: True
num_workers: 16
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/apps/deepseek_coder/test_inputs_gen/sub_dev_v1_0.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
data_path_prefix: ""
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "codeparrot/apps"
dev_file: ${train_file}
test_file: ${train_file}
port: 6000
model: ds-coder-v1.5-chat
sampling_params:
_target_: vllm.SamplingParams
n: 20
temperature: 0.8
top_p: 0.7
stop: [ "", "\n\n\n\n", "Context:\n", "Thought 42:", "<|end_of_text|>", "<|eot_id|>, <|EOT|>" ]
max_tokens: 4096
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
split_size: 8
split_id: 0
max_num_seqs: 64
suffix: ${split_id}-of-${split_size}
output_file: ${output_dir}/apps-test-inputs-gen/${eval_sub_path}/sub_dev.0shot.tem${tem}.n${n}.${suffix}.v1.0.json
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/test_input_gen_0shot_v1.0.txt
# Data loading
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
read_fn:
_target_: data.apps.APPsWithFunctionName
split: train
train_sub_split: val
template:
_target_: data.input_utils.compose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
composition: "{chat_prefix}{prompt}{chat_connect}"
instruction:
index_field: "problem_id"
service_based: False
split_size: ${split_size}
split_id: ${split_id}
service_processor:
_target_: data.vllm.VLLMRequestGenerator
# api_url: http://0.0.0.0:${port}/v1/chat/completions
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: 4096
# system_prompt: "You are a helpful assistant to help solve the complex reasoning problem."
model: ${model}
stop: ${sampling_params.stop}
temperature: ${sampling_params.temperature}
n: ${sampling_params.n}
max_data_num: -1
flush_file: ${flush_file}
exp_name:
save_best: False
eval_sub_path:
output_dir: ${output_path_prefix}experiments/${exp_name}/
# Dataloader
num_workers: 32
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.code.code.CodeExtractor
output_file: ${output_file}
answer_clean:
_target_: post_processors.code.clean.get
name: tag
index_field: problem_id
test_case_field: placeholder
evaluator:
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/apps/deepseek_coder/test_inputs_gen/test_v1_0.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
data_path_prefix: ""
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "codeparrot/apps"
dev_file: ${train_file}
test_file: ${train_file}
port: 6000
model: ds-coder-v1.5-chat
sampling_params:
_target_: vllm.SamplingParams
n: 20
temperature: 0.8
top_p: 0.7
stop: [ "", "\n\n\n\n", "Context:\n", "Thought 42:", "<|end_of_text|>", "<|eot_id|>, <|EOT|>" ]
max_tokens: 4096
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
split_size: 8
split_id: 0
max_num_seqs: 64
suffix: ${split_id}-of-${split_size}
output_file: ${output_dir}/apps-test-inputs-gen/${eval_sub_path}/test.0shot.tem${tem}.n${n}.${suffix}.v1.0.json
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/test_input_gen_0shot_v1.0.txt
# Data loading
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
read_fn:
_target_: data.apps.APPsWithFunctionName
split: test
template:
_target_: data.input_utils.compose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
composition: "{chat_prefix}{prompt}{chat_connect}"
instruction:
index_field: "problem_id"
service_based: False
split_size: ${split_size}
split_id: ${split_id}
service_processor:
_target_: data.vllm.VLLMRequestGenerator
# api_url: http://0.0.0.0:${port}/v1/chat/completions
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: 4096
# system_prompt: "You are a helpful assistant to help solve the complex reasoning problem."
model: ${model}
stop: ${sampling_params.stop}
temperature: ${sampling_params.temperature}
n: ${sampling_params.n}
max_data_num: -1
flush_file: ${flush_file}
exp_name:
save_best: False
eval_sub_path:
output_dir: ${output_path_prefix}experiments/${exp_name}/
# Dataloader
num_workers: 32
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.code.code.CodeExtractor
output_file: ${output_file}
answer_clean:
_target_: post_processors.code.clean.get
name: tag
index_field: problem_id
test_case_field: placeholder
evaluator:
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/apps/deepseek_coder/train_v1_0.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
data_path_prefix: ""
model_path_prefix: /mnt/fangkai_blob/share/models # ../pretrained-models/
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "codeparrot/apps"
dev_file: ${train_file}
test_file: ${train_file}
port: 6000
model: ds-coder-v1.5-chat
sampling_params:
_target_: vllm.SamplingParams
n: 10
temperature: 1.0
stop: [ "", "\n\n\n\n", "Context:\n", "Thought 42:", "<|end_of_text|>", "<|eot_id|>, <|EOT|>" ]
max_tokens: 4096
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
split_size: 8
split_id: 0
max_num_seqs: 64
suffix: ${split_id}-of-${split_size}
output_file: ${model_path_prefix}/deepseek-coder-7b-instruct-v1.5/apps/train.0shot.tem${tem}.n${n}.${suffix}.v1.0.json
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt: "{question}\n\nPlease write a program to solve the above problem under the given time constraints and memory limits."
# Data loading
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
read_fn:
_target_: data.apps.APPsReader
split: train
# aligner:
# _target_: data.input_aligner.add_id_aligner
# id_field: "id"
# few_shot_prompt:
# _target_: data.logiqav2.read_single_file
# file_path: data/prompts/ar_lsat/react/train_200006_1-G_1_1.txt
template: ${chat_prefix}${prompt}${chat_connect}
instruction:
index_field: "problem_id"
service_based: False
# message_compose_fn:
# _target_: data.input_utils.compose_message
# system_prompt: "You are a helpful assistant to help solve the complex reasoning problem."
split_size: ${split_size}
split_id: ${split_id}
service_processor:
_target_: data.vllm.VLLMRequestGenerator
# api_url: http://0.0.0.0:${port}/v1/chat/completions
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: 4096
# system_prompt: "You are a helpful assistant to help solve the complex reasoning problem."
model: ${model}
stop: ${sampling_params.stop}
temperature: ${sampling_params.temperature}
n: ${sampling_params.n}
max_data_num: -1
flush_file: ${flush_file}
save_best: True
eval_sub_path:
output_dir: ${model_path_prefix}/deepseek-coder-7b-instruct-v1.5/
# Dataloader
num_workers: 32
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.code.code.CodeExtractor
output_file: ${output_file}
answer_clean:
_target_: post_processors.code.clean.get
name: standard
index_field: problem_id
test_case_field: "input_output"
evaluator:
_target_: post_processors.code.code.APPsEvaluator
saved_keys: [ "difficulty" ]
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/apps/deepseek_coder/train_v2_0.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
data_path_prefix: ""
model_path_prefix: /mnt/fangkai_blob/share/models # ../pretrained-models/
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "codeparrot/apps"
dev_file: ${train_file}
test_file: ${train_file}
port: 6000
model: ds-coder-v1.5-chat
sampling_params:
_target_: vllm.SamplingParams
n: 10
temperature: 1.0
stop: [ "", "\n\n\n\n", "Context:\n", "Thought 42:", "<|end_of_text|>", "<|eot_id|>, <|EOT|>" ]
max_tokens: 4096
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
split_size: 8
split_id: 0
max_num_seqs: 64
suffix: ${split_id}-of-${split_size}
output_file: ${model_path_prefix}/deepseek-coder-7b-instruct-v1.5/apps/train.0shot.tem${tem}.n${n}.${suffix}.v2.0.json
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt: "{question}\n\nPlease write a program to solve the above problem under the given time constraints and memory limits."
# Data loading
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
read_fn:
_target_: data.apps.APPsWithFunctionName
split: train
train_sub_split: train
use_starter_code: True
template: ${chat_prefix}${prompt}${chat_connect}
instruction:
index_field: "problem_id"
service_based: False
split_size: ${split_size}
split_id: ${split_id}
service_processor:
_target_: data.vllm.VLLMRequestGenerator
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: 4096
model: ${model}
stop: ${sampling_params.stop}
temperature: ${sampling_params.temperature}
n: ${sampling_params.n}
max_data_num: -1
flush_file: ${flush_file}
save_best: True
eval_sub_path:
output_dir: ${model_path_prefix}/deepseek-coder-7b-instruct-v1.5/
# Dataloader
num_workers: 32
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.code.code.CodeExtractor
output_file: ${output_file}
answer_clean:
_target_: post_processors.code.clean.get
name: standard
index_field: problem_id
test_case_field: "input_output"
evaluator:
_target_: post_processors.code.code.APPsEvaluator
saved_keys: [ "difficulty" ]
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/apps/general_eval/dev_v2_0.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
data_path_prefix: ""
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "codeparrot/apps"
dev_file: ${train_file}
test_file: ${train_file}
port: 6000
model:
sampling_params:
_target_: vllm.SamplingParams
n: 1
temperature: 0.0
stop: [ "", "\n\n\n\n", "Context:\n", "Thought 42:", "<|end_of_text|>", "<|eot_id|>, <|EOT|>" ]
max_tokens: 4096
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
split_size: 8
split_id: 0
max_num_seqs: 32
suffix: ${split_id}-of-${split_size}
output_file: ${output_dir}/apps/${eval_sub_path}/test.0shot.tem${tem}.n${n}.${suffix}.v2.0.json # v1.1 for extraction
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
prompt: "{question}\n\nPlease write a program to solve the above problem under the given time constraints and memory limits."
# Data loading
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
read_fn:
_target_: data.apps.APPsWithFunctionName
split: test
use_starter_code: True
template: ${prompt}
instruction:
index_field: "problem_id"
service_based: False
split_size: ${split_size}
split_id: ${split_id}
service_processor:
_target_: data.vllm.VLLMRequestGenerator
# api_url: http://0.0.0.0:${port}/v1/chat/completions
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: 4096
# system_prompt: "You are a helpful assistant to help solve the complex reasoning problem."
model: ${model}
stop: ${sampling_params.stop}
temperature: ${sampling_params.temperature}
n: ${sampling_params.n}
max_data_num: -1
flush_file: ${flush_file}
exp_name: deepseek-coder-v1.5-ins.7b.apps.code_gen.dpo.V100.w8.v1.1
save_best: False
eval_sub_path:
output_dir: ${output_path_prefix}experiments/${exp_name}/
# Dataloader
num_workers: 32
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.code.code.CodeExtractor
output_file: ${output_file}
answer_clean:
_target_: post_processors.code.clean.get
name: standard
index_field: problem_id
test_case_field: "input_output"
evaluator:
_target_: post_processors.code.code.APPsEvaluator
saved_keys: [ "difficulty" ]
resume: True
num_workers: 16
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/apps/general_eval/dev_v2_1.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
data_path_prefix: ""
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "codeparrot/apps"
dev_file: ${train_file}
test_file: ${train_file}
port: 6000
model:
sampling_params:
_target_: vllm.SamplingParams
n: 1
temperature: 0.0
stop: [ "", "\n\n\n\n", "Context:\n", "Thought 42:", "<|end_of_text|>", "<|eot_id|>, <|EOT|>" ]
max_tokens: 4096
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
split_size: 8
split_id: 0
max_num_seqs: 8
global_batch_size: 128
suffix: ${split_id}-of-${split_size}
output_file: ${output_dir}/apps/${eval_sub_path}/test.0shot.tem${tem}.n${n}.${suffix}.v2.1.json # v1.1 for extraction
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
prompt: "{question}\n\nPlease write a Python program to solve the above problem under the given time constraints and memory limits."
# Data loading
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
read_fn:
_target_: data.apps.APPsWithFunctionName
split: test
use_starter_code: True
template: ${prompt}
instruction:
index_field: "problem_id"
service_based: False
split_size: ${split_size}
split_id: ${split_id}
service_processor:
_target_: data.vllm.VLLMRequestGenerator
# api_url: http://0.0.0.0:${port}/v1/chat/completions
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: 4096
# system_prompt: "You are a helpful assistant to help solve the complex reasoning problem."
model: ${model}
stop: ${sampling_params.stop}
temperature: ${sampling_params.temperature}
n: ${sampling_params.n}
max_data_num: -1
flush_file: ${flush_file}
exp_name: deepseek-coder-v1.5-ins.7b.apps.code_gen.dpo.V100.w8.v1.1
save_best: False
eval_sub_path:
output_dir: ${output_path_prefix}experiments/${exp_name}/
# Dataloader
num_workers: 32
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.code.code.CodeExtractor
output_file: ${output_file}
answer_clean:
_target_: post_processors.code.clean.get
name: standard
index_field: problem_id
test_case_field: "input_output"
evaluator:
_target_: post_processors.code.code.APPsEvaluator
saved_keys: [ "difficulty" ]
resume: True
num_workers: 16
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/apps/general_eval/dev_v2_2.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
data_path_prefix: ""
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "codeparrot/apps"
dev_file: ${train_file}
test_file: ${train_file}
port: 6000
model:
sampling_params:
_target_: vllm.SamplingParams
n: 1
temperature: 0.0
stop: [ "", "\n\n\n\n", "Context:\n", "Thought 42:", "<|end_of_text|>", "<|eot_id|>, <|EOT|>" ]
max_tokens: 4096
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
split_size: 8
split_id: 0
max_num_seqs: 8
global_batch_size: 128
suffix: ${split_id}-of-${split_size}
output_file: ${output_dir}/apps/${eval_sub_path}/test.0shot.tem${tem}.n${n}.${suffix}.v2.2.json # v1.1 for extraction
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
prompt: "{question}\n\nPlease write a Python program to solve the above problem under the given time constraints and memory limits. Your code should be put in the code block\n```python\n...\n```\n"
# Data loading
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
read_fn:
_target_: data.apps.APPsWithFunctionName
split: test
use_starter_code: True
template: ${prompt}
instruction:
index_field: "problem_id"
service_based: False
split_size: ${split_size}
split_id: ${split_id}
service_processor:
_target_: data.vllm.VLLMRequestGenerator
# api_url: http://0.0.0.0:${port}/v1/chat/completions
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: 4096
# system_prompt: "You are a helpful assistant to help solve the complex reasoning problem."
model: ${model}
stop: ${sampling_params.stop}
temperature: ${sampling_params.temperature}
n: ${sampling_params.n}
max_data_num: -1
flush_file: ${flush_file}
exp_name: deepseek-coder-v1.5-ins.7b.apps.code_gen.dpo.V100.w8.v1.1
save_best: False
eval_sub_path:
output_dir: ${output_path_prefix}experiments/${exp_name}/
# Dataloader
num_workers: 32
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.code.code.CodeExtractor
output_file: ${output_file}
answer_clean:
_target_: post_processors.code.clean.get
name: standard_default
index_field: problem_id
test_case_field: "input_output"
evaluator:
_target_: post_processors.code.code.APPsEvaluator
saved_keys: [ "difficulty" ]
resume: True
num_workers: 16
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/human_eval/ds_coder/r2c/test_v1_0.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
mount_dir: /mnt/fangkai_blob/
data_path_prefix: ${mount_dir}/share
model_path_prefix: ${mount_dir}/share/models
output_path_prefix: ${mount_dir}/reward_modeling/
train_file: "openai_humaneval"
dev_file: ${train_file}
test_file: ${train_file}
port: 6000
model: ds-coder-v1.5-chat
sampling_params:
_target_: vllm.SamplingParams
n: 1
temperature: 0.0
stop: [ "", "\n\n\n\n", "Context:\n", "Thought 42:", "<|end_of_text|>", "<|eot_id|>, <|EOT|>" ]
max_tokens: 4096
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
split_size: -1
split_id: 0
max_num_seqs: 32
output_file: ${output_dir}/human_eval/${eval_sub_path}/test.0shot.tem${tem}.n${n}.v1.0.json
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
# file_path: prompts/human_eval/r2c_prompt_0shot_v1.1.txt
file_path: prompts/human_eval/r2c_prompt_0shot_v1.0.txt
# Data loading
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
read_fn:
_target_: data.human_eval.HumanEvalReader
template:
_target_: data.input_utils.compose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
composition: "{chat_prefix}{prompt}{chat_connect}"
instruction:
index_field: "task_id"
service_based: False
split_size: ${split_size}
split_id: ${split_id}
service_processor:
_target_: data.vllm.VLLMRequestGenerator
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: 4096
model: ${model}
stop: ${sampling_params.stop}
temperature: ${sampling_params.temperature}
n: ${sampling_params.n}
max_data_num: -1
flush_file: ${flush_file}
exp_name:
save_best: False
eval_sub_path:
output_dir: ${output_path_prefix}experiments/${exp_name}/
# Dataloader
num_workers: 32
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.code.code.CodeExtractor
output_file: ${output_file}
answer_clean:
_target_: post_processors.code.clean.get
# name: standard
name: tag
index_field: task_id
test_case_field: "test"
resume: True
evaluator:
_target_: post_processors.code.evaluator.HumanEvaluator
saved_keys: [ "prompt", "entry_point" ]
num_workers: 16
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/human_eval/ds_coder/r2c/test_v1_0_local.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
train_file: "openai_humaneval"
dev_file: ${train_file}
test_file: ${train_file}
port: 6000
model: ds-coder-v1.5-chat
sampling_params:
_target_: vllm.SamplingParams
n: 1
temperature: 0.0
stop: [ "", "\n\n\n\n", "Context:\n", "Thought 42:", "<|end_of_text|>", "<|eot_id|>, <|EOT|>" ]
max_tokens: 4096
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
split_size: -1
split_id: 0
max_num_seqs: 32
output_file: ${output_dir}/human_eval/${eval_sub_path}/test.0shot.tem${tem}.n${n}.v1.0.json
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
# file_path: prompts/human_eval/r2c_prompt_0shot_v1.1.txt
file_path: prompts/human_eval/r2c_prompt_0shot_v1.0.txt
#prompt: "Here is a programming problem as uncompleted function with docstring:\n\n{prompt}\n\nPlease write a program to solve the above problem under the given time constraints and memory limits."
# Data loading
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
read_fn:
_target_: data.human_eval.HumanEvalReader
template:
_target_: data.input_utils.compose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
composition: "{chat_prefix}{prompt}{chat_connect}"
instruction:
index_field: "task_id"
service_based: False
split_size: ${split_size}
split_id: ${split_id}
service_processor:
_target_: data.vllm.VLLMRequestGenerator
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: 4096
model: ${model}
stop: ${sampling_params.stop}
temperature: ${sampling_params.temperature}
n: ${sampling_params.n}
max_data_num: -1
save_best: False
eval_sub_path:
output_dir: ../pretrained-models/deepseek-coder-7b-instruct-v1.5/
# Dataloader
num_workers: 32
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.code.code.CodeExtractor
output_file: ${output_file}
answer_clean:
_target_: post_processors.code.clean.get
# name: standard
name: tag
index_field: task_id
test_case_field: "test"
evaluator:
_target_: post_processors.code.evaluator.HumanEvaluator
saved_keys: [ "prompt", "entry_point" ]
num_workers: 16
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/human_eval/ds_coder/r2c/test_v2_0_local.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
train_file: "openai_humaneval"
dev_file: ${train_file}
test_file: ${train_file}
port: 6000
model: ds-coder-v1.5-chat
sampling_params:
_target_: vllm.SamplingParams
n: 1
temperature: 0.0
stop: [ "", "\n\n\n\n", "Context:\n", "Thought 42:", "<|end_of_text|>", "<|eot_id|>, <|EOT|>" ]
max_tokens: 4096
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
split_size: -1
split_id: 0
max_num_seqs: 32
output_file: ${output_dir}/human_eval/${eval_sub_path}/test.0shot.tem${tem}.n${n}.v2.0.json
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
# file_path: prompts/human_eval/r2c_prompt_0shot_v1.1.txt
# file_path: prompts/human_eval/r2c_prompt_0shot_v1.0.txt
file_path: prompts/human_eval/ds_coder_prompt_v1_0.txt
#prompt: "Here is a programming problem as uncompleted function with docstring:\n\n{prompt}\n\nPlease write a program to solve the above problem under the given time constraints and memory limits."
# Data loading
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
read_fn:
_target_: data.human_eval.HumanEvalReader
template:
_target_: data.input_utils.compose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
composition: "{chat_prefix}{prompt}{chat_connect}"
instruction:
index_field: "task_id"
service_based: False
split_size: ${split_size}
split_id: ${split_id}
service_processor:
_target_: data.vllm.VLLMRequestGenerator
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: 4096
model: ${model}
stop: ${sampling_params.stop}
temperature: ${sampling_params.temperature}
n: ${sampling_params.n}
max_data_num: -1
save_best: False
eval_sub_path: ""
output_dir: ../pretrained-models/deepseek-coder-7b-instruct-v1.5/
# Dataloader
num_workers: 32
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.code.code.CodeExtractor
output_file: ${output_file}
answer_clean:
_target_: post_processors.code.clean.get
# name: standard
name: tag
index_field: task_id
test_case_field: "test"
evaluator:
_target_: post_processors.code.evaluator.HumanEvaluator
saved_keys: [ "prompt", "entry_point" ]
num_workers: 16
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/human_eval/ds_coder/r2c/test_v2_1_local.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
train_file: "openai_humaneval"
dev_file: ${train_file}
test_file: ${train_file}
port: 6000
model: ds-coder-v1.5-chat
sampling_params:
_target_: vllm.SamplingParams
n: 1
temperature: 0.0
stop: [ "", "\n\n\n\n", "Context:\n", "Thought 42:", "<|end_of_text|>", "<|eot_id|>, <|EOT|>" ]
max_tokens: 4096
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
split_size: -1
split_id: 0
max_num_seqs: 32
output_file: ${output_dir}/human_eval/${eval_sub_path}/test.0shot.tem${tem}.n${n}.v2.0.json
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
# file_path: prompts/human_eval/r2c_prompt_0shot_v1.1.txt
# file_path: prompts/human_eval/r2c_prompt_0shot_v1.0.txt
file_path: prompts/human_eval/r2c_prompt_0shot_v1.2.txt
#prompt: "Here is a programming problem as uncompleted function with docstring:\n\n{prompt}\n\nPlease write a program to solve the above problem under the given time constraints and memory limits."
# Data loading
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
read_fn:
_target_: data.human_eval.HumanEvalReader
template:
_target_: data.input_utils.compose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
composition: "{chat_prefix}{prompt}{chat_connect}"
instruction:
index_field: "task_id"
service_based: False
split_size: ${split_size}
split_id: ${split_id}
service_processor:
_target_: data.vllm.VLLMRequestGenerator
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: 4096
model: ${model}
stop: ${sampling_params.stop}
temperature: ${sampling_params.temperature}
n: ${sampling_params.n}
max_data_num: -1
save_best: False
eval_sub_path: ""
output_dir: ../pretrained-models/deepseek-coder-7b-instruct-v1.5/
# Dataloader
num_workers: 32
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.code.code.CodeExtractor
output_file: ${output_file}
answer_clean:
_target_: post_processors.code.clean.get
# name: standard
name: tag
index_field: task_id
test_case_field: "test"
evaluator:
_target_: post_processors.code.evaluator.HumanEvaluator
saved_keys: [ "prompt", "entry_point" ]
num_workers: 16
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/human_eval/ds_coder/r2c/test_v2_2_local.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
train_file: "openai_humaneval"
dev_file: ${train_file}
test_file: ${train_file}
port: 6000
model: ds-coder-v1.5-chat
sampling_params:
_target_: vllm.SamplingParams
n: 1
temperature: 0.0
stop: [ "", "\n\n\n\n", "Context:\n", "Thought 42:", "<|end_of_text|>", "<|eot_id|>, <|EOT|>" ]
max_tokens: 4096
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
split_size: -1
split_id: 0
max_num_seqs: 32
output_file: ${output_dir}/human_eval/${eval_sub_path}/test.0shot.tem${tem}.n${n}.v2.0.json
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/human_eval/r2c_prompt_0shot_v1.3.txt
#prompt: "Here is a programming problem as uncompleted function with docstring:\n\n{prompt}\n\nPlease write a program to solve the above problem under the given time constraints and memory limits."
# Data loading
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
read_fn:
_target_: data.human_eval.HumanEvalReader
template:
_target_: data.input_utils.compose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
composition: "{chat_prefix}{prompt}{chat_connect}"
instruction:
index_field: "task_id"
service_based: False
split_size: ${split_size}
split_id: ${split_id}
service_processor:
_target_: data.vllm.VLLMRequestGenerator
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: 4096
model: ${model}
stop: ${sampling_params.stop}
temperature: ${sampling_params.temperature}
n: ${sampling_params.n}
max_data_num: -1
save_best: False
eval_sub_path: ""
output_dir: ../pretrained-models/deepseek-coder-7b-instruct-v1.5/
# Dataloader
num_workers: 32
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.code.code.CodeExtractor
output_file: ${output_file}
answer_clean:
_target_: post_processors.code.clean.get
# name: standard
name: tag
index_field: task_id
test_case_field: "test"
evaluator:
_target_: post_processors.code.evaluator.HumanEvaluator
saved_keys: [ "prompt", "entry_point" ]
num_workers: 16
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/human_eval/ds_coder/test_v1_0_local.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
train_file: "openai_humaneval"
dev_file: ${train_file}
test_file: ${train_file}
port: 6000
model: ds-coder-v1.5-chat
sampling_params:
_target_: vllm.SamplingParams
n: 1
temperature: 0.0
stop: [ "", "\n\n\n\n", "Context:\n", "Thought 42:", "<|end_of_text|>", "<|eot_id|>, <|EOT|>" ]
max_tokens: 4096
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
split_size: -1
split_id: 0
max_num_seqs: 32
output_file: ${output_dir}/human_eval/${eval_sub_path}/test.0shot.tem${tem}.n${n}.v1.0.json
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt: "Here is a programming problem as uncompleted function with docstring:\n\n{prompt}\n\nPlease write a program to solve the above problem under the given time constraints and memory limits."
# Data loading
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
read_fn:
_target_: data.human_eval.HumanEvalReader
template: ${chat_prefix}${prompt}${chat_connect}
instruction:
index_field: "task_id"
service_based: False
split_size: ${split_size}
split_id: ${split_id}
service_processor:
_target_: data.vllm.VLLMRequestGenerator
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: 4096
model: ${model}
stop: ${sampling_params.stop}
temperature: ${sampling_params.temperature}
n: ${sampling_params.n}
max_data_num: -1
save_best: False
eval_sub_path:
output_dir: ../pretrained-models/deepseek-coder-7b-instruct-v1.5/
# Dataloader
num_workers: 32
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.code.code.CodeExtractor
output_file: ${output_file}
answer_clean:
_target_: post_processors.code.clean.get
name: standard
index_field: task_id
test_case_field: "test"
evaluator:
_target_: post_processors.code.evaluator.HumanEvaluator
saved_keys: [ "prompt", "entry_point" ]
num_workers: 16
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/human_eval/ds_coder/test_v2_0.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
train_file: "openai_humaneval"
dev_file: ${train_file}
test_file: ${train_file}
port: 6000
model: ds-coder-v1.5-chat
sampling_params:
_target_: vllm.SamplingParams
n: 1
temperature: 0.0
stop: [ "", "\n\n\n\n", "Context:\n", "Thought 42:", "<|end_of_text|>", "<|eot_id|>, <|EOT|>" ]
max_tokens: 4096
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
split_size: -1
split_id: 0
max_num_seqs: 32
output_file: ${output_dir}/human_eval/${eval_sub_path}/test.0shot.tem${tem}.n${n}.v2.0.json
flush_file: ${output_file}l
apply_chat_template: True
add_generation_prompt: True
#chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
#chat_connect: "\n### Response:\n"
#chat_suffix: "\n<|EOT|>"
prompt: "Complete the following Python function according to the docstring:\n\n{prompt}"
# Data loading
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
read_fn:
_target_: data.human_eval.HumanEvalReader
template: ${prompt}
instruction:
index_field: "task_id"
service_based: False
split_size: ${split_size}
split_id: ${split_id}
service_processor:
_target_: data.vllm.VLLMRequestGenerator
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: 4096
model: ${model}
stop: ${sampling_params.stop}
temperature: ${sampling_params.temperature}
n: ${sampling_params.n}
max_data_num: -1
save_best: False
eval_sub_path:
output_dir: ../pretrained-models/deepseek-coder-7b-instruct-v1.5/
# Dataloader
num_workers: 32
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.code.code.CodeExtractor
output_file: ${output_file}
answer_clean:
_target_: post_processors.code.clean.get
name: standard
index_field: task_id
test_case_field: "test"
evaluator:
_target_: post_processors.code.evaluator.HumanEvaluator
saved_keys: [ "prompt", "entry_point" ]
num_workers: 16
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/human_eval/test_v2_1.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
train_file: "openai_humaneval"
dev_file: ${train_file}
test_file: ${train_file}
port: 6000
model: ds-coder-v1.5-chat
sampling_params:
_target_: vllm.SamplingParams
n: 1
temperature: 0.0
stop: [ "", "\n\n\n\n", "Context:\n", "Thought 42:", "<|end_of_text|>", "<|eot_id|>, <|EOT|>" ]
max_tokens: 4096
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
split_size: -1
split_id: 0
max_num_seqs: 32
output_file: ${output_dir}/human_eval/${eval_sub_path}/test.0shot.tem${tem}.n${n}.v2.1.json
flush_file: ${output_file}l
apply_chat_template: True
add_generation_prompt: True
#chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
#chat_connect: "\n### Response:\n"
#chat_suffix: "\n<|EOT|>"
prompt: "Complete the following Python function:\n\n{prompt}"
# Data loading
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
read_fn:
_target_: data.human_eval.HumanEvalReader
template: ${prompt}
instruction:
index_field: "task_id"
service_based: False
split_size: ${split_size}
split_id: ${split_id}
service_processor:
_target_: data.vllm.VLLMRequestGenerator
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: 4096
model: ${model}
stop: ${sampling_params.stop}
temperature: ${sampling_params.temperature}
n: ${sampling_params.n}
max_data_num: -1
save_best: False
eval_sub_path:
output_dir: ../pretrained-models/deepseek-coder-7b-instruct-v1.5/
# Dataloader
num_workers: 32
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.code.code.CodeExtractor
output_file: ${output_file}
answer_clean:
_target_: post_processors.code.clean.get
name: standard
index_field: task_id
test_case_field: "test"
evaluator:
_target_: post_processors.code.evaluator.HumanEvaluator
saved_keys: [ "prompt", "entry_point" ]
num_workers: 16
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/human_eval/test_v2_2.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
train_file: "openai_humaneval"
dev_file: ${train_file}
test_file: ${train_file}
port: 6000
model: ds-coder-v1.5-chat
sampling_params:
_target_: vllm.SamplingParams
n: 1
temperature: 0.0
stop: [ "", "\n\n\n\n", "Context:\n", "Thought 42:", "<|end_of_text|>", "<|eot_id|>, <|EOT|>" ]
max_tokens: 4096
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
split_size: -1
split_id: 0
max_num_seqs: 32
output_file: ${output_dir}/human_eval/${eval_sub_path}/test.0shot.tem${tem}.n${n}.v2.2.json
flush_file: ${output_file}l
apply_chat_template: True
add_generation_prompt: True
#chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
#chat_connect: "\n### Response:\n"
#chat_suffix: "\n<|EOT|>"
prompt: "Complete the following Python function:\n\n{prompt}\n\nPlease put your code in code block\n```python\n...\n```\nDo not change any code in the function head and do completion only."
# Data loading
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
read_fn:
_target_: data.human_eval.HumanEvalReader
template: ${prompt}
instruction:
index_field: "task_id"
service_based: False
split_size: ${split_size}
split_id: ${split_id}
service_processor:
_target_: data.vllm.VLLMRequestGenerator
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: 4096
model: ${model}
stop: ${sampling_params.stop}
temperature: ${sampling_params.temperature}
n: ${sampling_params.n}
max_data_num: -1
save_best: False
eval_sub_path:
output_dir: ../pretrained-models/deepseek-coder-7b-instruct-v1.5/
# Dataloader
num_workers: 32
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.code.code.CodeExtractor
output_file: ${output_file}
answer_clean:
_target_: post_processors.code.clean.get
name: standard_default
index_field: task_id
test_case_field: "test"
evaluator:
_target_: post_processors.code.evaluator.HumanEvaluator
saved_keys: [ "prompt", "entry_point" ]
num_workers: 16
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/magicoder/llama3/test_case_input_gen_v1_0.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: ${data_path_prefix}/dataset/magicoder/data-oss_instruct-decontaminated-python.json
dev_file: ${train_file}
test_file: ${train_file}
port: 6000
model:
sampling_params:
_target_: vllm.SamplingParams
n: 1
temperature: 0.0
top_p: 1.0
stop: [ "", "\n\n\n\n", "Context:\n", "Thought 42:", "<|end_of_text|>", "<|eot_id|>, <|EOT|>" ]
max_tokens: 1024
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
split_size: 2
split_id: 0
max_num_seqs: 64
suffix: ${split_id}-of-${split_size}
output_file: ${output_dir}/apps-test-inputs-gen/${eval_sub_path}/sub_dev.0shot.tem${tem}.n${n}.${suffix}.v1.0.json
flush_file: ${output_file}l
apply_chat_template: True
add_generation_prompt: True
chat_prefix:
chat_connect:
chat_suffix:
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/test_input_gen_2shot_v2.1.txt
# Data loading
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
template:
_target_: data.input_utils.compose_template
units:
prompt: ${prompt}
composition: "{prompt}"
instruction:
replacement:
"[[Question]]": "problem"
index_field: index
service_based: False
split_size: ${split_size}
split_id: ${split_id}
service_processor:
_target_: data.vllm.VLLMRequestGenerator
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: 4096
model: ${model}
stop: ${sampling_params.stop}
temperature: ${sampling_params.temperature}
n: ${sampling_params.n}
max_data_num: -1
flush_file: ${flush_file}
exp_name:
save_best: False
eval_sub_path: ""
output_dir: ${model_path_prefix}//Meta-Llama-3.1-70B-Instruct/
# Dataloader
num_workers: 32
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.openai_api_callback.SaveOnlyCallBack
output_file: ${output_file}
answer_clean:
index_field: index
resume: True
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/magicoder/mistral/func_head_extract_v1_0.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: ${data_path_prefix}/dataset/magicoder/data-oss_instruct-decontaminated-python.json
dev_file: ${train_file}
test_file: ${train_file}
port: 6000
model:
sampling_params:
_target_: vllm.SamplingParams
n: 1
temperature: 0.0
top_p: 1.0
stop: [ "", "\n\n\n\n", "Context:\n", "Thought 42:", "<|end_of_text|>", "<|eot_id|>, <|EOT|>" ]
max_tokens: 512
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
split_size: 2
split_id: 0
max_num_seqs: 64
max_model_len: 4096
suffix: ${split_id}-of-${split_size}
output_file: ${output_dir}/apps-test-inputs-gen/${eval_sub_path}/oss_instruct_python.func_head_extract.tem${tem}.n${n}.${suffix}.v1.0.json
flush_file: ${output_file}
apply_chat_template: True
add_generation_prompt: True
chat_prefix:
chat_connect:
chat_suffix:
prompt:
_target_: data.input_utils.read_text
file_path: prompts/magicoder/oss_has_function_head_v1_0.txt
# Data loading
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
template:
_target_: data.input_utils.compose_template
units:
prompt: ${prompt}
composition: "{prompt}"
instruction:
replacement:
"[[Question]]": "problem"
index_field: index
service_based: False
split_size: ${split_size}
split_id: ${split_id}
service_processor:
_target_: data.vllm.VLLMRequestGenerator
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: 4096
model: ${model}
stop: ${sampling_params.stop}
temperature: ${sampling_params.temperature}
n: ${sampling_params.n}
max_data_num: -1
flush_file: ${flush_file}
exp_name:
save_best: False
eval_sub_path: ""
output_dir: ${model_path_prefix}/Mistral-Large-Instruct-2407/
# Dataloader
num_workers: 32
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.openai_api_callback.SaveOnlyCallBack
output_file: ${output_file}
answer_clean:
index_field: index
resume: True
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/magicoder/mistral/test_case_input_gen_v1_0.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: ${data_path_prefix}/dataset/magicoder/data-oss_instruct-decontaminated-python.json
dev_file: ${train_file}
test_file: ${train_file}
port: 6000
model:
sampling_params:
_target_: vllm.SamplingParams
n: 1
temperature: 0.0
top_p: 1.0
stop: [ "", "\n\n\n\n", "Context:\n", "Thought 42:", "<|end_of_text|>", "<|eot_id|>, <|EOT|>" ]
max_tokens: 4096
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
split_size: 2
split_id: 0
max_num_seqs: 64
max_model_len: 4096
suffix: ${split_id}-of-${split_size}
output_file: ${output_dir}/apps-test-inputs-gen/${eval_sub_path}/sub_dev.0shot.tem${tem}.n${n}.${suffix}.v1.0.json
flush_file: ${output_file}
apply_chat_template: True
add_generation_prompt: True
chat_prefix:
chat_connect:
chat_suffix:
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/test_input_gen_2shot_v2.1.txt
# Data loading
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
template:
_target_: data.input_utils.compose_template
units:
prompt: ${prompt}
composition: "{prompt}"
instruction:
replacement:
"[[Question]]": "problem"
index_field: index
service_based: False
split_size: ${split_size}
split_id: ${split_id}
service_processor:
_target_: data.vllm.VLLMRequestGenerator
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: 4096
model: ${model}
stop: ${sampling_params.stop}
temperature: ${sampling_params.temperature}
n: ${sampling_params.n}
max_data_num: -1
flush_file: ${flush_file}
exp_name:
save_best: False
eval_sub_path: ""
output_dir: ${model_path_prefix}/Mistral-Large-Instruct-2407/
# Dataloader
num_workers: 32
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.openai_api_callback.SaveOnlyCallBack
output_file: ${output_file}
answer_clean:
index_field: index
resume: True
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/mathscale/4o_mathstral_train_0shot_v1_0.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/
model_path_prefix: /mnt/fangkai_blob/share/models # ../pretrained-models/
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
global_split_id: 0
train_file:
dev_file:
#test_file: ${model_path_prefix}/mathstral-7B-v0.1/mathscale4o/train.330k.v1.0.boxed.${global_split_id}-of-11.json
test_file: ${model_path_prefix}/mathstral-7B-v0.1/mathscale4o/train.500k.de_con.v1.0.boxed.${global_split_id}-of-20.json
port: 6000
model:
sampling_params:
_target_: vllm.SamplingParams
n: 10
temperature: 1.0
max_tokens: 4096
stop: [ "", "\n\n\n\n", "### Instruction", "<|end▁of▁sentence|>", "", "" ]
top_p: 0.9
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
top_p: ${sampling_params.top_p}
split_size: 64
split_id: 0
max_num_seqs: 64
max_model_len: 4096
global_batch_size: 512
suffix: ${split_id}-of-${split_size}
#output_file: ${output_dir}/${eval_sub_path}/mathscale4o/split-${global_split_id}-of-11/train.330k.boxed.v1.0.${global_split_id}-of-11.0shot.n${n}.tem${tem}.p${top_p}.${suffix}.json
output_file: ${output_dir}/${eval_sub_path}/mathscale4o/500k-split-${global_split_id}-of-20/train.500k.de_con.boxed.v1.0.${global_split_id}-of-20.0shot.n${n}.tem${tem}.p${top_p}.${suffix}.s${seed}.json
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
template: "{question}\n\nPlease put your final answer within {instruction}."
instruction: "\\boxed{}"
split_size: ${split_size}
split_id: ${split_id}
service_based: False
service_processor:
_target_: data.vllm.VLLMRequestGenerator
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: ${sampling_params.max_tokens}
model: ${model}
stop: ${sampling_params.stop}
n: ${n}
temperature: ${tem}
top_p: ${top_p}
# index_field: uuid
index_field: id
flush_file: ${flush_file}
save_best: False
step:
exp_name:
exp_notes:
#output_dir: ${model_path_prefix}/mathstral-7B-v0.1/
output_dir: ${output_path_prefix}/experiments/${exp_name}/
eval_sub_path: ""
# Dataloader
num_workers: 8
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.openai_api_callback.MathScaleCallBack
answer_clean:
output_file: ${output_file}
resume: True
# index_field: "uuid"
index_field: "id"
label_field: "label"
# saved_keys: [ "question", "uuid", "solution", "box_solution" ]
saved_keys: ["question", "solution", "box_solution"]
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/mathscale/4o_mathstral_train_0shot_v1_0_completion.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/
model_path_prefix: /mnt/fangkai_blob/share/models # ../pretrained-models/
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
sample_over_p: 10
sft_model_dir: ${output_path_prefix}/experiments/mathstral.mathscale4o.sft.V100.tp2dp8.v2.0.s42/checkpoint-800/
global_split_id: 0
train_file:
dev_file:
#test_file: ${model_path_prefix}/mathstral-7B-v0.1/mathscale4o/split-0-of-11/train.330k.boxed.v1.0.0-of-11.0shot.n20.tem1.0.p0.9.upper0.7.r0.3.inter_step.json
#test_file: ${model_path_prefix}/mathstral-7B-v0.1/mathscale4o/split-0-of-11/train.330k.boxed.v1.0.0-of-11.0shot.n20.tem1.0.p0.9.upper0.7.r0.3.filter_same.json
#test_file: ${sft_model_dir}/mathscale4o/train.500k.de_con.boxed.v1.0.n10.tem1.0.p0.9.upper0.7.r0.3.sample${sample_over_p}.filter_same.${global_split_id}-of-4.json
test_file: ${sft_model_dir}/mathscale4o/train.500k.de_con.boxed.v1.0.n10.tem1.0.p0.9.upper0.7.r0.3.sample${sample_over_p}.filter_same.json
port: 6000
model:
sampling_params:
_target_: vllm.SamplingParams
n: 3
temperature: 1.0
max_tokens: 2048
stop: [ "", "\n\n\n\n", "### Instruction", "<|end▁of▁sentence|>", "", "" ]
top_p: 0.9
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
top_p: ${sampling_params.top_p}
split_size: 256
split_id: 0
max_num_seqs: 64
max_model_len: 2048
global_batch_size: 512
suffix: ${split_id}-of-${split_size}
#output_file: ${output_dir}/${eval_sub_path}/mathscale4o/split-${global_split_id}-of-11/train.330k.boxed.v1.0.${global_split_id}-of-11.n20.tem1.0.p0.9.upper0.7.r0.3.prefix_completion.n${n}.tem${tem}.p${top_p}.${suffix}.json
#output_file: ${output_dir}/${eval_sub_path}/mathscale4o/split-${global_split_id}-of-11/train.330k.boxed.v1.0.${global_split_id}-of-11.n20.tem1.0.p0.9.upper0.7.r0.3.filter_same.prefix_completion.n${n}.tem${tem}.p${top_p}.${suffix}.json
#output_file: ${sft_model_dir}/mathscale4o/split-${split_size}/train.500k.de_con.boxed.v1.0.n10.tem1.0.p0.9.upper0.7.r0.3.sample${sample_over_p}.filter_same.${global_split_id}-of-4.prefix_completion.n${n}.tem${tem}.p${top_p}.${suffix}.json
output_file: ${sft_model_dir}/mathscale4o/split-${split_size}/train.500k.de_con.boxed.v1.0.n10.tem1.0.p0.9.upper0.7.r0.3.sample${sample_over_p}.filter_same.prefix_completion.n${n}.tem${tem}.p${top_p}.${suffix}.json
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.flat_aligner
input_index_field: id
extract_field: [ "prefix", "prefix_id" ]
mode: "multi"
template: "{question}\n\nPlease put your final answer within {instruction}.{prefix}"
instruction: "\\boxed{}"
split_size: ${split_size}
split_id: ${split_id}
service_based: False
service_processor:
_target_: data.vllm.VLLMRequestGenerator
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: ${sampling_params.max_tokens}
model: ${model}
stop: ${sampling_params.stop}
n: ${n}
temperature: ${tem}
top_p: ${top_p}
index_field: prefix_id
flush_file: ${flush_file}
save_best: False
step:
exp_name:
exp_notes:
#output_dir: ${model_path_prefix}/mathstral-7B-v0.1/
output_dir: ${sft_model_dir}
eval_sub_path: ""
# Dataloader
num_workers: 8
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.openai_api_callback.MathScaleCallBack
answer_clean:
output_file: ${output_file}
resume: True
index_field: "prefix_id"
label_field: "label"
# saved_keys: [ 'question', 'uuid', 'solution', 'box_solution', 'prefix', 'prefix_id']
saved_keys: [ 'question', 'id', 'prefix', 'prefix_id']
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/mathscale/4o_mathstral_train_0shot_v1_1.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/
model_path_prefix: /mnt/fangkai_blob/share/models # ../pretrained-models/
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
global_split_id: 0
train_file:
dev_file:
test_file: ${data_path_prefix}/dataset/mathscale4o/mscale_300k_boxed.json
port: 6000
model:
sampling_params:
_target_: vllm.SamplingParams
n: 10
temperature: 1.0
max_tokens: 4096
stop: [ "", "\n\n\n\n", "### Instruction", "<|end▁of▁sentence|>", "", "" ]
top_p: 1.0
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
top_p: ${sampling_params.top_p}
split_size: 512
split_id: 0
max_num_seqs: 64
max_model_len: 4096
global_batch_size: 256
suffix: ${split_id}-of-${split_size}
output_file: ${output_dir}/${eval_sub_path}/mathscale4o/mscale-v0.1-300k/mscale.v0.1.300k.v1.0.n${n}.tem${tem}.p${top_p}.${suffix}.s${seed}.json
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
template: "{question}\n\nPlease put your final answer within {instruction}."
instruction: "\\boxed{}"
split_size: ${split_size}
split_id: ${split_id}
service_based: False
service_processor:
_target_: data.vllm.VLLMRequestGenerator
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: ${sampling_params.max_tokens}
model: ${model}
stop: ${sampling_params.stop}
n: ${n}
temperature: ${tem}
top_p: ${top_p}
# index_field: uuid
index_field: id
flush_file: ${flush_file}
save_best: False
step:
exp_name:
exp_notes:
#output_dir: ${model_path_prefix}/mathstral-7B-v0.1/
output_dir: ${output_path_prefix}/experiments/${exp_name}/
eval_sub_path: ""
# Dataloader
num_workers: 8
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.openai_api_callback.MathScaleCallBack
answer_clean:
output_file: ${output_file}
resume: True
index_field: "id"
label_field: "label"
saved_keys: ["question", "solution", "box_solution"]
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/mathscale/4o_mathstral_train_0shot_v1_1_completion.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/
model_path_prefix: /mnt/fangkai_blob/share/models # ../pretrained-models/
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
sample_over_p: 16
sft_model_dir: ${output_path_prefix}/experiments/mathstral.mathscale4o.sft.V100.tp2dp8.v2.0.s42/checkpoint-800/
global_split_id: 0
train_file:
dev_file:
#test_file: ${model_path_prefix}/mathstral-7B-v0.1/mathscale4o/split-0-of-11/train.330k.boxed.v1.0.0-of-11.0shot.n20.tem1.0.p0.9.upper0.7.r0.3.inter_step.json
#test_file: ${model_path_prefix}/mathstral-7B-v0.1/mathscale4o/split-0-of-11/train.330k.boxed.v1.0.0-of-11.0shot.n20.tem1.0.p0.9.upper0.7.r0.3.filter_same.json
#test_file: ${sft_model_dir}/mathscale4o/train.500k.de_con.boxed.v1.0.n10.tem1.0.p0.9.upper0.7.r0.3.sample${sample_over_p}.filter_same.${global_split_id}-of-4.json
test_file: ${sft_model_dir}/mathscale4o/mscale-v0.1-300k/mscale.v0.1.300k.v1.0.n10.tem1.0.p1.0.upper0.7.r0.3.sample10.filter_same.json
port: 6000
model:
sampling_params:
_target_: vllm.SamplingParams
n: 3
temperature: 1.0
max_tokens: 2048
stop: [ "", "\n\n\n\n", "### Instruction", "<|end▁of▁sentence|>", "", "" ]
top_p: 0.9
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
top_p: ${sampling_params.top_p}
split_size: 256
split_id: 0
max_num_seqs: 64
max_model_len: 2048
global_batch_size: 512
suffix: ${split_id}-of-${split_size}
#output_file: ${output_dir}/${eval_sub_path}/mathscale4o/split-${global_split_id}-of-11/train.330k.boxed.v1.0.${global_split_id}-of-11.n20.tem1.0.p0.9.upper0.7.r0.3.prefix_completion.n${n}.tem${tem}.p${top_p}.${suffix}.json
#output_file: ${output_dir}/${eval_sub_path}/mathscale4o/split-${global_split_id}-of-11/train.330k.boxed.v1.0.${global_split_id}-of-11.n20.tem1.0.p0.9.upper0.7.r0.3.filter_same.prefix_completion.n${n}.tem${tem}.p${top_p}.${suffix}.json
#output_file: ${sft_model_dir}/mathscale4o/split-${split_size}/train.500k.de_con.boxed.v1.0.n10.tem1.0.p0.9.upper0.7.r0.3.sample${sample_over_p}.filter_same.${global_split_id}-of-4.prefix_completion.n${n}.tem${tem}.p${top_p}.${suffix}.json
output_file: ${sft_model_dir}/mathscale4o/mscale-v0.1-300k/split-${split_size}/mscale.v0.1.300k.v1.0.n10.tem1.0.p1.0.upper0.7.r0.3.sample10.filter_same.prefix_completion.n${n}.tem${tem}.p${top_p}.${suffix}.json
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.flat_aligner
input_index_field: id
extract_field: [ "prefix", "prefix_id" ]
mode: "multi"
template: "{question}\n\nPlease put your final answer within {instruction}.{prefix}"
instruction: "\\boxed{}"
split_size: ${split_size}
split_id: ${split_id}
service_based: False
service_processor:
_target_: data.vllm.VLLMRequestGenerator
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: ${sampling_params.max_tokens}
model: ${model}
stop: ${sampling_params.stop}
n: ${n}
temperature: ${tem}
top_p: ${top_p}
index_field: prefix_id
flush_file: ${flush_file}
save_best: False
step:
exp_name:
exp_notes:
#output_dir: ${model_path_prefix}/mathstral-7B-v0.1/
output_dir: ${sft_model_dir}
eval_sub_path: ""
# Dataloader
num_workers: 8
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.openai_api_callback.MathScaleCallBack
answer_clean:
output_file: ${output_file}
resume: True
index_field: "prefix_id"
label_field: "label"
# saved_keys: [ 'question', 'uuid', 'solution', 'box_solution', 'prefix', 'prefix_id']
saved_keys: [ 'question', 'id', 'prefix', 'prefix_id']
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/mathscale/4o_mathstral_train_half_0shot_v1_0.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/
model_path_prefix: /mnt/fangkai_blob/share/models # ../pretrained-models/
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
global_split_id: 0
train_file:
dev_file:
test_file: ${model_path_prefix}/mathstral-7B-v0.1/mathscale4o/train.500k-${global_split_id}-of-2.de_con.v1.0.boxed.json
port: 6000
model:
sampling_params:
_target_: vllm.SamplingParams
n: 10
temperature: 1.0
max_tokens: 4096
stop: [ "", "\n\n\n\n", "### Instruction", "<|end▁of▁sentence|>", "", "" ]
top_p: 0.9
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
top_p: ${sampling_params.top_p}
split_size: 256
split_id: 0
max_num_seqs: 64
max_model_len: 4096
global_batch_size: 256
suffix: ${split_id}-of-${split_size}
output_file: ${output_dir}/${eval_sub_path}/mathscale4o/500k-half-${global_split_id}-of-2/train.500k-${global_split_id}-of-2.de_con.boxed.v1.0.n${n}.tem${tem}.p${top_p}.${suffix}.s${seed}.json
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
template: "{question}\n\nPlease put your final answer within {instruction}."
instruction: "\\boxed{}"
split_size: ${split_size}
split_id: ${split_id}
service_based: False
service_processor:
_target_: data.vllm.VLLMRequestGenerator
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: ${sampling_params.max_tokens}
model: ${model}
stop: ${sampling_params.stop}
n: ${n}
temperature: ${tem}
top_p: ${top_p}
index_field: id
flush_file: ${flush_file}
save_best: False
step:
exp_name:
exp_notes:
#output_dir: ${model_path_prefix}/mathstral-7B-v0.1/
output_dir: ${output_path_prefix}/experiments/${exp_name}/
eval_sub_path: ""
# Dataloader
num_workers: 8
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.openai_api_callback.MathScaleCallBack
answer_clean:
output_file: ${output_file}
resume: True
# index_field: "uuid"
index_field: "id"
label_field: "label"
# saved_keys: [ "question", "uuid", "solution", "box_solution" ]
saved_keys: ["question", "solution", "box_solution"]
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/mathscale/4o_mathstral_train_half_0shot_v1_0_completion.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/
model_path_prefix: /mnt/fangkai_blob/share/models # ../pretrained-models/
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
sample_over_p: 16
sft_model_dir: ${output_path_prefix}/experiments/mathstral.mathscale4o.sft.V100.tp2dp8.v2.0.s42/checkpoint-800/
global_split_id: 0
train_file:
dev_file:
#test_file: ${model_path_prefix}/mathstral-7B-v0.1/mathscale4o/split-0-of-11/train.330k.boxed.v1.0.0-of-11.0shot.n20.tem1.0.p0.9.upper0.7.r0.3.inter_step.json
#test_file: ${model_path_prefix}/mathstral-7B-v0.1/mathscale4o/split-0-of-11/train.330k.boxed.v1.0.0-of-11.0shot.n20.tem1.0.p0.9.upper0.7.r0.3.filter_same.json
test_file: ${sft_model_dir}/mathscale4o/500k-half-${global_split_id}-of-2/train.500k-${global_split_id}-of-2.de_con.boxed.v1.0.n10.tem1.0.p0.9.upper0.7.r0.3.sample10.filter_same.json
port: 6000
model:
sampling_params:
_target_: vllm.SamplingParams
n: 3
temperature: 1.0
max_tokens: 2048
stop: [ "", "\n\n\n\n", "### Instruction", "<|end▁of▁sentence|>", "", "" ]
top_p: 0.9
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
top_p: ${sampling_params.top_p}
split_size: 512
split_id: 0
max_num_seqs: 64
max_model_len: 2048
suffix: ${split_id}-of-${split_size}
#output_file: ${output_dir}/${eval_sub_path}/mathscale4o/split-${global_split_id}-of-11/train.330k.boxed.v1.0.${global_split_id}-of-11.n20.tem1.0.p0.9.upper0.7.r0.3.prefix_completion.n${n}.tem${tem}.p${top_p}.${suffix}.json
#output_file: ${output_dir}/${eval_sub_path}/mathscale4o/split-${global_split_id}-of-11/train.330k.boxed.v1.0.${global_split_id}-of-11.n20.tem1.0.p0.9.upper0.7.r0.3.filter_same.prefix_completion.n${n}.tem${tem}.p${top_p}.${suffix}.json
output_file: ${sft_model_dir}/mathscale4o/500k-half-${global_split_id}-of-2/split-${split_size}/train.500k-${global_split_id}-of-2.de_con.boxed.v1.0.n10.tem1.0.p0.9.upper0.7.r0.3.sample10.filter_same.prefix_completion.n${n}.tem${tem}.p${top_p}.${suffix}.json
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.flat_aligner
input_index_field: id
extract_field: [ "prefix", "prefix_id" ]
mode: "multi"
template: "{question}\n\nPlease put your final answer within {instruction}.{prefix}"
instruction: "\\boxed{}"
split_size: ${split_size}
split_id: ${split_id}
service_based: False
service_processor:
_target_: data.vllm.VLLMRequestGenerator
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: ${sampling_params.max_tokens}
model: ${model}
stop: ${sampling_params.stop}
n: ${n}
temperature: ${tem}
top_p: ${top_p}
index_field: prefix_id
flush_file: ${flush_file}
save_best: False
step:
exp_name:
exp_notes:
#output_dir: ${model_path_prefix}/mathstral-7B-v0.1/
output_dir: ${sft_model_dir}
eval_sub_path: ""
# Dataloader
num_workers: 8
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.openai_api_callback.MathScaleCallBack
answer_clean:
output_file: ${output_file}
resume: True
index_field: "prefix_id"
label_field: "label"
# saved_keys: [ 'question', 'uuid', 'solution', 'box_solution', 'prefix', 'prefix_id']
saved_keys: [ 'question', 'id', 'prefix', 'prefix_id']
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/mathscale/mathstral/deepseek_test_0shot_tem_v1_1.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
mount_dir: /mnt/fangkai_blob/
data_path_prefix: ${mount_dir}/share/
model_path_prefix: ${mount_dir}/share/models # ../pretrained-models/
output_path_prefix: ${mount_dir}/reward_modeling/
train_file:
dev_file:
test_file: ${data_path_prefix}/MetaMath/data/test/MATH_test.json
port: 6000
model:
sampling_params:
_target_: vllm.SamplingParams
n: 1
temperature: 0.0
max_tokens: 4096
stop: [ "", "\n\n\n\n", "### Instruction", "<|end▁of▁sentence|>", "", "" ]
top_p: 1.0
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
top_p: ${sampling_params.top_p}
split_size: 1
split_id: 0
max_num_seqs: 128
suffix: ${split_id}-of-${split_size}
output_file: ${output_dir}/${eval_sub_path}/math/math.test.v1.1.0shot.n${n}.tem${tem}.p${top_p}.${suffix}.json
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
aligner:
_target_: data.math.math_gold_answer_extractor_deepseek
kv_mapping:
instruction: question
template: "User: {question}\nPlease reason step by step, and put your final answer within {instruction}.\n\nAssistant:"
instruction: "\\boxed{}" # Hack here! because {} wil report error.
split_size: ${split_size}
split_id: ${split_id}
service_based: False
service_processor:
_target_: data.vllm.VLLMRequestGenerator
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: ${sampling_params.max_tokens}
model: ${model}
stop: ${sampling_params.stop}
n: ${n}
temperature: ${tem}
top_p: ${top_p}
index_field: "idx"
save_best: False
step:
exp_name:
exp_notes:
output_dir: ${model_path_prefix}/mathstral-7B-v0.1/
eval_sub_path: ""
# Dataloader
num_workers: 48
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
# _target_: post_processors.openai_api_callback.OpenAIMATHCallBack
_target_: post_processors.openai_api_callback.DeepSeekMathCallBack
output_file: ${output_file}
# answer_clean:
# _target_: data.math.math_boxed_answer_cleaner_proxy
eval_fn: math
answer_clean: math
resume: False
index_field: "idx"
label_field: "label"
saved_keys: [ "question", "output" ]
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/mathscale/mathstral/mistral_mathscale4o_labeling.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
mount_dir: /mnt/fangkai_blob/
data_path_prefix: ${mount_dir}/share/
model_path_prefix: ${mount_dir}/share/models # ../pretrained-models/
output_path_prefix: ${mount_dir}/reward_modeling/
train_file:
dev_file:
#test_file: ${data_path_prefix}/dataset/mathscale4o/concept2prompts.4o.t1.0.extract_qa.330k.json
test_file: ${data_path_prefix}/dataset/mathscale4o/mathscale4o.500k.de_con.json
port: 6000
model:
sampling_params:
_target_: vllm.SamplingParams
n: 1
temperature: 0.0
max_tokens: 128
stop: [ "", "\n\n\n\n", "### Instruction", "<|end▁of▁sentence|>", "", "" ]
top_p: 1.0
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
top_p: ${sampling_params.top_p}
split_size: 100
split_id: 0
max_num_seqs: 64
suffix: ${split_id}-of-${split_size}
output_file: ${output_dir}/mathscale4o/${eval_sub_path}/labeling/4o.500k.de_con.v1.0.0shot.n${n}.tem${tem}.p${top_p}.${suffix}.json
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.mathscale.util.mathscale_extract_answer_fn_v3
completion_field: solution
template: "{question}\n\nPlease put your final answer within {instruction}.\n\n{solution_wo_suffix}"
instruction: "\\boxed{}"
split_size: ${split_size}
split_id: ${split_id}
service_based: False
service_processor:
_target_: data.vllm.VLLMRequestGenerator
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: ${sampling_params.max_tokens}
model: ${model}
stop: ${sampling_params.stop}
n: ${n}
temperature: ${tem}
top_p: ${top_p}
# index_field: uuid
index_field: id
save_best: False
step:
exp_name:
exp_notes:
output_dir: ${model_path_prefix}/mathstral-7B-v0.1
eval_sub_path: ""
# Dataloader
num_workers: 8
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.openai_api_callback.MathScaleCallBack
answer_clean:
output_file: ${output_file}
resume: False
# index_field: "uuid"
index_field: "id"
label_field: "label"
# saved_keys: [ "question", "completion", "prompt", "solution", "uuid", "solution_wo_suffix" ]
saved_keys: [ "question", "solution", "solution_wo_suffix" ]
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/mathscale/mathstral/mistral_train_0shot_iter0_v1_0.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/
model_path_prefix: /mnt/fangkai_blob/share/models # ../pretrained-models/
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file:
dev_file:
#test_file: ${data_path_prefix}/dataset/mathscale/train.v60.300k.1-of-30.json
test_file: ${data_path_prefix}/dataset/mathscale/train.v60.300k.all.json
port: 6000
model:
sampling_params:
_target_: vllm.SamplingParams
n: 10
temperature: 1.0
max_tokens: 4096
stop: [ "", "\n\n\n\n", "### Instruction", "<|end▁of▁sentence|>", "", "" ]
top_p: 0.9
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
top_p: ${sampling_params.top_p}
split_size: 16
split_id: 0
max_num_seqs: 32
suffix: ${split_id}-of-${split_size}
#output_file: ${output_dir}/mathscale/${eval_sub_path}/train.v60.300k.1-of-30.v1.2.0shot.n${n}.tem${tem}.p${top_p}.${suffix}.json
output_file: ${output_dir}/mathscale/${eval_sub_path}/all_splits/train.v60.300k.all.v1.2.0shot.n${n}.tem${tem}.p${top_p}.${suffix}.json
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.mathscale.util.mathscale_extract_answer_fn_v3
completion_field: completion
- _target_: data.mathscale.util.extract_pure_prompt_aligner
template: "{question}\n\nPlease put your final answer within {instruction}."
instruction: "\\boxed{}"
split_size: ${split_size}
split_id: ${split_id}
service_based: False
service_processor:
_target_: data.vllm.VLLMRequestGenerator
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: ${sampling_params.max_tokens}
model: ${model}
stop: ${sampling_params.stop}
n: ${n}
temperature: ${tem}
top_p: ${top_p}
index_field: id
save_best: False
step:
exp_name:
exp_notes:
#output_dir: ${output_path_prefix}/experiments/${exp_name}/
output_dir: ${model_path_prefix}/mathstral-7B-v0.1
eval_sub_path: ""
# Dataloader
num_workers: 8
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.openai_api_callback.MathScaleCallBack
answer_clean:
output_file: ${output_file}
resume: False
index_field: "id"
label_field: "label"
saved_keys: [ "question", "completion", "prompt" ]
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/mathscale/mathstral/test_0shot_tem_v1_1.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
mount_dir: /mnt/fangkai_blob/
data_path_prefix: ${mount_dir}/share/
model_path_prefix: ${mount_dir}/share/models # ../pretrained-models/
output_path_prefix: ${mount_dir}/reward_modeling/
train_file:
dev_file:
test_file: ${data_path_prefix}/MetaMath/data/test/MATH_test.json
port: 6000
model:
sampling_params:
_target_: vllm.SamplingParams
n: 1
temperature: 0.0
max_tokens: 4096
stop: [ "", "\n\n\n\n", "### Instruction", "<|end▁of▁sentence|>", "", "" ]
top_p: 1.0
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
top_p: ${sampling_params.top_p}
split_size: 1
split_id: 0
max_num_seqs: 32
suffix: ${split_id}-of-${split_size}
output_file: ${output_dir}/${eval_sub_path}/math/math.test.v1.1.0shot.n${n}.tem${tem}.p${top_p}.${suffix}.json
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
aligner:
_target_: data.math.math_gold_answer_extractor
kv_mapping:
instruction: question
template: "{question}\n\nPlease put your final answer within {instruction}."
instruction: "\\boxed{}"
split_size: ${split_size}
split_id: ${split_id}
service_based: False
service_processor:
_target_: data.vllm.VLLMRequestGenerator
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: ${sampling_params.max_tokens}
model: ${model}
stop: ${sampling_params.stop}
n: ${n}
temperature: ${tem}
top_p: ${top_p}
index_field: "idx"
save_best: False
step:
exp_name:
exp_notes:
#output_dir: ${model_path_prefix}/mathscale-mistral/
output_dir: ${model_path_prefix}/mathstral-7B-v0.1/
eval_sub_path: ""
# Dataloader
num_workers: 16
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.openai_api_callback.MathScaleCallBack
answer_clean:
output_file: ${output_file}
index_field: "idx"
label_field: "label"
saved_keys: [ "question", "output" ]
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/mathscale/mathstral/test_0shot_tem_v1_1_step.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
mount_dir: /mnt/fangkai_blob/
data_path_prefix: ${mount_dir}/share/
model_path_prefix: ${mount_dir}/share/models # ../pretrained-models/
output_path_prefix: ${mount_dir}/reward_modeling/
train_file:
dev_file:
test_file: ${data_path_prefix}/MetaMath/data/test/MATH_test.json
port: 6000
model:
sampling_params:
_target_: vllm.SamplingParams
n: 1
temperature: 0.0
max_tokens: 4096
stop: [ "", "\n\n\n\n", "### Instruction", "<|end▁of▁sentence|>", "", "", "<|end_of_text|>" ]
top_p: 1.0
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
top_p: ${sampling_params.top_p}
split_size: 8
split_id: 0
max_num_seqs: 32
suffix: ${split_id}-of-${split_size}
output_file: ${output_dir}/${eval_sub_path}/math/math.test.v1.1.0shot.n${n}.tem${tem}.p${top_p}.${suffix}.json
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
aligner:
_target_: data.math.math_gold_answer_extractor
kv_mapping:
instruction: question
template: "{question}\n\nPlease put your final answer within {instruction}."
instruction: "\\boxed{}"
split_size: ${split_size}
split_id: ${split_id}
service_based: False
service_processor:
_target_: data.vllm.VLLMRequestGenerator
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: ${sampling_params.max_tokens}
model: ${model}
stop: ${sampling_params.stop}
n: ${n}
temperature: ${tem}
top_p: ${top_p}
index_field: "idx"
flush_file: ${flush_file}
save_best: False
step:
exp_name:
exp_notes:
output_dir: ${output_path_prefix}/experiments/${exp_name}/
eval_sub_path: ""
# Dataloader
num_workers: 16
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.openai_api_callback.MathScaleCallBack
answer_clean:
output_file: ${output_file}
index_field: "idx"
label_field: "label"
saved_keys: [ "question", "output" ]
resume: True
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/mathscale/mathstral/test_0shot_tem_v1_1_step_seed.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
mount_dir: /mnt/fangkai_blob/
data_path_prefix: ${mount_dir}/share/
model_path_prefix: ${mount_dir}/share/models # ../pretrained-models/
output_path_prefix: ${mount_dir}/reward_modeling/
train_file:
dev_file:
test_file: ${data_path_prefix}/MetaMath/data/test/MATH_test.json
port: 6000
model:
sampling_params:
_target_: vllm.SamplingParams
n: 1
temperature: 1.0
max_tokens: 4096
stop: [ "", "\n\n\n\n", "### Instruction", "<|end▁of▁sentence|>", "", "", "<|end_of_text|>" ]
top_p: 1.0
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
top_p: ${sampling_params.top_p}
split_size: 1
split_id: 0
max_num_seqs: 64
max_model_len: 4096
global_batch_size: 512
suffix: ${split_id}-of-${split_size}
output_file: ${output_dir}/${eval_sub_path}/math/math.test.v1.1.0shot.n${n}.tem${tem}.p${top_p}.${suffix}.s${seed}.json
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
aligner:
_target_: data.math.math_gold_answer_extractor
kv_mapping:
instruction: question
template: "{question}\n\nPlease put your final answer within {instruction}."
instruction: "\\boxed{}"
split_size: ${split_size}
split_id: ${split_id}
service_based: False
service_processor:
_target_: data.vllm.VLLMRequestGenerator
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: ${sampling_params.max_tokens}
model: ${model}
stop: ${sampling_params.stop}
n: ${n}
temperature: ${tem}
top_p: ${top_p}
index_field: "idx"
flush_file: ${flush_file}
save_best: False
step:
exp_name:
exp_notes:
output_dir: ${output_path_prefix}/experiments/${exp_name}/
eval_sub_path: ""
# Dataloader
num_workers: 16
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.openai_api_callback.MathScaleCallBack
answer_clean:
output_file: ${output_file}
index_field: "idx"
label_field: "label"
saved_keys: [ "question", "output" ]
resume: True
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/mathscale/mathstral/test_0shot_tem_v2_0_step.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
mount_dir: /mnt/fangkai_blob/
data_path_prefix: ${mount_dir}/share/
model_path_prefix: ${mount_dir}/share/models # ../pretrained-models/
output_path_prefix: ${mount_dir}/reward_modeling/
train_file:
dev_file:
test_file: ${data_path_prefix}/Qwen2.5-Math/evaluation/data/math/test.jsonl
port: 6000
model:
sampling_params:
_target_: vllm.SamplingParams
n: 1
temperature: 0.0
max_tokens: 4096
stop: [ "", "\n\n\n\n", "### Instruction", "<|end▁of▁sentence|>", "", "", "<|end_of_text|>" ]
top_p: 1.0
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
top_p: ${sampling_params.top_p}
split_size: 8
split_id: 0
max_num_seqs: 32
suffix: ${split_id}-of-${split_size}
output_file: ${output_dir}/${eval_sub_path}/math/math.test.v2.0.0shot.n${n}.tem${tem}.p${top_p}.${suffix}.json
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
read_fn:
_target_: data.input_utils.jsonl_read_fn
template: "{problem}\n\nPlease put your final answer within {instruction}."
instruction: "\\boxed{}"
split_size: ${split_size}
split_id: ${split_id}
service_based: False
service_processor:
_target_: data.vllm.VLLMRequestGenerator
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: ${sampling_params.max_tokens}
model: ${model}
stop: ${sampling_params.stop}
n: ${n}
temperature: ${tem}
top_p: ${top_p}
index_field: "unique_id"
flush_file: ${flush_file}
save_best: False
step:
exp_name:
exp_notes:
output_dir: ${output_path_prefix}/experiments/${exp_name}/
eval_sub_path: ""
# Dataloader
num_workers: 16
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.openai_api_callback.MathScaleCallBack
answer_clean:
output_file: ${output_file}
index_field: "unique_id"
label_field: "answer"
saved_keys: [ "problem" ]
resume: True
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/mathscale/mathstral/test_0shot_tem_v3_0_step.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
mount_dir: /mnt/fangkai_blob/
data_path_prefix: ${mount_dir}/share/
model_path_prefix: ${mount_dir}/share/models # ../pretrained-models/
output_path_prefix: ${mount_dir}/reward_modeling/
train_file:
dev_file:
#test_file: ${data_path_prefix}/Qwen2.5-Math/evaluation/data/math/test.jsonl
test_file: ${data_path_prefix}/MetaMath/data/test/MATH_test.json
port: 6000
model:
sampling_params:
_target_: vllm.SamplingParams
n: 1
temperature: 0.0
max_tokens: 4096
stop: [ "", "\n\n\n\n", "### Instruction", "<|end▁of▁sentence|>", "", "", "<|end_of_text|>" ]
top_p: 1.0
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
top_p: ${sampling_params.top_p}
split_size: 8
split_id: 0
max_num_seqs: 32
suffix: ${split_id}-of-${split_size}
output_file: ${output_dir}/${eval_sub_path}/math/math.test.v3.0.0shot.n${n}.tem${tem}.p${top_p}.${suffix}.json
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
# read_fn:
# _target_: data.input_utils.jsonl_read_fn
aligner:
_target_: data.math.math_gold_answer_extractor
kv_mapping:
instruction: question
template: "{question}\n\nPlease put your final answer within {instruction}."
instruction: "\\boxed{}"
split_size: ${split_size}
split_id: ${split_id}
service_based: False
service_processor:
_target_: data.vllm.VLLMRequestGenerator
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: ${sampling_params.max_tokens}
model: ${model}
stop: ${sampling_params.stop}
n: ${n}
temperature: ${tem}
top_p: ${top_p}
# index_field: "unique_id"
index_field: "idx"
flush_file: ${flush_file}
save_best: False
step:
exp_name:
exp_notes:
output_dir: ${output_path_prefix}/experiments/${exp_name}/
eval_sub_path: ""
# Dataloader
num_workers: 16
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.qwen25_math_callback.Qwen25MathCallBack
answer_clean:
output_file: ${output_file}
# index_field: "unique_id"
index_field: "idx"
label_field: "answer"
saved_keys: [ "question" ]
resume: True
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/mathscale/mistral_train_0shot_v1_0.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/
model_path_prefix: /mnt/fangkai_blob/share/models # ../pretrained-models/
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file:
dev_file:
test_file: ${data_path_prefix}/dataset/mathscale/train.v60.300k.1-of-30.json
port: 6000
model:
sampling_params:
_target_: vllm.SamplingParams
n: 5
temperature: 1.0
max_tokens: 4096
stop: [ "", "\n\n\n\n", "### Instruction", "<|end▁of▁sentence|>", "", "" ]
top_p: 0.9
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
top_p: ${sampling_params.top_p}
split_size: 8
split_id: 0
max_num_seqs: 32
suffix: ${split_id}-of-${split_size}
output_file: ${output_dir}/${eval_sub_path}/mathscale/train.v60.300k.1-of-30.v1.0.0shot.n${n}.tem${tem}.p${top_p}.${suffix}.json
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.math.number_answer_extractor
separator: "he answer is"
completion_field: completion
# kv_mapping:
# instruction: question
- _target_: data.input_aligner.add_id_aligner
id_field: id
template: "{prompt}"
instruction:
split_size: ${split_size}
split_id: ${split_id}
service_based: False
service_processor:
_target_: data.vllm.VLLMRequestGenerator
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: ${sampling_params.max_tokens}
model: ${model}
stop: ${sampling_params.stop}
n: ${n}
temperature: ${tem}
top_p: ${top_p}
index_field: id
save_best: False
step:
exp_name:
exp_notes:
output_dir: ${model_path_prefix}/mathscale-mistral/
eval_sub_path: ""
# Dataloader
num_workers: 8
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.openai_api_callback.OpenAIMATHCallBack
# _target_: post_processors.openai_api_callback.DeepSeekMathCallBack
output_file: ${output_file}
answer_clean:
_target_: data.math.gsk8k_answer_cleaner
separator: "he answer is"
# eval_fn: math
# answer_clean: math
resume: False
index_field: "id"
label_field: "label"
saved_keys: [ "prompt", "completion" ]
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/mathscale/mistral_train_0shot_v1_1.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/
model_path_prefix: /mnt/fangkai_blob/share/models # ../pretrained-models/
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file:
dev_file:
test_file: ${data_path_prefix}/dataset/mathscale/train.v60.300k.2-of-30.json
port: 6000
model:
sampling_params:
_target_: vllm.SamplingParams
n: 5
temperature: 1.0
max_tokens: 4096
stop: [ "", "\n\n\n\n", "### Instruction", "<|end▁of▁sentence|>", "", "" ]
top_p: 0.9
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
top_p: ${sampling_params.top_p}
split_size: 8
split_id: 0
max_num_seqs: 32
suffix: ${split_id}-of-${split_size}
output_file: ${output_dir}/mathscale/${eval_sub_path}/train.v60.300k.2-of-30.v1.1.0shot.n${n}.tem${tem}.p${top_p}.${suffix}.json
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.mathscale.util.mathscale_extract_answer_fn_v2_list
separator: "he answer is"
completion_field: completion
# kv_mapping:
# instruction: question
- _target_: data.input_aligner.add_id_aligner
id_field: id
template: "{prompt}"
instruction:
split_size: ${split_size}
split_id: ${split_id}
service_based: False
service_processor:
_target_: data.vllm.VLLMRequestGenerator
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: ${sampling_params.max_tokens}
model: ${model}
stop: ${sampling_params.stop}
n: ${n}
temperature: ${tem}
top_p: ${top_p}
index_field: id
save_best: False
step:
exp_name:
exp_notes:
output_dir: ${output_path_prefix}/experiments/${exp_name}/
eval_sub_path: ""
# Dataloader
num_workers: 8
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.openai_api_callback.MathScaleCallBack
answer_clean:
output_file: ${output_file}
resume: False
index_field: "id"
label_field: "label"
saved_keys: [ "prompt", "completion" ]
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/mathscale/mistral_train_0shot_v1_2.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/
model_path_prefix: /mnt/fangkai_blob/share/models # ../pretrained-models/
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file:
dev_file:
test_file: ${data_path_prefix}/dataset/mathscale/train.v60.300k.2-of-30.json
port: 6000
model:
sampling_params:
_target_: vllm.SamplingParams
n: 5
temperature: 1.0
max_tokens: 4096
stop: [ "", "\n\n\n\n", "### Instruction", "<|end▁of▁sentence|>", "", "" ]
top_p: 0.9
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
top_p: ${sampling_params.top_p}
split_size: 8
split_id: 0
max_num_seqs: 32
suffix: ${split_id}-of-${split_size}
#output_file: ${output_dir}/mathscale/${eval_sub_path}/train.v60.300k.2-of-30.v1.2.0shot.n${n}.tem${tem}.p${top_p}.${suffix}.json
output_file: ${output_dir}/mathscale/${eval_sub_path}/train.v60.300k.2-of-30.v1.2.1.0shot.n${n}.tem${tem}.p${top_p}.${suffix}.json # Align the implementation with Geyang: https://github.com/XingxingZhang/math_step/blob/iter_dev/dataset/pseudo/utils.py#L282-L328
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.mathscale.util.mathscale_extract_answer_fn_v3
completion_field: completion
# kv_mapping:
# instruction: question
- _target_: data.input_aligner.add_id_aligner
id_field: id
template: "{prompt}"
instruction:
split_size: ${split_size}
split_id: ${split_id}
service_based: False
service_processor:
_target_: data.vllm.VLLMRequestGenerator
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: ${sampling_params.max_tokens}
model: ${model}
stop: ${sampling_params.stop}
n: ${n}
temperature: ${tem}
top_p: ${top_p}
index_field: id
save_best: False
step:
exp_name:
exp_notes:
output_dir: ${output_path_prefix}/experiments/${exp_name}/
eval_sub_path: ""
# Dataloader
num_workers: 8
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.openai_api_callback.MathScaleCallBack
answer_clean:
output_file: ${output_file}
resume: False
index_field: "id"
label_field: "label"
saved_keys: [ "prompt", "completion" ]
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/mathscale/numina_hard_train_0shot_v1_0_completion.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/
model_path_prefix: /mnt/fangkai_blob/share/models # ../pretrained-models/
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
data_dir: ${model_path_prefix}/Ministral-8B-Instruct-2410/numina/aops_aime_oly/
global_split_id:
train_file:
dev_file:
test_file: ${data_dir}/cot.de_con.n64.tem1.0.p1.0.manual_step_tag.edit_dis_improve_pair.top1.cmp-step-correction.0shot.n1.tem0.0.p1.0.step-replace.${global_split_id}-of-16.json
port: 6000
model:
sampling_params:
_target_: vllm.SamplingParams
n: 3
temperature: 1.0
max_tokens: 4096
stop: [ "", "\n\n\n\n", "### Instruction", "<|end▁of▁sentence|>", "", "" ]
top_p: 1.0
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
top_p: ${sampling_params.top_p}
split_size: 1
split_id: 0
max_num_seqs: 256
max_model_len: 4096
global_batch_size: 512
suffix: ${split_id}-of-${split_size}
output_file: ${output_dir}/${eval_sub_path}/numina/aops_aime_oly/cot.de_con.n64.tem1.0.p1.0.manual_step_tag.edit_dis_improve_pair.top1.cmp-step-correction.n1.t0.0.p1.0.step-replace.${global_split_id}-of-16.n${n}.t${tem}.p${top_p}.${suffix}.s${seed}.json
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.flat_aligner
input_index_field: id
extract_field: [ "prefix" ]
mode: "multi"
template: "{question}\n\nPlease put your final answer within {instruction}.{prefix}"
instruction: "\\boxed{}"
split_size: ${split_size}
split_id: ${split_id}
service_based: False
service_processor:
_target_: data.vllm.VLLMRequestGenerator
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: ${sampling_params.max_tokens}
model: ${model}
stop: ${sampling_params.stop}
n: ${n}
temperature: ${tem}
top_p: ${top_p}
index_field: id
flush_file: ${flush_file}
save_best: False
step:
exp_name:
exp_notes:
#output_dir: ${model_path_prefix}/mathstral-7B-v0.1/
output_dir: ${output_path_prefix}/experiments/${exp_name}/
eval_sub_path: ""
# Dataloader
num_workers: 8
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.openai_api_callback.MathScaleCallBack
answer_clean:
output_file: ${output_file}
resume: True
index_field: "id"
label_field: "label"
saved_keys: [ "question", "completion", "source" ]
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/mathscale/numina_hard_train_0shot_v1_0_seed.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/
model_path_prefix: /mnt/fangkai_blob/share/models # ../pretrained-models/
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file:
dev_file:
test_file: ${data_path_prefix}/dataset/NuminaMath/numina.aops_aime_oly.de_con.label.160k.json
port: 6000
model:
sampling_params:
_target_: vllm.SamplingParams
n: 1
temperature: 1.0
max_tokens: 4096
stop: [ "", "\n\n\n\n", "### Instruction", "<|end▁of▁sentence|>", "", "" ]
top_p: 1.0
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
top_p: ${sampling_params.top_p}
split_size: 1
split_id: 0
max_num_seqs: 256
max_model_len: 4096
global_batch_size: 512
suffix: ${split_id}-of-${split_size}
output_file: ${output_dir}/${eval_sub_path}/numina/aops_aime_oly/cot.de_con.n${n}.tem${tem}.p${top_p}.${suffix}.s${seed}.json
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
template: "{question}\n\nPlease put your final answer within {instruction}."
instruction: "\\boxed{}"
split_size: ${split_size}
split_id: ${split_id}
service_based: False
service_processor:
_target_: data.vllm.VLLMRequestGenerator
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: ${sampling_params.max_tokens}
model: ${model}
stop: ${sampling_params.stop}
n: ${n}
temperature: ${tem}
top_p: ${top_p}
index_field: id
flush_file: ${flush_file}
save_best: False
step:
exp_name:
exp_notes:
#output_dir: ${model_path_prefix}/mathstral-7B-v0.1/
output_dir: ${output_path_prefix}/experiments/${exp_name}/
eval_sub_path: ""
# Dataloader
num_workers: 8
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.openai_api_callback.MathScaleCallBack
answer_clean:
output_file: ${output_file}
resume: True
index_field: "id"
label_field: "label"
saved_keys: [ "question", "completion", "source" ]
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/mathscale/numina_rewrite_qwen25_0shot_v1_0.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/
model_path_prefix: /mnt/fangkai_blob/share/models # ../pretrained-models/
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
global_split_id: 0
train_file:
dev_file:
test_file: ${data_path_prefix}/dataset/NuminaMath/numina-olympiads-dynamic-cot-4o-box-45675.parse_and_sympy_eval.rm_tags.correct.v1.0.jsonl
port: 6000
model:
sampling_params:
_target_: vllm.SamplingParams
n: 1
temperature: 0.0
max_tokens: 16384
stop: [ "", "\n\n\n\n", "### Instruction", "<|end▁of▁sentence|>", "", "", "<|endoftext|>", "<|im_end|>" ]
top_p: 1.0
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
top_p: ${sampling_params.top_p}
split_size: 8
split_id: 0
max_num_seqs: 64
max_model_len: 16384
global_batch_size: 128
suffix: ${split_id}-of-${split_size}
output_file: ${output_dir}/${eval_sub_path}/numina/numina-olympiads-dynamic-cot-4o-box-45675.parse_and_sympy_eval.rm_tags.correct.rewrite.v1.0.${suffix}.json
flush_file: ${output_file}l
apply_chat_template: True
add_generation_prompt: True
prompt:
_target_: data.input_utils.read_text
file_path: prompts/numina/dynamic_cot/rewrite_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
read_fn:
_target_: data.input_utils.jsonl_read_fn
template:
_target_: data.input_utils.compose_template
units:
prompt: ${prompt}
composition: "{prompt}"
instruction:
index_field: "idx"
replacement:
"{solution}": "cleaned_response"
split_size: ${split_size}
split_id: ${split_id}
service_based: False
max_data_num: -1
flush_file: ${flush_file}
save_best: False
step:
exp_name:
exp_notes:
#output_dir: ${model_path_prefix}/mathstral-7B-v0.1/
output_dir: ${model_path_prefix}/Qwen2.5-72B-Instruct
eval_sub_path: ""
# Dataloader
num_workers: 8
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.openai_api_callback.SaveOnlyCallBack
answer_clean:
output_file: ${output_file}
resume: True
index_field: "idx"
label_field: "label"
saved_keys: [ 'source', 'problem', 'solution', 'completion', 'label', 'pred', 'res', 'cleaned_response', 'idx' ]
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/mathscale/numina_train_0shot_v1_0.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/
model_path_prefix: /mnt/fangkai_blob/share/models # ../pretrained-models/
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
global_split_id: 0
train_file:
dev_file:
test_file: ${data_path_prefix}/dataset/NuminaMath/numina-math-cot.de_con.label.${global_split_id}-of-10.json
port: 6000
model:
sampling_params:
_target_: vllm.SamplingParams
n: 10
temperature: 1.0
max_tokens: 4096
stop: [ "", "\n\n\n\n", "### Instruction", "<|end▁of▁sentence|>", "", "" ]
top_p: 1.0
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
top_p: ${sampling_params.top_p}
split_size: 8
split_id: 0
max_num_seqs: 64
max_model_len: 4096
global_batch_size: 8192
suffix: ${split_id}-of-${split_size}
output_file: ${output_dir}/${eval_sub_path}/numina/830k-split-${global_split_id}-of-10/cot.de_con.${global_split_id}-of-10.n${n}.tem${tem}.p${top_p}.${suffix}.s${seed}.json
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
template: "{question}\n\nPlease put your final answer within {instruction}."
instruction: "\\boxed{}"
split_size: ${split_size}
split_id: ${split_id}
service_based: False
service_processor:
_target_: data.vllm.VLLMRequestGenerator
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: ${sampling_params.max_tokens}
model: ${model}
stop: ${sampling_params.stop}
n: ${n}
temperature: ${tem}
top_p: ${top_p}
index_field: id
flush_file: ${flush_file}
save_best: False
step:
exp_name:
exp_notes:
#output_dir: ${model_path_prefix}/mathstral-7B-v0.1/
output_dir: ${output_path_prefix}/experiments/${exp_name}/
eval_sub_path: ""
# Dataloader
num_workers: 8
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.openai_api_callback.MathScaleCallBack
answer_clean:
output_file: ${output_file}
resume: True
index_field: "id"
label_field: "label"
saved_keys: ["question", "completion"]
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/mathscale/numina_train_0shot_v1_0_completion.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/
model_path_prefix: /mnt/fangkai_blob/share/models # ../pretrained-models/
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
sample_over_p: 32
sft_model_dir: ${output_path_prefix}/experiments/mathstral.mathscale4o.process-dpo.iter0.V100.tp8dp48.v2.2.fix.s42/checkpoint-600/
#test_file_name: cot.de_con.n16.tem1.0.p1.0.split23.upper0.8.r0.3.sample8.filter_same.json
#test_file_name: cot.de_con.n16.tem1.0.p1.0.split01.upper0.8.r0.3.sample8.filter_same.json
#test_file_name: cot.de_con.n16.tem1.0.p1.0.split45.upper0.8.r0.3.sample8.filter_same.json
#test_file_name: cot.de_con.n16.tem1.0.p1.0.split67.upper0.8.r0.3.sample8.filter_same.json
#test_file_name: cot.de_con.n16.tem1.0.p1.0.split89.upper0.8.r0.3.sample8.filter_same.json
test_file_name: cot.de_con.n16.tem1.2.p1.0.split89.upper0.8.r0.3.sample32.filter_same.json
global_split_id: 0
train_file:
dev_file:
#test_file: ${data_path_prefix}/dataset/NuminaMath/numina-math-cot.de_con.label.${global_split_id}-of-10.json
#test_file: ${sft_model_dir}/numina/cot.de_con.n8.tem1.0.p1.0.s0.upper0.7.r0.3.sample${sample_over_p}.filter_same.${global_split_id}-of-4.json
#test_file: ${sft_model_dir}/numina/cot.de_con.n8.tem1.0.p1.0.orig_0-of-8.s0.upper0.7.r0.3.sample32.filter_same.json
test_file: ${sft_model_dir}/numina/${test_file_name}
port: 6000
model:
sampling_params:
_target_: vllm.SamplingParams
n: 3
temperature: 1.0
max_tokens: 2048
stop: [ "", "\n\n\n\n", "### Instruction", "<|end▁of▁sentence|>", "", "" ]
top_p: 1.0
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
top_p: ${sampling_params.top_p}
split_size: 1024
split_id: 0
max_num_seqs: 64
max_model_len: 4096
global_batch_size: 512
suffix: ${split_id}-of-${split_size}
#output_file: ${output_dir}/${eval_sub_path}/numina/830k-split-${global_split_id}-of-10/cot.de_con.${global_split_id}-of-10.n${n}.tem${tem}.p${top_p}.${suffix}.s${seed}.json
#output_file: ${output_dir}/${eval_sub_path}/numina/completion-split-${split_size}/cot.de_con.n8.tem1.0.p1.0.s0.upper0.7.r0.3.sample${sample_over_p}.filter_same.${global_split_id}-of-4.n${n}.tem${tem}.p${top_p}.${suffix}.s${seed}.json
#output_file: ${output_dir}/${eval_sub_path}/numina/completion-split-${split_size}/cot.de_con.n8.tem1.0.p1.0.orig_0-of-8.s0.upper0.7.r0.3.sample32.filter_same.n${n}.tem${tem}.p${top_p}.${suffix}.s${seed}.json
#output_file: ${output_dir}/${eval_sub_path}/numina/cot.de_con.n16.tem1.0.p1.0.split23.upper0.8.r0.3.sample8.filter_same.n${n}.tem${tem}.p${top_p}.${suffix}.s${seed}.json
#output_file: ${output_dir}/${eval_sub_path}/numina/cot.de_con.n16.tem1.0.p1.0.split01.upper0.8.r0.3.sample8.filter_same.n${n}.tem${tem}.p${top_p}.${suffix}.s${seed}.json
#output_file: ${output_dir}/${eval_sub_path}/numina/cot.de_con.n16.tem1.0.p1.0.split45.upper0.8.r0.3.sample8.filter_same.n${n}.tem${tem}.p${top_p}.${suffix}.s${seed}.json
#output_file: ${output_dir}/${eval_sub_path}/numina/cot.de_con.n16.tem1.0.p1.0.split67.upper0.8.r0.3.sample8.filter_same.n${n}.tem${tem}.p${top_p}.${suffix}.s${seed}.json
#output_file: ${output_dir}/${eval_sub_path}/numina/cot.de_con.n16.tem1.0.p1.0.split89.upper0.8.r0.3.sample8.filter_same.n${n}.tem${tem}.p${top_p}.${suffix}.s${seed}.json
output_file: ${output_dir}/${eval_sub_path}/numina/cot.de_con.n16.tem1.2.p1.0.split89.upper0.8.r0.3.sample32.filter_same.n${n}.tem${tem}.p${top_p}.${suffix}.s${seed}.json
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.flat_aligner
input_index_field: id
extract_field: [ "prefix", "prefix_id" ]
mode: "multi"
template: "{question}\n\nPlease put your final answer within {instruction}.{prefix}"
instruction: "\\boxed{}"
split_size: ${split_size}
split_id: ${split_id}
service_based: False
service_processor:
_target_: data.vllm.VLLMRequestGenerator
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: ${sampling_params.max_tokens}
model: ${model}
stop: ${sampling_params.stop}
n: ${n}
temperature: ${tem}
top_p: ${top_p}
index_field: prefix_id
flush_file: ${flush_file}
save_best: False
step:
exp_name:
exp_notes:
#output_dir: ${model_path_prefix}/mathstral-7B-v0.1/
output_dir: ${sft_model_dir}
eval_sub_path: ""
# Dataloader
num_workers: 8
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.openai_api_callback.MathScaleCallBack
answer_clean:
output_file: ${output_file}
resume: True
index_field: "prefix_id"
label_field: "label"
saved_keys: [ "question", "id", "prefix", "prefix_id" ]
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/mathscale/test_0shot_tem_v1_1.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
mount_dir: /mnt/fangkai_blob/
data_path_prefix: ${mount_dir}/share/
model_path_prefix: ${mount_dir}/share/models # ../pretrained-models/
output_path_prefix: ${mount_dir}/reward_modeling/
train_file:
dev_file:
test_file: ${data_path_prefix}/MetaMath/data/test/MATH_test.json
port: 6000
model:
sampling_params:
_target_: vllm.SamplingParams
n: 1
temperature: 0.0
max_tokens: 4096
stop: [ "", "\n\n\n\n", "### Instruction", "<|end▁of▁sentence|>", "", "" ]
top_p: 1.0
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
top_p: ${sampling_params.top_p}
split_size: 1
split_id: 0
max_num_seqs: 32
suffix: ${split_id}-of-${split_size}
output_file: ${output_dir}/${eval_sub_path}/math/math.test.v1.1.0shot.n${n}.tem${tem}.p${top_p}.${suffix}.json
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
aligner:
_target_: data.math.math_gold_answer_extractor
kv_mapping:
instruction: question
template: "{instruction}{question}\n\n### Response:"
instruction: "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n"
split_size: ${split_size}
split_id: ${split_id}
service_based: False
service_processor:
_target_: data.vllm.VLLMRequestGenerator
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: ${sampling_params.max_tokens}
model: ${model}
stop: ${sampling_params.stop}
n: ${n}
temperature: ${tem}
top_p: ${top_p}
index_field: "idx"
save_best: False
step:
exp_name:
exp_notes:
output_dir: ${model_path_prefix}/mathscale-mistral/
eval_sub_path: ""
# Dataloader
num_workers: 16
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.openai_api_callback.MathScaleCallBack
answer_clean:
output_file: ${output_file}
index_field: "idx"
label_field: "label"
saved_keys: [ "question", "output" ]
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/mathscale/test_0shot_tem_v1_1_step.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/
model_path_prefix: /mnt/fangkai_blob/share/models # ../pretrained-models/
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file:
dev_file:
test_file: ${data_path_prefix}/MetaMath/data/test/MATH_test.json
port: 6000
model:
sampling_params:
_target_: vllm.SamplingParams
n: 1
temperature: 0.0
max_tokens: 4096
stop: [ "", "\n\n\n\n", "### Instruction", "<|end▁of▁sentence|>", "", "" ]
top_p: 1.0
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
top_p: ${sampling_params.top_p}
split_size: 8
split_id: 0
max_num_seqs: 32
suffix: ${split_id}-of-${split_size}
output_file: ${output_dir}/${eval_sub_path}/math/math.test.v1.1.0shot.n${n}.tem${tem}.p${top_p}.${suffix}.json
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
aligner:
_target_: data.math.math_gold_answer_extractor
kv_mapping:
instruction: question
template: "{instruction}{question}\n\n### Response:"
instruction: "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n"
split_size: ${split_size}
split_id: ${split_id}
service_based: False
service_processor:
_target_: data.vllm.VLLMRequestGenerator
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: ${sampling_params.max_tokens}
model: ${model}
stop: ${sampling_params.stop}
n: ${n}
temperature: ${tem}
top_p: ${top_p}
index_field: "idx"
save_best: False
step:
exp_name:
exp_notes:
output_dir: ${output_path_prefix}/experiments/${exp_name}/
eval_sub_path: ""
# Dataloader
num_workers: 16
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.openai_api_callback.MathScaleCallBack
answer_clean:
output_file: ${output_file}
index_field: "idx"
label_field: "label"
saved_keys: [ "question", "output" ]
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/mbpp_sanitized/r2c/test_3shot_v2_0.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
mount_dir: /mnt/fangkai_blob/
data_path_prefix: ${mount_dir}/share
model_path_prefix: ${mount_dir}/share/models
output_path_prefix: ${mount_dir}/reward_modeling/
train_file: "mbpp"
dev_file: ${train_file}
test_file: ${train_file}
port: 6000
model: ds-coder-v1.5-chat
sampling_params:
_target_: vllm.SamplingParams
n: 1
temperature: 0.0
stop: [ "", "\n\n\n\n", "Context:\n", "Thought 42:", "<|end_of_text|>", "<|eot_id|>, <|EOT|>" ]
max_tokens: 4096
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
split_size: -1
split_id: 0
max_num_seqs: 32
output_file: ${output_dir}/mbpp_257/${eval_sub_path}/test.3shot.tem${tem}.n${n}.v2.0.json
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/mbpp/r2c_prompt_3shot_v2.0.txt
# Data loading
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
read_fn:
_target_: data.human_eval.MBPPReader
sanitized: True
template:
_target_: data.input_utils.compose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
composition: "{chat_prefix}{prompt}{chat_connect}"
instruction:
index_field: "task_id"
service_based: False
split_size: ${split_size}
split_id: ${split_id}
service_processor:
_target_: data.vllm.VLLMRequestGenerator
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: 4096
model: ${model}
stop: ${sampling_params.stop}
temperature: ${sampling_params.temperature}
n: ${sampling_params.n}
max_data_num: -1
flush_file: ${flush_file}
exp_name:
save_best: False
eval_sub_path:
output_dir: ${output_path_prefix}experiments/${exp_name}/
# Dataloader
num_workers: 32
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.code.code.CodeExtractor
output_file: ${output_file}
answer_clean:
_target_: post_processors.code.clean.get
# name: standard
name: tag
index_field: task_id
test_case_field: "test_list"
resume: True
evaluator:
_target_: post_processors.code.evaluator.MBPPEvaluator
saved_keys: [ "prompt" ]
num_workers: 16
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/mbpp_sanitized/r2c/test_v1_0.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
mount_dir: /mnt/fangkai_blob/
data_path_prefix: ${mount_dir}/share
model_path_prefix: ${mount_dir}/share/models
output_path_prefix: ${mount_dir}/reward_modeling/
train_file: "mbpp"
dev_file: ${train_file}
test_file: ${train_file}
port: 6000
model: ds-coder-v1.5-chat
sampling_params:
_target_: vllm.SamplingParams
n: 1
temperature: 0.0
stop: [ "", "\n\n\n\n", "Context:\n", "Thought 42:", "<|end_of_text|>", "<|eot_id|>, <|EOT|>" ]
max_tokens: 4096
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
split_size: -1
split_id: 0
max_num_seqs: 32
output_file: ${output_dir}/mbpp_257/${eval_sub_path}/test.0shot.tem${tem}.n${n}.v1.0.json
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/mbpp/r2c_prompt_0shot_v1.0.txt
# Data loading
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
read_fn:
_target_: data.human_eval.MBPPReader
sanitized: True
template:
_target_: data.input_utils.compose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
composition: "{chat_prefix}{prompt}{chat_connect}"
instruction:
index_field: "task_id"
service_based: False
split_size: ${split_size}
split_id: ${split_id}
service_processor:
_target_: data.vllm.VLLMRequestGenerator
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: 4096
model: ${model}
stop: ${sampling_params.stop}
temperature: ${sampling_params.temperature}
n: ${sampling_params.n}
max_data_num: -1
flush_file: ${flush_file}
exp_name:
save_best: False
eval_sub_path:
output_dir: ${output_path_prefix}experiments/${exp_name}/
# Dataloader
num_workers: 32
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.code.code.CodeExtractor
output_file: ${output_file}
answer_clean:
_target_: post_processors.code.clean.get
# name: standard
name: tag
index_field: task_id
test_case_field: "test_list"
resume: True
evaluator:
_target_: post_processors.code.evaluator.MBPPEvaluator
saved_keys: [ "prompt" ]
num_workers: 16
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/mbpp_sanitized/r2c/test_v1_0_local.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
train_file: "mbpp"
dev_file: ${train_file}
test_file: ${train_file}
port: 6000
model: ds-coder-v1.5-chat
sampling_params:
_target_: vllm.SamplingParams
n: 1
temperature: 0.0
stop: [ "", "\n\n\n\n", "Context:\n", "Thought 42:", "<|end_of_text|>", "<|eot_id|>, <|EOT|>" ]
max_tokens: 4096
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
split_size: -1
split_id: 0
max_num_seqs: 32
output_file: ${output_dir}/mbpp_257/${eval_sub_path}/test.0shot.tem${tem}.n${n}.v1.0.json
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/mbpp/r2c_prompt_0shot_v1.0.txt
# Data loading
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
read_fn:
_target_: data.human_eval.MBPPReader
sanitized: True
template:
_target_: data.input_utils.compose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
composition: "{chat_prefix}{prompt}{chat_connect}"
instruction:
index_field: "task_id"
service_based: False
split_size: ${split_size}
split_id: ${split_id}
service_processor:
_target_: data.vllm.VLLMRequestGenerator
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: 4096
model: ${model}
stop: ${sampling_params.stop}
temperature: ${sampling_params.temperature}
n: ${sampling_params.n}
max_data_num: -1
save_best: False
eval_sub_path:
output_dir: ../pretrained-models/deepseek-coder-7b-instruct-v1.5/
# Dataloader
num_workers: 32
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.code.code.CodeExtractor
output_file: ${output_file}
answer_clean:
_target_: post_processors.code.clean.get
# name: standard
name: tag
index_field: task_id
test_case_field: "test_list"
evaluator:
_target_: post_processors.code.evaluator.MBPPEvaluator
saved_keys: [ "prompt" ]
num_workers: 16
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/mbpp_sanitized/test_3shot_v1_0.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
mount_dir: /mnt/fangkai_blob/
data_path_prefix: ${mount_dir}/share
model_path_prefix: ${mount_dir}/share/models
output_path_prefix: ${mount_dir}/reward_modeling/
train_file: "mbpp"
dev_file: ${train_file}
test_file: ${train_file}
port: 6000
model: ds-coder-v1.5-chat
sampling_params:
_target_: vllm.SamplingParams
n: 1
temperature: 0.0
stop: [ "", "\n\n\n\n", "Context:\n", "Thought 42:", "<|end_of_text|>", "<|eot_id|>, <|EOT|>" ]
max_tokens: 4096
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
split_size: -1
split_id: 0
max_num_seqs: 32
output_file: ${output_dir}/mbpp_257/${eval_sub_path}/test.3shot.tem${tem}.n${n}.v1.0.json
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/mbpp/r2c_prompt_3shot_v1.0.txt
# Data loading
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
read_fn:
_target_: data.human_eval.MBPPReader
sanitized: True
template:
_target_: data.input_utils.compose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
composition: "{chat_prefix}{prompt}{chat_connect}"
instruction:
index_field: "task_id"
service_based: False
split_size: ${split_size}
split_id: ${split_id}
service_processor:
_target_: data.vllm.VLLMRequestGenerator
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: 4096
model: ${model}
stop: ${sampling_params.stop}
temperature: ${sampling_params.temperature}
n: ${sampling_params.n}
max_data_num: -1
flush_file: ${flush_file}
exp_name:
save_best: False
eval_sub_path:
output_dir: ${output_path_prefix}experiments/${exp_name}/
# Dataloader
num_workers: 32
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.code.code.CodeExtractor
output_file: ${output_file}
answer_clean:
_target_: post_processors.code.clean.get
# name: standard
name: tag
index_field: task_id
test_case_field: "test_list"
resume: True
evaluator:
_target_: post_processors.code.evaluator.MBPPEvaluator
saved_keys: [ "prompt" ]
num_workers: 16
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/mbpp_sanitized/test_3shot_v1_0_local.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
train_file: "mbpp"
dev_file: ${train_file}
test_file: ${train_file}
port: 6000
model: ds-coder-v1.5-chat
sampling_params:
_target_: vllm.SamplingParams
n: 1
temperature: 0.0
stop: [ "", "\n\n\n\n", "Context:\n", "Thought 42:", "<|end_of_text|>", "<|eot_id|>, <|EOT|>" ]
max_tokens: 4096
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
split_size: -1
split_id: 0
max_num_seqs: 32
output_file: ${output_dir}/mbpp_257/${eval_sub_path}/test.3shot.tem${tem}.n${n}.v1.0.json
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/mbpp/r2c_prompt_3shot_v1.0.txt
# Data loading
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
read_fn:
_target_: data.human_eval.MBPPReader
sanitized: True
template:
_target_: data.input_utils.compose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
composition: "{chat_prefix}{prompt}{chat_connect}"
instruction:
index_field: "task_id"
service_based: False
split_size: ${split_size}
split_id: ${split_id}
service_processor:
_target_: data.vllm.VLLMRequestGenerator
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: 4096
model: ${model}
stop: ${sampling_params.stop}
temperature: ${sampling_params.temperature}
n: ${sampling_params.n}
max_data_num: -1
save_best: False
eval_sub_path: ""
output_dir: ../msranlpintern/share/models/deepseek-coder-7b-instruct-v1.5/
# Dataloader
num_workers: 32
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.code.code.CodeExtractor
output_file: ${output_file}
answer_clean:
_target_: post_processors.code.clean.get
# name: standard
name: tag
index_field: task_id
test_case_field: "test_list"
evaluator:
_target_: post_processors.code.evaluator.MBPPEvaluator
saved_keys: [ "prompt" ]
num_workers: 16
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/mbpp_sanitized/test_v1_0_local.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
train_file: "mbpp"
dev_file: ${train_file}
test_file: ${train_file}
port: 6000
model: ds-coder-v1.5-chat
sampling_params:
_target_: vllm.SamplingParams
n: 1
temperature: 0.0
stop: [ "", "\n\n\n\n", "Context:\n", "Thought 42:", "<|end_of_text|>", "<|eot_id|>, <|EOT|>" ]
max_tokens: 4096
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
split_size: -1
split_id: 0
max_num_seqs: 32
output_file: ${output_dir}/mbpp_257/${eval_sub_path}/test.0shot.tem${tem}.n${n}.v1.0.json
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt: "You are an expert Python programmer, and here is your task: {prompt}\n\nYour code should pass test test cases:\n\n{test_list}"
# Data loading
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
read_fn:
_target_: data.human_eval.MBPPReader
sanitized: True
template: ${chat_prefix}${prompt}${chat_connect}
instruction:
index_field: "task_id"
service_based: False
split_size: ${split_size}
split_id: ${split_id}
service_processor:
_target_: data.vllm.VLLMRequestGenerator
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: 4096
model: ${model}
stop: ${sampling_params.stop}
temperature: ${sampling_params.temperature}
n: ${sampling_params.n}
max_data_num: -1
save_best: False
eval_sub_path:
output_dir: ../pretrained-models/deepseek-coder-7b-instruct-v1.5/
# Dataloader
num_workers: 32
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.code.code.CodeExtractor
output_file: ${output_file}
answer_clean:
_target_: post_processors.code.clean.get
name: standard
index_field: task_id
test_case_field: "test_list"
evaluator:
_target_: post_processors.code.evaluator.MBPPEvaluator
saved_keys: [ "prompt" ]
num_workers: 16
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/mbpp_sanitized/test_v1_1_local.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
train_file: "mbpp"
dev_file: ${train_file}
test_file: ${train_file}
port: 6000
model: ds-coder-v1.5-chat
sampling_params:
_target_: vllm.SamplingParams
n: 1
temperature: 0.0
stop: [ "", "\n\n\n\n", "Context:\n", "Thought 42:", "<|end_of_text|>", "<|eot_id|>, <|EOT|>" ]
max_tokens: 4096
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
split_size: -1
split_id: 0
max_num_seqs: 32
output_file: ${output_dir}/mbpp_257/${eval_sub_path}/test.0shot.tem${tem}.n${n}.v1.1.json
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt: "{prompt}\nYour code should pass the following test cases:\n\n{test_list}"
# Data loading
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
read_fn:
_target_: data.human_eval.MBPPReader
sanitized: True
template: ${chat_prefix}${prompt}${chat_connect}
instruction:
index_field: "task_id"
service_based: False
split_size: ${split_size}
split_id: ${split_id}
service_processor:
_target_: data.vllm.VLLMRequestGenerator
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: 4096
model: ${model}
stop: ${sampling_params.stop}
temperature: ${sampling_params.temperature}
n: ${sampling_params.n}
max_data_num: -1
save_best: False
eval_sub_path:
output_dir: ../pretrained-models/deepseek-coder-7b-instruct-v1.5/
# Dataloader
num_workers: 32
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.code.code.CodeExtractor
output_file: ${output_file}
answer_clean:
_target_: post_processors.code.clean.get
name: standard
index_field: task_id
test_case_field: "test_list"
evaluator:
_target_: post_processors.code.evaluator.MBPPEvaluator
saved_keys: [ "prompt" ]
num_workers: 16
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/mbpp_sanitized/test_v2_0_local.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
train_file: "mbpp"
dev_file: ${train_file}
test_file: ${train_file}
port: 6000
model: ds-coder-v1.5-chat
sampling_params:
_target_: vllm.SamplingParams
n: 1
temperature: 0.0
stop: [ "", "\n\n\n\n", "Context:\n", "Thought 42:", "<|end_of_text|>", "<|eot_id|>, <|EOT|>" ]
max_tokens: 4096
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
split_size: -1
split_id: 0
max_num_seqs: 32
output_file: ${output_dir}/mbpp_257/${eval_sub_path}/test.0shot.tem${tem}.n${n}.v2.0.json
flush_file: ${output_file}l
apply_chat_template: True
add_generation_prompt: True
#chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
#chat_connect: "\n### Response:\n"
#chat_suffix: "\n<|EOT|>"
prompt: "{prompt}\nYour code should pass the following test cases:\n\n{test_list}"
# Data loading
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
read_fn:
_target_: data.human_eval.MBPPReader
sanitized: True
template: ${prompt}
instruction:
index_field: "task_id"
service_based: False
split_size: ${split_size}
split_id: ${split_id}
service_processor:
_target_: data.vllm.VLLMRequestGenerator
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: 4096
model: ${model}
stop: ${sampling_params.stop}
temperature: ${sampling_params.temperature}
n: ${sampling_params.n}
max_data_num: -1
save_best: False
eval_sub_path:
output_dir: ../pretrained-models/deepseek-coder-7b-instruct-v1.5/
# Dataloader
num_workers: 32
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.code.code.CodeExtractor
output_file: ${output_file}
answer_clean:
_target_: post_processors.code.clean.get
name: standard
index_field: task_id
test_case_field: "test_list"
evaluator:
_target_: post_processors.code.evaluator.MBPPEvaluator
saved_keys: [ "prompt" ]
num_workers: 16
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/mbpp_sanitized/test_v2_1_local.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
train_file: "mbpp"
dev_file: ${train_file}
test_file: ${train_file}
port: 6000
model: ds-coder-v1.5-chat
sampling_params:
_target_: vllm.SamplingParams
n: 1
temperature: 0.0
stop: [ "", "\n\n\n\n", "Context:\n", "Thought 42:", "<|end_of_text|>", "<|eot_id|>, <|EOT|>" ]
max_tokens: 4096
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
split_size: -1
split_id: 0
max_num_seqs: 32
output_file: ${output_dir}/mbpp_257/${eval_sub_path}/test.0shot.tem${tem}.n${n}.v2.1.json
flush_file: ${output_file}l
apply_chat_template: True
add_generation_prompt: True
#chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
#chat_connect: "\n### Response:\n"
#chat_suffix: "\n<|EOT|>"
prompt: "You are an expert Python programmer, and here is your task:\n\n{prompt}\n\nYour code should pass the following test cases:\n\n{test_list}\n\nPlease finish this Python function:"
# Data loading
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
read_fn:
_target_: data.human_eval.MBPPReader
sanitized: True
template: ${prompt}
instruction:
index_field: "task_id"
service_based: False
split_size: ${split_size}
split_id: ${split_id}
service_processor:
_target_: data.vllm.VLLMRequestGenerator
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: 4096
model: ${model}
stop: ${sampling_params.stop}
temperature: ${sampling_params.temperature}
n: ${sampling_params.n}
max_data_num: -1
save_best: False
eval_sub_path:
output_dir: ../pretrained-models/deepseek-coder-7b-instruct-v1.5/
# Dataloader
num_workers: 32
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.code.code.CodeExtractor
output_file: ${output_file}
answer_clean:
_target_: post_processors.code.clean.get
name: standard
index_field: task_id
test_case_field: "test_list"
evaluator:
_target_: post_processors.code.evaluator.MBPPEvaluator
saved_keys: [ "prompt" ]
num_workers: 16
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/mbpp_sanitized/test_v2_2_local.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
train_file: "mbpp"
dev_file: ${train_file}
test_file: ${train_file}
port: 6000
model: ds-coder-v1.5-chat
sampling_params:
_target_: vllm.SamplingParams
n: 1
temperature: 0.0
stop: [ "", "\n\n\n\n", "Context:\n", "Thought 42:", "<|end_of_text|>", "<|eot_id|>, <|EOT|>" ]
max_tokens: 4096
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
split_size: -1
split_id: 0
max_num_seqs: 32
output_file: ${output_dir}/mbpp_257/${eval_sub_path}/test.0shot.tem${tem}.n${n}.v2.2.json
flush_file: ${output_file}l
apply_chat_template: True
add_generation_prompt: True
#chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
#chat_connect: "\n### Response:\n"
#chat_suffix: "\n<|EOT|>"
prompt: "You are an expert Python programmer, and here is your task:\n\n{prompt}\n\nYour code should pass the following test cases:\n\n{test_list}\n\nPlease put your code in code block\n```python\n...\n```\n"
# Data loading
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
read_fn:
_target_: data.human_eval.MBPPReader
sanitized: True
template: ${prompt}
instruction:
index_field: "task_id"
service_based: False
split_size: ${split_size}
split_id: ${split_id}
service_processor:
_target_: data.vllm.VLLMRequestGenerator
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: 4096
model: ${model}
stop: ${sampling_params.stop}
temperature: ${sampling_params.temperature}
n: ${sampling_params.n}
max_data_num: -1
save_best: False
eval_sub_path:
output_dir: ../pretrained-models/deepseek-coder-7b-instruct-v1.5/
# Dataloader
num_workers: 32
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.code.code.CodeExtractor
output_file: ${output_file}
answer_clean:
_target_: post_processors.code.clean.get
name: standard_default
index_field: task_id
test_case_field: "test_list"
evaluator:
_target_: post_processors.code.evaluator.MBPPEvaluator
saved_keys: [ "prompt" ]
num_workers: 16
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/mwp-bench/deepseek_test_0shot_v1_1.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
mount_dir: /mnt/fangkai_blob/
data_path_prefix: ${mount_dir}/share/
model_path_prefix: ${mount_dir}/share/models # ../pretrained-models/
output_path_prefix: ${mount_dir}/reward_modeling/
train_file:
dev_file:
test_file: ${data_path_prefix}/MWPBench/data/train_wo_gsm_2k.json
port: 6000
model:
sampling_params:
_target_: vllm.SamplingParams
n: 1
temperature: 0.0
max_tokens: 4096
stop: [ "", "\n\n\n\n", "### Instruction", "<|end▁of▁sentence|>" ]
top_p: 1.0
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
top_p: ${sampling_params.top_p}
split_size: 8
split_id: 0
max_num_seqs: 32
suffix: ${split_id}-of-${split_size}
output_file: ${output_dir}/mwpbench/${eval_sub_path}/train_wo_gsm.2k.v1.0.0shot.n${n}.tem${tem}.p${top_p}.${suffix}.json
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
template: "User: {question}\nPlease reason step by step, and put your final answer within {instruction}.\n\nAssistant:"
instruction: "\\boxed{}" # Hack here! because {} wil report error.
split_size: ${split_size}
split_id: ${split_id}
service_based: False
service_processor:
_target_: data.vllm.VLLMRequestGenerator
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: ${sampling_params.max_tokens}
model: ${model}
stop: ${sampling_params.stop}
n: ${n}
temperature: ${tem}
top_p: ${top_p}
index_field: question_number
save_best: False
step:
exp_name:
exp_notes:
output_dir: ${output_path_prefix}/experiments/${exp_name}/
eval_sub_path: ""
# Dataloader
num_workers: 48
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.openai_api_callback.MathScaleCallBack
answer_clean:
output_file: ${output_file}
resume: False
index_field: "question_number"
label_field: "answer"
saved_keys: [ "question", "question_number", "data_source", "answer", "data_topic"]
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/mwp-bench/llama_base/college_math_test_4shot_v1_0.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
mount_dir: /mnt/fangkai_blob/
data_path_prefix: ${mount_dir}/share/
model_path_prefix: ${mount_dir}/share/models # ../pretrained-models/
output_path_prefix: ${mount_dir}/reward_modeling/
train_file:
dev_file:
test_file: ${data_path_prefix}/MWPBench/data/full_test.json
port: 6000
model:
sampling_params:
_target_: vllm.SamplingParams
n: 1
temperature: 0.0
max_tokens: 8192
stop: [ "", "\n\n\n\n", "### Instruction", "<|end▁of▁sentence|>", "", "", "<|end_of_text|>", "<|eot_id|>", "## Question" ]
top_p: 1.0
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
top_p: ${sampling_params.top_p}
split_size: 1
split_id: 0
max_num_seqs: 8
global_batch_size: 256
max_model_len: 8192
suffix: ${split_id}-of-${split_size}
output_file: ${output_dir}/mwpbench/${eval_sub_path}/college_math.test.v1.0.0shot.n${n}.tem${tem}.p${top_p}.${suffix}.json
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
few_shot_prompt:
_target_: data.input_utils.read_text
file_path: prompts/math/college_math_4shot.txt
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
read_fn:
_target_: data.input_utils.jsonl_read_fn
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.starts_with_filter
key: data_topic
value: "college_math"
template:
_target_: data.input_utils.compose_template
units:
few_shot_prompt: ${few_shot_prompt}
prompt: "{question}\n\n## Response"
composition: "{few_shot_prompt}{prompt}"
replacement:
"{question}": "question"
instruction:
split_size: ${split_size}
split_id: ${split_id}
service_based: False
service_processor:
_target_: data.vllm.VLLMRequestGenerator
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: ${sampling_params.max_tokens}
model: ${model}
stop: ${sampling_params.stop}
n: ${n}
temperature: ${tem}
top_p: ${top_p}
index_field: question_number
flush_file: ${flush_file}
save_best: False
step:
exp_name:
exp_notes:
output_dir: ${output_path_prefix}/experiments/${exp_name}/
eval_sub_path: ""
# Dataloader
num_workers: 8
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.openai_api_callback.MathScaleCallBack
answer_clean:
output_file: ${output_file}
resume: True
index_field: "question_number"
label_field: "answer"
saved_keys: [ "question", "question_number", "data_source", "answer", "data_topic" ]
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/mwp-bench/llama_chat/dev_0shot_v1_0.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
mount_dir: /mnt/fangkai_blob/
data_path_prefix: ${mount_dir}/share/
model_path_prefix: ${mount_dir}/share/models # ../pretrained-models/
output_path_prefix: ${mount_dir}/reward_modeling/
train_file:
dev_file:
test_file: ${data_path_prefix}/MWPBench/data/train_wo_gsm_2k.json
port: 6000
model:
sampling_params:
_target_: vllm.SamplingParams
n: 1
temperature: 0.0
max_tokens: 4096
stop: [ "", "\n\n\n\n", "### Instruction", "<|end▁of▁sentence|>", "", "", "<|end_of_text|>", "<|eot_id|>" ]
top_p: 1.0
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
top_p: ${sampling_params.top_p}
split_size: 8
split_id: 0
max_num_seqs: 32
global_batch_size: 256
suffix: ${split_id}-of-${split_size}
#output_file: ${output_dir}/mathscale/${eval_sub_path}/train.v60.300k.2-of-30.v1.2.0shot.n${n}.tem${tem}.p${top_p}.${suffix}.json
output_file: ${output_dir}/mwpbench/${eval_sub_path}/train_wo_gsm.2k.v1.0.0shot.n${n}.tem${tem}.p${top_p}.${suffix}.json # Align the implementation with Geyang: https://github.com/XingxingZhang/math_step/blob/iter_dev/dataset/pseudo/utils.py#L282-L328
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
template: "<|start_header_id|>user<|end_header_id|>\n\n{question}\n\nPlease put your final answer within {instruction}.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
instruction: "\\boxed{}"
split_size: ${split_size}
split_id: ${split_id}
service_based: False
service_processor:
_target_: data.vllm.VLLMRequestGenerator
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: ${sampling_params.max_tokens}
model: ${model}
stop: ${sampling_params.stop}
n: ${n}
temperature: ${tem}
top_p: ${top_p}
index_field: question_number
flush_file: ${flush_file}
save_best: False
step:
exp_name:
exp_notes:
output_dir: ${output_path_prefix}/experiments/${exp_name}/
eval_sub_path: ""
# Dataloader
num_workers: 8
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.openai_api_callback.MathScaleCallBack
answer_clean:
output_file: ${output_file}
resume: True
index_field: "question_number"
label_field: "answer"
saved_keys: [ "question", "question_number", "data_source", "answer", "data_topic"]
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/mwp-bench/llama_chat/math_test_0shot_v1_0.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
mount_dir: /mnt/fangkai_blob/
data_path_prefix: ${mount_dir}/share/
model_path_prefix: ${mount_dir}/share/models # ../pretrained-models/
output_path_prefix: ${mount_dir}/reward_modeling/
train_file:
dev_file:
test_file: ${data_path_prefix}/MWPBench/data/full_test.json
port: 6000
model:
sampling_params:
_target_: vllm.SamplingParams
n: 1
temperature: 0.0
max_tokens: 4096
stop: [ "", "\n\n\n\n", "### Instruction", "<|end▁of▁sentence|>", "", "", "<|end_of_text|>", "<|eot_id|>" ]
top_p: 1.0
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
top_p: ${sampling_params.top_p}
split_size: 8
split_id: 0
max_num_seqs: 32
global_batch_size: 256
max_model_len: 4096
suffix: ${split_id}-of-${split_size}
output_file: ${output_dir}/mwpbench/${eval_sub_path}/math.test.v1.0.0shot.n${n}.tem${tem}.p${top_p}.${suffix}.json # Align the implementation with Geyang: https://github.com/XingxingZhang/math_step/blob/iter_dev/dataset/pseudo/utils.py#L282-L328
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
read_fn:
_target_: data.input_utils.jsonl_read_fn
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.starts_with_filter
key: data_topic
value: "MATH"
template: "<|start_header_id|>user<|end_header_id|>\n\n{question}\n\nPlease put your final answer within {instruction}.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
instruction: "\\boxed{}"
split_size: ${split_size}
split_id: ${split_id}
service_based: False
service_processor:
_target_: data.vllm.VLLMRequestGenerator
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: ${sampling_params.max_tokens}
model: ${model}
stop: ${sampling_params.stop}
n: ${n}
temperature: ${tem}
top_p: ${top_p}
index_field: question_number
flush_file: ${flush_file}
save_best: False
step:
exp_name:
exp_notes:
output_dir: ${output_path_prefix}/experiments/${exp_name}/
eval_sub_path: ""
# Dataloader
num_workers: 8
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.openai_api_callback.MathScaleCallBack
answer_clean:
output_file: ${output_file}
resume: True
index_field: "question_number"
label_field: "answer"
saved_keys: [ "question", "question_number", "data_source", "answer", "data_topic" ]
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/mwp-bench/llama_chat/math_test_0shot_v3_0.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
mount_dir: /mnt/fangkai_blob/
data_path_prefix: ${mount_dir}/share/
model_path_prefix: ${mount_dir}/share/models # ../pretrained-models/
output_path_prefix: ${mount_dir}/reward_modeling/
train_file:
dev_file:
test_file: ${data_path_prefix}/MWPBench/data/full_test.json
port: 6000
model:
sampling_params:
_target_: vllm.SamplingParams
n: 1
temperature: 0.0
max_tokens: 12288
stop: [ "", "\n\n\n\n", "### Instruction", "<|end▁of▁sentence|>", "", "", "<|end_of_text|>", "<|eot_id|>" ]
top_p: 1.0
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
top_p: ${sampling_params.top_p}
split_size: 8
split_id: 0
max_num_seqs: 64
global_batch_size: 256
max_model_len: 12288
suffix: ${split_id}-of-${split_size}
output_file: ${output_dir}/mwpbench/${eval_sub_path}/math.test.v3.0.0shot.n${n}.tem${tem}.p${top_p}.${suffix}.json # Align the implementation with Geyang: https://github.com/XingxingZhang/math_step/blob/iter_dev/dataset/pseudo/utils.py#L282-L328
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
system_prompt:
_target_: data.input_utils.read_text
file_path: prompts/math/long_cot_sys_prompt_v1.0.txt
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
read_fn:
_target_: data.input_utils.jsonl_read_fn
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.starts_with_filter
key: data_topic
value: "MATH"
template:
_target_: data.input_utils.compose_template
units:
system_prompt: ${system_prompt}
system_prompt_prefix: "<|start_header_id|>system<|end_header_id|>\n\n"
prompt: "<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{question}\n\nPlease put your final answer within {instruction}.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
composition: "{system_prompt_prefix}{system_prompt}{prompt}"
instruction: "\\boxed{}"
split_size: ${split_size}
split_id: ${split_id}
service_based: False
service_processor:
_target_: data.vllm.VLLMRequestGenerator
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: ${sampling_params.max_tokens}
model: ${model}
stop: ${sampling_params.stop}
n: ${n}
temperature: ${tem}
top_p: ${top_p}
index_field: question_number
flush_file: ${flush_file}
save_best: False
step:
exp_name:
exp_notes:
output_dir: ${output_path_prefix}/experiments/${exp_name}/
eval_sub_path: ""
# Dataloader
num_workers: 8
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.openai_api_callback.MathScaleCallBack
answer_clean:
output_file: ${output_file}
resume: True
index_field: "question_number"
label_field: "answer"
saved_keys: [ "question", "question_number", "data_source", "answer", "data_topic" ]
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/mwp-bench/llama_chat/test_0shot_v1_0.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
mount_dir: /mnt/fangkai_blob/
data_path_prefix: ${mount_dir}/share/
model_path_prefix: ${mount_dir}/share/models # ../pretrained-models/
output_path_prefix: ${mount_dir}/reward_modeling/
train_file:
dev_file:
test_file: ${data_path_prefix}/MWPBench/data/full_test.json
port: 6000
model:
sampling_params:
_target_: vllm.SamplingParams
n: 1
temperature: 0.0
max_tokens: 4096
stop: [ "", "\n\n\n\n", "### Instruction", "<|end▁of▁sentence|>", "", "", "<|end_of_text|>", "<|eot_id|>" ]
top_p: 1.0
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
top_p: ${sampling_params.top_p}
split_size: 8
split_id: 0
max_num_seqs: 32
global_batch_size: 256
suffix: ${split_id}-of-${split_size}
output_file: ${output_dir}/mwpbench/${eval_sub_path}/full_test.v1.0.0shot.n${n}.tem${tem}.p${top_p}.${suffix}.json # Align the implementation with Geyang: https://github.com/XingxingZhang/math_step/blob/iter_dev/dataset/pseudo/utils.py#L282-L328
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
read_fn:
_target_: data.input_utils.jsonl_read_fn
template: "<|start_header_id|>user<|end_header_id|>\n\n{question}\n\nPlease put your final answer within {instruction}.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
instruction: "\\boxed{}"
split_size: ${split_size}
split_id: ${split_id}
service_based: False
service_processor:
_target_: data.vllm.VLLMRequestGenerator
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: ${sampling_params.max_tokens}
model: ${model}
stop: ${sampling_params.stop}
n: ${n}
temperature: ${tem}
top_p: ${top_p}
index_field: question_number
flush_file: ${flush_file}
save_best: False
step:
exp_name:
exp_notes:
output_dir: ${output_path_prefix}/experiments/${exp_name}/
eval_sub_path: ""
# Dataloader
num_workers: 8
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.openai_api_callback.MathScaleCallBack
answer_clean:
output_file: ${output_file}
resume: True
index_field: "question_number"
label_field: "answer"
saved_keys: [ "question", "question_number", "data_source", "answer", "data_topic" ]
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/mwp-bench/mathstral_dev_0shot_self_correct_v1_0.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
mount_dir: /mnt/fangkai_blob/
data_path_prefix: ${mount_dir}/share/
model_path_prefix: ${mount_dir}/share/models # ../pretrained-models/
output_path_prefix: ${mount_dir}/reward_modeling/
train_file:
dev_file:
test_file: # The prediction from last iteration
port: 6000
model:
sampling_params:
_target_: vllm.SamplingParams
n: 1
temperature: 0.0
max_tokens: 4096
stop: [ "", "\n\n\n\n", "### Instruction", "<|end▁of▁sentence|>", "", "", "<|end_of_text|>" ]
top_p: 1.0
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
top_p: ${sampling_params.top_p}
split_size: 8
split_id: 0
max_num_seqs: 32
global_batch_size: 256
suffix: ${split_id}-of-${split_size}
prefix: ""
file_name: train_wo_gsm.2k.v1.0.0shot.n${n}.tem${tem}.p${top_p}.${suffix}.json
output_file: ${output_dir}/${prefix}/mwpbench/${eval_sub_path}/${file_name}
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
aligner:
_target_: data.input_aligner.rename_field_aligner
kv_pair:
response: prev_response
res: prev_res
pred: prev_pred
template: "{text}{response}There might be an error in the solution above because of lack of understanding of the question. Please correct the error, if any, and rewrite the solution."
instruction:
replacement:
"{text}": text
"{response}": prev_response
split_size: ${split_size}
split_id: ${split_id}
service_based: False
service_processor:
_target_: data.vllm.VLLMRequestGenerator
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: ${sampling_params.max_tokens}
model: ${model}
stop: ${sampling_params.stop}
n: ${n}
temperature: ${tem}
top_p: ${top_p}
index_field: question_number
flush_file: ${flush_file}
save_best: False
step:
exp_name:
exp_notes:
output_dir: ${output_path_prefix}/experiments/${exp_name}/
eval_sub_path: ""
# Dataloader
num_workers: 8
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.openai_api_callback.MathScaleCallBack
answer_clean:
output_file: ${output_file}
resume: True
index_field: "question_number"
label_field: "answer"
saved_keys: [ "question", "question_number", "data_source", "answer", "data_topic", "prev_response", "prev_pred", "prev_res" ]
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/mwp-bench/mathstral_dev_0shot_v1_0.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
mount_dir: /mnt/fangkai_blob/
data_path_prefix: ${mount_dir}/share/
model_path_prefix: ${mount_dir}/share/models # ../pretrained-models/
output_path_prefix: ${mount_dir}/reward_modeling/
train_file:
dev_file:
test_file: ${data_path_prefix}/MWPBench/data/train_wo_gsm_2k.json
port: 6000
model:
sampling_params:
_target_: vllm.SamplingParams
n: 1
temperature: 0.0
max_tokens: 4096
stop: [ "", "\n\n\n\n", "### Instruction", "<|end▁of▁sentence|>", "", "", "<|end_of_text|>" ]
top_p: 1.0
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
top_p: ${sampling_params.top_p}
split_size: 8
split_id: 0
max_num_seqs: 32
global_batch_size: 256
suffix: ${split_id}-of-${split_size}
#output_file: ${output_dir}/mathscale/${eval_sub_path}/train.v60.300k.2-of-30.v1.2.0shot.n${n}.tem${tem}.p${top_p}.${suffix}.json
output_file: ${output_dir}/mwpbench/${eval_sub_path}/train_wo_gsm.2k.v1.0.0shot.n${n}.tem${tem}.p${top_p}.${suffix}.json # Align the implementation with Geyang: https://github.com/XingxingZhang/math_step/blob/iter_dev/dataset/pseudo/utils.py#L282-L328
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
template: "{question}\n\nPlease put your final answer within {instruction}."
instruction: "\\boxed{}"
split_size: ${split_size}
split_id: ${split_id}
service_based: False
service_processor:
_target_: data.vllm.VLLMRequestGenerator
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: ${sampling_params.max_tokens}
model: ${model}
stop: ${sampling_params.stop}
n: ${n}
temperature: ${tem}
top_p: ${top_p}
index_field: question_number
flush_file: ${flush_file}
save_best: False
step:
exp_name:
exp_notes:
output_dir: ${output_path_prefix}/experiments/${exp_name}/
eval_sub_path: ""
# Dataloader
num_workers: 8
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.openai_api_callback.MathScaleCallBack
answer_clean:
output_file: ${output_file}
resume: True
index_field: "question_number"
label_field: "answer"
saved_keys: [ "question", "question_number", "data_source", "answer", "data_topic"]
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/mwp-bench/mathstral_test_0shot_self_correct_v1_0.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
mount_dir: /mnt/fangkai_blob/
data_path_prefix: ${mount_dir}/share/
model_path_prefix: ${mount_dir}/share/models # ../pretrained-models/
output_path_prefix: ${mount_dir}/reward_modeling/
train_file:
dev_file:
test_file: # The prediction from last iteration
port: 6000
model:
sampling_params:
_target_: vllm.SamplingParams
n: 1
temperature: 0.0
max_tokens: 4096
stop: [ "", "\n\n\n\n", "### Instruction", "<|end▁of▁sentence|>", "", "", "<|end_of_text|>" ]
top_p: 1.0
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
top_p: ${sampling_params.top_p}
split_size: 8
split_id: 0
max_num_seqs: 32
global_batch_size: 256
suffix: ${split_id}-of-${split_size}
prefix: ""
file_name: full_test.v1.0.0shot.n${n}.tem${tem}.p${top_p}.${suffix}.json
output_file: ${output_dir}/${prefix}/mwpbench/${eval_sub_path}/${file_name}
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
aligner:
_target_: data.input_aligner.rename_field_aligner
kv_pair:
response: prev_response
res: prev_res
pred: prev_pred
template: "{text}{response}There might be an error in the solution above because of lack of understanding of the question. Please correct the error, if any, and rewrite the solution."
instruction:
replacement:
"{text}": text
"{response}": prev_response
split_size: ${split_size}
split_id: ${split_id}
service_based: False
service_processor:
_target_: data.vllm.VLLMRequestGenerator
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: ${sampling_params.max_tokens}
model: ${model}
stop: ${sampling_params.stop}
n: ${n}
temperature: ${tem}
top_p: ${top_p}
index_field: question_number
flush_file: ${flush_file}
save_best: False
step:
exp_name:
exp_notes:
output_dir: ${output_path_prefix}/experiments/${exp_name}/
eval_sub_path: ""
# Dataloader
num_workers: 8
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.openai_api_callback.MathScaleCallBack
answer_clean:
output_file: ${output_file}
resume: True
index_field: "question_number"
label_field: "answer"
saved_keys: [ "question", "question_number", "data_source", "answer", "data_topic", "prev_response", "prev_pred", "prev_res" ]
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/mwp-bench/mathstral_test_0shot_v1_0.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
mount_dir: /mnt/fangkai_blob/
data_path_prefix: ${mount_dir}/share/
model_path_prefix: ${mount_dir}/share/models # ../pretrained-models/
output_path_prefix: ${mount_dir}/reward_modeling/
train_file:
dev_file:
test_file: ${data_path_prefix}/MWPBench/data/full_test.json
port: 6000
model:
sampling_params:
_target_: vllm.SamplingParams
n: 1
temperature: 0.0
max_tokens: 4096
stop: [ "", "\n\n\n\n", "### Instruction", "<|end▁of▁sentence|>", "", "", "<|end_of_text|>" ]
top_p: 1.0
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
top_p: ${sampling_params.top_p}
split_size: 8
split_id: 0
max_num_seqs: 32
global_batch_size: 256
suffix: ${split_id}-of-${split_size}
#output_file: ${output_dir}/mwpbench/${eval_sub_path}/full_test.v1.0.0shot.n${n}.tem${tem}.p${top_p}.${suffix}.json # Align the implementation with Geyang: https://github.com/XingxingZhang/math_step/blob/iter_dev/dataset/pseudo/utils.py#L282-L328
output_file: ${output_dir}/mwpbench/${eval_sub_path}/math.v1.0.0shot.n${n}.tem${tem}.p${top_p}.${suffix}.json
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
read_fn:
_target_: data.input_utils.jsonl_read_fn
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.starts_with_filter
key: data_topic
value: "MATH"
template: "{question}\n\nPlease put your final answer within {instruction}."
instruction: "\\boxed{}"
split_size: ${split_size}
split_id: ${split_id}
service_based: False
service_processor:
_target_: data.vllm.VLLMRequestGenerator
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: ${sampling_params.max_tokens}
model: ${model}
stop: ${sampling_params.stop}
n: ${n}
temperature: ${tem}
top_p: ${top_p}
index_field: question_number
flush_file: ${flush_file}
save_best: False
step:
exp_name:
exp_notes:
output_dir: ${output_path_prefix}/experiments/${exp_name}/
eval_sub_path: ""
# Dataloader
num_workers: 8
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.openai_api_callback.MathScaleCallBack
answer_clean:
output_file: ${output_file}
resume: True
index_field: "question_number"
label_field: "answer"
saved_keys: [ "question", "question_number", "data_source", "answer", "data_topic" ]
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/mwp-bench/mathstral_test_gaokao_2023_0shot_v1_0.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
mount_dir: /mnt/fangkai_blob/
data_path_prefix: ${mount_dir}/share/
model_path_prefix: ${mount_dir}/share/models # ../pretrained-models/
output_path_prefix: ${mount_dir}/reward_modeling/
train_file:
dev_file:
test_file: ${data_path_prefix}/MWPBench/data/fresh_gaokao_math_2023.json
port: 6000
model:
sampling_params:
_target_: vllm.SamplingParams
n: 1
temperature: 0.0
max_tokens: 4096
stop: [ "", "\n\n\n\n", "### Instruction", "<|end▁of▁sentence|>", "", "", "<|end_of_text|>" ]
top_p: 1.0
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
top_p: ${sampling_params.top_p}
split_size: 1
split_id: 0
max_num_seqs: 32
global_batch_size: 256
suffix: ${split_id}-of-${split_size}
output_file: ${output_dir}/mwpbench/${eval_sub_path}/fresh_gaokao_math_2023.v1.0.0shot.n${n}.tem${tem}.p${top_p}.${suffix}.json # Align the implementation with Geyang: https://github.com/XingxingZhang/math_step/blob/iter_dev/dataset/pseudo/utils.py#L282-L328
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
read_fn:
_target_: data.input_utils.jsonl_read_fn
template: "{question}\n\nPlease put your final answer within {instruction}."
instruction: "\\boxed{}"
split_size: ${split_size}
split_id: ${split_id}
service_based: False
service_processor:
_target_: data.vllm.VLLMRequestGenerator
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: ${sampling_params.max_tokens}
model: ${model}
stop: ${sampling_params.stop}
n: ${n}
temperature: ${tem}
top_p: ${top_p}
index_field: question_number
flush_file: ${flush_file}
save_best: False
step:
exp_name:
exp_notes:
output_dir: ${output_path_prefix}/experiments/${exp_name}/
eval_sub_path: ""
# Dataloader
num_workers: 8
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.openai_api_callback.MathScaleCallBack
answer_clean:
output_file: ${output_file}
resume: True
index_field: "question_number"
label_field: "answer"
saved_keys: [ "question", "question_number", "data_source", "answer", "data_topic" ]
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/mwp-bench/mathstral_test_gsm8k_0shot_v1_0.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
mount_dir: /mnt/fangkai_blob/
data_path_prefix: ${mount_dir}/share/
model_path_prefix: ${mount_dir}/share/models # ../pretrained-models/
output_path_prefix: ${mount_dir}/reward_modeling/
train_file:
dev_file:
test_file: ${data_path_prefix}/MWPBench/data/gsm8k-sub.jsonl
port: 6000
model:
sampling_params:
_target_: vllm.SamplingParams
n: 1
temperature: 0.0
max_tokens: 4096
stop: [ "", "\n\n\n\n", "### Instruction", "<|end▁of▁sentence|>", "", "", "<|end_of_text|>" ]
top_p: 1.0
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
top_p: ${sampling_params.top_p}
split_size: 1
split_id: 0
max_num_seqs: 64
global_batch_size: 256
suffix: ${split_id}-of-${split_size}
output_file: ${output_dir}/mwpbench/${eval_sub_path}/gsm8k.v1.0.0shot.n${n}.tem${tem}.p${top_p}.${suffix}.s${seed}.json # Align the implementation with Geyang: https://github.com/XingxingZhang/math_step/blob/iter_dev/dataset/pseudo/utils.py#L282-L328
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
read_fn:
_target_: data.input_utils.jsonl_read_fn
template: "{question}\n\nPlease put your final answer within {instruction}."
instruction: "\\boxed{}"
split_size: ${split_size}
split_id: ${split_id}
service_based: False
service_processor:
_target_: data.vllm.VLLMRequestGenerator
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: ${sampling_params.max_tokens}
model: ${model}
stop: ${sampling_params.stop}
n: ${n}
temperature: ${tem}
top_p: ${top_p}
index_field: question_number
flush_file: ${flush_file}
save_best: False
step:
exp_name:
exp_notes:
output_dir: ${output_path_prefix}/experiments/${exp_name}/
eval_sub_path: ""
# Dataloader
num_workers: 8
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.openai_api_callback.MathScaleCallBack
answer_clean:
output_file: ${output_file}
resume: True
index_field: "question_number"
label_field: "answer"
saved_keys: [ "question", "question_number", "data_source", "answer", "data_topic" ]
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/mwp-bench/mistral/dev_0shot_v1_0.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
mount_dir: /mnt/fangkai_blob/
data_path_prefix: ${mount_dir}/share/
model_path_prefix: ${mount_dir}/share/models # ../pretrained-models/
output_path_prefix: ${mount_dir}/reward_modeling/
train_file:
dev_file:
test_file: ${data_path_prefix}/MWPBench/data/train_wo_gsm_2k.json
port: 6000
model:
sampling_params:
_target_: vllm.SamplingParams
n: 1
temperature: 0.0
max_tokens: 4096
stop: [ "", "\n\n\n\n", "### Instruction", "<|end▁of▁sentence|>", "", "", "<|end_of_text|>" ]
top_p: 1.0
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
top_p: ${sampling_params.top_p}
split_size: 8
split_id: 0
max_num_seqs: 32
global_batch_size: 256
suffix: ${split_id}-of-${split_size}
#output_file: ${output_dir}/mathscale/${eval_sub_path}/train.v60.300k.2-of-30.v1.2.0shot.n${n}.tem${tem}.p${top_p}.${suffix}.json
output_file: ${output_dir}/mwpbench/${eval_sub_path}/train_wo_gsm.2k.v1.0.0shot.n${n}.tem${tem}.p${top_p}.${suffix}.json # Align the implementation with Geyang: https://github.com/XingxingZhang/math_step/blob/iter_dev/dataset/pseudo/utils.py#L282-L328
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
template: "[INST]{question}\n\nPlease put your final answer within {instruction}.[/INST]"
instruction: "\\boxed{}"
split_size: ${split_size}
split_id: ${split_id}
service_based: False
service_processor:
_target_: data.vllm.VLLMRequestGenerator
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: ${sampling_params.max_tokens}
model: ${model}
stop: ${sampling_params.stop}
n: ${n}
temperature: ${tem}
top_p: ${top_p}
index_field: question_number
flush_file: ${flush_file}
save_best: False
step:
exp_name:
exp_notes:
output_dir: ${output_path_prefix}/experiments/${exp_name}/
eval_sub_path: ""
# Dataloader
num_workers: 8
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.openai_api_callback.MathScaleCallBack
answer_clean:
output_file: ${output_file}
resume: True
index_field: "question_number"
label_field: "answer"
saved_keys: [ "question", "question_number", "data_source", "answer", "data_topic"]
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/mwp-bench/mistral/test_0shot_v1_0.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
mount_dir: /mnt/fangkai_blob/
data_path_prefix: ${mount_dir}/share/
model_path_prefix: ${mount_dir}/share/models # ../pretrained-models/
output_path_prefix: ${mount_dir}/reward_modeling/
train_file:
dev_file:
test_file: ${data_path_prefix}/MWPBench/data/full_test.json
port: 6000
model:
sampling_params:
_target_: vllm.SamplingParams
n: 1
temperature: 0.0
max_tokens: 4096
stop: [ "", "\n\n\n\n", "### Instruction", "<|end▁of▁sentence|>", "", "", "<|end_of_text|>" ]
top_p: 1.0
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
top_p: ${sampling_params.top_p}
split_size: 8
split_id: 0
max_num_seqs: 32
global_batch_size: 256
suffix: ${split_id}-of-${split_size}
output_file: ${output_dir}/mwpbench/${eval_sub_path}/full_test.v1.0.0shot.n${n}.tem${tem}.p${top_p}.${suffix}.json # Align the implementation with Geyang: https://github.com/XingxingZhang/math_step/blob/iter_dev/dataset/pseudo/utils.py#L282-L328
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
read_fn:
_target_: data.input_utils.jsonl_read_fn
template: "[INST]{question}\n\nPlease put your final answer within {instruction}.[/INST]"
instruction: "\\boxed{}"
split_size: ${split_size}
split_id: ${split_id}
service_based: False
service_processor:
_target_: data.vllm.VLLMRequestGenerator
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: ${sampling_params.max_tokens}
model: ${model}
stop: ${sampling_params.stop}
n: ${n}
temperature: ${tem}
top_p: ${top_p}
index_field: question_number
flush_file: ${flush_file}
save_best: False
step:
exp_name:
exp_notes:
output_dir: ${output_path_prefix}/experiments/${exp_name}/
eval_sub_path: ""
# Dataloader
num_workers: 8
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.openai_api_callback.MathScaleCallBack
answer_clean:
output_file: ${output_file}
resume: True
index_field: "question_number"
label_field: "answer"
saved_keys: [ "question", "question_number", "data_source", "answer", "data_topic" ]
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/mwp-bench/mistral_dev_0shot_v1_0.yaml
================================================
defaults:
- hydra: default
- _self_
hydra:
searchpath:
- file://conf/
mount_dir: /mnt/fangkai_blob
data_path_prefix: ${mount_dir}/share/
model_path_prefix: ${mount_dir}/share/models # ../pretrained-models/
output_path_prefix: ${mount_dir}/reward_modeling/
train_file:
dev_file:
test_file: ${data_path_prefix}/MWPBench/data/train_wo_gsm_2k.json
port: 6000
model:
sampling_params:
_target_: vllm.SamplingParams
n: 1
temperature: 0.0
max_tokens: 4096
stop: [ "", "\n\n\n\n", "### Instruction", "<|end▁of▁sentence|>", "", "" ]
top_p: 0.9
tem: ${sampling_params.temperature}
n: ${sampling_params.n}
top_p: ${sampling_params.top_p}
split_size: 8
split_id: 0
max_num_seqs: 32
suffix: ${split_id}-of-${split_size}
#output_file: ${output_dir}/mathscale/${eval_sub_path}/train.v60.300k.2-of-30.v1.2.0shot.n${n}.tem${tem}.p${top_p}.${suffix}.json
output_file: ${output_dir}/mwpbench/${eval_sub_path}/train_wo_gsm.2k.v1.0.0shot.n${n}.tem${tem}.p${top_p}.${suffix}.json # Align the implementation with Geyang: https://github.com/XingxingZhang/math_step/blob/iter_dev/dataset/pseudo/utils.py#L282-L328
flush_file: ${output_file}l
apply_chat_template: False
add_generation_prompt: True
read_tensor:
_target_: data.combine_dataset.ResponseAlignDataset
# aligner:
# _target_: data.input_aligner.concat_aligner
# aligners:
# - _target_: data.mathscale.util.mathscale_extract_answer_fn_v3
# completion_field: completion
## kv_mapping:
## instruction: question
# - _target_: data.input_aligner.add_id_aligner
# id_field: id
template: "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{question}\n\n### Response:"
instruction:
split_size: ${split_size}
split_id: ${split_id}
service_based: False
service_processor:
_target_: data.vllm.VLLMRequestGenerator
api_url: http://0.0.0.0:${port}/v1/completions
max_tokens: ${sampling_params.max_tokens}
model: ${model}
stop: ${sampling_params.stop}
n: ${n}
temperature: ${tem}
top_p: ${top_p}
index_field: question_number
save_best: False
step:
exp_name:
exp_notes:
output_dir: ${output_path_prefix}/experiments/${exp_name}/
eval_sub_path: ""
# Dataloader
num_workers: 8
prefetch_factor: 2
dp_size:
tp_size: 1
pp_size: 1
post_process:
_target_: post_processors.openai_api_callback.MathScaleCallBack
answer_clean:
output_file: ${output_file}
resume: False
index_field: "question_number"
label_field: "answer"
saved_keys: [ "question", "question_number", "data_source", "answer", "data_topic"]
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
ddp_eval: False
no_cuda: False
seed: 42
local_rank: -1
# Temporary variables
fp16: True
fp16_bfloat16: True
n_gpu: 1
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/api/vllm/vllm_params/sampling_param_greedy.yaml
================================================
_target_: vllm.SamplingParams
n: 1
temperature: 0.0
stop: [ "", "\n\n\n\n" ]
max_tokens: 2048
================================================
FILE: PFPO/conf/api/vllm/vllm_params/sampling_param_sample.yaml
================================================
_target_: vllm.SamplingParams
n: 5
temperature: 1.0
stop: [ "", "\n\n\n\n" ]
max_tokens: 2048
================================================
FILE: PFPO/conf/deepspeed/fp16.yaml
================================================
bf16:
enabled: False
fp16:
enabled: True
auto_cast: False
loss_scale: 0
initial_scale_power: 16
loss_scale_window: 1000
hysteresis: 2
consecutive_hysteresis: False
min_loss_scale: 1
================================================
FILE: PFPO/conf/deepspeed/train_hybrid_engine_zero0.yaml
================================================
train_micro_batch_size_per_gpu:
gradient_accumulation_steps:
scheduler:
type: WarmupDecayLR
params:
total_num_steps:
warmup_max_lr: ${learning_rate}
warmup_num_steps:
warmup_type: linear
optimizer:
type: AdamW
params:
lr: 1e-4
betas: [ 0.9, 0.999 ]
eps: 1e-6
weight_decay: 0.0
bf16:
enabled: True
zero_optimization:
stage: 0
# offload_optimizer:
# device: cpu
# pin_memory: True
# offload_param:
# device: cpu
# pin_memory: True
# activation_checkpointing:
# partition_activations: True
# cpu_checkpointing: True
# contiguous_memory_optimization: False
# number_checkpoints: False
# synchronize_checkpoint_boundary: False
# profile: False
# zero_quantized_nontrainable_weights: False # If `enable_mixed_precision_lora` is True, this should be True
# stage3_param_persistence_threshold: 1e5 # (1e4,1e6)
# stage3_max_live_parameters: 1e8 # (3e7, 1e9)
# stage3_prefetch_bucket_size: 1e8 # (3e7, 5e8)
# memory_efficient_linear: False
steps_per_print: 25
gradient_clipping: 1.0
prescale_gradients: False
#wall_clock_breakdown: False
#hybrid_engine:
# enabled: True
# max_out_tokens: max_out_tokens
# inference_tp_size: inference_tp_size
# release_inference_cache: release_inference_cache
# pin_parameters: pin_parameters
# tp_gather_partition_size: tp_gather_partition_size
================================================
FILE: PFPO/conf/deepspeed/train_hybrid_engine_zero1.yaml
================================================
train_micro_batch_size_per_gpu:
gradient_accumulation_steps:
scheduler:
type: WarmupDecayLR
params:
total_num_steps:
warmup_max_lr: ${learning_rate}
warmup_num_steps:
warmup_type: linear
optimizer:
type: AdamW
params:
lr: 1e-4
betas: [ 0.9, 0.999 ]
eps: 1e-6
weight_decay: 0.0
bf16:
enabled: True
zero_optimization:
stage: 1
# offload_optimizer:
# device: cpu
# pin_memory: True
# offload_param:
# device: cpu
# pin_memory: True
# activation_checkpointing:
# partition_activations: True
# cpu_checkpointing: True
# contiguous_memory_optimization: False
# number_checkpoints: False
# synchronize_checkpoint_boundary: False
# profile: False
# zero_quantized_nontrainable_weights: False # If `enable_mixed_precision_lora` is True, this should be True
stage3_param_persistence_threshold: 1e5 # (1e4,1e6)
stage3_max_live_parameters: 1e8 # (3e7, 1e9)
stage3_prefetch_bucket_size: 1e8 # (3e7, 5e8)
memory_efficient_linear: False
steps_per_print: 25
gradient_clipping: 1.0
prescale_gradients: False
#wall_clock_breakdown: False
#hybrid_engine:
# enabled: True
# max_out_tokens: max_out_tokens
# inference_tp_size: inference_tp_size
# release_inference_cache: release_inference_cache
# pin_parameters: pin_parameters
# tp_gather_partition_size: tp_gather_partition_size
================================================
FILE: PFPO/conf/deepspeed/train_hybrid_engine_zero1_cosine.yaml
================================================
train_micro_batch_size_per_gpu:
gradient_accumulation_steps:
scheduler:
type: WarmupCosineLR # requires deepspeed >= 0.12.3
params:
total_num_steps:
# warmup_max_lr: ${learning_rate}
warmup_num_steps:
warmup_type: linear
optimizer:
type: AdamW
params:
lr: 1e-4
betas: [ 0.9, 0.999 ]
eps: 1e-6
weight_decay: 0.0
bf16:
enabled: True
zero_optimization:
stage: 1
# offload_optimizer:
# device: cpu
# pin_memory: True
# offload_param:
# device: cpu
# pin_memory: True
# activation_checkpointing:
# partition_activations: True
# cpu_checkpointing: True
# contiguous_memory_optimization: False
# number_checkpoints: False
# synchronize_checkpoint_boundary: False
# profile: False
# zero_quantized_nontrainable_weights: False # If `enable_mixed_precision_lora` is True, this should be True
stage3_param_persistence_threshold: 1e5 # (1e4,1e6)
stage3_max_live_parameters: 1e8 # (3e7, 1e9)
stage3_prefetch_bucket_size: 1e8 # (3e7, 5e8)
memory_efficient_linear: False
steps_per_print: 25
gradient_clipping: 1.0
prescale_gradients: False
#wall_clock_breakdown: False
#hybrid_engine:
# enabled: True
# max_out_tokens: max_out_tokens
# inference_tp_size: inference_tp_size
# release_inference_cache: release_inference_cache
# pin_parameters: pin_parameters
# tp_gather_partition_size: tp_gather_partition_size
================================================
FILE: PFPO/conf/deepspeed/train_hybrid_engine_zero1_lr.yaml
================================================
train_micro_batch_size_per_gpu:
gradient_accumulation_steps:
scheduler:
type: WarmupLR
params:
warmup_max_lr: ${learning_rate}
warmup_num_steps:
warmup_type: linear
optimizer:
type: AdamW
params:
lr: 1e-4
betas: [ 0.9, 0.999 ]
eps: 1e-6
weight_decay: 0.0
bf16:
enabled: True
zero_optimization:
stage: 1
# offload_optimizer:
# device: cpu
# pin_memory: True
# offload_param:
# device: cpu
# pin_memory: True
# activation_checkpointing:
# partition_activations: True
# cpu_checkpointing: True
# contiguous_memory_optimization: False
# number_checkpoints: False
# synchronize_checkpoint_boundary: False
# profile: False
# zero_quantized_nontrainable_weights: False # If `enable_mixed_precision_lora` is True, this should be True
stage3_param_persistence_threshold: 1e5 # (1e4,1e6)
stage3_max_live_parameters: 1e8 # (3e7, 1e9)
stage3_prefetch_bucket_size: 1e8 # (3e7, 5e8)
memory_efficient_linear: False
steps_per_print: 25
gradient_clipping: 1.0
prescale_gradients: False
#wall_clock_breakdown: False
#hybrid_engine:
# enabled: True
# max_out_tokens: max_out_tokens
# inference_tp_size: inference_tp_size
# release_inference_cache: release_inference_cache
# pin_parameters: pin_parameters
# tp_gather_partition_size: tp_gather_partition_size
================================================
FILE: PFPO/conf/deepspeed/train_hybrid_engine_zero1_optim_offload.yaml
================================================
train_micro_batch_size_per_gpu:
gradient_accumulation_steps:
scheduler:
type: WarmupDecayLR
params:
total_num_steps:
warmup_max_lr: ${learning_rate}
warmup_num_steps:
warmup_type: linear
optimizer:
type: AdamW
params:
lr: 1e-4
betas: [ 0.9, 0.999 ]
eps: 1e-6
weight_decay: 0.0
bf16:
enabled: True
zero_optimization:
stage: 1
offload_optimizer:
device: cpu
pin_memory: True
# offload_param:
# device: cpu
# pin_memory: True
# activation_checkpointing:
# partition_activations: True
# cpu_checkpointing: True
# contiguous_memory_optimization: False
# number_checkpoints: False
# synchronize_checkpoint_boundary: False
# profile: False
# zero_quantized_nontrainable_weights: False # If `enable_mixed_precision_lora` is True, this should be True
stage3_param_persistence_threshold: 1e5 # (1e4,1e6)
stage3_max_live_parameters: 1e8 # (3e7, 1e9)
stage3_prefetch_bucket_size: 1e8 # (3e7, 5e8)
memory_efficient_linear: False
steps_per_print: 25
gradient_clipping: 1.0
prescale_gradients: False
#wall_clock_breakdown: False
#hybrid_engine:
# enabled: True
# max_out_tokens: max_out_tokens
# inference_tp_size: inference_tp_size
# release_inference_cache: release_inference_cache
# pin_parameters: pin_parameters
# tp_gather_partition_size: tp_gather_partition_size
================================================
FILE: PFPO/conf/deepspeed/train_hybrid_engine_zero1_optim_offload_cosine.yaml
================================================
train_micro_batch_size_per_gpu:
gradient_accumulation_steps:
scheduler:
type: WarmupCosineLR # requires deepspeed >= 0.12.3
params:
total_num_steps:
# warmup_max_lr: ${learning_rate}
warmup_num_steps:
warmup_type: linear
optimizer:
type: AdamW
params:
lr: 1e-4
betas: [ 0.9, 0.999 ]
eps: 1e-6
weight_decay: 0.0
bf16:
enabled: True
zero_optimization:
stage: 1
offload_optimizer:
device: cpu
pin_memory: True
# offload_param:
# device: cpu
# pin_memory: True
# activation_checkpointing:
# partition_activations: True
# cpu_checkpointing: True
# contiguous_memory_optimization: False
# number_checkpoints: False
# synchronize_checkpoint_boundary: False
# profile: False
# zero_quantized_nontrainable_weights: False # If `enable_mixed_precision_lora` is True, this should be True
stage3_param_persistence_threshold: 1e5 # (1e4,1e6)
stage3_max_live_parameters: 1e8 # (3e7, 1e9)
stage3_prefetch_bucket_size: 1e8 # (3e7, 5e8)
memory_efficient_linear: False
steps_per_print: 25
gradient_clipping: 1.0
prescale_gradients: False
#wall_clock_breakdown: False
#hybrid_engine:
# enabled: True
# max_out_tokens: max_out_tokens
# inference_tp_size: inference_tp_size
# release_inference_cache: release_inference_cache
# pin_parameters: pin_parameters
# tp_gather_partition_size: tp_gather_partition_size
================================================
FILE: PFPO/conf/deepspeed/train_hybrid_engine_zero1_optim_offload_lr.yaml
================================================
train_micro_batch_size_per_gpu:
gradient_accumulation_steps:
scheduler:
type: WarmupLR
params:
warmup_max_lr: ${learning_rate}
warmup_num_steps:
warmup_type: linear
optimizer:
type: AdamW
params:
lr: 1e-4
betas: [ 0.9, 0.999 ]
eps: 1e-6
weight_decay: 0.0
bf16:
enabled: True
zero_optimization:
stage: 1
offload_optimizer:
device: cpu
pin_memory: True
# offload_param:
# device: cpu
# pin_memory: True
# activation_checkpointing:
# partition_activations: True
# cpu_checkpointing: True
# contiguous_memory_optimization: False
# number_checkpoints: False
# synchronize_checkpoint_boundary: False
# profile: False
# zero_quantized_nontrainable_weights: False # If `enable_mixed_precision_lora` is True, this should be True
stage3_param_persistence_threshold: 1e5 # (1e4,1e6)
stage3_max_live_parameters: 1e8 # (3e7, 1e9)
stage3_prefetch_bucket_size: 1e8 # (3e7, 5e8)
memory_efficient_linear: False
steps_per_print: 25
gradient_clipping: 1.0
prescale_gradients: False
#wall_clock_breakdown: False
#hybrid_engine:
# enabled: True
# max_out_tokens: max_out_tokens
# inference_tp_size: inference_tp_size
# release_inference_cache: release_inference_cache
# pin_parameters: pin_parameters
# tp_gather_partition_size: tp_gather_partition_size
================================================
FILE: PFPO/conf/deepspeed/train_hybrid_engine_zero1_wo_optim.yaml
================================================
train_micro_batch_size_per_gpu:
gradient_accumulation_steps:
scheduler:
type: WarmupDecayLR
params:
total_num_steps:
warmup_max_lr: ${learning_rate}
warmup_num_steps:
warmup_type: linear
bf16:
enabled: True
zero_optimization:
stage: 1
# offload_optimizer:
# device: cpu
# pin_memory: True
# offload_param:
# device: cpu
# pin_memory: True
# activation_checkpointing:
# partition_activations: True
# cpu_checkpointing: True
# contiguous_memory_optimization: False
# number_checkpoints: False
# synchronize_checkpoint_boundary: False
# profile: False
# zero_quantized_nontrainable_weights: False # If `enable_mixed_precision_lora` is True, this should be True
stage3_param_persistence_threshold: 1e5 # (1e4,1e6)
stage3_max_live_parameters: 1e8 # (3e7, 1e9)
stage3_prefetch_bucket_size: 1e8 # (3e7, 5e8)
memory_efficient_linear: False
steps_per_print: 25
gradient_clipping: 1.0
prescale_gradients: False
zero_allow_untested_optimizer: True
#wall_clock_breakdown: False
#hybrid_engine:
# enabled: True
# max_out_tokens: max_out_tokens
# inference_tp_size: inference_tp_size
# release_inference_cache: release_inference_cache
# pin_parameters: pin_parameters
# tp_gather_partition_size: tp_gather_partition_size
================================================
FILE: PFPO/conf/deepspeed/train_hybrid_engine_zero2.yaml
================================================
train_micro_batch_size_per_gpu:
gradient_accumulation_steps:
scheduler:
type: WarmupDecayLR
params:
total_num_steps:
warmup_max_lr: ${learning_rate}
warmup_num_steps:
warmup_type: linear
optimizer:
type: AdamW
params:
lr: 1e-4
betas: [ 0.9, 0.999 ]
eps: 1e-6
weight_decay: 0.0
bf16:
enabled: True
zero_optimization:
stage: 2
# offload_optimizer:
# device: cpu
# pin_memory: True
# offload_param:
# device: cpu
# pin_memory: True
# activation_checkpointing:
# partition_activations: True
# cpu_checkpointing: True
# contiguous_memory_optimization: False
# number_checkpoints: False
# synchronize_checkpoint_boundary: False
# profile: False
# zero_quantized_nontrainable_weights: False # If `enable_mixed_precision_lora` is True, this should be True
stage3_param_persistence_threshold: 1e5 # (1e4,1e6)
stage3_max_live_parameters: 1e8 # (3e7, 1e9)
stage3_prefetch_bucket_size: 1e8 # (3e7, 5e8)
memory_efficient_linear: False
steps_per_print: 25
gradient_clipping: 1.0
prescale_gradients: False
#wall_clock_breakdown: False
#hybrid_engine:
# enabled: True
# max_out_tokens: max_out_tokens
# inference_tp_size: inference_tp_size
# release_inference_cache: release_inference_cache
# pin_parameters: pin_parameters
# tp_gather_partition_size: tp_gather_partition_size
================================================
FILE: PFPO/conf/deepspeed/train_hybrid_engine_zero2_cosine.yaml
================================================
train_micro_batch_size_per_gpu:
gradient_accumulation_steps:
scheduler:
type: WarmupCosineLR # requires deepspeed >= 0.12.3
params:
total_num_steps:
# warmup_max_lr: ${learning_rate}
warmup_num_steps:
warmup_type: linear
optimizer:
type: AdamW
params:
lr: 1e-4
betas: [ 0.9, 0.999 ]
eps: 1e-6
weight_decay: 0.0
bf16:
enabled: True
zero_optimization:
stage: 2
# offload_optimizer:
# device: cpu
# pin_memory: True
# offload_param:
# device: cpu
# pin_memory: True
# activation_checkpointing:
# partition_activations: True
# cpu_checkpointing: True
# contiguous_memory_optimization: False
# number_checkpoints: False
# synchronize_checkpoint_boundary: False
# profile: False
# zero_quantized_nontrainable_weights: False # If `enable_mixed_precision_lora` is True, this should be True
stage3_param_persistence_threshold: 1e5 # (1e4,1e6)
stage3_max_live_parameters: 1e8 # (3e7, 1e9)
stage3_prefetch_bucket_size: 1e8 # (3e7, 5e8)
memory_efficient_linear: False
steps_per_print: 25
gradient_clipping: 1.0
prescale_gradients: False
#wall_clock_breakdown: False
#hybrid_engine:
# enabled: True
# max_out_tokens: max_out_tokens
# inference_tp_size: inference_tp_size
# release_inference_cache: release_inference_cache
# pin_parameters: pin_parameters
# tp_gather_partition_size: tp_gather_partition_size
================================================
FILE: PFPO/conf/deepspeed/train_hybrid_engine_zero2_lr.yaml
================================================
train_micro_batch_size_per_gpu:
gradient_accumulation_steps:
scheduler:
type: WarmupLR
params:
# total_num_steps:
warmup_max_lr: ${learning_rate}
warmup_num_steps:
warmup_type: linear
optimizer:
type: AdamW
params:
lr: 1e-4
betas: [ 0.9, 0.999 ]
eps: 1e-6
weight_decay: 0.0
bf16:
enabled: True
zero_optimization:
stage: 2
# offload_optimizer:
# device: cpu
# pin_memory: True
# offload_param:
# device: cpu
# pin_memory: True
# activation_checkpointing:
# partition_activations: True
# cpu_checkpointing: True
# contiguous_memory_optimization: False
# number_checkpoints: False
# synchronize_checkpoint_boundary: False
# profile: False
# zero_quantized_nontrainable_weights: False # If `enable_mixed_precision_lora` is True, this should be True
stage3_param_persistence_threshold: 1e5 # (1e4,1e6)
stage3_max_live_parameters: 1e8 # (3e7, 1e9)
stage3_prefetch_bucket_size: 1e8 # (3e7, 5e8)
memory_efficient_linear: False
steps_per_print: 25
gradient_clipping: 1.0
prescale_gradients: False
#wall_clock_breakdown: False
#hybrid_engine:
# enabled: True
# max_out_tokens: max_out_tokens
# inference_tp_size: inference_tp_size
# release_inference_cache: release_inference_cache
# pin_parameters: pin_parameters
# tp_gather_partition_size: tp_gather_partition_size
================================================
FILE: PFPO/conf/deepspeed/train_hybrid_engine_zero2_optim_offload.yaml
================================================
train_micro_batch_size_per_gpu:
gradient_accumulation_steps:
scheduler:
type: WarmupDecayLR
params:
total_num_steps:
warmup_max_lr: ${learning_rate}
warmup_num_steps:
warmup_type: linear
optimizer:
type: AdamW
params:
lr: 1e-4
betas: [ 0.9, 0.999 ]
eps: 1e-6
weight_decay: 0.0
bf16:
enabled: True
zero_optimization:
stage: 2
offload_optimizer:
device: cpu
pin_memory: True
# offload_param:
# device: cpu
# pin_memory: True
# activation_checkpointing:
# partition_activations: True
# cpu_checkpointing: True
# contiguous_memory_optimization: False
# number_checkpoints: False
# synchronize_checkpoint_boundary: False
# profile: False
# zero_quantized_nontrainable_weights: False # If `enable_mixed_precision_lora` is True, this should be True
stage3_param_persistence_threshold: 1e5 # (1e4,1e6)
stage3_max_live_parameters: 1e8 # (3e7, 1e9)
stage3_prefetch_bucket_size: 1e8 # (3e7, 5e8)
memory_efficient_linear: False
steps_per_print: 25
gradient_clipping: 1.0
prescale_gradients: False
#wall_clock_breakdown: False
#hybrid_engine:
# enabled: True
# max_out_tokens: max_out_tokens
# inference_tp_size: inference_tp_size
# release_inference_cache: release_inference_cache
# pin_parameters: pin_parameters
# tp_gather_partition_size: tp_gather_partition_size
================================================
FILE: PFPO/conf/deepspeed/train_hybrid_engine_zero2_optim_offload_cosine.yaml
================================================
train_micro_batch_size_per_gpu:
gradient_accumulation_steps:
scheduler:
type: WarmupCosineLR # requires deepspeed >= 0.12.3
params:
total_num_steps:
# warmup_max_lr: ${learning_rate}
warmup_num_steps:
warmup_type: linear
optimizer:
type: AdamW
params:
lr: 1e-4
betas: [ 0.9, 0.999 ]
eps: 1e-6
weight_decay: 0.0
bf16:
enabled: True
zero_optimization:
stage: 2
offload_optimizer:
device: cpu
pin_memory: True
# offload_param:
# device: cpu
# pin_memory: True
# activation_checkpointing:
# partition_activations: True
# cpu_checkpointing: True
# contiguous_memory_optimization: False
# number_checkpoints: False
# synchronize_checkpoint_boundary: False
# profile: False
# zero_quantized_nontrainable_weights: False # If `enable_mixed_precision_lora` is True, this should be True
stage3_param_persistence_threshold: 1e5 # (1e4,1e6)
stage3_max_live_parameters: 1e8 # (3e7, 1e9)
stage3_prefetch_bucket_size: 1e8 # (3e7, 5e8)
memory_efficient_linear: False
steps_per_print: 25
gradient_clipping: 1.0
prescale_gradients: False
#wall_clock_breakdown: False
#hybrid_engine:
# enabled: True
# max_out_tokens: max_out_tokens
# inference_tp_size: inference_tp_size
# release_inference_cache: release_inference_cache
# pin_parameters: pin_parameters
# tp_gather_partition_size: tp_gather_partition_size
================================================
FILE: PFPO/conf/deepspeed/train_hybrid_engine_zero3.yaml
================================================
train_micro_batch_size_per_gpu:
gradient_accumulation_steps:
scheduler:
type: WarmupDecayLR
params:
total_num_steps:
warmup_max_lr: ${learning_rate}
warmup_num_steps:
warmup_type: linear
optimizer:
type: AdamW
params:
lr: 1e-4
betas: [ 0.9, 0.999 ]
eps: 1e-6
weight_decay: 0.0
bf16:
enabled: True
zero_optimization:
stage: 3
# offload_optimizer:
# device: cpu
# pin_memory: True
# offload_param:
# device: cpu
# pin_memory: True
# activation_checkpointing:
# partition_activations: True
# cpu_checkpointing: True
# contiguous_memory_optimization: False
# number_checkpoints: False
# synchronize_checkpoint_boundary: False
# profile: False
# zero_quantized_nontrainable_weights: False # If `enable_mixed_precision_lora` is True, this should be True
stage3_param_persistence_threshold: 1e5 # (1e4,1e6)
stage3_max_live_parameters: 1e8 # (3e7, 1e9)
stage3_prefetch_bucket_size: 1e8 # (3e7, 5e8)
memory_efficient_linear: False
steps_per_print: 25
gradient_clipping: 1.0
prescale_gradients: False
#wall_clock_breakdown: False
#hybrid_engine:
# enabled: True
# max_out_tokens: max_out_tokens
# inference_tp_size: inference_tp_size
# release_inference_cache: release_inference_cache
# pin_parameters: pin_parameters
# tp_gather_partition_size: tp_gather_partition_size
================================================
FILE: PFPO/conf/deepspeed/train_hybrid_engine_zero3_cosine.yaml
================================================
train_micro_batch_size_per_gpu:
gradient_accumulation_steps:
scheduler:
type: WarmupCosineLR # requires deepspeed >= 0.12.3
params:
total_num_steps:
# warmup_max_lr: ${learning_rate}
warmup_num_steps:
warmup_type: linear
optimizer:
type: AdamW
params:
lr: 1e-4
betas: [ 0.9, 0.999 ]
eps: 1e-6
weight_decay: 0.0
bf16:
enabled: True
zero_optimization:
stage: 3
# offload_optimizer:
# device: cpu
# pin_memory: True
# offload_param:
# device: cpu
# pin_memory: True
# activation_checkpointing:
# partition_activations: True
# cpu_checkpointing: True
# contiguous_memory_optimization: False
# number_checkpoints: False
# synchronize_checkpoint_boundary: False
# profile: False
# zero_quantized_nontrainable_weights: False # If `enable_mixed_precision_lora` is True, this should be True
stage3_param_persistence_threshold: 1e5 # (1e4,1e6)
stage3_max_live_parameters: 1e8 # (3e7, 1e9)
stage3_prefetch_bucket_size: 1e8 # (3e7, 5e8)
memory_efficient_linear: False
steps_per_print: 25
gradient_clipping: 1.0
prescale_gradients: False
#wall_clock_breakdown: False
#hybrid_engine:
# enabled: True
# max_out_tokens: max_out_tokens
# inference_tp_size: inference_tp_size
# release_inference_cache: release_inference_cache
# pin_parameters: pin_parameters
# tp_gather_partition_size: tp_gather_partition_size
================================================
FILE: PFPO/conf/deepspeed/train_hybrid_engine_zero3_optim_offload.yaml
================================================
train_micro_batch_size_per_gpu:
gradient_accumulation_steps:
scheduler:
type: WarmupDecayLR
params:
total_num_steps:
warmup_max_lr: ${learning_rate}
warmup_num_steps:
warmup_type: linear
optimizer:
type: AdamW
params:
lr: 1e-4
betas: [ 0.9, 0.999 ]
eps: 1e-6
weight_decay: 0.0
bf16:
enabled: True
zero_optimization:
stage: 3
offload_optimizer:
device: cpu
pin_memory: True
# offload_param:
# device: cpu
# pin_memory: True
# activation_checkpointing:
# partition_activations: True
# cpu_checkpointing: True
# contiguous_memory_optimization: False
# number_checkpoints: False
# synchronize_checkpoint_boundary: False
# profile: False
# zero_quantized_nontrainable_weights: False # If `enable_mixed_precision_lora` is True, this should be True
overlap_comm: True
contiguous_gradients: True
reduce_bucket_size: auto
stage3_param_persistence_threshold: 1e5 # (1e4,1e6)
stage3_max_live_parameters: 1e8 # (3e7, 1e9)
stage3_prefetch_bucket_size: 1e8 # (3e7, 5e8)
memory_efficient_linear: False
steps_per_print: 25
gradient_clipping: 1.0
prescale_gradients: False
#wall_clock_breakdown: False
#hybrid_engine:
# enabled: True
# max_out_tokens: max_out_tokens
# inference_tp_size: inference_tp_size
# release_inference_cache: release_inference_cache
# pin_parameters: pin_parameters
# tp_gather_partition_size: tp_gather_partition_size
================================================
FILE: PFPO/conf/deepspeed/train_hybrid_engine_zero3_optim_offload_cosine.yaml
================================================
train_micro_batch_size_per_gpu:
gradient_accumulation_steps:
scheduler:
type: WarmupCosineLR # requires deepspeed >= 0.12.3
params:
total_num_steps:
# warmup_max_lr: ${learning_rate}
warmup_num_steps:
warmup_type: linear
optimizer:
type: AdamW
params:
lr: 1e-4
betas: [ 0.9, 0.999 ]
eps: 1e-6
weight_decay: 0.0
bf16:
enabled: True
zero_optimization:
stage: 3
offload_optimizer:
device: cpu
pin_memory: True
# offload_param:
# device: cpu
# pin_memory: True
# activation_checkpointing:
# partition_activations: True
# cpu_checkpointing: True
# contiguous_memory_optimization: False
# number_checkpoints: False
# synchronize_checkpoint_boundary: False
# profile: False
# zero_quantized_nontrainable_weights: False # If `enable_mixed_precision_lora` is True, this should be True
stage3_param_persistence_threshold: 1e5 # (1e4,1e6)
stage3_max_live_parameters: 1e8 # (3e7, 1e9)
stage3_prefetch_bucket_size: 1e8 # (3e7, 5e8)
memory_efficient_linear: False
steps_per_print: 25
gradient_clipping: 1.0
prescale_gradients: False
#wall_clock_breakdown: False
#hybrid_engine:
# enabled: True
# max_out_tokens: max_out_tokens
# inference_tp_size: inference_tp_size
# release_inference_cache: release_inference_cache
# pin_parameters: pin_parameters
# tp_gather_partition_size: tp_gather_partition_size
================================================
FILE: PFPO/conf/exp/apps/code_gen/deepseek_coder/dpo/orig-pseudo-v1.0-a100.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: ""
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.1
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
# device_map: ${device_map}
ref_model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt: "{question}\n\nPlease write a program to solve the above problem under the given time constraints and memory limits."
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsReader
split: train
train_sub_split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: id
extract_fields: [ "pos", "neg" ]
extra_file: ${model_path_prefix}/deepseek-coder-7b-instruct-v1.5/apps/train.0shot.tem1.0.n10.v1.0.pseudo_test_case.exec.sc.dpo_v1.0.json
- _target_: data.input_aligner.dpo_pair_aligner
pos_field: pos
neg_field: neg
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}{chat_suffix}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}{chat_suffix}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${model_path_prefix}/deepseek-coder-7b-instruct-v1.5
pretrain:
resume:
dp_size:
tp_size: 1
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.code_gen.dpo.pseudo.A100.w8.v1.0.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 5e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 16
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 2
max_steps: 500
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 5
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
# zero_optimization:
# offload_optimizer:
# device: cpu
# pin_memory: True
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/code_gen/deepseek_coder/dpo/orig-v1.0-v100.yaml
================================================
defaults:
- hydra: default
- deepspeed@ds_cfg: train_hybrid_engine_zero3_optim_offload
# - deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: ""
model_path_prefix: ../pretrained-models/ # /mnt/fangkai_blob/share/models
output_path_prefix: "" # /mnt/fangkai_blob/reward_modeling/
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.1
gradient_checkpointing: True
attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
device_map: ${device_map}
ref_model:
_target_: models.ds_utils.init_ds_eval_engine
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "eager"
device_map: ${device_map}
pad_token_id: 100001
ds_cfg: ${ds_cfg}
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt: "{question}\n\nPlease write a program to solve the above problem under the given time constraints and memory limits."
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsReader
split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: id
extract_fields: [ "pos", "neg" ]
extra_file: ${model_path_prefix}/deepseek-coder-7b-instruct-v1.5/apps/train.0shot.tem1.0.n10.dpo_v1.0.json
- _target_: data.input_aligner.dpo_pair_aligner
pos_field: pos
neg_field: neg
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}{chat_suffix}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}{chat_suffix}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 2048
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${model_path_prefix}/deepseek-coder-7b-instruct-v1.5
pretrain:
resume:
dp_size:
tp_size: 1
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.code_gen.dpo.V100.w8.v1.0
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 5e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 16
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 1
max_steps: 0
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 5
save_ds_state: True
save_steps: 200
save_best: False
eval_steps: 200
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
bf16:
enabled: False
fp16:
enabled: True
auto_cast: False
loss_scale": 0
initial_scale_power: 16
loss_scale_window: 1000
hysteresis: 2
consecutive_hysteresis: False
min_loss_scale: 1
# zero_optimization:
# offload_optimizer:
# device: cpu
# pin_memory: True
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/code_gen/deepseek_coder/dpo/orig-v1.1-v100-tp2.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero2_optim_offload
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: ""
model_path_prefix: ../pretrained-models/ # /mnt/fangkai_blob/share/models
output_path_prefix: "" # /mnt/fangkai_blob/reward_modeling/
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.1
gradient_checkpointing: True
attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
# device_map: ${device_map}
ref_model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "eager"
# device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt: "{question}\n\nPlease write a program to solve the above problem under the given time constraints and memory limits."
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsReader
split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: id
extract_fields: [ "pos", "neg" ]
extra_file: ${model_path_prefix}/deepseek-coder-7b-instruct-v1.5/apps/train.0shot.tem1.0.n10.dpo_v1.0.json
- _target_: data.input_aligner.dpo_pair_aligner
pos_field: pos
neg_field: neg
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}{chat_suffix}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}{chat_suffix}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 2048
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${model_path_prefix}/deepseek-coder-7b-instruct-v1.5
pretrain:
resume:
dp_size:
tp_size: 2
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.code_gen.dpo.V100.tp2.dp4.v1.1.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 5e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 32
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 2
max_steps: 500
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 5
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
bf16:
enabled: False
fp16:
enabled: True
auto_cast: False
loss_scale": 0
initial_scale_power: 16
loss_scale_window: 1000
hysteresis: 2
consecutive_hysteresis: False
min_loss_scale: 1
# zero_optimization:
# offload_optimizer:
# device: cpu
# pin_memory: True
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/code_gen/deepseek_coder/dpo/orig-v1.1-v100-tp4.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: ""
model_path_prefix: ../pretrained-models/ # /mnt/fangkai_blob/share/models
output_path_prefix: "" # /mnt/fangkai_blob/reward_modeling/
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.1
gradient_checkpointing: True
attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
# device_map: ${device_map}
ref_model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "eager"
# device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt: "{question}\n\nPlease write a program to solve the above problem under the given time constraints and memory limits."
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsReader
split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: id
extract_fields: [ "pos", "neg" ]
extra_file: ${model_path_prefix}/deepseek-coder-7b-instruct-v1.5/apps/train.0shot.tem1.0.n10.dpo_v1.0.json
- _target_: data.input_aligner.dpo_pair_aligner
pos_field: pos
neg_field: neg
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}{chat_suffix}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}{chat_suffix}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 2048
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${model_path_prefix}/deepseek-coder-7b-instruct-v1.5
pretrain:
resume:
dp_size:
tp_size: 4
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.code_gen.dpo.V100.tp4.dp2.v1.1.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 5e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 64
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 2
max_steps: 500
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 5
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
bf16:
enabled: False
fp16:
enabled: True
auto_cast: False
loss_scale": 0
initial_scale_power: 16
loss_scale_window: 1000
hysteresis: 2
consecutive_hysteresis: False
min_loss_scale: 1
# zero_optimization:
# offload_optimizer:
# device: cpu
# pin_memory: True
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/code_gen/deepseek_coder/dpo/orig-v1.1-v100.yaml
================================================
defaults:
- hydra: default
- deepspeed@ds_cfg: train_hybrid_engine_zero3
# - deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: ""
model_path_prefix: ../pretrained-models/ # /mnt/fangkai_blob/share/models
output_path_prefix: "" # /mnt/fangkai_blob/reward_modeling/
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.1
gradient_checkpointing: True
attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
device_map: ${device_map}
ref_model:
_target_: models.ds_utils.init_ds_eval_engine
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "eager"
device_map: ${device_map}
pad_token_id: 100001
ds_cfg: ${ds_cfg}
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt: "{question}\n\nPlease write a program to solve the above problem under the given time constraints and memory limits."
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsReader
split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: id
extract_fields: [ "pos", "neg" ]
extra_file: ${model_path_prefix}/deepseek-coder-7b-instruct-v1.5/apps/train.0shot.tem1.0.n10.dpo_v1.0.json
- _target_: data.input_aligner.dpo_pair_aligner
pos_field: pos
neg_field: neg
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}{chat_suffix}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}{chat_suffix}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 2048
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${model_path_prefix}/deepseek-coder-7b-instruct-v1.5
pretrain:
resume:
dp_size:
tp_size: 1
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.code_gen.dpo.V100.w8.v1.1
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 5e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 16
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 2
max_steps: 0
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 5
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
# optimizer:
# type: AdamW
# params:
# lr: ${learning_rate}
# betas: [ 0.9, 0.95 ]
# weight_decay: ${weight_decay}
bf16:
enabled: False
fp16:
enabled: True
auto_cast: False
loss_scale": 0
initial_scale_power: 16
loss_scale_window: 1000
hysteresis: 2
consecutive_hysteresis: False
min_loss_scale: 1
# zero_optimization:
# offload_optimizer:
# device: cpu
# pin_memory: True
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/code_gen/deepseek_coder/dpo/orig-v1.2-v100-tp4.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: ""
model_path_prefix: ../pretrained-models/ # /mnt/fangkai_blob/share/models
output_path_prefix: "" # /mnt/fangkai_blob/reward_modeling/
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.1
gradient_checkpointing: True
attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
# device_map: ${device_map}
ref_model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "eager"
# device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt: "{question}\n\nPlease write a program to solve the above problem under the given time constraints and memory limits."
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsReader
split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: id
extract_fields: [ "pos", "neg" ]
extra_file: ${model_path_prefix}/deepseek-coder-7b-instruct-v1.5/apps/train.0shot.tem1.0.n10.v1.0.dpo_v1.1.json
- _target_: data.input_aligner.dpo_pair_aligner
pos_field: pos
neg_field: neg
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}{chat_suffix}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}{chat_suffix}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${model_path_prefix}/deepseek-coder-7b-instruct-v1.5
pretrain:
resume: latest
dp_size:
tp_size: 4
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.code_gen.dpo.V100.tp4.dp8.v1.2.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 5e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 16
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 2
max_steps: 500
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 5
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: False
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
bf16:
enabled: False
fp16:
enabled: True
auto_cast: False
loss_scale: 0
initial_scale_power: 16
loss_scale_window: 1000
hysteresis: 2
consecutive_hysteresis: False
min_loss_scale: 1
# zero_optimization:
# offload_optimizer:
# device: cpu
# pin_memory: True
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/code_gen/deepseek_coder/dpo/orig-v1.3-a100.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: ""
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.1
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
# device_map: ${device_map}
ref_model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt: "{question}\n\nPlease write a program to solve the above problem under the given time constraints and memory limits."
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsReader
split: train
train_sub_split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: id
extract_fields: [ "pos", "neg" ]
extra_file: ${model_path_prefix}/deepseek-coder-7b-instruct-v1.5/apps/train.0shot.tem1.0.n10.v1.0.dpo_v1.1.json
- _target_: data.input_aligner.dpo_pair_aligner
pos_field: pos
neg_field: neg
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}{chat_suffix}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}{chat_suffix}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${model_path_prefix}/deepseek-coder-7b-instruct-v1.5
pretrain:
resume:
dp_size:
tp_size: 1
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.code_gen.dpo.A100.w8.v1.3.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 5e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 16
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 2
max_steps: 500
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 5
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
# zero_optimization:
# offload_optimizer:
# device: cpu
# pin_memory: True
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/code_gen/deepseek_coder/dpo/orig-v1.3-v100-tp4.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: ""
model_path_prefix: ../pretrained-models/ # /mnt/fangkai_blob/share/models
output_path_prefix: "" # /mnt/fangkai_blob/reward_modeling/
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.1
gradient_checkpointing: True
attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
# device_map: ${device_map}
ref_model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "eager"
# device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt: "{question}\n\nPlease write a program to solve the above problem under the given time constraints and memory limits."
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsReader
split: train
train_sub_split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: id
extract_fields: [ "pos", "neg" ]
extra_file: ${model_path_prefix}/deepseek-coder-7b-instruct-v1.5/apps/train.0shot.tem1.0.n10.v1.0.dpo_v1.1.json
- _target_: data.input_aligner.dpo_pair_aligner
pos_field: pos
neg_field: neg
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}{chat_suffix}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}{chat_suffix}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${model_path_prefix}/deepseek-coder-7b-instruct-v1.5
pretrain:
resume: latest
dp_size:
tp_size: 4
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.code_gen.dpo.V100.tp4.dp8.v1.3.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 5e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 16
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 2
max_steps: 500
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 5
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: False
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
bf16:
enabled: False
fp16:
enabled: True
auto_cast: False
loss_scale: 0
initial_scale_power: 16
loss_scale_window: 1000
hysteresis: 2
consecutive_hysteresis: False
min_loss_scale: 1
# zero_optimization:
# offload_optimizer:
# device: cpu
# pin_memory: True
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/code_gen/deepseek_coder/dpo/orig-v1.4-a100.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: bfloat16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.1
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
device_map: ${device_map}
ref_model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt: "{question}\n\nPlease write a program to solve the above problem under the given time constraints and memory limits."
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsWithFunctionName
split: train
train_sub_split: train
use_starter_code: True
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: id
extract_fields: [ "pos", "neg" ]
extra_file: ${model_path_prefix}/deepseek-coder-7b-instruct-v1.5/apps/train.0shot.tem1.0.n10.v1.0.dpo_v1.1.json
- _target_: data.input_aligner.dpo_pair_aligner
pos_field: pos
neg_field: neg
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}{chat_suffix}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}{chat_suffix}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${model_path_prefix}/deepseek-coder-7b-instruct-v1.5
pretrain:
resume:
dp_size:
tp_size: 1
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.code_gen.dpo.A100.dp4.v1.4.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 5e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 32
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 2
max_steps: 500
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 5
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
# zero_optimization:
# offload_optimizer:
# device: cpu
# pin_memory: True
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/code_gen/deepseek_coder/dpo/orig-v1.4-v100-tp4.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero2_optim_offload
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.1
gradient_checkpointing: True
attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
# device_map: ${device_map}
ref_model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "eager"
# device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt: "{question}\n\nPlease write a program to solve the above problem under the given time constraints and memory limits."
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsWithFunctionName
split: train
train_sub_split: train
use_starter_code: True
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: id
extract_fields: [ "pos", "neg" ]
extra_file: ${model_path_prefix}/deepseek-coder-7b-instruct-v1.5/apps/train.0shot.tem1.0.n10.v1.0.dpo_v1.1.json
- _target_: data.input_aligner.dpo_pair_aligner
pos_field: pos
neg_field: neg
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}{chat_suffix}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}{chat_suffix}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${model_path_prefix}/deepseek-coder-7b-instruct-v1.5
pretrain:
resume:
dp_size:
tp_size: 4
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.code_gen.dpo.V100.tp4.dp4.v1.4.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 5e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 32
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 2
max_steps: 500
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 5
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: False
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
bf16:
enabled: False
fp16:
enabled: True
auto_cast: False
loss_scale: 0
initial_scale_power: 16
loss_scale_window: 1000
hysteresis: 2
consecutive_hysteresis: False
min_loss_scale: 1
# zero_optimization:
# offload_optimizer:
# device: cpu
# pin_memory: True
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/code_gen/deepseek_coder/dpo/pseudo-sc-dpo-v1.0-v100-tp8.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.4
gradient_checkpointing: True
attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
# device_map: ${device_map}
ref_model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "eager"
# device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt: "{question}\n\nPlease write a program to solve the above problem under the given time constraints and memory limits."
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsWithFunctionName
split: train
train_sub_split: train
use_starter_code: True
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: id
extract_fields: [ "pos", "neg" ]
extra_file: ${model_path_prefix}/deepseek-coder-7b-instruct-v1.5/apps/train.0shot.tem1.0.n10.v2.0.pseudo_input_output.v1.0.clean.dpo_m2_low0.5.json
- _target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: [ "pos", "neg" ]
mode: multi
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}{chat_suffix}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}{chat_suffix}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${model_path_prefix}/deepseek-coder-7b-instruct-v1.5
pretrain:
resume: latest
dp_size:
tp_size: 8
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.code_gen.pseudo-test-10.sc.dpo.V100.tp8.dp4.v1.0.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 1e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 32
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 1
max_steps: 0
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 5
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: False
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
bf16:
enabled: False
fp16:
enabled: True
auto_cast: False
loss_scale: 0
initial_scale_power: 16
loss_scale_window: 1000
hysteresis: 2
consecutive_hysteresis: False
min_loss_scale: 1
# zero_optimization:
# offload_optimizer:
# device: cpu
# pin_memory: True
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/code_gen/deepseek_coder/dpo/pseudo-sc-dpo-v1.1-h100.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.4
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
device_map: ${device_map}
ref_model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt: "{question}\n\nPlease write a program to solve the above problem under the given time constraints and memory limits."
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsWithFunctionName
split: train
train_sub_split: train
use_starter_code: True
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: id
extract_fields: [ "pos", "neg" ]
extra_file: ${model_path_prefix}/deepseek-coder-7b-instruct-v1.5/apps/train.0shot.tem1.0.n10.v2.0.pseudo_input_output.v1.0.clean.dpo_m3_low0.5.json
- _target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: [ "pos", "neg" ]
mode: multi
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}{chat_suffix}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}{chat_suffix}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${model_path_prefix}/deepseek-coder-7b-instruct-v1.5
pretrain:
resume:
dp_size:
tp_size: 1
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.code_gen.pseudo-test-10.sc.dpo.H100.dp8.v1.1.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 1e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 16
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 1
max_steps: 0
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 5
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: False
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
bf16:
enabled: False
fp16:
enabled: True
auto_cast: False
loss_scale: 0
initial_scale_power: 16
loss_scale_window: 1000
hysteresis: 2
consecutive_hysteresis: False
min_loss_scale: 1
# zero_optimization:
# offload_optimizer:
# device: cpu
# pin_memory: True
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/code_gen/deepseek_coder/dpo/pseudo-sc-dpo-v1.1-v100-tp8.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.4
gradient_checkpointing: True
attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
# device_map: ${device_map}
ref_model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "eager"
# device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt: "{question}\n\nPlease write a program to solve the above problem under the given time constraints and memory limits."
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsWithFunctionName
split: train
train_sub_split: train
use_starter_code: True
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: id
extract_fields: [ "pos", "neg" ]
# extra_file: ${model_path_prefix}/deepseek-coder-7b-instruct-v1.5/apps/train.0shot.tem1.0.n10.v2.0.pseudo_input_output.v1.0.clean.dpo_m3_low0.5.json
extra_file: ${model_path_prefix}/deepseek-coder-7b-instruct-v1.5/apps/train.0shot.tem1.0.n10.v2.0.pseudo_input_output.v1.0.clean.dpo_m3_low0.5.fix.json
- _target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: [ "pos", "neg" ]
mode: multi
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}{chat_suffix}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}{chat_suffix}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${model_path_prefix}/deepseek-coder-7b-instruct-v1.5
pretrain:
resume:
dp_size:
tp_size: 8
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.code_gen.pseudo-test-10.sc.dpo.V100.tp8.dp8.v1.1.s${seed}.fix-rerun
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 1e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 16
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 1
max_steps: 0
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 5
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: False
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
bf16:
enabled: False
fp16:
enabled: True
auto_cast: False
loss_scale: 0
initial_scale_power: 16
loss_scale_window: 1000
hysteresis: 2
consecutive_hysteresis: False
min_loss_scale: 1
# zero_optimization:
# offload_optimizer:
# device: cpu
# pin_memory: True
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/code_gen/deepseek_coder/dpo/pseudo-sc-dpo-v1.2-a100.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.4
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
device_map: ${device_map}
ref_model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt: "{question}\n\nPlease write a program to solve the above problem under the given time constraints and memory limits."
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsWithFunctionName
split: train
train_sub_split: train
use_starter_code: True
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: id
extract_fields: [ "pos", "neg" ]
extra_file: ${model_path_prefix}/deepseek-coder-7b-instruct-v1.5/apps/train.0shot.tem1.0.n10.v2.0.pseudo_input_output.v1.0.clean.dpo_m6_low0.5.json
- _target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: [ "pos", "neg" ]
mode: multi
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}{chat_suffix}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}{chat_suffix}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${model_path_prefix}/deepseek-coder-7b-instruct-v1.5
pretrain:
resume:
dp_size:
tp_size: 1
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.code_gen.pseudo-test-10.sc.dpo.A100.dp8.v1.2.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 1e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 16
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 1
max_steps: 0
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 5
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: False
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
bf16:
enabled: False
fp16:
enabled: True
auto_cast: False
loss_scale: 0
initial_scale_power: 16
loss_scale_window: 1000
hysteresis: 2
consecutive_hysteresis: False
min_loss_scale: 1
# zero_optimization:
# offload_optimizer:
# device: cpu
# pin_memory: True
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/code_gen/deepseek_coder/dpo/pseudo-sc-dpo-v1.2-v100-tp8.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.4
gradient_checkpointing: True
attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
# device_map: ${device_map}
ref_model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "eager"
# device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt: "{question}\n\nPlease write a program to solve the above problem under the given time constraints and memory limits."
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsWithFunctionName
split: train
train_sub_split: train
use_starter_code: True
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: id
extract_fields: [ "pos", "neg" ]
# extra_file: ${model_path_prefix}/deepseek-coder-7b-instruct-v1.5/apps/train.0shot.tem1.0.n10.v2.0.pseudo_input_output.v1.0.clean.dpo_m3_low0.5.json
extra_file: ${model_path_prefix}/deepseek-coder-7b-instruct-v1.5/apps/train.0shot.tem1.0.n10.v2.0.pseudo_input_output.v1.0.clean.dpo_m6_low0.5.json
- _target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: [ "pos", "neg" ]
mode: multi
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}{chat_suffix}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}{chat_suffix}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${model_path_prefix}/deepseek-coder-7b-instruct-v1.5
pretrain:
resume:
dp_size:
tp_size: 8
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.code_gen.pseudo-test-10.sc.dpo.V100.tp8.dp8.v1.2.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 1e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 16
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 1
max_steps: 0
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 5
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: False
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
bf16:
enabled: False
fp16:
enabled: True
auto_cast: False
loss_scale: 0
initial_scale_power: 16
loss_scale_window: 1000
hysteresis: 2
consecutive_hysteresis: False
min_loss_scale: 1
# zero_optimization:
# offload_optimizer:
# device: cpu
# pin_memory: True
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/code_gen/deepseek_coder/sft/v1.0-a100.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: ""
model_path_prefix: ../pretrained-models/ # /mnt/fangkai_blob/share/models
output_path_prefix: "" # /mnt/fangkai_blob/reward_modeling/
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
model:
_target_: models.llama.LlamaForCausalLM.from_pretrained
gradient_checkpointing: True
attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
# device_map:
# _target_: models.utils.return_single_device_map
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt: "{question}\n\nPlease write a program to solve the above problem under the given time constraints and memory limits."
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsReader
split: train
aligner:
_target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: [ "solutions" ]
mode: "multi"
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
suffix: "{solutions}"
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{suffix}"
index_field: problem_id
kv_mapping:
chosen: chosen
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPODataSFTCollator
tokenizer: ${tokenizer_init}
max_seq_length: 4096
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${model_path_prefix}/deepseek-coder-7b-instruct-v1.5
pretrain:
resume:
dp_size:
tp_size: 1
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.code_gen.A100.w8.v1.0
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: True
do_eval: True
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 2
per_gpu_eval_batch_size: 2
#learning_rate: 1e-4
#learning_rate: 5e-6
learning_rate: 2e-5
gradient_accumulation_steps: 8
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 1
max_steps: 0
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 5
save_ds_state: True
save_steps: 400
save_best: False
eval_steps: 400
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "acc"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.SFTLossOnlyPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
# optimizer:
# type: AdamW
# params:
# lr: ${learning_rate}
# betas: [ 0.9, 0.95 ]
# weight_decay: ${weight_decay}
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale": 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
# zero_optimization:
# offload_optimizer:
# device: cpu
# pin_memory: True
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/code_gen/deepseek_coder/sft/v1.0-v100.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero2
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: ""
model_path_prefix: ../pretrained-models/ # /mnt/fangkai_blob/share/models
output_path_prefix: "" # /mnt/fangkai_blob/reward_modeling/
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
model:
_target_: models.llama.LlamaForCausalLM.from_pretrained
gradient_checkpointing: True
attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
# device_map:
# _target_: models.utils.return_single_device_map
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt: "{question}\n\nPlease write a program to solve the above problem under the given time constraints and memory limits."
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsReader
split: train
aligner:
_target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: [ "solutions" ]
mode: "multi"
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
suffix: "{solutions}"
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{suffix}"
index_field: problem_id
kv_mapping:
chosen: chosen
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPODataSFTCollator
tokenizer: ${tokenizer_init}
max_seq_length: 4096
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${model_path_prefix}/deepseek-coder-7b-instruct-v1.5
pretrain:
resume:
dp_size:
tp_size: 1
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.code_gen.V100.w8.v1.0
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: True
do_eval: True
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
#learning_rate: 5e-6
learning_rate: 2e-5
gradient_accumulation_steps: 16
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 1
max_steps: 0
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 5
save_ds_state: False
save_steps: 400
save_best: False
eval_steps: 400
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: False
# Prediction config
prediction_cfg:
metric: "acc"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.SFTLossOnlyPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
bf16:
enabled: False
fp16:
enabled: True
auto_cast: False
loss_scale": 0
initial_scale_power: 16
loss_scale_window: 1000
hysteresis: 2
consecutive_hysteresis: False
min_loss_scale: 1
zero_optimization:
offload_optimizer:
device: cpu
pin_memory: True
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/deprecated/sft-v1.0-v100-tp4.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: ""
model_path_prefix: ../pretrained-models/ # /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.1
gradient_checkpointing: True
attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
# device_map: ${device_map}
ref_model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "eager"
# device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsReader
split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: id
extract_fields: [ "pos", "neg" ]
extra_file: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v2.0/apps/checkpoint-100/train.0shot.tem1.0.n10.v1.1.dpo_v1.0.json
- _target_: data.input_aligner.dpo_pair_aligner
pos_field: pos
neg_field: neg
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}{chat_suffix}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}{chat_suffix}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 2048
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v2.0/checkpoint-100/
pretrain:
resume:
dp_size:
tp_size: 4
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.sft_dpo.V100.tp4.dp2.v1.0.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 5e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 64
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 2
max_steps: 500
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 5
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: False
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
bf16:
enabled: False
fp16:
enabled: True
auto_cast: False
loss_scale": 0
initial_scale_power: 16
loss_scale_window: 1000
hysteresis: 2
consecutive_hysteresis: False
min_loss_scale: 1
# zero_optimization:
# offload_optimizer:
# device: cpu
# pin_memory: True
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/gpt4o-distil-4o-ps-test-pdpo-h100-v1.0.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: bfloat16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.4
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
device_map: ${device_map}
ref_model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor_dpo:
_target_: data.combine_dataset.MultiMappingDataset
file_path: "codeparrot/apps"
read_fn:
_target_: data.apps.APPsWithFunctionName
split: train
train_sub_split: train
use_starter_code: True
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: id
extract_fields: [ "pos", "neg" ]
extra_file: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v3.1.dp4.tp4.s42/apps/checkpoint-200/train.0shot.tem1.0.n10.v2.0.4o_pseudo_test_cases.dpo_m4_low0.5.json
- _target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: [ "pos", "neg" ]
mode: multi
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}{chat_suffix}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}{chat_suffix}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
read_tensor_pdpo:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsWithFunctionName
split: train
train_sub_split: train
use_starter_code: True
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: problem_id
extract_fields: [ "pos", "neg" ]
extra_file: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v3.1.dp4.tp4.s42/apps/checkpoint-200/train.tem1.0.n10.prefix.upper0.8.r0.3.completion.tem1.0.n5.v2.0.4o_pseudo_test_case.prm_prefer_pair_low0.5_m4.0_avg.json
- _target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: [ "pos", "neg" ]
mode: multi
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
read_tensor:
_target_: data.combine_dataset.ReplayDataset
_recursive_: False
new_dataset_cfg: ${read_tensor_pdpo}
old_dataset_cfg: ${read_tensor_dpo}
replay_ratio: 1.0
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v3.1.dp4.tp4.s42/checkpoint-200/
pretrain:
resume:
dp_size:
tp_size: 1
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.sft_4o_ps_test_case.pdpo.H100.dp16.v1.0.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 5e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 8
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 2
max_steps: 0
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
# zero_optimization:
# offload_optimizer:
# device: cpu
# pin_memory: True
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/gpt4o-distil-4o-ps-test-pdpo-h100-v1.1.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: bfloat16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.4
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
device_map: ${device_map}
ref_model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
#read_tensor_dpo:
# _target_: data.combine_dataset.MultiMappingDataset
# read_fn:
# _target_: data.apps.APPsWithFunctionName
# split: train
# train_sub_split: train
# use_starter_code: True
# aligner:
# _target_: data.input_aligner.concat_aligner
# aligners:
# - _target_: data.input_aligner.field_extract_aligner
# input_index_field: problem_id
# extract_index_field: id
# extract_fields: [ "pos", "neg" ]
# extra_file: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v3.1.dp4.tp4.s42/apps/checkpoint-200/train.0shot.tem1.0.n10.v2.0.4o_pseudo_test_cases.dpo_m4_low0.5.json
# - _target_: data.input_aligner.flat_aligner
# input_index_field: problem_id
# extract_field: [ "pos", "neg" ]
# mode: multi
# template:
# _target_: data.input_utils.recompose_template
# units:
# chat_prefix: ${chat_prefix}
# prompt: ${prompt}
# chat_connect: ${chat_connect}
# pos: "{pos}"
# neg: "{neg}"
# chat_suffix: ${chat_suffix}
# compositions:
# prompt: "{chat_prefix}{prompt}{chat_connect}"
# chosen: "{chat_prefix}{prompt}{chat_connect}{pos}{chat_suffix}"
# reject: "{chat_prefix}{prompt}{chat_connect}{neg}{chat_suffix}"
# index_field: problem_id
# kv_mapping:
# chosen: chosen
# reject: reject
# problem_id: index
# prompt: prompt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsWithFunctionName
split: train
train_sub_split: train
use_starter_code: True
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: problem_id
extract_fields: [ "pos", "neg" ]
extra_file: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v3.1.dp4.tp4.s42/apps/checkpoint-200/train.tem1.0.n10.prefix.upper0.8.r0.3.completion.tem1.0.n5.v2.0.4o_pseudo_test_case.prm_prefer_pair_low0.5_m4.0_avg.json
- _target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: [ "pos", "neg" ]
mode: multi
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
#read_tensor:
# _target_: data.combine_dataset.ReplayDataset
# _recursive_: False
# new_dataset_cfg: ${read_tensor_pdpo}
# old_dataset_cfg: ${read_tensor_dpo}
# replay_ratio: 1.0
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v3.1.dp4.tp4.s42/checkpoint-200/
pretrain:
resume:
dp_size:
tp_size: 1
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.sft_4o_ps_test_case.pdpo.H100.dp16.v1.1.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 5e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 8
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 3
max_steps: 0
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
# zero_optimization:
# offload_optimizer:
# device: cpu
# pin_memory: True
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/gpt4o-distil-4o-self-mix-ps-test-v1.0-mi300x-dp16.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: bfloat16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.1
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
# attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
device_map: ${device_map}
ref_model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
# attn_implementation: "eager"
device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsWithFunctionName
split: train
train_sub_split: train
use_starter_code: True
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: id
extract_fields: [ "pos", "neg" ]
extra_file: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v3.1.dp4.tp4.s42/apps/checkpoint-200/train.0shot.tem1.0.n10.v2.0.4o.dpo_m4_low0.5.2k.self.dpo_m6_low0.5.2k.json
- _target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: [ "pos", "neg" ]
mode: multi
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}{chat_suffix}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}{chat_suffix}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v3.1.dp4.tp4.s42/checkpoint-200/
pretrain:
resume:
dp_size:
tp_size: 1
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.sft_4o_self_ps_test_case.mix.dpo.MI300x.dp16.v1.0.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 4
per_gpu_eval_batch_size: 4
#learning_rate: 1e-4
learning_rate: 5e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 2
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 2
max_steps: 0
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
# zero_optimization:
# offload_optimizer:
# device: cpu
# pin_memory: True
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/gpt4o-distil-4o-self-mix-ps-test-v1.0-mi300x.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: bfloat16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.1
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
# attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
device_map: ${device_map}
ref_model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
# attn_implementation: "eager"
device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsWithFunctionName
split: train
train_sub_split: train
use_starter_code: True
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: id
extract_fields: [ "pos", "neg" ]
extra_file: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v3.1.dp4.tp4.s42/apps/checkpoint-200/train.0shot.tem1.0.n10.v2.0.4o.dpo_m4_low0.5.2k.self.dpo_m6_low0.5.2k.json
- _target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: [ "pos", "neg" ]
mode: multi
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}{chat_suffix}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}{chat_suffix}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v3.1.dp4.tp4.s42/checkpoint-200/
pretrain:
resume:
dp_size:
tp_size: 1
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.sft_4o_self_ps_test_case.mix.dpo.MI300x.dp32.v1.0.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 4
per_gpu_eval_batch_size: 4
#learning_rate: 1e-4
learning_rate: 5e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 1
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 2
max_steps: 0
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
# zero_optimization:
# offload_optimizer:
# device: cpu
# pin_memory: True
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/gpt4o-distil-4o-self-mix-ps-test-v1.1-mi300x.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: bfloat16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.1
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
# attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
device_map: ${device_map}
ref_model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
# attn_implementation: "eager"
device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsWithFunctionName
split: train
train_sub_split: train
use_starter_code: True
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: id
extract_fields: [ "pos", "neg" ]
extra_file: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v3.1.dp4.tp4.s42/apps/checkpoint-200/train.0shot.tem1.0.n10.v2.0.4o.dpo_m4_low0.5.2k.self.dpo_m6_low0.5.2k.json
- _target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: [ "pos", "neg" ]
mode: multi
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}{chat_suffix}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}{chat_suffix}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v3.1.dp4.tp4.s42/checkpoint-200/
pretrain:
resume:
dp_size:
tp_size: 1
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.sft_4o_self_ps_test_case.mix.dpo.MI300x.dp32.v1.1.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 4
per_gpu_eval_batch_size: 4
#learning_rate: 1e-4
learning_rate: 5e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 1
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 3
max_steps: 0
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
# zero_optimization:
# offload_optimizer:
# device: cpu
# pin_memory: True
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/gpt4o-distil-ps-pdpo-ctr-ts-num-v1.0-mi300x-dp32.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: bfloat16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.1
# sft_loss: True
# sft_loss_weight: 0.3
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
device_map: ${device_map}
ref_model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsWithFunctionName
split: train
train_sub_split: train
use_starter_code: True
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: problem_id
extract_fields: [ "pos", "neg" ]
extra_file: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v3.1.dp4.tp4.s42/apps/checkpoint-200/train.tem1.0.n10.prefix.upper0.8.r0.3.completion.tem1.0.n5.v2.0.pseudo_test_case.ctr_ts_num.prefix_pass_num.fix_low0.5_m4.0_avg_low0.5_m4.0_avg.json
- _target_: data.input_aligner.dpo_paired_random_choice_aligner
anchor_field: pos
paired_field: neg
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v3.1.dp4.tp4.s42/checkpoint-200/
pretrain:
resume: latest
dp_size:
tp_size: 8
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.sft_ps_test_case.process-dpo.ctr-ts-num.MI300x.dp32.v1.0.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 4
per_gpu_eval_batch_size: 4
#learning_rate: 1e-4
learning_rate: 5e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 2
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 2
max_steps: 0
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/gpt4o-distil-pseudo-v1.0-a100.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: ""
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.1
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
# device_map: ${device_map}
ref_model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsReader
split: train
train_sub_split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: id
extract_fields: [ "pos", "neg" ]
extra_file: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.A100.w8.v3.0.s42/apps/checkpoint-400/train.0shot.tem1.0.n10.pseudo_test_case.exec_dpo.json
- _target_: data.input_aligner.dpo_pair_aligner
pos_field: pos
neg_field: neg
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}{chat_suffix}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}{chat_suffix}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.A100.w8.v3.0.s42/checkpoint-400/
pretrain:
resume:
dp_size:
tp_size: 1
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.sft_dpo.pseudo.A100.dp8.v1.0.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 5e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 16
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 2
max_steps: 500
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
# zero_optimization:
# offload_optimizer:
# device: cpu
# pin_memory: True
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/gpt4o-distil-self-pseudo-v1.0-a100.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: ""
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.1
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
# device_map: ${device_map}
ref_model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsReader
split: train
train_sub_split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: id
extract_fields: [ "pos", "neg" ]
extra_file: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.A100.w8.v3.0.s42/apps/checkpoint-400/train.0shot.tem1.0.n10.self_s43_pseudo_cases.exec_dpo.json
- _target_: data.input_aligner.dpo_pair_aligner
pos_field: pos
neg_field: neg
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}{chat_suffix}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}{chat_suffix}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.A100.w8.v3.0.s42/checkpoint-400/
pretrain:
resume:
dp_size:
tp_size: 1
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.sft_dpo.self-pseudo.A100.dp8.v1.0.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 5e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 16
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 2
max_steps: 500
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
# zero_optimization:
# offload_optimizer:
# device: cpu
# pin_memory: True
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/gpt4o-distil-self-pseudo-v1.0-v100.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: ""
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.1
gradient_checkpointing: True
attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
# device_map: ${device_map}
ref_model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "eager"
# device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsReader
split: train
train_sub_split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: id
extract_fields: [ "pos", "neg" ]
extra_file: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.A100.w8.v3.0.s42/apps/checkpoint-400/train.0shot.tem1.0.n10.self_s43_pseudo_cases.exec.json
- _target_: data.input_aligner.dpo_pair_aligner
pos_field: pos
neg_field: neg
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}{chat_suffix}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}{chat_suffix}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.A100.w8.v3.0.s42/checkpoint-400/
pretrain:
resume:
dp_size:
tp_size: 4
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.sft_dpo.self-pseudo.V100.dp4.tp4.v1.0.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 5e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 32
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 2
max_steps: 500
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
# zero_optimization:
# offload_optimizer:
# device: cpu
# pin_memory: True
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/gpt4o-distil-v1.0-H100-4o-ps-test.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: bfloat16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.1
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
# attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
device_map: ${device_map}
ref_model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
# attn_implementation: "eager"
device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsWithFunctionName
split: train
train_sub_split: train
use_starter_code: True
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: id
extract_fields: [ "pos", "neg" ]
extra_file: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v3.1.dp4.tp4.s42/apps/checkpoint-200/train.0shot.tem1.0.n10.v2.0.4o_pseudo_test_cases.dpo_m4_low0.5.json
- _target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: [ "pos", "neg" ]
mode: multi
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}{chat_suffix}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}{chat_suffix}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v3.1.dp4.tp4.s42/checkpoint-200/
pretrain:
resume:
dp_size:
tp_size: 1
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.sft_4o_ps_test_case.dpo.H100.dp8.v1.0.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 5e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 16
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 3
max_steps: 0
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
# zero_optimization:
# offload_optimizer:
# device: cpu
# pin_memory: True
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/gpt4o-distil-v2.0-v100.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: ""
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.1
gradient_checkpointing: True
attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
# device_map: ${device_map}
ref_model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "eager"
# device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsReader
split: train
train_sub_split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: id
extract_fields: [ "pos", "neg" ]
extra_file: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.tp2.dp8.v2.4.s42.fix_all_padding.multi-node-testing/apps/checkpoint-400/train.0shot.tem1.0.n10.v1.1.dpo_v1.0.json
- _target_: data.input_aligner.dpo_pair_aligner
pos_field: pos
neg_field: neg
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}{chat_suffix}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}{chat_suffix}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v2.0/checkpoint-100/
pretrain:
resume:
dp_size:
tp_size: 4
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.sft_dpo.V100.tp4.dp8.v2.0.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 5e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 16
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 2
max_steps: 500
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 5
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: False
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
bf16:
enabled: False
fp16:
enabled: True
auto_cast: False
loss_scale: 0
initial_scale_power: 16
loss_scale_window: 1000
hysteresis: 2
consecutive_hysteresis: False
min_loss_scale: 1
# zero_optimization:
# offload_optimizer:
# device: cpu
# pin_memory: True
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/gpt4o-distil-v3.0-a100.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: ""
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.1
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
# device_map: ${device_map}
ref_model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsReader
split: train
train_sub_split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: id
extract_fields: [ "pos", "neg" ]
extra_file: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.A100.w8.v3.0.s42/apps/checkpoint-400/train.0shot.tem1.0.n10.v1.1.dpo_v1.0.json
- _target_: data.input_aligner.dpo_pair_aligner
pos_field: pos
neg_field: neg
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}{chat_suffix}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}{chat_suffix}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.A100.w8.v3.0.s42/checkpoint-400/
pretrain:
resume:
dp_size:
tp_size: 1
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.sft_dpo.A100.dp8.v3.0.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 5e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 16
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 2
max_steps: 500
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
# zero_optimization:
# offload_optimizer:
# device: cpu
# pin_memory: True
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/gpt4o-distil-v3.1-rm-a100.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: ""
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.1
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
# device_map: ${device_map}
ref_model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsReader
split: train
train_sub_split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: id
extract_fields: [ "pos", "neg" ]
extra_file: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.A100.w8.v3.0.s42/apps/checkpoint-400/train.0shot.tem1.0.n10.v1.1.lower-0.0.mar-1.0.json
- _target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: [ "pos", "neg" ]
mode: "multi"
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}{chat_suffix}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}{chat_suffix}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.A100.w8.v3.0.s42/checkpoint-400/
pretrain:
resume:
dp_size:
tp_size: 1
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.sft_dpo.A100.dp8.v3.1.rm.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 5e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 16
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 2
max_steps: 500
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
# zero_optimization:
# offload_optimizer:
# device: cpu
# pin_memory: True
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/gpt4o-distil-v3.2-v100.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.1
gradient_checkpointing: True
# attn_implementation: "flash_attention_2"
attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
# device_map: ${device_map}
ref_model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
# attn_implementation: "flash_attention_2"
attn_implementation: "eager"
# device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsWithFunctionName
split: train
train_sub_split: train
use_starter_code: True
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: id
extract_fields: [ "pos", "neg" ]
extra_file: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v3.1.dp4.tp4.s42/apps/checkpoint-200/train.0shot.tem1.0.n10.dpo_v1.0.json
- _target_: data.input_aligner.dpo_pair_aligner
pos_field: pos
neg_field: neg
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}{chat_suffix}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}{chat_suffix}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v3.1.dp4.tp4.s42/checkpoint-200/
pretrain:
resume:
dp_size:
tp_size: 8
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.sft_dpo.V100.tp8dp2.v3.2.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 5e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 64
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 2
max_steps: 500
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: False
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
bf16:
enabled: False
fp16:
enabled: True
auto_cast: False
loss_scale: 0
initial_scale_power: 16
loss_scale_window: 1000
hysteresis: 2
consecutive_hysteresis: False
min_loss_scale: 1
# zero_optimization:
# offload_optimizer:
# device: cpu
# pin_memory: True
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/gpt4o-distil-v4.0-v100-ps-test.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.1
gradient_checkpointing: True
# attn_implementation: "flash_attention_2"
attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
# device_map: ${device_map}
ref_model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
# attn_implementation: "flash_attention_2"
attn_implementation: "eager"
# device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsWithFunctionName
split: train
train_sub_split: train
use_starter_code: True
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: id
extract_fields: [ "pos", "neg" ]
extra_file: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v3.1.dp4.tp4.s42/apps/checkpoint-200/train.0shot.tem1.0.n10.v2.0.pseudo_input_output.v1.0.clean.dpo_m6_low0.5.json
- _target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: [ "pos", "neg" ]
mode: multi
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}{chat_suffix}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}{chat_suffix}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v3.1.dp4.tp4.s42/checkpoint-200/
pretrain:
resume:
dp_size:
tp_size: 8
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.sft_ps_test_case.dpo.V100.tp8dp8.v4.0.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 5e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 16
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 2
max_steps: 500
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: False
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
bf16:
enabled: False
fp16:
enabled: True
auto_cast: False
loss_scale: 0
initial_scale_power: 16
loss_scale_window: 1000
hysteresis: 2
consecutive_hysteresis: False
min_loss_scale: 1
# zero_optimization:
# offload_optimizer:
# device: cpu
# pin_memory: True
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/gpt4o-distil-v4.1-H100-ps-pdpo.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: bfloat16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.1
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
# attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
device_map: ${device_map}
ref_model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
# attn_implementation: "eager"
device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsWithFunctionName
split: train
train_sub_split: train
use_starter_code: True
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: problem_id
extract_fields: [ "pos", "neg" ]
extra_file: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v3.1.dp4.tp4.s42/apps/checkpoint-200/train.tem1.0.n10.prefix.upper0.8.r0.3.completion.tem1.0.n5.v2.0.pseudo_test_case.prefix_value_binary_pairs.json
# - _target_: data.input_aligner.flat_aligner
# input_index_field: problem_id
# extract_field: [ "pos", "neg" ]
# mode: multi
- _target_: data.input_aligner.dpo_random_choice_aligner
anchor_field: pos
paired_field: neg
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v3.1.dp4.tp4.s42/checkpoint-200/
pretrain:
resume: latest
dp_size:
tp_size: 1
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.sft_ps_test_case.process-dpo.H100.dp8.v4.1.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 5e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 16
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 2
max_steps: 0
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
# zero_optimization:
# offload_optimizer:
# device: cpu
# pin_memory: True
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/gpt4o-distil-v4.10-V100-ps-pdpo-rerun.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.1
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
# attn_implementation: "flash_attention_2"
attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
# device_map: ${device_map}
ref_model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
# attn_implementation: "flash_attention_2"
attn_implementation: "eager"
# device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor_pdpo:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsWithFunctionName
split: train
train_sub_split: train
use_starter_code: True
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: problem_id
extract_fields: [ "pos", "neg" ]
extra_file: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v3.1.dp4.tp4.s42/apps/checkpoint-200/train.tem1.0.n10.prefix.upper0.8.r0.3.completion.tem1.0.n5.v2.0.pseudo_test_case.prefix_pass_num.fix_low0.8_m4.0.json
- _target_: data.input_aligner.dpo_paired_random_choice_aligner
anchor_field: pos
paired_field: neg
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
read_tensor_dpo:
_target_: data.combine_dataset.MultiMappingDataset
file_path: "codeparrot/apps"
read_fn:
_target_: data.apps.APPsWithFunctionName
split: train
train_sub_split: train
use_starter_code: True
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: problem_id
extract_fields: [ "pos", "neg" ]
extra_file: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v3.1.dp4.tp4.s42/apps/checkpoint-200/train.tem1.0.n10.prefix.upper0.8.r0.3.completion.tem1.0.n5.v2.0.orig_test_case.prefix_value_binary_pairs.json
- _target_: data.input_aligner.dpo_random_choice_aligner
anchor_field: pos
paired_field: neg
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
read_tensor:
_target_: data.combine_dataset.ReplayDataset
_recursive_: False
new_dataset_cfg: ${read_tensor_pdpo}
old_dataset_cfg: ${read_tensor_dpo}
replay_ratio: 1.0
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
# max_seq_length: 3072
max_seq_length: 2560
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v3.1.dp4.tp4.s42/checkpoint-200/
pretrain:
resume: latest
dp_size:
tp_size: 8
pp_size: 1
#exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.sft_ps_test_case.process-dpo.H100.dp8.v4.4.s${seed}
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.sft_ps_test_case.process-dpo.V100.dp8tp16.v4.10.s${seed}.rerun # Fix sft loss nan error (non response)
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 5e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 8
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 2
max_steps: 0
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: False
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
bf16:
enabled: False
fp16:
enabled: True
auto_cast: False
loss_scale: 0
initial_scale_power: 16
loss_scale_window: 1000
hysteresis: 2
consecutive_hysteresis: False
min_loss_scale: 1
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/gpt4o-distil-v4.2-H100-gd-pdpo.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: bfloat16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.1
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
# attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
device_map: ${device_map}
ref_model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
# attn_implementation: "eager"
device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsWithFunctionName
split: train
train_sub_split: train
use_starter_code: True
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: problem_id
extract_fields: [ "pos", "neg" ]
extra_file: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v3.1.dp4.tp4.s42/apps/checkpoint-200/train.tem1.0.n10.prefix.upper0.8.r0.3.completion.tem1.0.n5.v2.0.orig_test_case.prefix_value_binary_pairs.json
# - _target_: data.input_aligner.flat_aligner
# input_index_field: problem_id
# extract_field: [ "pos", "neg" ]
# mode: multi
- _target_: data.input_aligner.dpo_random_choice_aligner
anchor_field: pos
paired_field: neg
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v3.1.dp4.tp4.s42/checkpoint-200/
pretrain:
resume: latest
dp_size:
tp_size: 1
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.sft.process-dpo.A100.dp8.v4.2.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 2
per_gpu_eval_batch_size: 2
#learning_rate: 1e-4
learning_rate: 5e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 8
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 2
max_steps: 0
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
# zero_optimization:
# offload_optimizer:
# device: cpu
# pin_memory: True
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/gpt4o-distil-v4.2-v100-gd-pdpo.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.1
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
# attn_implementation: "flash_attention_2"
attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
# device_map: ${device_map}
ref_model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
# attn_implementation: "flash_attention_2"
attn_implementation: "eager"
# device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsWithFunctionName
split: train
train_sub_split: train
use_starter_code: True
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: problem_id
extract_fields: [ "pos", "neg" ]
extra_file: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v3.1.dp4.tp4.s42/apps/checkpoint-200/train.tem1.0.n10.prefix.upper0.8.r0.3.completion.tem1.0.n5.v2.0.orig_test_case.prefix_value_binary_pairs.json
# - _target_: data.input_aligner.flat_aligner
# input_index_field: problem_id
# extract_field: [ "pos", "neg" ]
# mode: multi
- _target_: data.input_aligner.dpo_random_choice_aligner
anchor_field: pos
paired_field: neg
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v3.1.dp4.tp4.s42/checkpoint-200/
pretrain:
resume: latest
dp_size:
tp_size: 8
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.sft.process-dpo.V100.tp8dp8.v4.2.s${seed}.rerun
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 5e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 16
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 2
max_steps: 0
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: False
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
bf16:
enabled: False
fp16:
enabled: True
auto_cast: False
loss_scale: 0
initial_scale_power: 16
loss_scale_window: 1000
hysteresis: 2
consecutive_hysteresis: False
min_loss_scale: 1
# zero_optimization:
# offload_optimizer:
# device: cpu
# pin_memory: True
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/gpt4o-distil-v4.3-H100-ps-pdpo.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: bfloat16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.1
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
# attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
device_map: ${device_map}
ref_model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
# attn_implementation: "eager"
device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsWithFunctionName
split: train
train_sub_split: train
use_starter_code: True
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: problem_id
extract_fields: [ "pos", "neg" ]
extra_file: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v3.1.dp4.tp4.s42/apps/checkpoint-200/train.tem1.0.n10.prefix.upper0.8.r0.3.completion.tem1.0.n5.v2.0.pseudo_test_case.prefix_pass_num.fix_low0.5_m6.0.json
- _target_: data.input_aligner.dpo_paired_random_choice_aligner
anchor_field: pos
paired_field: neg
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v3.1.dp4.tp4.s42/checkpoint-200/
pretrain:
resume: latest
dp_size:
tp_size: 1
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.sft_ps_test_case.process-dpo.H100.dp8.v4.3.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 5e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 16
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 2
max_steps: 0
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
# zero_optimization:
# offload_optimizer:
# device: cpu
# pin_memory: True
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/gpt4o-distil-v4.3-V100-ps-pdpo-rerun.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.1
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
# attn_implementation: "flash_attention_2"
attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
# device_map: ${device_map}
ref_model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
# attn_implementation: "flash_attention_2"
attn_implementation: "eager"
# device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsWithFunctionName
split: train
train_sub_split: train
use_starter_code: True
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: problem_id
extract_fields: [ "pos", "neg" ]
extra_file: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v3.1.dp4.tp4.s42/apps/checkpoint-200/train.tem1.0.n10.prefix.upper0.8.r0.3.completion.tem1.0.n5.v2.0.pseudo_test_case.prefix_pass_num.fix_low0.5_m6.0.json
- _target_: data.input_aligner.dpo_paired_random_choice_aligner
anchor_field: pos
paired_field: neg
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
# max_seq_length: 3072
max_seq_length: 2560
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v3.1.dp4.tp4.s42/checkpoint-200/
pretrain:
resume: latest
dp_size:
tp_size: 8
pp_size: 1
#exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.sft_ps_test_case.process-dpo.H100.dp8.v4.3.s${seed}
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.sft_ps_test_case.process-dpo.V100.tp8dp8.v4.3.s${seed}.rerun # Fix sft loss nan error (non response)
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 5e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 16
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 2
max_steps: 0
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: False
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
bf16:
enabled: False
fp16:
enabled: True
auto_cast: False
loss_scale: 0
initial_scale_power: 16
loss_scale_window: 1000
hysteresis: 2
consecutive_hysteresis: False
min_loss_scale: 1
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/gpt4o-distil-v4.3-v100-gd-pdpo.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.1
# sft_loss: True
# sft_loss_weight: 0.2
gradient_checkpointing: True
# attn_implementation: "flash_attention_2"
attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
# device_map: ${device_map}
ref_model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
# attn_implementation: "flash_attention_2"
attn_implementation: "eager"
# device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsWithFunctionName
split: train
train_sub_split: train
use_starter_code: True
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: problem_id
extract_fields: [ "pos", "neg" ]
extra_file: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v3.1.dp4.tp4.s42/apps/checkpoint-200/train.tem1.0.n10.prefix.upper0.8.r0.3.completion.tem1.0.n5.v2.0.orig_test_case.prefix_value_binary_pairs.json
# - _target_: data.input_aligner.flat_aligner
# input_index_field: problem_id
# extract_field: [ "pos", "neg" ]
# mode: multi
- _target_: data.input_aligner.dpo_random_choice_aligner
anchor_field: pos
paired_field: neg
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v3.1.dp4.tp4.s42/checkpoint-200/
pretrain:
resume: latest
dp_size:
tp_size: 8
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.sft.process-dpo.V100.tp8dp8.v4.3.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 5e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 16
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 2
max_steps: 0
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: False
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
bf16:
enabled: False
fp16:
enabled: True
auto_cast: False
loss_scale: 0
initial_scale_power: 16
loss_scale_window: 1000
hysteresis: 2
consecutive_hysteresis: False
min_loss_scale: 1
# zero_optimization:
# offload_optimizer:
# device: cpu
# pin_memory: True
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/gpt4o-distil-v4.4-H100-ps-pdpo.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: bfloat16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.1
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
# attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
device_map: ${device_map}
ref_model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
# attn_implementation: "eager"
device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsWithFunctionName
split: train
train_sub_split: train
use_starter_code: True
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: problem_id
extract_fields: [ "pos", "neg" ]
extra_file: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v3.1.dp4.tp4.s42/apps/checkpoint-200/train.tem1.0.n10.prefix.upper0.8.r0.3.completion.tem1.0.n5.v2.0.pseudo_test_case.prefix_pass_num.fix_low0.8_m4.0.json
- _target_: data.input_aligner.dpo_paired_random_choice_aligner
anchor_field: pos
paired_field: neg
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v3.1.dp4.tp4.s42/checkpoint-200/
pretrain:
resume: latest
dp_size:
tp_size: 1
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.sft_ps_test_case.process-dpo.H100.dp8.v4.4.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 5e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 16
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 2
max_steps: 0
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
# zero_optimization:
# offload_optimizer:
# device: cpu
# pin_memory: True
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/gpt4o-distil-v4.4-V100-ps-pdpo-rerun.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.1
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
# attn_implementation: "flash_attention_2"
attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
# device_map: ${device_map}
ref_model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
# attn_implementation: "flash_attention_2"
attn_implementation: "eager"
# device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsWithFunctionName
split: train
train_sub_split: train
use_starter_code: True
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: problem_id
extract_fields: [ "pos", "neg" ]
extra_file: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v3.1.dp4.tp4.s42/apps/checkpoint-200/train.tem1.0.n10.prefix.upper0.8.r0.3.completion.tem1.0.n5.v2.0.pseudo_test_case.prefix_pass_num.fix_low0.8_m4.0.json
- _target_: data.input_aligner.dpo_paired_random_choice_aligner
anchor_field: pos
paired_field: neg
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v3.1.dp4.tp4.s42/checkpoint-200/
pretrain:
resume: latest
dp_size:
tp_size: 8
pp_size: 1
#exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.sft_ps_test_case.process-dpo.H100.dp8.v4.4.s${seed}
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.sft_ps_test_case.process-dpo.V100.dp8tp8.v4.4.s${seed}.rerun # Fix sft loss nan error (non response)
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 5e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 16
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 2
max_steps: 0
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
bf16:
enabled: False
fp16:
enabled: True
auto_cast: False
loss_scale: 0
initial_scale_power: 16
loss_scale_window: 1000
hysteresis: 2
consecutive_hysteresis: False
min_loss_scale: 1
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/gpt4o-distil-v4.5-A100-ps-pdpo.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: bfloat16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.4
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
# attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
device_map: ${device_map}
ref_model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
# attn_implementation: "eager"
device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsWithFunctionName
split: train
train_sub_split: train
use_starter_code: True
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: problem_id
extract_fields: [ "pos", "neg" ]
extra_file: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v3.1.dp4.tp4.s42/apps/checkpoint-200/train.tem1.0.n10.prefix.upper0.8.r0.3.completion.tem1.0.n5.v2.0.pseudo_test_case.prefix_pass_num.fix_low0.8_m4.0.json
- _target_: data.input_aligner.dpo_paired_random_choice_aligner
anchor_field: pos
paired_field: neg
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v3.1.dp4.tp4.s42/checkpoint-200/
pretrain:
resume: latest
dp_size:
tp_size: 1
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.sft_ps_test_case.process-dpo.A100.dp8.v4.5.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 3e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 16
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 2
max_steps: 0
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
# zero_optimization:
# offload_optimizer:
# device: cpu
# pin_memory: True
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/gpt4o-distil-v4.5-v100-ps-pdpo.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.4
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
# attn_implementation: "flash_attention_2"
attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
# device_map: ${device_map}
ref_model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
# attn_implementation: "flash_attention_2"
attn_implementation: "eager"
# device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsWithFunctionName
split: train
train_sub_split: train
use_starter_code: True
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: problem_id
extract_fields: [ "pos", "neg" ]
extra_file: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v3.1.dp4.tp4.s42/apps/checkpoint-200/train.tem1.0.n10.prefix.upper0.8.r0.3.completion.tem1.0.n5.v2.0.pseudo_test_case.prefix_pass_num.fix_low0.8_m4.0.json
- _target_: data.input_aligner.dpo_paired_random_choice_aligner
anchor_field: pos
paired_field: neg
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v3.1.dp4.tp4.s42/checkpoint-200/
pretrain:
resume: latest
dp_size:
tp_size: 8
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.sft_ps_test_case.process-dpo.V100.tp8dp8.v4.5.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 3e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 16
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 2
max_steps: 0
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: False
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
bf16:
enabled: False
fp16:
enabled: True
auto_cast: False
loss_scale: 0
initial_scale_power: 16
loss_scale_window: 1000
hysteresis: 2
consecutive_hysteresis: False
min_loss_scale: 1
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/gpt4o-distil-v4.6-v100-ps-pdpo.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.1
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
# attn_implementation: "flash_attention_2"
attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
# device_map: ${device_map}
ref_model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
# attn_implementation: "flash_attention_2"
attn_implementation: "eager"
# device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsWithFunctionName
split: train
train_sub_split: train
use_starter_code: True
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: problem_id
extract_fields: [ "pos", "neg" ]
extra_file: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v3.1.dp4.tp4.s42/apps/checkpoint-200/train.tem1.0.n10.prefix.upper0.8.r0.3.completion.tem1.0.n5.v2.0.pseudo_test_case.prefix_pass_num.fix_low0.7_m6.0.json
- _target_: data.input_aligner.dpo_paired_random_choice_aligner
anchor_field: pos
paired_field: neg
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v3.1.dp4.tp4.s42/checkpoint-200/
pretrain:
resume: latest
dp_size:
tp_size: 8
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.sft_ps_test_case.process-dpo.V100.tp8dp8.v4.6.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 5e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 16
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 2
max_steps: 0
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: False
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
bf16:
enabled: False
fp16:
enabled: True
auto_cast: False
loss_scale: 0
initial_scale_power: 16
loss_scale_window: 1000
hysteresis: 2
consecutive_hysteresis: False
min_loss_scale: 1
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/gpt4o-distil-v4.7-A100-ps-pdpo.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: bfloat16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.1
# sft_loss: True
# sft_loss_weight: 0.3
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
# attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
device_map: ${device_map}
ref_model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
# attn_implementation: "eager"
device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsWithFunctionName
split: train
train_sub_split: train
use_starter_code: True
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: problem_id
extract_fields: [ "pos", "neg" ]
extra_file: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v3.1.dp4.tp4.s42/apps/checkpoint-200/train.tem1.0.n10.prefix.upper0.8.r0.3.completion.tem1.0.n5.v2.0.pseudo_test_case.prefix_pass_num.fix_low0.8_m4.0.json
- _target_: data.input_aligner.dpo_paired_random_choice_aligner
anchor_field: pos
paired_field: neg
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v3.1.dp4.tp4.s42/checkpoint-200/
pretrain:
resume: latest
dp_size:
tp_size: 1
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.sft_ps_test_case.process-dpo.A100.dp8.v4.7.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 1e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 16
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 2
max_steps: 0
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
# zero_optimization:
# offload_optimizer:
# device: cpu
# pin_memory: True
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/gpt4o-distil-v4.8-A100-ps-pdpo.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: bfloat16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.1
# sft_loss: True
# sft_loss_weight: 0.3
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
# attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
device_map: ${device_map}
ref_model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
# attn_implementation: "eager"
device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsWithFunctionName
split: train
train_sub_split: train
use_starter_code: True
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: problem_id
extract_fields: [ "pos", "neg" ]
extra_file: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v3.1.dp4.tp4.s42/apps/checkpoint-200/train.tem1.0.n10.prefix.upper0.8.r0.3.completion.tem1.0.n5.v2.0.pseudo_test_case.prefix_pass_num.fix_low0.8_m4.0.json
- _target_: data.input_aligner.dpo_paired_random_choice_aligner
anchor_field: pos
paired_field: neg
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v3.1.dp4.tp4.s42/checkpoint-200/
pretrain:
resume: latest
dp_size:
tp_size: 1
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.sft_ps_test_case.process-dpo.H100.dp8.v4.8.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 5e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 16
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 2
max_steps: 0
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
# zero_optimization:
# offload_optimizer:
# device: cpu
# pin_memory: True
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/gpt4o-distil-v4.9-V100-ps-pdpo.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.1
# sft_loss: True
# sft_loss_weight: 0.3
gradient_checkpointing: True
# attn_implementation: "flash_attention_2"
attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
# device_map: ${device_map}
ref_model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
# attn_implementation: "flash_attention_2"
attn_implementation: "eager"
# device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsWithFunctionName
split: train
train_sub_split: train
use_starter_code: True
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: problem_id
extract_fields: [ "pos", "neg" ]
extra_file: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v3.1.dp4.tp4.s42/apps/checkpoint-200/train.tem1.0.n10.prefix.upper0.8.r0.3.completion.tem1.0.n5.v2.0.pseudo_test_case.prefix_pass_num.fix_low0.5_m4.0_avg.json
- _target_: data.input_aligner.dpo_paired_random_choice_aligner
anchor_field: pos
paired_field: neg
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v3.1.dp4.tp4.s42/checkpoint-200/
pretrain:
resume: latest
dp_size:
tp_size: 8
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.sft_ps_test_case.process-dpo.V100.tp8dp16.v4.9.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 5e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 16
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 2
max_steps: 0
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: False
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
bf16:
enabled: False
fp16:
enabled: True
auto_cast: False
loss_scale: 0
initial_scale_power: 16
loss_scale_window: 1000
hysteresis: 2
consecutive_hysteresis: False
min_loss_scale: 1
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/gpt4o-distil-v4.9.1-V100-ps-pdpo.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.1
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
# attn_implementation: "flash_attention_2"
attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
# device_map: ${device_map}
ref_model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
# attn_implementation: "flash_attention_2"
attn_implementation: "eager"
# device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsWithFunctionName
split: train
train_sub_split: train
use_starter_code: True
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: problem_id
extract_fields: [ "pos", "neg" ]
extra_file: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v3.1.dp4.tp4.s42/apps/checkpoint-200/train.tem1.0.n10.prefix.upper0.8.r0.3.completion.tem1.0.n5.v2.0.pseudo_test_case.prefix_pass_num.fix_low0.5_m4.0_avg.json
- _target_: data.input_aligner.dpo_paired_random_choice_aligner
anchor_field: pos
paired_field: neg
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v3.1.dp4.tp4.s42/checkpoint-200/
pretrain:
resume: latest
dp_size:
tp_size: 8
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.sft_ps_test_case.process-dpo.V100.tp8dp32.v4.9.1.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 5e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 4
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 2
max_steps: 0
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: False
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
bf16:
enabled: False
fp16:
enabled: True
auto_cast: False
loss_scale: 0
initial_scale_power: 16
loss_scale_window: 1000
hysteresis: 2
consecutive_hysteresis: False
min_loss_scale: 1
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/gpt4o-distil-v4.9.2-V100-ps-pdpo.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload
- deepspeed@ds_cfg_eval: train_hybrid_engine_zero3
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
ds_cfg_eval:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
bf16:
enabled: True
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: bfloat16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.1
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
# attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
device_map: ${device_map}
ref_model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
# attn_implementation: "eager"
device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsWithFunctionName
split: train
train_sub_split: train
use_starter_code: True
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: problem_id
extract_fields: [ "pos", "neg" ]
extra_file: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v3.1.dp4.tp4.s42/apps/checkpoint-200/train.tem1.0.n10.prefix.upper0.8.r0.3.completion.tem1.0.n5.v2.0.pseudo_test_case.prefix_pass_num.fix_low0.8_m4.0_avg.json
- _target_: data.input_aligner.dpo_paired_random_choice_aligner
anchor_field: pos
paired_field: neg
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v3.1.dp4.tp4.s42/checkpoint-200/
pretrain:
resume: latest
dp_size:
tp_size: 1
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.sft_ps_test_case.process-dpo.H100.dp8.v4.9.2.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 5e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 16
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 2
max_steps: 0
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/gpt4o-distil-v4.9.3-A100-ps-pdpo.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: bfloat16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.1
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
# attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
device_map: ${device_map}
ref_model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
# attn_implementation: "eager"
device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsWithFunctionName
split: train
train_sub_split: train
use_starter_code: True
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: problem_id
extract_fields: [ "pos", "neg" ]
extra_file: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v3.1.dp4.tp4.s42/apps/checkpoint-200/train.tem1.0.n10.prefix.upper0.8.r0.3.completion.tem1.0.n5.v2.0.pseudo_test_case.prefix_pass_num.fix_low0.5_m4.0_avg.json
- _target_: data.input_aligner.dpo_paired_random_choice_aligner
anchor_field: pos
paired_field: neg
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${output_path_prefix}experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v3.1.dp4.tp4.s42/checkpoint-200/
pretrain:
resume: latest
dp_size:
tp_size: 1
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.sft_ps_test_case.process-dpo.A100.dp8.v4.9.3.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 5e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 16
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 2
max_steps: 0
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/iter1/gpt4o-distil-apps-mc-v1.0-mi300x-hybrid.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/dataset/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
sft_model_dir: ${output_path_prefix}/experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.sft_4o_ps_test_case.dpo.H100.dp8.v1.0.s42/
#train_file: ${sft_model_dir}/oss-apps-xcode-combine-4o-ps-tests/checkpoint-100/train.0shot.tem1.0.n10.v2.1.s42.prefer_pair.low0.5.m6.4o-non-sc.json
train_file: ${data_path_prefix}/magicoder/oss-instruct-apps-train-pseudo-test-inputs.v1.0.json
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: bfloat16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.4
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
device_map: ${device_map}
ref_model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.PseudoInputsWithFunctionName
use_starter_code: True
train_sub_split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: problem_id
extract_fields: [ "pos", "neg" ]
extra_file: ${sft_model_dir}/oss-instruct-apps-train/checkpoint-100/train.0shot.tem1.0.n10.v2.1.apps.4o.dpo_m4_low0.5.mcoder.self.dpo_m6_low0.5_min5.json
- _target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: [ "pos", "neg" ]
mode: multi
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}{chat_suffix}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}{chat_suffix}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 4096
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${sft_model_dir}/checkpoint-100/
pretrain:
resume: latest
dp_size:
tp_size: 1
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps-4o.mc-self.iter1.dpo.mi300x.dp32.v1.0.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 4
per_gpu_eval_batch_size: 4
#learning_rate: 1e-4
learning_rate: 3e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 1
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 3
max_steps: 0
warmup_proportion: 0.03
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
# zero_optimization:
# offload_optimizer:
# device: cpu
# pin_memory: True
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/iter1/gpt4o-distil-apps-mc-v1.1-mi300x-hybrid.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/dataset/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
sft_model_dir: ${output_path_prefix}/experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.sft_4o_ps_test_case.dpo.H100.dp8.v1.0.s42/
#train_file: ${sft_model_dir}/oss-apps-xcode-combine-4o-ps-tests/checkpoint-100/train.0shot.tem1.0.n10.v2.1.s42.prefer_pair.low0.5.m6.4o-non-sc.json
train_file: ${data_path_prefix}/magicoder/oss-instruct-apps-train-pseudo-test-inputs.v1.0.json
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: bfloat16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.4
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
device_map: ${device_map}
ref_model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.PseudoInputsWithFunctionNameFixStarterCode
use_starter_code: True
train_sub_split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: problem_id
extract_fields: [ "pos", "neg" ]
extra_file: ${sft_model_dir}/oss-instruct-apps-train/checkpoint-100/train.0shot.tem1.0.n10.v2.1.apps.4o.dpo_m4_low0.5.mcoder.self.dpo_m6_low0.5_min5.json
- _target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: [ "pos", "neg" ]
mode: multi
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}{chat_suffix}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}{chat_suffix}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 4096
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${sft_model_dir}/checkpoint-100/
pretrain:
resume: latest
dp_size:
tp_size: 1
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps-4o.mc-self.iter1.dpo.mi300x.dp32.v1.1.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 4
per_gpu_eval_batch_size: 4
#learning_rate: 1e-4
learning_rate: 3e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 1
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 3
max_steps: 0
warmup_proportion: 0.03
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
# zero_optimization:
# offload_optimizer:
# device: cpu
# pin_memory: True
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/iter1/gpt4o-distil-combine-pdpo-v1.0-a100-40-ps-test.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/dataset/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
sft_model_dir: ${output_path_prefix}/experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.sft_ps_test_case.process-dpo.V100.tp8dp16.v4.9.s42/
train_file: ${data_path_prefix}/magicoder/oss-instruct-apps-train-pseudo-test-inputs.v1.0.json
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: bfloat16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.4
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
# attn_implementation: "sdpa"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
# device_map: ${device_map}
ref_model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
# attn_implementation: "sdpa"
# device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.PseudoInputsWithFunctionName
use_starter_code: True
train_sub_split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: problem_id
extract_fields: [ "pos", "neg" ]
extra_file: ${sft_model_dir}/oss-instruct-apps-train/checkpoint-700/train.tem1.0.n10.prefix.upper0.8.r0.3.sample20_per.completion.tem1.0.n3.pseudo_input_output.prefix_pass_num.fix_low0.5_m4.0_avg.json
- _target_: data.input_aligner.dpo_paired_random_choice_aligner
anchor_field: pos
paired_field: neg
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${sft_model_dir}/checkpoint-700
pretrain:
resume: latest
dp_size:
tp_size: 4
pp_size: 1
#exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.sft_ps_test_case.iter1.dpo.A100.tp8dp8.v1.2.s${seed}
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.sft_ps_test_case.iter1.pdpo.A100.tp4dp16.v1.0.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 3e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 16
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 3
max_steps: 0
warmup_proportion: 0.03
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
# zero_optimization:
# offload_optimizer:
# device: cpu
# pin_memory: True
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/iter1/gpt4o-distil-combine-pdpo-v1.1-v100-ps-test.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/dataset/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
sft_model_dir: ${output_path_prefix}/experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.sft_ps_test_case.process-dpo.V100.tp8dp16.v4.9.s42/
train_file: ${data_path_prefix}/magicoder/oss-instruct-apps-train-pseudo-test-inputs.v1.0.json
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.4
# sft_loss: True
# sft_loss_weight: 0.2
gradient_checkpointing: True
# attn_implementation: "flash_attention_2"
attn_implementation: "sdpa"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
# device_map: ${device_map}
ref_model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
# attn_implementation: "flash_attention_2"
attn_implementation: "sdpa"
# device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.PseudoInputsWithFunctionName
use_starter_code: True
train_sub_split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: problem_id
extract_fields: [ "pos", "neg" ]
extra_file: ${sft_model_dir}/oss-instruct-apps-train/checkpoint-700/train.tem1.0.n10.prefix.upper0.8.r0.3.sample20_per.completion.tem1.0.n3.pseudo_input_output.prefix_pass_num.fix_low0.5_m4.0_avg.json
- _target_: data.input_aligner.dpo_paired_random_choice_aligner
anchor_field: pos
paired_field: neg
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${sft_model_dir}/checkpoint-700
pretrain:
resume: latest
dp_size:
tp_size: 8
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.sft_ps_test_case.iter1.pdpo.V100.tp8dp32.v1.1.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 3e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 4
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 3
max_steps: 0
warmup_proportion: 0.03
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: False
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
bf16:
enabled: False
fp16:
enabled: True
auto_cast: False
loss_scale: 0
initial_scale_power: 16
loss_scale_window: 1000
hysteresis: 2
consecutive_hysteresis: False
min_loss_scale: 1
# zero_optimization:
# offload_optimizer:
# device: cpu
# pin_memory: True
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/iter1/gpt4o-distil-combine-pdpo-v1.2-h100-ps-test.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/dataset/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
sft_model_dir: ${output_path_prefix}/experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.sft_ps_test_case.process-dpo.V100.tp8dp16.v4.9.s42/
train_file: ${data_path_prefix}/magicoder/oss-instruct-apps-train-pseudo-test-inputs.v1.0.json
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: bfloat16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.4
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
# attn_implementation: "sdpa"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
device_map: ${device_map}
ref_model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
# attn_implementation: "sdpa"
device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor_pdpo:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.PseudoInputsWithFunctionName
use_starter_code: True
train_sub_split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: problem_id
extract_fields: [ "pos", "neg" ]
extra_file: ${sft_model_dir}/oss-instruct-apps-train/checkpoint-700/train.tem1.0.n10.prefix.upper0.8.r0.3.sample20_per.completion.tem1.0.n3.pseudo_input_output.prefix_pass_num.fix_low0.5_m4.0_avg.json
- _target_: data.input_aligner.dpo_paired_random_choice_aligner
anchor_field: pos
paired_field: neg
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
read_tensor_dpo:
_target_: data.combine_dataset.MultiMappingDataset
file_path: ${data_path_prefix}/magicoder/oss-instruct-apps-train-pseudo-test-inputs.v1.0.json
read_fn:
_target_: data.apps.PseudoInputsWithFunctionName
use_starter_code: True
train_sub_split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: id
extract_fields: [ "pos", "neg" ]
extra_file: ${sft_model_dir}/oss-instruct-apps-train/checkpoint-700/split-32/train.0shot.tem1.0.n10.v2.0.pseudo_input_output.v1.0.dpo_m6_low0.5_min5.json
- _target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: [ "pos", "neg" ]
mode: multi
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}{chat_suffix}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}{chat_suffix}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
read_tensor:
_target_: data.combine_dataset.ReplayDataset
_recursive_: False
new_dataset_cfg: ${read_tensor_pdpo}
old_dataset_cfg: ${read_tensor_dpo}
replay_ratio: 1.0
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${sft_model_dir}/checkpoint-700
pretrain:
resume: latest
dp_size:
tp_size: 1
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.sft_ps_test_case.iter1.pdpo.H100.dp8.v1.2.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 3e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 16
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 3
max_steps: 0
warmup_proportion: 0.03
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/iter1/gpt4o-distil-combine-v1.0-H100-4o-ps-test.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/dataset/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
sft_model_dir: ${output_path_prefix}/experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.sft_4o_ps_test_case.dpo.H100.dp8.v1.0.s42/
train_file: ${sft_model_dir}/oss-apps-xcode-combine-4o-ps-tests/checkpoint-100/train.0shot.tem1.0.n10.v2.1.s42.prefer_pair.low0.5.m6.4o-non-sc.json
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: bfloat16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.1
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
device_map: ${device_map}
ref_model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.PseudoInputsWithFunctionName
use_starter_code: True
train_sub_split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: [ "pos", "neg" ]
mode: multi
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}{chat_suffix}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}{chat_suffix}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 4096
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${sft_model_dir}/checkpoint-100/
pretrain:
resume: latest
dp_size:
tp_size: 1
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.sft_4o_ps_test_case.non_sc.iter1.dpo.MI300x.dp16.v1.0.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 5e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 8
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 3
max_steps: 0
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
# zero_optimization:
# offload_optimizer:
# device: cpu
# pin_memory: True
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/iter1/gpt4o-distil-combine-v1.0-v100-ps-test.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/dataset/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
sft_model_dir: ${output_path_prefix}/experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.sft_ps_test_case.process-dpo.V100.tp8dp16.v4.9.s42/
train_file: ${data_path_prefix}/magicoder/oss-instruct-apps-train-pseudo-test-inputs.v1.0.json
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.1
gradient_checkpointing: True
# attn_implementation: "flash_attention_2"
attn_implementation: "sdpa"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
# device_map: ${device_map}
ref_model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
# attn_implementation: "flash_attention_2"
attn_implementation: "sdpa"
# device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.PseudoInputsWithFunctionName
use_starter_code: True
train_sub_split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: id
extract_fields: [ "pos", "neg" ]
extra_file: ${sft_model_dir}/oss-instruct-apps-train/checkpoint-700/split-32/train.0shot.tem1.0.n10.v2.0.pseudo_input_output.v1.0.dpo_m6_low0.5_min5.json
- _target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: [ "pos", "neg" ]
mode: multi
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}{chat_suffix}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}{chat_suffix}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${sft_model_dir}/checkpoint-700
pretrain:
resume: latest
dp_size:
tp_size: 8
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.sft_ps_test_case.iter1.dpo.V100.tp8dp32.v1.0.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 5e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 4
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 3
max_steps: 0
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: False
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
bf16:
enabled: False
fp16:
enabled: True
auto_cast: False
loss_scale: 0
initial_scale_power: 16
loss_scale_window: 1000
hysteresis: 2
consecutive_hysteresis: False
min_loss_scale: 1
# zero_optimization:
# offload_optimizer:
# device: cpu
# pin_memory: True
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/iter1/gpt4o-distil-combine-v1.1-H100-4o-ps-test.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/dataset/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
sft_model_dir: ${output_path_prefix}/experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.sft_4o_ps_test_case.dpo.H100.dp8.v1.0.s42/
train_file: ${sft_model_dir}/oss-apps-xcode-combine-4o-ps-tests/checkpoint-100/train.0shot.tem1.0.n10.v2.1.s42.run_outputs.prefer_pair.low0.5.m6.4o-sc.json
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: bfloat16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.1
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
device_map: ${device_map}
ref_model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.PseudoInputsWithFunctionName
use_starter_code: True
train_sub_split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: [ "pos", "neg" ]
mode: multi
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}{chat_suffix}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}{chat_suffix}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 4096
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${sft_model_dir}/checkpoint-100/
pretrain:
resume: latest
dp_size:
tp_size: 1
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.sft_4o_ps_test_case.sc.iter1.dpo.MI300x.dp16.v1.1.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 5e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 8
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 3
max_steps: 0
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
# zero_optimization:
# offload_optimizer:
# device: cpu
# pin_memory: True
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/iter1/gpt4o-distil-combine-v1.1-a100-40-ps-test.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/dataset/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
sft_model_dir: ${output_path_prefix}/experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.sft_ps_test_case.process-dpo.V100.tp8dp16.v4.9.s42/
train_file: ${data_path_prefix}/magicoder/oss-instruct-apps-train-pseudo-test-inputs.v1.0.json
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: bfloat16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.1
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
# attn_implementation: "sdpa"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
# device_map: ${device_map}
ref_model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
# attn_implementation: "sdpa"
# device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.PseudoInputsWithFunctionName
use_starter_code: True
train_sub_split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: id
extract_fields: [ "pos", "neg" ]
extra_file: ${sft_model_dir}/oss-instruct-apps-train/checkpoint-700/split-32/train.0shot.tem1.0.n10.v2.0.pseudo_input_output.v1.0.dpo_m6_low0.5_min1.json
- _target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: [ "pos", "neg" ]
mode: multi
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}{chat_suffix}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}{chat_suffix}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${sft_model_dir}/checkpoint-700
pretrain:
resume: latest
dp_size:
tp_size: 8
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.sft_ps_test_case.iter1.dpo.A100-40.tp8dp8.v1.1.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 5e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 16
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 3
max_steps: 0
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
# zero_optimization:
# offload_optimizer:
# device: cpu
# pin_memory: True
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/iter1/gpt4o-distil-combine-v1.1-v100-ps-test.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/dataset/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
sft_model_dir: ${output_path_prefix}/experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.sft_ps_test_case.process-dpo.V100.tp8dp16.v4.9.s42/
train_file: ${data_path_prefix}/magicoder/oss-instruct-apps-train-pseudo-test-inputs.v1.0.json
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.1
gradient_checkpointing: True
# attn_implementation: "flash_attention_2"
attn_implementation: "sdpa"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
# device_map: ${device_map}
ref_model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
# attn_implementation: "flash_attention_2"
attn_implementation: "sdpa"
# device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.PseudoInputsWithFunctionName
use_starter_code: True
train_sub_split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: id
extract_fields: [ "pos", "neg" ]
extra_file: ${sft_model_dir}/oss-instruct-apps-train/checkpoint-700/split-32/train.0shot.tem1.0.n10.v2.0.pseudo_input_output.v1.0.dpo_m6_low0.5_min1.json
- _target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: [ "pos", "neg" ]
mode: multi
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}{chat_suffix}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}{chat_suffix}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
# max_seq_length: 3072
max_seq_length: 2560
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${sft_model_dir}/checkpoint-700
pretrain:
resume: latest
dp_size:
tp_size: 8
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.sft_ps_test_case.iter1.dpo.V100.tp8dp32.v1.1.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 5e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 4
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 3
max_steps: 0
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: False
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
bf16:
enabled: False
fp16:
enabled: True
auto_cast: False
loss_scale: 0
initial_scale_power: 16
loss_scale_window: 1000
hysteresis: 2
consecutive_hysteresis: False
min_loss_scale: 1
# zero_optimization:
# offload_optimizer:
# device: cpu
# pin_memory: True
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/iter1/gpt4o-distil-combine-v1.2-H100-4o-ps-test.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/dataset/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
sft_model_dir: ${output_path_prefix}/experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.sft_4o_ps_test_case.dpo.H100.dp8.v1.0.s42/
train_file: ${sft_model_dir}/oss-apps-xcode-combine-4o-ps-tests/checkpoint-100/train.0shot.tem1.0.n10.v2.1.s42.prefer_pair.low0.5.m6.4o-non-sc.json
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: bfloat16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.4
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
device_map: ${device_map}
ref_model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.PseudoInputsWithFunctionName
use_starter_code: True
train_sub_split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: [ "pos", "neg" ]
mode: multi
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}{chat_suffix}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}{chat_suffix}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 4096
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${sft_model_dir}/checkpoint-100/
pretrain:
resume: latest
dp_size:
tp_size: 1
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.sft_4o_ps_test_case.non_sc.iter1.dpo.MI300x.dp16.v1.2.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 5e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 8
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 1
max_steps: 0
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
# zero_optimization:
# offload_optimizer:
# device: cpu
# pin_memory: True
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/iter1/gpt4o-distil-combine-v1.2-a100-40-ps-test.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/dataset/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
sft_model_dir: ${output_path_prefix}/experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.sft_ps_test_case.process-dpo.V100.tp8dp16.v4.9.s42/
train_file: ${data_path_prefix}/magicoder/oss-instruct-apps-train-pseudo-test-inputs.v1.0.json
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: bfloat16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.4
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
# attn_implementation: "sdpa"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
# device_map: ${device_map}
ref_model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
# attn_implementation: "sdpa"
# device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.PseudoInputsWithFunctionName
use_starter_code: True
train_sub_split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: id
extract_fields: [ "pos", "neg" ]
extra_file: ${sft_model_dir}/oss-instruct-apps-train/checkpoint-700/split-32/train.0shot.tem1.0.n10.v2.0.pseudo_input_output.v1.0.dpo_m6_low0.5_min5.json
- _target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: [ "pos", "neg" ]
mode: multi
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}{chat_suffix}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}{chat_suffix}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${sft_model_dir}/checkpoint-700
pretrain:
resume: latest
dp_size:
tp_size: 4
pp_size: 1
#exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.sft_ps_test_case.iter1.dpo.A100.tp8dp8.v1.2.s${seed}
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.sft_ps_test_case.iter1.dpo.A100.tp4dp16.v1.2.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 3e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 16
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 3
max_steps: 0
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
# zero_optimization:
# offload_optimizer:
# device: cpu
# pin_memory: True
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/iter2/gpt4o-distil-combine-dpo-n64sc-v1.0-A100-ps-test.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/dataset/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
sft_model_dir: ${output_path_prefix}/experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.sft_ps_test_case.iter1.dpo.A100.tp4dp16.v1.2.s42/
train_file: /mnt/fangkai_blob/share/xcode_4o_oss_apps_test_inputs_v1.json
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: bfloat16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.4
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
# attn_implementation: "sdpa"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
device_map: ${device_map}
ref_model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
# attn_implementation: "sdpa"
device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.PseudoInputsWithFunctionName
use_starter_code: True
train_sub_split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: id
extract_fields: [ "pos", "neg" ]
extra_file: ${sft_model_dir}/oss-apps-xcode-combine/checkpoint-800/train.0shot.tem1.0.n10.v2.0.pseudo_input_output_by_n64.v1.0.dpo_m6_low0.5_min5.json
- _target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: [ "pos", "neg" ]
mode: multi
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}{chat_suffix}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}{chat_suffix}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${sft_model_dir}/checkpoint-800
pretrain:
resume: latest
dp_size:
tp_size: 1
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.r2c.sft_ps_test_case.xcode-apps-oss.n64sc.iter2.dpo.H100.dp16.v1.0.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 2e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 8
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 3
max_steps: 0
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
# zero_optimization:
# offload_optimizer:
# device: cpu
# pin_memory: True
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/iter2/gpt4o-distil-combine-dpo-n64sc-v1.1-A100-ps-test.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/dataset/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
sft_model_dir: ${output_path_prefix}/experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.sft_ps_test_case.iter1.dpo.A100.tp4dp16.v1.2.s42/
train_file: /mnt/fangkai_blob/share/xcode_4o_oss_apps_test_inputs_v1.json
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: bfloat16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.4
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
# attn_implementation: "sdpa"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
device_map: ${device_map}
ref_model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
# attn_implementation: "sdpa"
device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.PseudoInputsWithFunctionName
use_starter_code: True
train_sub_split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: id
extract_fields: [ "pos", "neg" ]
# extra_file: ${sft_model_dir}/oss-apps-xcode-combine/checkpoint-800/train.0shot.tem1.0.n10.v2.0.pseudo_input_output_by_n64.v1.0.dpo_m6_low0.5_min5.json
extra_file: ${sft_model_dir}/oss-apps-xcode-combine/checkpoint-800/train.0shot.tem1.0.n10.v2.0.pseudo_input_output_by_n64.v1.0.dpo_m6_low0.5_min5_p0.7.json
- _target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: [ "pos", "neg" ]
mode: multi
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}{chat_suffix}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}{chat_suffix}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${sft_model_dir}/checkpoint-800
pretrain:
resume: latest
dp_size:
tp_size: 1
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.r2c.sft_ps_test_case.xcode-apps-oss.n64sc.p0.7.iter2.dpo.A100.dp16.v1.1.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 2e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 8
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 3
max_steps: 0
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
# zero_optimization:
# offload_optimizer:
# device: cpu
# pin_memory: True
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/iter2/gpt4o-distil-combine-dpo-n64sc-v1.2-A100-ps-test.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/dataset/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
sft_model_dir: ${output_path_prefix}/experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.sft_ps_test_case.iter1.dpo.A100.tp4dp16.v1.2.s42/
train_file: /mnt/fangkai_blob/share/xcode_4o_oss_apps_test_inputs_v1.json
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: bfloat16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.4
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
# attn_implementation: "sdpa"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
device_map: ${device_map}
ref_model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
# attn_implementation: "sdpa"
device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.PseudoInputsWithFunctionName
use_starter_code: True
train_sub_split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: id
extract_fields: [ "pos", "neg" ]
# extra_file: ${sft_model_dir}/oss-apps-xcode-combine/checkpoint-800/train.0shot.tem1.0.n10.v2.0.pseudo_input_output_by_n64.v1.0.dpo_m6_low0.5_min5.json
extra_file: ${sft_model_dir}/oss-apps-xcode-combine/checkpoint-800/train.0shot.tem1.0.n10.v2.0.pseudo_input_output_by_n64.v1.0.dpo_m6_low0.5_min5_p0.8.json
- _target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: [ "pos", "neg" ]
mode: multi
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}{chat_suffix}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}{chat_suffix}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${sft_model_dir}/checkpoint-800
pretrain:
resume: latest
dp_size:
tp_size: 1
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.r2c.sft_ps_test_case.xcode-apps-oss.n64sc.p0.8.iter2.dpo.A100.dp16.v1.2.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 2e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 8
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 3
max_steps: 0
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
# zero_optimization:
# offload_optimizer:
# device: cpu
# pin_memory: True
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/iter2/gpt4o-distil-combine-dpo-n64sc-v1.2-V100-ps-test.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero2_optim_offload
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/dataset/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
sft_model_dir: ${output_path_prefix}/experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.sft_ps_test_case.iter1.dpo.A100.tp4dp16.v1.2.s42/
train_file: /mnt/fangkai_blob/share/xcode_4o_oss_apps_test_inputs_v1.json
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.4
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
# attn_implementation: "flash_attention_2"
attn_implementation: "sdpa"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
# device_map: ${device_map}
ref_model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
# attn_implementation: "flash_attention_2"
attn_implementation: "sdpa"
# device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.PseudoInputsWithFunctionName
use_starter_code: True
train_sub_split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: id
extract_fields: [ "pos", "neg" ]
# extra_file: ${sft_model_dir}/oss-apps-xcode-combine/checkpoint-800/train.0shot.tem1.0.n10.v2.0.pseudo_input_output_by_n64.v1.0.dpo_m6_low0.5_min5.json
extra_file: ${sft_model_dir}/oss-apps-xcode-combine/checkpoint-800/train.0shot.tem1.0.n10.v2.0.pseudo_input_output_by_n64.v1.0.dpo_m6_low0.5_min5_p0.8.json
- _target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: [ "pos", "neg" ]
mode: multi
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}{chat_suffix}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}{chat_suffix}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${sft_model_dir}/checkpoint-800
pretrain:
resume: latest
dp_size:
tp_size: 4
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.r2c.sft_ps_test_case.xcode-apps-oss.n64sc.p0.8.iter2.dpo.V100.tp4dp64.v1.2.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 2e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 2
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 3
max_steps: 0
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: False
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
bf16:
enabled: False
fp16:
enabled: True
auto_cast: False
loss_scale: 0
initial_scale_power: 16
loss_scale_window: 1000
hysteresis: 2
consecutive_hysteresis: False
min_loss_scale: 1
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/iter2/gpt4o-distil-combine-dpo-n64sc-v1.3-A100-ps-test.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/dataset/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
sft_model_dir: ${output_path_prefix}/experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.sft_ps_test_case.iter1.dpo.A100.tp4dp16.v1.2.s42/
train_file: /mnt/fangkai_blob/share/xcode_4o_oss_apps_test_inputs_v1.json
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: bfloat16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.4
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
# attn_implementation: "sdpa"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
device_map: ${device_map}
ref_model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
# attn_implementation: "sdpa"
device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.PseudoInputsWithFunctionName
use_starter_code: True
train_sub_split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: id
extract_fields: [ "pos", "neg" ]
# extra_file: ${sft_model_dir}/oss-apps-xcode-combine/checkpoint-800/train.0shot.tem1.0.n10.v2.0.pseudo_input_output_by_n64.v1.0.dpo_m6_low0.5_min5.json
extra_file: ${sft_model_dir}/oss-apps-xcode-combine/checkpoint-800/train.0shot.tem1.0.n10.v2.0.pseudo_input_output_by_n64.v1.0.dpo_m6_low0.5_min5_p0.6.json
- _target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: [ "pos", "neg" ]
mode: multi
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}{chat_suffix}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}{chat_suffix}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${sft_model_dir}/checkpoint-800
pretrain:
resume: latest
dp_size:
tp_size: 1
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.r2c.sft_ps_test_case.xcode-apps-oss.n64sc.p0.6.iter2.dpo.A100.dp16.v1.3.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 1e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 16
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 3
max_steps: 0
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
# zero_optimization:
# offload_optimizer:
# device: cpu
# pin_memory: True
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/iter2/gpt4o-distil-combine-dpo-n64sc-v1.4-A100-ps-test.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/dataset/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
sft_model_dir: ${output_path_prefix}/experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.sft_ps_test_case.iter1.dpo.A100.tp4dp16.v1.2.s42/
train_file: /mnt/fangkai_blob/share/xcode_4o_oss_apps_test_inputs_v1.json
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: bfloat16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.4
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
# attn_implementation: "sdpa"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
device_map: ${device_map}
ref_model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
# attn_implementation: "sdpa"
device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.PseudoInputsWithFunctionName
use_starter_code: True
train_sub_split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: id
extract_fields: [ "pos", "neg" ]
# extra_file: ${sft_model_dir}/oss-apps-xcode-combine/checkpoint-800/train.0shot.tem1.0.n10.v2.0.pseudo_input_output_by_n64.v1.0.dpo_m6_low0.5_min5.json
extra_file: ${sft_model_dir}/oss-apps-xcode-combine/checkpoint-800/train.0shot.tem1.0.n10.v2.0.pseudo_input_output_by_n64.v1.0.dpo_m6_low0.5_min5_p0.5.json
- _target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: [ "pos", "neg" ]
mode: multi
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}{chat_suffix}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}{chat_suffix}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${sft_model_dir}/checkpoint-800
pretrain:
resume: latest
dp_size:
tp_size: 1
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.r2c.sft_ps_test_case.xcode-apps-oss.n64sc.p0.5.iter2.dpo.A100.dp16.v1.4.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 1e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 16
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 3
max_steps: 0
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
# zero_optimization:
# offload_optimizer:
# device: cpu
# pin_memory: True
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/iter2/gpt4o-distil-combine-pdpo-v1.0-h100-ps-test.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/dataset/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
sft_model_dir: ${output_path_prefix}/experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.sft_ps_test_case.iter1.dpo.A100.tp4dp16.v1.2.s42/
train_file: /mnt/fangkai_blob/share/xCodeEval/xcode_train_4o_test_inputs_v1.json
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: bfloat16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.4
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
# attn_implementation: "sdpa"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
device_map: ${device_map}
ref_model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
# attn_implementation: "sdpa"
device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor_pdpo:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.PseudoInputsWithFunctionName
use_starter_code: True
train_sub_split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: problem_id
extract_fields: [ "pos", "neg" ]
extra_file: ${sft_model_dir}/oss-apps-xcode-combine/checkpoint-800/train.0shot.tem1.0.n10.v2.0.prefix.upper0.8.r0.3.sample10_per.completion.tem1.0.n3.pseudo_input_output.prefix_pass_num.fix_low0.5_m4.0_avg.json
- _target_: data.input_aligner.dpo_paired_random_choice_aligner
anchor_field: pos
paired_field: neg
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
read_tensor_dpo:
_target_: data.combine_dataset.MultiMappingDataset
file_path: ${data_path_prefix}/magicoder/oss-instruct-apps-train-pseudo-test-inputs.v1.0.json
read_fn:
_target_: data.apps.PseudoInputsWithFunctionName
use_starter_code: True
train_sub_split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: id
extract_fields: [ "pos", "neg" ]
extra_file: ${sft_model_dir}/oss-apps-xcode-combine/checkpoint-800/train.0shot.tem1.0.n10.v2.0.pseudo_input_output.v1.0.dpo_m6_low0.5_min5.json
- _target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: [ "pos", "neg" ]
mode: multi
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}{chat_suffix}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}{chat_suffix}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
read_tensor:
_target_: data.combine_dataset.ReplayDataset
_recursive_: False
new_dataset_cfg: ${read_tensor_pdpo}
old_dataset_cfg: ${read_tensor_dpo}
replay_ratio: 0.5
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${sft_model_dir}/checkpoint-800
pretrain:
resume: latest
dp_size:
tp_size: 1
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.r2c.sft_ps_test_case.iter2.pdpo.H100.dp8.v1.0.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 2e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 16
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 2
max_steps: 0
warmup_proportion: 0.03
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/iter2/gpt4o-distil-combine-pdpo-v1.1-h100-ps-test.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/dataset/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
sft_model_dir: ${output_path_prefix}/experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.sft_ps_test_case.iter1.dpo.A100.tp4dp16.v1.2.s42/
train_file: /mnt/fangkai_blob/share/xCodeEval/xcode_train_4o_test_inputs_v1.json
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: bfloat16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.4
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
# attn_implementation: "sdpa"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
device_map: ${device_map}
ref_model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
# attn_implementation: "sdpa"
device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor_pdpo:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.PseudoInputsWithFunctionName
use_starter_code: True
train_sub_split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: problem_id
extract_fields: [ "pos", "neg" ]
extra_file: ${sft_model_dir}/oss-apps-xcode-combine/checkpoint-800/train.0shot.tem1.0.n10.v2.0.prefix.upper0.8.r0.3.sample10_per.completion.tem1.0.n3.pseudo_input_output.prefix_pass_num.fix_low0.6_m6.0_avg.json
- _target_: data.input_aligner.dpo_paired_random_choice_aligner
anchor_field: pos
paired_field: neg
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
read_tensor_dpo:
_target_: data.combine_dataset.MultiMappingDataset
file_path: ${data_path_prefix}/magicoder/oss-instruct-apps-train-pseudo-test-inputs.v1.0.json
read_fn:
_target_: data.apps.PseudoInputsWithFunctionName
use_starter_code: True
train_sub_split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: id
extract_fields: [ "pos", "neg" ]
extra_file: ${sft_model_dir}/oss-apps-xcode-combine/checkpoint-800/train.0shot.tem1.0.n10.v2.0.pseudo_input_output.v1.0.dpo_m6_low0.5_min5.json
- _target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: [ "pos", "neg" ]
mode: multi
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}{chat_suffix}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}{chat_suffix}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
read_tensor:
_target_: data.combine_dataset.ReplayDataset
_recursive_: False
new_dataset_cfg: ${read_tensor_pdpo}
old_dataset_cfg: ${read_tensor_dpo}
replay_ratio: 0.3
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${sft_model_dir}/checkpoint-800
pretrain:
resume: latest
dp_size:
tp_size: 1
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.r2c.sft_ps_test_case.iter2.pdpo.H100.dp8.v1.1.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 2e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 16
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 3
max_steps: 0
warmup_proportion: 0.03
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/iter2/gpt4o-distil-combine-pdpo-v1.1-v100-ps-test.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero2_optim_offload
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/dataset/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
sft_model_dir: ${output_path_prefix}/experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.sft_ps_test_case.iter1.dpo.A100.tp4dp16.v1.2.s42/
train_file: /mnt/fangkai_blob/share/xCodeEval/xcode_train_4o_test_inputs_v1.json
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.4
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
# attn_implementation: "flash_attention_2"
attn_implementation: "sdpa"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
# device_map: ${device_map}
ref_model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
# attn_implementation: "flash_attention_2"
attn_implementation: "sdpa"
# device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor_pdpo:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.PseudoInputsWithFunctionName
use_starter_code: True
train_sub_split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: problem_id
extract_fields: [ "pos", "neg" ]
extra_file: ${sft_model_dir}/oss-apps-xcode-combine/checkpoint-800/train.0shot.tem1.0.n10.v2.0.prefix.upper0.8.r0.3.sample10_per.completion.tem1.0.n3.pseudo_input_output.prefix_pass_num.fix_low0.6_m6.0_avg.json
- _target_: data.input_aligner.dpo_paired_random_choice_aligner
anchor_field: pos
paired_field: neg
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
read_tensor_dpo:
_target_: data.combine_dataset.MultiMappingDataset
file_path: ${data_path_prefix}/magicoder/oss-instruct-apps-train-pseudo-test-inputs.v1.0.json
read_fn:
_target_: data.apps.PseudoInputsWithFunctionName
use_starter_code: True
train_sub_split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: id
extract_fields: [ "pos", "neg" ]
extra_file: ${sft_model_dir}/oss-apps-xcode-combine/checkpoint-800/train.0shot.tem1.0.n10.v2.0.pseudo_input_output.v1.0.dpo_m6_low0.5_min5_rm_large_meta.json
- _target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: [ "pos", "neg" ]
mode: multi
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}{chat_suffix}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}{chat_suffix}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
read_tensor:
_target_: data.combine_dataset.ReplayDataset
_recursive_: False
new_dataset_cfg: ${read_tensor_pdpo}
old_dataset_cfg: ${read_tensor_dpo}
replay_ratio: 0.3
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${sft_model_dir}/checkpoint-800
pretrain:
resume: latest
dp_size:
tp_size: 4
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.r2c.sft_ps_test_case.iter2.pdpo.V100.tp4dp64.v1.1.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# global size 128
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 2e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 2
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 3
max_steps: 0
warmup_proportion: 0.03
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: False
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
bf16:
enabled: False
fp16:
enabled: True
auto_cast: False
loss_scale: 0
initial_scale_power: 16
loss_scale_window: 1000
hysteresis: 2
consecutive_hysteresis: False
min_loss_scale: 1
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/iter2/gpt4o-distil-combine-pdpo-v1.2-v100-ps-test.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero2_optim_offload
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/dataset/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
sft_model_dir: ${output_path_prefix}/experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.sft_ps_test_case.iter1.dpo.A100.tp4dp16.v1.2.s42/
train_file: /mnt/fangkai_blob/share/xCodeEval/xcode_train_4o_test_inputs_v1.json
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.4
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
# attn_implementation: "flash_attention_2"
attn_implementation: "sdpa"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
# device_map: ${device_map}
ref_model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
# attn_implementation: "flash_attention_2"
attn_implementation: "sdpa"
# device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor_pdpo:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.PseudoInputsWithFunctionName
use_starter_code: True
train_sub_split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: problem_id
extract_fields: [ "pos", "neg" ]
extra_file: ${sft_model_dir}/oss-apps-xcode-combine/checkpoint-800/train.0shot.tem1.0.n10.v2.0.prefix.upper0.8.r0.3.sample10_per.completion.tem1.0.n3.pseudo_input_output.prefix_pass_num.fix_low0.5_m4.0_avg.json
- _target_: data.input_aligner.dpo_paired_random_choice_aligner
anchor_field: pos
paired_field: neg
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
read_tensor_dpo:
_target_: data.combine_dataset.MultiMappingDataset
file_path: ${data_path_prefix}/magicoder/oss-instruct-apps-train-pseudo-test-inputs.v1.0.json
read_fn:
_target_: data.apps.PseudoInputsWithFunctionName
use_starter_code: True
train_sub_split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: id
extract_fields: [ "pos", "neg" ]
extra_file: ${sft_model_dir}/oss-apps-xcode-combine/checkpoint-800/train.0shot.tem1.0.n10.v2.0.pseudo_input_output.v1.0.dpo_m6_low0.5_min5_rm_large_meta.json
- _target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: [ "pos", "neg" ]
mode: multi
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}{chat_suffix}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}{chat_suffix}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
read_tensor:
_target_: data.combine_dataset.ReplayDataset
_recursive_: False
new_dataset_cfg: ${read_tensor_pdpo}
old_dataset_cfg: ${read_tensor_dpo}
replay_ratio: 0.5
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${sft_model_dir}/checkpoint-800
pretrain:
resume: latest
dp_size:
tp_size: 4
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.r2c.sft_ps_test_case.iter2.pdpo.V100.tp4dp32.v1.2.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# global size 128
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 1e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 4
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 3
max_steps: 0
warmup_proportion: 0.03
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: False
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
bf16:
enabled: False
fp16:
enabled: True
auto_cast: False
loss_scale: 0
initial_scale_power: 16
loss_scale_window: 1000
hysteresis: 2
consecutive_hysteresis: False
min_loss_scale: 1
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/iter2/gpt4o-distil-combine-pdpo-v1.3-h100-fix-ps-test.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
# - deepspeed@ds_cfg: train_hybrid_engine_zero2_optim_offload
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/dataset/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
sft_model_dir: ${output_path_prefix}/experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.sft_ps_test_case.iter1.dpo.A100.tp4dp16.v1.2.s42/
train_file: /mnt/fangkai_blob/share/xcode_4o_oss_apps_test_inputs_v1.json
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: bfloat16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.5
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
# attn_implementation: "sdpa"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
device_map: ${device_map}
ref_model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
# attn_implementation: "sdpa"
device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor_pdpo:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.PseudoInputsWithFunctionName
use_starter_code: True
train_sub_split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: problem_id
extract_fields: [ "pos", "neg" ]
extra_file: ${sft_model_dir}/oss-apps-xcode-combine/checkpoint-800/train.0shot.tem1.0.n10.v2.0.prefix.upper0.8.r0.3.sample10_per.completion.tem1.0.n3.pseudo_input_output.prefix_pass_num.fix_low0.5_m4.0_avg.json
- _target_: data.input_aligner.dpo_paired_random_choice_aligner
anchor_field: pos
paired_field: neg
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
read_tensor_dpo:
_target_: data.combine_dataset.MultiMappingDataset
file_path: /mnt/fangkai_blob/share/xcode_4o_oss_apps_test_inputs_v1.json
read_fn:
_target_: data.apps.PseudoInputsWithFunctionName
use_starter_code: True
train_sub_split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: id
extract_fields: [ "pos", "neg" ]
extra_file: ${sft_model_dir}/oss-apps-xcode-combine/checkpoint-800/train.0shot.tem1.0.n10.v2.0.pseudo_input_output.v1.0.dpo_m6_low0.5_min5_rm_large_meta.json
- _target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: [ "pos", "neg" ]
mode: multi
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}{chat_suffix}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}{chat_suffix}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
read_tensor:
_target_: data.combine_dataset.ReplayDataset
_recursive_: False
new_dataset_cfg: ${read_tensor_pdpo}
old_dataset_cfg: ${read_tensor_dpo}
replay_ratio: 0.5
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${sft_model_dir}/checkpoint-800
pretrain:
resume: latest
dp_size:
tp_size: 1
pp_size: 1
#exp_name: deepseek-coder-v1.5-ins.7b.r2c.sft_ps_test_case.iter2.pdpo.V100.tp4dp64.v1.3.s${seed}
exp_name: deepseek-coder-v1.5-ins.7b.r2c.sft_ps_test_case.iter2.pdpo.H100.dp16.v1.3.fix.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# global size 128
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 1e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 8
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 3
max_steps: 0
warmup_proportion: 0.0
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/iter2/gpt4o-distil-combine-pdpo-v1.3-v100-fix-ps-test.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
# - deepspeed@ds_cfg: train_hybrid_engine_zero2_optim_offload
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/dataset/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
sft_model_dir: ${output_path_prefix}/experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.sft_ps_test_case.iter1.dpo.A100.tp4dp16.v1.2.s42/
train_file: /mnt/fangkai_blob/share/xcode_4o_oss_apps_test_inputs_v1.json
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.5
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
# attn_implementation: "flash_attention_2"
attn_implementation: "sdpa"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
# device_map: ${device_map}
ref_model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
# attn_implementation: "flash_attention_2"
attn_implementation: "sdpa"
# device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor_pdpo:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.PseudoInputsWithFunctionName
use_starter_code: True
train_sub_split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: problem_id
extract_fields: [ "pos", "neg" ]
extra_file: ${sft_model_dir}/oss-apps-xcode-combine/checkpoint-800/train.0shot.tem1.0.n10.v2.0.prefix.upper0.8.r0.3.sample10_per.completion.tem1.0.n3.pseudo_input_output.prefix_pass_num.fix_low0.5_m4.0_avg.json
- _target_: data.input_aligner.dpo_paired_random_choice_aligner
anchor_field: pos
paired_field: neg
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
read_tensor_dpo:
_target_: data.combine_dataset.MultiMappingDataset
file_path: /mnt/fangkai_blob/share/xcode_4o_oss_apps_test_inputs_v1.json
read_fn:
_target_: data.apps.PseudoInputsWithFunctionName
use_starter_code: True
train_sub_split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: id
extract_fields: [ "pos", "neg" ]
extra_file: ${sft_model_dir}/oss-apps-xcode-combine/checkpoint-800/train.0shot.tem1.0.n10.v2.0.pseudo_input_output.v1.0.dpo_m6_low0.5_min5_rm_large_meta.json
- _target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: [ "pos", "neg" ]
mode: multi
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}{chat_suffix}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}{chat_suffix}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
read_tensor:
_target_: data.combine_dataset.ReplayDataset
_recursive_: False
new_dataset_cfg: ${read_tensor_pdpo}
old_dataset_cfg: ${read_tensor_dpo}
replay_ratio: 0.5
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${sft_model_dir}/checkpoint-800
pretrain:
resume: latest
dp_size:
#tp_size: 4
tp_size: 8
pp_size: 1
#exp_name: deepseek-coder-v1.5-ins.7b.r2c.sft_ps_test_case.iter2.pdpo.V100.tp4dp64.v1.3.s${seed}
exp_name: deepseek-coder-v1.5-ins.7b.r2c.sft_ps_test_case.iter2.pdpo.V100.tp8dp32.v1.3.fix.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# global size 128
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 1e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 4
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 3
max_steps: 0
warmup_proportion: 0.0
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: False
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
bf16:
enabled: False
fp16:
enabled: True
auto_cast: False
loss_scale: 0
initial_scale_power: 16
loss_scale_window: 1000
hysteresis: 2
consecutive_hysteresis: False
min_loss_scale: 1
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/iter2/gpt4o-distil-combine-pdpo-v1.3-v100-ps-test.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
# - deepspeed@ds_cfg: train_hybrid_engine_zero2_optim_offload
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/dataset/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
sft_model_dir: ${output_path_prefix}/experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.sft_ps_test_case.iter1.dpo.A100.tp4dp16.v1.2.s42/
train_file: /mnt/fangkai_blob/share/xCodeEval/xcode_train_4o_test_inputs_v1.json
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.5
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
# attn_implementation: "flash_attention_2"
attn_implementation: "sdpa"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
# device_map: ${device_map}
ref_model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
# attn_implementation: "flash_attention_2"
attn_implementation: "sdpa"
# device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor_pdpo:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.PseudoInputsWithFunctionName
use_starter_code: True
train_sub_split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: problem_id
extract_fields: [ "pos", "neg" ]
extra_file: ${sft_model_dir}/oss-apps-xcode-combine/checkpoint-800/train.0shot.tem1.0.n10.v2.0.prefix.upper0.8.r0.3.sample10_per.completion.tem1.0.n3.pseudo_input_output.prefix_pass_num.fix_low0.5_m4.0_avg.json
- _target_: data.input_aligner.dpo_paired_random_choice_aligner
anchor_field: pos
paired_field: neg
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
read_tensor_dpo:
_target_: data.combine_dataset.MultiMappingDataset
file_path: ${data_path_prefix}/magicoder/oss-instruct-apps-train-pseudo-test-inputs.v1.0.json
read_fn:
_target_: data.apps.PseudoInputsWithFunctionName
use_starter_code: True
train_sub_split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: id
extract_fields: [ "pos", "neg" ]
extra_file: ${sft_model_dir}/oss-apps-xcode-combine/checkpoint-800/train.0shot.tem1.0.n10.v2.0.pseudo_input_output.v1.0.dpo_m6_low0.5_min5_rm_large_meta.json
- _target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: [ "pos", "neg" ]
mode: multi
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}{chat_suffix}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}{chat_suffix}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
read_tensor:
_target_: data.combine_dataset.ReplayDataset
_recursive_: False
new_dataset_cfg: ${read_tensor_pdpo}
old_dataset_cfg: ${read_tensor_dpo}
replay_ratio: 0.5
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${sft_model_dir}/checkpoint-800
pretrain:
resume: latest
dp_size:
#tp_size: 4
tp_size: 8
pp_size: 1
#exp_name: deepseek-coder-v1.5-ins.7b.r2c.sft_ps_test_case.iter2.pdpo.V100.tp4dp64.v1.3.s${seed}
exp_name: deepseek-coder-v1.5-ins.7b.r2c.sft_ps_test_case.iter2.pdpo.V100.tp8dp32.v1.3.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# global size 128
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 1e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 4
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 3
max_steps: 0
warmup_proportion: 0.0
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: False
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
bf16:
enabled: False
fp16:
enabled: True
auto_cast: False
loss_scale: 0
initial_scale_power: 16
loss_scale_window: 1000
hysteresis: 2
consecutive_hysteresis: False
min_loss_scale: 1
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/iter2/gpt4o-distil-combine-pdpo-v2.0-h100-ps-test.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
# - deepspeed@ds_cfg: train_hybrid_engine_zero2_optim_offload
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/dataset/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
sft_model_dir: ${output_path_prefix}/experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.sft_ps_test_case.iter1.pdpo.H100.dp8.v1.2.s42/
train_file: /mnt/fangkai_blob/share/xcode_4o_oss_apps_test_inputs_v1.json
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: bfloat16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.5
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
# attn_implementation: "sdpa"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
device_map: ${device_map}
ref_model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
# attn_implementation: "sdpa"
device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
#read_tensor_pdpo:
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.PseudoInputsWithFunctionName
use_starter_code: True
train_sub_split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: problem_id
extract_fields: [ "pos", "neg" ]
extra_file: ${sft_model_dir}/oss-apps-xcode-combine/checkpoint-500/train.0shot.tem1.0.n64.v2.0.prefix.upper0.8.r0.3.sample32_per.completion.tem1.0.n3.pseudo_input_output.prefix_pass_num.fix_low0.5_m4.0_avg.json
- _target_: data.input_aligner.dpo_paired_random_choice_aligner
anchor_field: pos
paired_field: neg
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
#read_tensor_dpo:
# _target_: data.combine_dataset.MultiMappingDataset
# file_path: /mnt/fangkai_blob/share/xcode_4o_oss_apps_test_inputs_v1.json
# read_fn:
# _target_: data.apps.PseudoInputsWithFunctionName
# use_starter_code: True
# train_sub_split: train
# aligner:
# _target_: data.input_aligner.concat_aligner
# aligners:
# - _target_: data.input_aligner.field_extract_aligner
# input_index_field: problem_id
# extract_index_field: id
# extract_fields: [ "pos", "neg" ]
# extra_file: ${sft_model_dir}/oss-apps-xcode-combine/checkpoint-500/train.0shot.tem1.0.n64.v2.0.pseudo_input_output.v1.0.cp.dpo_m4_low0.3_min5_p0.2.json
# - _target_: data.input_aligner.flat_aligner
# input_index_field: problem_id
# extract_field: [ "pos", "neg" ]
# mode: multi
# template:
# _target_: data.input_utils.recompose_template
# units:
# chat_prefix: ${chat_prefix}
# prompt: ${prompt}
# chat_connect: ${chat_connect}
# pos: "{pos}"
# neg: "{neg}"
# chat_suffix: ${chat_suffix}
# compositions:
# prompt: "{chat_prefix}{prompt}{chat_connect}"
# chosen: "{chat_prefix}{prompt}{chat_connect}{pos}{chat_suffix}"
# reject: "{chat_prefix}{prompt}{chat_connect}{neg}{chat_suffix}"
# index_field: problem_id
# kv_mapping:
# chosen: chosen
# reject: reject
# problem_id: index
# prompt: prompt
#
#read_tensor:
# _target_: data.combine_dataset.ReplayDataset
# _recursive_: False
# new_dataset_cfg: ${read_tensor_pdpo}
# old_dataset_cfg: ${read_tensor_dpo}
# replay_ratio: 0.5
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${sft_model_dir}/checkpoint-500/
pretrain:
resume: latest
dp_size:
tp_size: 1
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.r2c.sft_ps_test_case.iter2.pdpo.H100.dp16.v2.0.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# global size 128
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 1e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 8
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 3
max_steps: 0
warmup_proportion: 0.0
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/iter2/gpt4o-distil-combine-pdpo-v2.1-h100-ps-test.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
# - deepspeed@ds_cfg: train_hybrid_engine_zero2_optim_offload
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/dataset/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
sft_model_dir: ${output_path_prefix}/experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.sft_ps_test_case.iter1.pdpo.H100.dp8.v1.2.s42/
train_file: /mnt/fangkai_blob/share/xcode_4o_oss_apps_test_inputs_v1.json
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: bfloat16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.5
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
# attn_implementation: "sdpa"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
device_map: ${device_map}
ref_model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
# attn_implementation: "sdpa"
device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
#read_tensor_pdpo:
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.PseudoInputsWithFunctionName
use_starter_code: True
train_sub_split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: problem_id
extract_fields: [ "pos", "neg" ]
extra_file: ${sft_model_dir}/oss-apps-xcode-combine/checkpoint-500/train.0shot.tem1.0.n64.v2.0.prefix.upper0.8.r0.3.sample32_per.completion.tem1.0.n3.pseudo_input_output.prefix_pass_num.p0.2.fix_low0.5_m4.0_avg.json
- _target_: data.input_aligner.dpo_paired_random_choice_aligner
anchor_field: pos
paired_field: neg
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
#read_tensor_dpo:
# _target_: data.combine_dataset.MultiMappingDataset
# file_path: /mnt/fangkai_blob/share/xcode_4o_oss_apps_test_inputs_v1.json
# read_fn:
# _target_: data.apps.PseudoInputsWithFunctionName
# use_starter_code: True
# train_sub_split: train
# aligner:
# _target_: data.input_aligner.concat_aligner
# aligners:
# - _target_: data.input_aligner.field_extract_aligner
# input_index_field: problem_id
# extract_index_field: id
# extract_fields: [ "pos", "neg" ]
# extra_file: ${sft_model_dir}/oss-apps-xcode-combine/checkpoint-500/train.0shot.tem1.0.n64.v2.0.pseudo_input_output.v1.0.cp.dpo_m4_low0.3_min5_p0.2.json
# - _target_: data.input_aligner.flat_aligner
# input_index_field: problem_id
# extract_field: [ "pos", "neg" ]
# mode: multi
# template:
# _target_: data.input_utils.recompose_template
# units:
# chat_prefix: ${chat_prefix}
# prompt: ${prompt}
# chat_connect: ${chat_connect}
# pos: "{pos}"
# neg: "{neg}"
# chat_suffix: ${chat_suffix}
# compositions:
# prompt: "{chat_prefix}{prompt}{chat_connect}"
# chosen: "{chat_prefix}{prompt}{chat_connect}{pos}{chat_suffix}"
# reject: "{chat_prefix}{prompt}{chat_connect}{neg}{chat_suffix}"
# index_field: problem_id
# kv_mapping:
# chosen: chosen
# reject: reject
# problem_id: index
# prompt: prompt
#
#read_tensor:
# _target_: data.combine_dataset.ReplayDataset
# _recursive_: False
# new_dataset_cfg: ${read_tensor_pdpo}
# old_dataset_cfg: ${read_tensor_dpo}
# replay_ratio: 0.5
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${sft_model_dir}/checkpoint-500/
pretrain:
resume: latest
dp_size:
tp_size: 1
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.r2c.sft_ps_test_case.iter2.pdpo.H100.dp16.v2.1.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# global size 128
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 1e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 8
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 3
max_steps: 0
warmup_proportion: 0.0
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/iter2/gpt4o-distil-combine-v1.0-H100-ps-test.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero2
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/dataset/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
sft_model_dir: ${output_path_prefix}/experiments/deepseek-coder-v1.5-ins.7b.apps.r2c.sft_ps_test_case.iter1.dpo.A100.tp4dp16.v1.2.s42/
train_file: /mnt/fangkai_blob/share/xCodeEval/xcode_train_4o_test_inputs_v1.json
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: bfloat16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.4
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
# attn_implementation: "sdpa"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
device_map: ${device_map}
ref_model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
# attn_implementation: "sdpa"
device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor_dpo_xcode:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.PseudoInputsWithFunctionName
use_starter_code: True
train_sub_split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: id
extract_fields: [ "pos", "neg" ]
extra_file: ${sft_model_dir}/xcode-train/checkpoint-800/train.0shot.tem1.0.n10.v2.0.pseudo_input_output.v1.0.dpo_m6_low0.5_min5.json
- _target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: [ "pos", "neg" ]
mode: multi
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}{chat_suffix}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}{chat_suffix}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
read_tensor_dpo_oss_apps_combine:
_target_: data.combine_dataset.MultiMappingDataset
file_path: ${data_path_prefix}/magicoder/oss-instruct-apps-train-pseudo-test-inputs.v1.0.json
read_fn:
_target_: data.apps.PseudoInputsWithFunctionName
use_starter_code: True
train_sub_split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: id
extract_fields: [ "pos", "neg" ]
extra_file: ${sft_model_dir}/oss-instruct-apps-train/checkpoint-800/train.0shot.tem1.0.n10.v2.0.pseudo_input_output.v1.0.dpo_m6_low0.5_min5.json
- _target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: [ "pos", "neg" ]
mode: multi
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}{chat_suffix}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}{chat_suffix}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
read_tensor:
_target_: data.combine_dataset.ReplayDataset
_recursive_: False
new_dataset_cfg: ${read_tensor_dpo_xcode}
old_dataset_cfg: ${read_tensor_dpo_oss_apps_combine}
replay_ratio: 1.0
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${sft_model_dir}/checkpoint-800
pretrain:
resume: latest
dp_size:
tp_size: 1
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.r2c.sft_ps_test_case.iter2.dpo.H100.dp8.v1.0.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 2e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 16
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 3
max_steps: 0
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
# zero_optimization:
# offload_optimizer:
# device: cpu
# pin_memory: True
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/iter2/gpt4o-distil-combine-v1.0-mi300x-hybrid.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/dataset/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
sft_model_dir: ${output_path_prefix}/experiments/deepseek-coder-v1.5-ins.7b.apps-4o.mc-self.iter1.dpo.H100.dp32.v1.0.s42/
#train_file: ${sft_model_dir}/oss-apps-xcode-combine-4o-ps-tests/checkpoint-100/train.0shot.tem1.0.n10.v2.1.s42.prefer_pair.low0.5.m6.4o-non-sc.json
train_file: /mnt/fangkai_blob/share/xcode_4o_oss_apps_test_inputs_v1.json
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.4
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
device_map: ${device_map}
ref_model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.PseudoInputsWithFunctionNameFixStarterCode
use_starter_code: True
train_sub_split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: problem_id
extract_fields: [ "pos", "neg" ]
extra_file: ${sft_model_dir}/oss-apps-xcode-combine/checkpoint-300/train.0shot.tem1.0.n10.v2.1.apps.4o.dpo_m4_low0.5.mcoder-xcode.self.dpo_m6_low0.5_min5.json
- _target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: [ "pos", "neg" ]
mode: multi
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}{chat_suffix}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}{chat_suffix}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 4096
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${sft_model_dir}/checkpoint-300/
pretrain:
resume: latest
dp_size:
tp_size: 1
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps-4o.mc-xcode-self.iter2.dpo.mi300x.dp32.v1.0.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 4
per_gpu_eval_batch_size: 4
#learning_rate: 1e-4
learning_rate: 1e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 1
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 3
max_steps: 0
warmup_proportion: 0.03
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: False
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
# zero_optimization:
# offload_optimizer:
# device: cpu
# pin_memory: True
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/iter2/gpt4o-distil-combine-v1.1-mi300x-hybrid.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/dataset/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
sft_model_dir: ${output_path_prefix}/experiments/deepseek-coder-v1.5-ins.7b.apps-4o.mc-self.iter1.dpo.H100.dp32.v1.0.s42/
#train_file: ${sft_model_dir}/oss-apps-xcode-combine-4o-ps-tests/checkpoint-100/train.0shot.tem1.0.n10.v2.1.s42.prefer_pair.low0.5.m6.4o-non-sc.json
train_file: /mnt/fangkai_blob/share/xcode_4o_oss_apps_test_inputs_v1.json
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.4
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
device_map: ${device_map}
ref_model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.PseudoInputsWithFunctionNameFixStarterCode
use_starter_code: True
train_sub_split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: problem_id
extract_fields: [ "pos", "neg" ]
extra_file: ${sft_model_dir}/oss-apps-xcode-combine/checkpoint-300/train.0shot.tem1.0.n10.v2.1.apps.4o.dpo_m4_low0.5.mcoder-xcode.self.dpo_m6_low0.5_min5.json
- _target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: [ "pos", "neg" ]
mode: multi
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}{chat_suffix}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}{chat_suffix}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 4096
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${sft_model_dir}/checkpoint-300/
pretrain:
resume: latest
dp_size:
tp_size: 1
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps-4o.mc-xcode-self.iter2.dpo.mi300x.dp32.v1.1.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 4
per_gpu_eval_batch_size: 4
#learning_rate: 1e-4
learning_rate: 1e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 1
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 1
max_steps: 0
warmup_proportion: 0.03
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: False
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
# zero_optimization:
# offload_optimizer:
# device: cpu
# pin_memory: True
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/iter2/gpt4o-distil-combine-v1.2-mi300x-hybrid.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/dataset/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
sft_model_dir: ${output_path_prefix}/experiments/deepseek-coder-v1.5-ins.7b.apps-4o.mc-self.iter1.dpo.H100.dp32.v1.0.s42/
#train_file: ${sft_model_dir}/oss-apps-xcode-combine-4o-ps-tests/checkpoint-100/train.0shot.tem1.0.n10.v2.1.s42.prefer_pair.low0.5.m6.4o-non-sc.json
train_file: /mnt/fangkai_blob/share/xcode_4o_oss_apps_test_inputs_v1.json
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.4
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
device_map: ${device_map}
ref_model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.PseudoInputsWithFunctionNameFixStarterCode
use_starter_code: True
train_sub_split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: problem_id
extract_fields: [ "pos", "neg" ]
extra_file: ${sft_model_dir}/oss-apps-xcode-combine/checkpoint-300/train.0shot.tem1.0.n10.v2.1.apps.4o.dpo_m4_low0.5.mcoder-xcode.self.dpo_m6_low0.5_min5.all.json
- _target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: [ "pos", "neg" ]
mode: multi
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}{chat_suffix}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}{chat_suffix}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 4096
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${sft_model_dir}/checkpoint-300/
pretrain:
resume: latest
dp_size:
tp_size: 1
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps-4o.mc-xcode-self.iter2.dpo.mi300x.dp32.v1.2.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 4
per_gpu_eval_batch_size: 4
#learning_rate: 1e-4
learning_rate: 1e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 1
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 1
max_steps: 0
warmup_proportion: 0.03
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: False
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
# zero_optimization:
# offload_optimizer:
# device: cpu
# pin_memory: True
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/iter3/gpt4o-distil-combine-pdpo-v1.0-h100-ps-test.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
# - deepspeed@ds_cfg: train_hybrid_engine_zero2_optim_offload
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/dataset/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
sft_model_dir: ${output_path_prefix}/experiments/deepseek-coder-v1.5-ins.7b.r2c.sft_ps_test_case.iter2.pdpo.V100.tp8dp32.v1.3.s42/
train_file: /mnt/fangkai_blob/share/xcode_4o_oss_apps_test_inputs_v1.json
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: bfloat16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.5
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
# attn_implementation: "sdpa"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
device_map: ${device_map}
ref_model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
# attn_implementation: "sdpa"
device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.PseudoInputsWithFunctionName
use_starter_code: True
train_sub_split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: problem_id
extract_fields: [ "pos", "neg" ]
extra_file: ${sft_model_dir}/oss-apps-xcode-combine/checkpoint-300/train.0shot.tem1.0.n64.v2.0.prefix.upper0.8.r0.3.sample10_per.completion.tem1.0.n3.pseudo_input_output.prefix_pass_num.fix_low0.5_m4.0_avg.json
- _target_: data.input_aligner.dpo_paired_random_choice_aligner
anchor_field: pos
paired_field: neg
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
#read_tensor_dpo:
# _target_: data.combine_dataset.MultiMappingDataset
# file_path: /mnt/fangkai_blob/share/xcode_4o_oss_apps_test_inputs_v1.json
# read_fn:
# _target_: data.apps.PseudoInputsWithFunctionName
# use_starter_code: True
# train_sub_split: train
# aligner:
# _target_: data.input_aligner.concat_aligner
# aligners:
# - _target_: data.input_aligner.field_extract_aligner
# input_index_field: problem_id
# extract_index_field: id
# extract_fields: [ "pos", "neg" ]
# extra_file: ${sft_model_dir}/oss-apps-xcode-combine/checkpoint-800/train.0shot.tem1.0.n10.v2.0.pseudo_input_output.v1.0.dpo_m6_low0.5_min5_rm_large_meta.json
# - _target_: data.input_aligner.flat_aligner
# input_index_field: problem_id
# extract_field: [ "pos", "neg" ]
# mode: multi
# template:
# _target_: data.input_utils.recompose_template
# units:
# chat_prefix: ${chat_prefix}
# prompt: ${prompt}
# chat_connect: ${chat_connect}
# pos: "{pos}"
# neg: "{neg}"
# chat_suffix: ${chat_suffix}
# compositions:
# prompt: "{chat_prefix}{prompt}{chat_connect}"
# chosen: "{chat_prefix}{prompt}{chat_connect}{pos}{chat_suffix}"
# reject: "{chat_prefix}{prompt}{chat_connect}{neg}{chat_suffix}"
# index_field: problem_id
# kv_mapping:
# chosen: chosen
# reject: reject
# problem_id: index
# prompt: prompt
#read_tensor:
# _target_: data.combine_dataset.ReplayDataset
# _recursive_: False
# new_dataset_cfg: ${read_tensor_pdpo}
# old_dataset_cfg: ${read_tensor_dpo}
# replay_ratio: 0.5
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${sft_model_dir}/checkpoint-300
pretrain:
resume: latest
dp_size:
tp_size: 1
pp_size: 1
#exp_name: deepseek-coder-v1.5-ins.7b.r2c.sft_ps_test_case.iter2.pdpo.V100.tp4dp64.v1.3.s${seed}
exp_name: deepseek-coder-v1.5-ins.7b.r2c.sft_ps_test_case.iter3.pdpo.H100.dp16.v1.0.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# global size 128
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 1e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 16
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 5
max_steps: 0
warmup_proportion: 0.0
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/iter3/gpt4o-distil-combine-pdpo-v1.1-h100-ps-test.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
# - deepspeed@ds_cfg: train_hybrid_engine_zero2_optim_offload
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/dataset/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
sft_model_dir: ${output_path_prefix}/experiments/deepseek-coder-v1.5-ins.7b.r2c.sft_ps_test_case.iter2.pdpo.V100.tp8dp32.v1.3.s42/
train_file: /mnt/fangkai_blob/share/xcode_4o_oss_apps_test_inputs_v1.json
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: bfloat16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.5
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
# attn_implementation: "sdpa"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
device_map: ${device_map}
ref_model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
# attn_implementation: "sdpa"
device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.PseudoInputsWithFunctionName
use_starter_code: True
train_sub_split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: problem_id
extract_fields: [ "pos", "neg" ]
extra_file: ${sft_model_dir}/oss-apps-xcode-combine/checkpoint-300/train.0shot.tem1.0.n64.v2.0.prefix.upper0.8.r0.3.sample10_per.completion.tem1.0.n3.pseudo_input_output.prefix_pass_num.fix_low0.3_m3.0_avg.json
# - _target_: data.input_aligner.dpo_paired_random_choice_aligner
# anchor_field: pos
# paired_field: neg
- _target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: [ "pos", "neg" ]
mode: multi
- _target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: [ "neg" ]
mode: multi
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
#read_tensor_dpo:
# _target_: data.combine_dataset.MultiMappingDataset
# file_path: /mnt/fangkai_blob/share/xcode_4o_oss_apps_test_inputs_v1.json
# read_fn:
# _target_: data.apps.PseudoInputsWithFunctionName
# use_starter_code: True
# train_sub_split: train
# aligner:
# _target_: data.input_aligner.concat_aligner
# aligners:
# - _target_: data.input_aligner.field_extract_aligner
# input_index_field: problem_id
# extract_index_field: id
# extract_fields: [ "pos", "neg" ]
# extra_file: ${sft_model_dir}/oss-apps-xcode-combine/checkpoint-800/train.0shot.tem1.0.n10.v2.0.pseudo_input_output.v1.0.dpo_m6_low0.5_min5_rm_large_meta.json
# - _target_: data.input_aligner.flat_aligner
# input_index_field: problem_id
# extract_field: [ "pos", "neg" ]
# mode: multi
# template:
# _target_: data.input_utils.recompose_template
# units:
# chat_prefix: ${chat_prefix}
# prompt: ${prompt}
# chat_connect: ${chat_connect}
# pos: "{pos}"
# neg: "{neg}"
# chat_suffix: ${chat_suffix}
# compositions:
# prompt: "{chat_prefix}{prompt}{chat_connect}"
# chosen: "{chat_prefix}{prompt}{chat_connect}{pos}{chat_suffix}"
# reject: "{chat_prefix}{prompt}{chat_connect}{neg}{chat_suffix}"
# index_field: problem_id
# kv_mapping:
# chosen: chosen
# reject: reject
# problem_id: index
# prompt: prompt
#read_tensor:
# _target_: data.combine_dataset.ReplayDataset
# _recursive_: False
# new_dataset_cfg: ${read_tensor_pdpo}
# old_dataset_cfg: ${read_tensor_dpo}
# replay_ratio: 0.5
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${sft_model_dir}/checkpoint-300
pretrain:
resume: latest
dp_size:
tp_size: 1
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.r2c.sft_ps_test_case.iter3.pdpo.H100.dp16.v1.1.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# global size 128
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 5e-7
#learning_rate: 2e-5
gradient_accumulation_steps: 8
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 3
max_steps: 0
warmup_proportion: 0.0
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/dpo/iter3/gpt4o-distil-combine-pdpo-v1.2-h100-ps-test.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
# - deepspeed@ds_cfg: train_hybrid_engine_zero2_optim_offload
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share/dataset/
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
sft_model_dir: ${output_path_prefix}/experiments/deepseek-coder-v1.5-ins.7b.r2c.sft_ps_test_case.iter2.pdpo.V100.tp8dp32.v1.3.s42/
train_file: /mnt/fangkai_blob/share/xcode_4o_oss_apps_test_inputs_v1.json
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: bfloat16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.5
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
# attn_implementation: "sdpa"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
device_map: ${device_map}
ref_model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
# attn_implementation: "sdpa"
device_map: ${device_map}
pad_token_id: 100001
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.PseudoInputsWithFunctionName
use_starter_code: True
train_sub_split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: problem_id
extract_fields: [ "pos", "neg" ]
extra_file: ${sft_model_dir}/oss-apps-xcode-combine/checkpoint-300/train.0shot.tem1.0.n64.v2.0.prefix.upper0.8.r0.3.sample10_per.completion.tem1.0.n3.pseudo_input_output.prefix_pass_num.fix_low0.5_m4.0_avg.json
# - _target_: data.input_aligner.dpo_paired_random_choice_aligner
# anchor_field: pos
# paired_field: neg
- _target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: [ "pos", "neg" ]
mode: multi
- _target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: [ "neg" ]
mode: multi
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{pos}"
reject: "{chat_prefix}{prompt}{chat_connect}{neg}"
index_field: problem_id
kv_mapping:
chosen: chosen
reject: reject
problem_id: index
prompt: prompt
#read_tensor_dpo:
# _target_: data.combine_dataset.MultiMappingDataset
# file_path: /mnt/fangkai_blob/share/xcode_4o_oss_apps_test_inputs_v1.json
# read_fn:
# _target_: data.apps.PseudoInputsWithFunctionName
# use_starter_code: True
# train_sub_split: train
# aligner:
# _target_: data.input_aligner.concat_aligner
# aligners:
# - _target_: data.input_aligner.field_extract_aligner
# input_index_field: problem_id
# extract_index_field: id
# extract_fields: [ "pos", "neg" ]
# extra_file: ${sft_model_dir}/oss-apps-xcode-combine/checkpoint-800/train.0shot.tem1.0.n10.v2.0.pseudo_input_output.v1.0.dpo_m6_low0.5_min5_rm_large_meta.json
# - _target_: data.input_aligner.flat_aligner
# input_index_field: problem_id
# extract_field: [ "pos", "neg" ]
# mode: multi
# template:
# _target_: data.input_utils.recompose_template
# units:
# chat_prefix: ${chat_prefix}
# prompt: ${prompt}
# chat_connect: ${chat_connect}
# pos: "{pos}"
# neg: "{neg}"
# chat_suffix: ${chat_suffix}
# compositions:
# prompt: "{chat_prefix}{prompt}{chat_connect}"
# chosen: "{chat_prefix}{prompt}{chat_connect}{pos}{chat_suffix}"
# reject: "{chat_prefix}{prompt}{chat_connect}{neg}{chat_suffix}"
# index_field: problem_id
# kv_mapping:
# chosen: chosen
# reject: reject
# problem_id: index
# prompt: prompt
#read_tensor:
# _target_: data.combine_dataset.ReplayDataset
# _recursive_: False
# new_dataset_cfg: ${read_tensor_pdpo}
# old_dataset_cfg: ${read_tensor_dpo}
# replay_ratio: 0.5
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${sft_model_dir}/checkpoint-300
pretrain:
resume: latest
dp_size:
tp_size: 1
pp_size: 1
#exp_name: deepseek-coder-v1.5-ins.7b.r2c.sft_ps_test_case.iter2.pdpo.V100.tp4dp64.v1.3.s${seed}
exp_name: deepseek-coder-v1.5-ins.7b.r2c.sft_ps_test_case.iter3.pdpo.H100.dp16.v1.2.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# global size 128
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 5e-7
#learning_rate: 2e-5
gradient_accumulation_steps: 16
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 3
max_steps: 0
warmup_proportion: 0.0
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/sft/deprecated/gpt4o-distil-v1.0-v100.yaml
================================================
defaults:
- hydra: default
- deepspeed@ds_cfg: train_hybrid_engine_zero3
# - deepspeed@ds_cfg: train_hybrid_engine_zero2
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: ""
model_path_prefix: ../pretrained-models/ # /mnt/fangkai_blob/share/models
output_path_prefix: "" # /mnt/fangkai_blob/reward_modeling/
train_file: ${data_path_prefix}outputs/apps/apps.train.r2c.vanilla.gpt-4o.tem1.0.s42.json
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
model:
_target_: models.llama.LlamaForCausalLM.from_pretrained
gradient_checkpointing: True
attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
device_map:
_target_: models.utils.return_single_device_map
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.input_utils.jsonl_read_fn
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
suffix: "{completion}"
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{suffix}"
# chosen: "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n{pos}<|im_end|>"
# reject: "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n{neg}<|im_end|>"
# prompt: ${chat_prefix}${prompt}${chat_connect}
# chosen: ${chat_prefix}${prompt}${chat_connect}{completion}
index_field: problem_id
kv_mapping:
chosen: chosen
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPODataSFTCollator
tokenizer: ${tokenizer_init}
max_seq_length: 4096
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${model_path_prefix}/deepseek-coder-7b-instruct-v1.5
pretrain:
resume:
dp_size:
tp_size: 1
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v1.0
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: True
do_eval: True
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
#learning_rate: 5e-6
learning_rate: 2e-5
gradient_accumulation_steps: 16
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 1
max_steps: 0
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 5
save_ds_state: False
save_steps: 400
save_best: False
eval_steps: 400
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: False
# Prediction config
prediction_cfg:
metric: "acc"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.SFTLossOnlyPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
bf16:
enabled: False
fp16:
enabled: True
auto_cast: False
loss_scale": 0
initial_scale_power: 16
loss_scale_window: 1000
hysteresis: 2
consecutive_hysteresis: False
min_loss_scale: 1
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/sft/deprecated/gpt4o-distil-v1.1-v100.yaml
================================================
defaults:
- hydra: default
- deepspeed@ds_cfg: train_hybrid_engine_zero3
# - deepspeed@ds_cfg: train_hybrid_engine_zero2
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: ""
model_path_prefix: ../pretrained-models/ # /mnt/fangkai_blob/share/models
output_path_prefix: "" # /mnt/fangkai_blob/reward_modeling/
train_file: ${data_path_prefix}outputs/apps/apps.train.r2c.vanilla.gpt-4o.tem1.0.s42.json
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
model:
_target_: models.llama.LlamaForCausalLM.from_pretrained
gradient_checkpointing: True
attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
device_map:
_target_: models.utils.return_single_device_map
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.input_utils.jsonl_read_fn
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
suffix: "{completion}"
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{suffix}"
# chosen: "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n{pos}<|im_end|>"
# reject: "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n{neg}<|im_end|>"
# prompt: ${chat_prefix}${prompt}${chat_connect}
# chosen: ${chat_prefix}${prompt}${chat_connect}{completion}
index_field: problem_id
kv_mapping:
chosen: chosen
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPODataSFTCollator
tokenizer: ${tokenizer_init}
max_seq_length: 4096
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${model_path_prefix}/deepseek-coder-7b-instruct-v1.5
pretrain:
resume:
dp_size:
tp_size: 1
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v1.1
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: True
do_eval: True
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
#learning_rate: 5e-6
learning_rate: 1e-5
gradient_accumulation_steps: 8
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 3
max_steps: 0
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 5
save_ds_state: False
save_steps: 400
save_best: False
eval_steps: 400
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: False
# Prediction config
prediction_cfg:
metric: "acc"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.SFTLossOnlyPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
bf16:
enabled: False
fp16:
enabled: True
auto_cast: False
loss_scale": 0
initial_scale_power: 16
loss_scale_window: 1000
hysteresis: 2
consecutive_hysteresis: False
min_loss_scale: 1
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/sft/deprecated/gpt4o-distil-v2.0-v100.yaml
================================================
defaults:
- hydra: default
- deepspeed@ds_cfg: train_hybrid_engine_zero3
# - deepspeed@ds_cfg: train_hybrid_engine_zero2
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: ""
model_path_prefix: ../pretrained-models/ # /mnt/fangkai_blob/share/models
output_path_prefix: "" # /mnt/fangkai_blob/reward_modeling/
#train_file: ${data_path_prefix}outputs/apps/apps.train.r2c.vanilla.gpt-4o.tem1.0.s42.json
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
model:
_target_: models.llama.LlamaForCausalLM.from_pretrained
gradient_checkpointing: True
attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
device_map:
_target_: models.utils.return_single_device_map
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsReader
split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: problem_id
extract_fields: [ "pos" ]
extra_file: ${data_path_prefix}outputs/apps/apps.train.r2c.vanilla.gpt-4o.tem1.0.dpo.exec.dpo_v1.0.json
- _target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: pos
mode: multi
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
suffix: "{pos}"
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{suffix}"
# chosen: "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n{pos}<|im_end|>"
# reject: "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n{neg}<|im_end|>"
# prompt: ${chat_prefix}${prompt}${chat_connect}
# chosen: ${chat_prefix}${prompt}${chat_connect}{completion}
index_field: problem_id
kv_mapping:
chosen: chosen
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPODataSFTCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${model_path_prefix}/deepseek-coder-7b-instruct-v1.5
pretrain:
resume:
dp_size:
tp_size: 1
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v2.0
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
#learning_rate: 5e-6
learning_rate: 1e-5
gradient_accumulation_steps: 8
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 5
max_steps: 0
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 5
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: False
# Prediction config
prediction_cfg:
metric: "acc"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.SFTLossOnlyPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
bf16:
enabled: False
fp16:
enabled: True
auto_cast: False
loss_scale": 0
initial_scale_power: 16
loss_scale_window: 1000
hysteresis: 2
consecutive_hysteresis: False
min_loss_scale: 1
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/sft/deprecated/gpt4o-distil-v2.1-v100-tp.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: ""
model_path_prefix: /mnt/fangkai_blob/share/models # ../pretrained-models/
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
#train_file: ${data_path_prefix}outputs/apps/apps.train.r2c.vanilla.gpt-4o.tem1.0.s42.json
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
model:
_target_: models.llama_tp.LlamaForCausalLM.from_pretrained
gradient_checkpointing: True
attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
# device_map:
# _target_: models.utils.return_single_device_map
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsReader
split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: problem_id
extract_fields: [ "pos" ]
extra_file: ${data_path_prefix}outputs/apps/apps.train.r2c.vanilla.gpt-4o.tem1.0.dpo.exec.dpo_v1.0.json
- _target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: pos
mode: multi
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
suffix: "{pos}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{suffix}{chat_suffix}"
# chosen: "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n{pos}<|im_end|>"
# reject: "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n{neg}<|im_end|>"
# prompt: ${chat_prefix}${prompt}${chat_connect}
# chosen: ${chat_prefix}${prompt}${chat_connect}{completion}
index_field: problem_id
kv_mapping:
chosen: chosen
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPODataSFTCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${model_path_prefix}/deepseek-coder-7b-instruct-v1.5
pretrain:
resume:
dp_size:
tp_size: 2
pp_size: 1
#exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v2.1
#exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v2.1.rerun # Add float32 computation for LM loss.
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.tp2.dp4.v2.1.rerun # Add float32 computation for LM loss.
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
#learning_rate: 5e-6
learning_rate: 1e-5
gradient_accumulation_steps: 16
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 5
max_steps: 0
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 5
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: False
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.SFTLossOnlyPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
bf16:
enabled: False
fp16:
enabled: True
auto_cast: False
loss_scale": 0
initial_scale_power: 16
loss_scale_window: 1000
hysteresis: 2
consecutive_hysteresis: False
min_loss_scale: 1
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/sft/deprecated/gpt4o-distil-v2.1-v100.yaml
================================================
defaults:
- hydra: default
- deepspeed@ds_cfg: train_hybrid_engine_zero3
# - deepspeed@ds_cfg: train_hybrid_engine_zero2
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: ""
model_path_prefix: ../pretrained-models/ # /mnt/fangkai_blob/share/models
output_path_prefix: "" # /mnt/fangkai_blob/reward_modeling/
#train_file: ${data_path_prefix}outputs/apps/apps.train.r2c.vanilla.gpt-4o.tem1.0.s42.json
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
model:
_target_: models.llama.LlamaForCausalLM.from_pretrained
gradient_checkpointing: True
attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
device_map:
_target_: models.utils.return_single_device_map
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsReader
split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: problem_id
extract_fields: [ "pos" ]
extra_file: ${data_path_prefix}outputs/apps/apps.train.r2c.vanilla.gpt-4o.tem1.0.dpo.exec.dpo_v1.0.json
- _target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: pos
mode: multi
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
suffix: "{pos}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{suffix}{chat_suffix}"
# chosen: "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n{pos}<|im_end|>"
# reject: "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n{neg}<|im_end|>"
# prompt: ${chat_prefix}${prompt}${chat_connect}
# chosen: ${chat_prefix}${prompt}${chat_connect}{completion}
index_field: problem_id
kv_mapping:
chosen: chosen
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPODataSFTCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${model_path_prefix}/deepseek-coder-7b-instruct-v1.5
pretrain:
resume:
dp_size:
tp_size: 1
pp_size: 1
#exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v2.1
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v2.1.rerun # Add float32 computation for LM loss.
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
#learning_rate: 5e-6
learning_rate: 1e-5
gradient_accumulation_steps: 8
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 5
max_steps: 0
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 5
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: False
# Prediction config
prediction_cfg:
metric: "acc"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.SFTLossOnlyPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
bf16:
enabled: False
fp16:
enabled: True
auto_cast: False
loss_scale": 0
initial_scale_power: 16
loss_scale_window: 1000
hysteresis: 2
consecutive_hysteresis: False
min_loss_scale: 1
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/sft/deprecated/gpt4o-distil-v2.2-v100.yaml
================================================
defaults:
- hydra: default
- deepspeed@ds_cfg: train_hybrid_engine_zero3
# - deepspeed@ds_cfg: train_hybrid_engine_zero2
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: ""
model_path_prefix: ../pretrained-models/ # /mnt/fangkai_blob/share/models
output_path_prefix: "" # /mnt/fangkai_blob/reward_modeling/
#train_file: ${data_path_prefix}outputs/apps/apps.train.r2c.vanilla.gpt-4o.tem1.0.s42.json
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
model:
_target_: models.llama.LlamaForCausalLM.from_pretrained
gradient_checkpointing: True
attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
device_map:
_target_: models.utils.return_single_device_map
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsReader
split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: problem_id
extract_fields: [ "pos" ]
extra_file: ${data_path_prefix}outputs/apps/apps.train.r2c.vanilla.gpt-4o.tem1.0.dpo.exec.dpo_v1.0.json
- _target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: pos
mode: multi
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
suffix: "{pos}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{suffix}{chat_suffix}"
# chosen: "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n{pos}<|im_end|>"
# reject: "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n{neg}<|im_end|>"
# prompt: ${chat_prefix}${prompt}${chat_connect}
# chosen: ${chat_prefix}${prompt}${chat_connect}{completion}
index_field: problem_id
kv_mapping:
chosen: chosen
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPODataSFTCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${model_path_prefix}/deepseek-coder-7b-instruct-v1.5
pretrain:
resume:
dp_size:
tp_size: 1
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v2.2
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 5e-6
#learning_rate: 1e-5
gradient_accumulation_steps: 8
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 5
max_steps: 0
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 5
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: False
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.SFTLossOnlyPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
bf16:
enabled: False
fp16:
enabled: True
auto_cast: False
loss_scale": 0
initial_scale_power: 16
loss_scale_window: 1000
hysteresis: 2
consecutive_hysteresis: False
min_loss_scale: 1
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/sft/deprecated/gpt4o-distil-v2.3-v100.yaml
================================================
defaults:
- hydra: default
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload
# - deepspeed@ds_cfg: train_hybrid_engine_zero2
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: ""
model_path_prefix: ../pretrained-models/ # /mnt/fangkai_blob/share/models
output_path_prefix: "" # /mnt/fangkai_blob/reward_modeling/
#train_file: ${data_path_prefix}outputs/apps/apps.train.r2c.vanilla.gpt-4o.tem1.0.s42.json
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
model:
_target_: models.llama_tp.LlamaForCausalLM.from_pretrained
gradient_checkpointing: True
attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
# device_map:
# _target_: models.utils.return_single_device_map
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsReader
split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: problem_id
extract_fields: [ "pos" ]
extra_file: ${data_path_prefix}outputs/apps/apps.train.r2c.vanilla.gpt-4o.tem1.0.n11.exec.dpo_v1.0.json
- _target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: pos
mode: multi
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
suffix: "{pos}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{suffix}{chat_suffix}"
# chosen: "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n{pos}<|im_end|>"
# reject: "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n{neg}<|im_end|>"
# prompt: ${chat_prefix}${prompt}${chat_connect}
# chosen: ${chat_prefix}${prompt}${chat_connect}{completion}
index_field: problem_id
kv_mapping:
chosen: chosen
problem_id: index
prompt: prompt
dist_load_data_barrier: True
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPODataSFTCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${model_path_prefix}/deepseek-coder-7b-instruct-v1.5
pretrain:
resume:
dp_size:
tp_size: 4
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.tp4.dp2.v2.3.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 5e-6
#learning_rate: 1e-5
gradient_accumulation_steps: 64
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 1
max_steps: 500
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 5
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: False
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.SFTLossOnlyPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
bf16:
enabled: False
fp16:
enabled: True
auto_cast: False
loss_scale": 0
initial_scale_power: 16
loss_scale_window: 1000
hysteresis: 2
consecutive_hysteresis: False
min_loss_scale: 1
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/sft/deprecated/gpt4o-distil-v2.5-v100.yaml
================================================
defaults:
- hydra: default
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload
# - deepspeed@ds_cfg: train_hybrid_engine_zero2
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: ""
model_path_prefix: ../pretrained-models/ # /mnt/fangkai_blob/share/models
output_path_prefix: "" # /mnt/fangkai_blob/reward_modeling/
#train_file: ${data_path_prefix}outputs/apps/apps.train.r2c.vanilla.gpt-4o.tem1.0.s42.json
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
model:
_target_: models.llama_tp.LlamaForCausalLM.from_pretrained
gradient_checkpointing: True
attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
# device_map:
# _target_: models.utils.return_single_device_map
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsReader
split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: problem_id
extract_fields: [ "pos" ]
extra_file: ${data_path_prefix}outputs/apps/apps.train.r2c.vanilla.gpt-4o.tem1.0.n11.exec.dpo_v1.0.json
- _target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: pos
mode: multi
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
suffix: "{pos}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{suffix}{chat_suffix}"
# chosen: "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n{pos}<|im_end|>"
# reject: "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n{neg}<|im_end|>"
# prompt: ${chat_prefix}${prompt}${chat_connect}
# chosen: ${chat_prefix}${prompt}${chat_connect}{completion}
index_field: problem_id
kv_mapping:
chosen: chosen
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPODataSFTCollator
tokenizer: ${tokenizer_init}
max_seq_length: 2048
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${model_path_prefix}/deepseek-coder-7b-instruct-v1.5
pretrain:
resume:
dp_size:
tp_size: 4
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.tp2.dp4.v2.5.s${seed} # Fix loss scale
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 1e-6
#learning_rate: 1e-5
gradient_accumulation_steps: 32
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 1
max_steps: 500
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 5
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: False
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.SFTLossOnlyPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
bf16:
enabled: False
fp16:
enabled: True
auto_cast: False
# loss_scale": 0
loss_scale: 0
# initial_scale_power: 16
initial_scale_power: 32
loss_scale_window: 1000
hysteresis: 2
consecutive_hysteresis: False
min_loss_scale: 1
gradient_clipping: 1.0
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/sft/deprecated/gpt4o-distil-v2.6-v100.yaml
================================================
defaults:
- hydra: default
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload
# - deepspeed@ds_cfg: train_hybrid_engine_zero2
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: ""
model_path_prefix: ../pretrained-models/ # /mnt/fangkai_blob/share/models
output_path_prefix: "" # /mnt/fangkai_blob/reward_modeling/
#train_file: ${data_path_prefix}outputs/apps/apps.train.r2c.vanilla.gpt-4o.tem1.0.s42.json
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
model:
_target_: models.llama_tp.LlamaForCausalLM.from_pretrained
gradient_checkpointing: True
attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
# device_map:
# _target_: models.utils.return_single_device_map
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsReader
split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: problem_id
extract_fields: [ "pos" ]
extra_file: ${data_path_prefix}outputs/apps/apps.train.r2c.vanilla.gpt-4o.tem1.0.n11.exec.dpo_v1.0.json
- _target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: pos
mode: multi
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
suffix: "{pos}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{suffix}{chat_suffix}"
# chosen: "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n{pos}<|im_end|>"
# reject: "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n{neg}<|im_end|>"
# prompt: ${chat_prefix}${prompt}${chat_connect}
# chosen: ${chat_prefix}${prompt}${chat_connect}{completion}
index_field: problem_id
kv_mapping:
chosen: chosen
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPODataSFTCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${model_path_prefix}/deepseek-coder-7b-instruct-v1.5
pretrain:
resume:
dp_size:
tp_size: 4
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.tp2.dp4.v2.6.s${seed} # Fix loss scale
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 1e-6
#learning_rate: 1e-5
gradient_accumulation_steps: 32
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 0.5
num_train_epochs: 1
max_steps: 500
warmup_proportion: 0.2
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: False
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.SFTLossOnlyPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
bf16:
enabled: False
fp16:
enabled: True
auto_cast: False
# loss_scale": 0
loss_scale: 0
# initial_scale_power: 16
initial_scale_power: 36
loss_scale_window: 1000
hysteresis: 2
consecutive_hysteresis: False
min_loss_scale: 1
gradient_clipping: 0.5
steps_per_print: 1
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/sft/gpt4o-distil-v2.4-a100.yaml
================================================
defaults:
- hydra: default
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload
# - deepspeed@ds_cfg: train_hybrid_engine_zero2
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: ""
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: bfloat16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
model:
_target_: models.llama.LlamaForCausalLM.from_pretrained
gradient_checkpointing: True
attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
# device_map:
# _target_: models.utils.return_single_device_map
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsReader
split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: problem_id
extract_fields: [ "pos" ]
extra_file: ${data_path_prefix}outputs/apps/apps.train.r2c.vanilla.gpt-4o.tem1.0.n11.exec.dpo_v1.0.json
- _target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: pos
mode: multi
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
suffix: "{pos}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{suffix}{chat_suffix}"
# chosen: "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n{pos}<|im_end|>"
# reject: "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n{neg}<|im_end|>"
# prompt: ${chat_prefix}${prompt}${chat_connect}
# chosen: ${chat_prefix}${prompt}${chat_connect}{completion}
index_field: problem_id
kv_mapping:
chosen: chosen
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPODataSFTCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${model_path_prefix}/deepseek-coder-7b-instruct-v1.5
pretrain:
resume:
dp_size:
tp_size: 1
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.A100.w4.v2.4.s${seed} # Fix loss scale
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 5e-6
#learning_rate: 1e-5
gradient_accumulation_steps: 32
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 1
max_steps: 500
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 5
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.SFTLossOnlyPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
## loss_scale": 0
# loss_scale: 0
## initial_scale_power: 16
# initial_scale_power: 32
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/sft/gpt4o-distil-v2.4-v100-fix-2node-test.yaml
================================================
defaults:
- hydra: default
- deepspeed@ds_cfg: train_hybrid_engine_zero1
# - deepspeed@ds_cfg: train_hybrid_engine_zero2
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: ""
model_path_prefix: ../pretrained-models/ # /mnt/fangkai_blob/share/models
output_path_prefix: "" # /mnt/fangkai_blob/reward_modeling/
#train_file: ${data_path_prefix}outputs/apps/apps.train.r2c.vanilla.gpt-4o.tem1.0.s42.json
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
model:
_target_: models.llama_tp.LlamaForCausalLM.from_pretrained
gradient_checkpointing: True
attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
# device_map:
# _target_: models.utils.return_single_device_map
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsReader
split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: problem_id
extract_fields: [ "pos" ]
extra_file: ${data_path_prefix}outputs/apps/apps.train.r2c.vanilla.gpt-4o.tem1.0.n11.exec.dpo_v1.0.json
- _target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: pos
mode: multi
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
suffix: "{pos}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{suffix}{chat_suffix}"
# chosen: "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n{pos}<|im_end|>"
# reject: "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n{neg}<|im_end|>"
# prompt: ${chat_prefix}${prompt}${chat_connect}
# chosen: ${chat_prefix}${prompt}${chat_connect}{completion}
index_field: problem_id
kv_mapping:
chosen: chosen
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPODataSFTCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${model_path_prefix}/deepseek-coder-7b-instruct-v1.5
pretrain:
resume:
dp_size:
tp_size: 4
pp_size: 1
#exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.tp2.dp4.v2.4.s${seed} # Fix loss scale
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.tp2.dp8.v2.4.s${seed}.fix_all_padding.multi-node-testing # Fix all padding problem
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
#learning_rate: 5e-6
learning_rate: 1e-5
gradient_accumulation_steps: 16
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 1
max_steps: 500
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 5
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: False
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.SFTLossOnlyPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
bf16:
enabled: False
fp16:
enabled: True
auto_cast: False
loss_scale: 0
initial_scale_power: 16
loss_scale_window: 1000
hysteresis: 2
consecutive_hysteresis: False
min_loss_scale: 1
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/sft/gpt4o-distil-v2.4-v100-fix.yaml
================================================
defaults:
- hydra: default
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload
# - deepspeed@ds_cfg: train_hybrid_engine_zero2
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: ""
model_path_prefix: ../pretrained-models/ # /mnt/fangkai_blob/share/models
output_path_prefix: "" # /mnt/fangkai_blob/reward_modeling/
#train_file: ${data_path_prefix}outputs/apps/apps.train.r2c.vanilla.gpt-4o.tem1.0.s42.json
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
model:
_target_: models.llama_tp.LlamaForCausalLM.from_pretrained
gradient_checkpointing: True
attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
# device_map:
# _target_: models.utils.return_single_device_map
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsReader
split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: problem_id
extract_fields: [ "pos" ]
extra_file: ${data_path_prefix}outputs/apps/apps.train.r2c.vanilla.gpt-4o.tem1.0.n11.exec.dpo_v1.0.json
- _target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: pos
mode: multi
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
suffix: "{pos}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{suffix}{chat_suffix}"
# chosen: "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n{pos}<|im_end|>"
# reject: "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n{neg}<|im_end|>"
# prompt: ${chat_prefix}${prompt}${chat_connect}
# chosen: ${chat_prefix}${prompt}${chat_connect}{completion}
index_field: problem_id
kv_mapping:
chosen: chosen
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPODataSFTCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${model_path_prefix}/deepseek-coder-7b-instruct-v1.5
pretrain:
resume:
dp_size:
tp_size: 4
pp_size: 1
#exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.tp2.dp4.v2.4.s${seed} # Fix loss scale
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.tp2.dp4.v2.4.s${seed}.fix_all_padding # Fix all padding problem
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
#learning_rate: 5e-6
learning_rate: 1e-5
gradient_accumulation_steps: 16
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 1
max_steps: 500
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 5
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: False
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.SFTLossOnlyPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
bf16:
enabled: False
fp16:
enabled: True
auto_cast: False
loss_scale: 0
initial_scale_power: 16
loss_scale_window: 1000
hysteresis: 2
consecutive_hysteresis: False
min_loss_scale: 1
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/sft/gpt4o-distil-v2.4-v100.yaml
================================================
defaults:
- hydra: default
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload
# - deepspeed@ds_cfg: train_hybrid_engine_zero2
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: ""
model_path_prefix: ../pretrained-models/ # /mnt/fangkai_blob/share/models
output_path_prefix: "" # /mnt/fangkai_blob/reward_modeling/
#train_file: ${data_path_prefix}outputs/apps/apps.train.r2c.vanilla.gpt-4o.tem1.0.s42.json
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
model:
_target_: models.llama_tp.LlamaForCausalLM.from_pretrained
gradient_checkpointing: True
attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
# device_map:
# _target_: models.utils.return_single_device_map
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsReader
split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: problem_id
extract_fields: [ "pos" ]
extra_file: ${data_path_prefix}outputs/apps/apps.train.r2c.vanilla.gpt-4o.tem1.0.n11.exec.dpo_v1.0.json
- _target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: pos
mode: multi
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
suffix: "{pos}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{suffix}{chat_suffix}"
# chosen: "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n{pos}<|im_end|>"
# reject: "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n{neg}<|im_end|>"
# prompt: ${chat_prefix}${prompt}${chat_connect}
# chosen: ${chat_prefix}${prompt}${chat_connect}{completion}
index_field: problem_id
kv_mapping:
chosen: chosen
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPODataSFTCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${model_path_prefix}/deepseek-coder-7b-instruct-v1.5
pretrain:
resume:
dp_size:
tp_size: 4
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.tp2.dp4.v2.4.s${seed} # Fix loss scale
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 5e-6
#learning_rate: 1e-5
gradient_accumulation_steps: 32
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 1
max_steps: 500
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 5
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: False
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.SFTLossOnlyPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
bf16:
enabled: False
fp16:
enabled: True
auto_cast: False
# loss_scale": 0
loss_scale: 0
# initial_scale_power: 16
initial_scale_power: 32
loss_scale_window: 1000
hysteresis: 2
consecutive_hysteresis: False
min_loss_scale: 1
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/sft/gpt4o-distil-v3.0-a100.yaml
================================================
defaults:
- hydra: default
- deepspeed@ds_cfg: train_hybrid_engine_zero1
# - deepspeed@ds_cfg: train_hybrid_engine_zero2
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: ""
model_path_prefix: ../pretrained-models/ # /mnt/fangkai_blob/share/models
output_path_prefix: "" # /mnt/fangkai_blob/reward_modeling/
#train_file: ${data_path_prefix}outputs/apps/apps.train.r2c.vanilla.gpt-4o.tem1.0.s42.json
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
model:
_target_: models.llama.LlamaForCausalLM.from_pretrained
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
# device_map:
# _target_: models.utils.return_single_device_map
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsReader
split: train
train_sub_split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: problem_id
extract_fields: [ "pos" ]
extra_file: ${data_path_prefix}outputs/apps/apps.train.r2c.vanilla.gpt-4o.tem1.0.n11.exec.dpo_v1.0.json
- _target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: pos
mode: multi
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
suffix: "{pos}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{suffix}{chat_suffix}"
# chosen: "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n{pos}<|im_end|>"
# reject: "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n{neg}<|im_end|>"
# prompt: ${chat_prefix}${prompt}${chat_connect}
# chosen: ${chat_prefix}${prompt}${chat_connect}{completion}
index_field: problem_id
kv_mapping:
chosen: chosen
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPODataSFTCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${model_path_prefix}/deepseek-coder-7b-instruct-v1.5
pretrain:
resume:
dp_size:
tp_size: 1
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.A100.w8.v3.0.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
#learning_rate: 5e-6
learning_rate: 1e-5
gradient_accumulation_steps: 8
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 1
max_steps: 500
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 5
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: False
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.SFTLossOnlyPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
bf16:
enabled: False
fp16:
enabled: True
auto_cast: False
loss_scale: 0
initial_scale_power: 16
loss_scale_window: 1000
hysteresis: 2
consecutive_hysteresis: False
min_loss_scale: 1
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/sft/gpt4o-distil-v3.0-v100.yaml
================================================
defaults:
- hydra: default
- deepspeed@ds_cfg: train_hybrid_engine_zero1
# - deepspeed@ds_cfg: train_hybrid_engine_zero2
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: ""
model_path_prefix: ../pretrained-models/ # /mnt/fangkai_blob/share/models
output_path_prefix: "" # /mnt/fangkai_blob/reward_modeling/
#train_file: ${data_path_prefix}outputs/apps/apps.train.r2c.vanilla.gpt-4o.tem1.0.s42.json
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
model:
_target_: models.llama_tp.LlamaForCausalLM.from_pretrained
gradient_checkpointing: True
attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
# device_map:
# _target_: models.utils.return_single_device_map
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsReader
split: train
train_sub_split: train
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: problem_id
extract_fields: [ "pos" ]
extra_file: ${data_path_prefix}outputs/apps/apps.train.r2c.vanilla.gpt-4o.tem1.0.n11.exec.dpo_v1.0.json
- _target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: pos
mode: multi
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
suffix: "{pos}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{suffix}{chat_suffix}"
# chosen: "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n{pos}<|im_end|>"
# reject: "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n{neg}<|im_end|>"
# prompt: ${chat_prefix}${prompt}${chat_connect}
# chosen: ${chat_prefix}${prompt}${chat_connect}{completion}
index_field: problem_id
kv_mapping:
chosen: chosen
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPODataSFTCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${model_path_prefix}/deepseek-coder-7b-instruct-v1.5
pretrain:
resume:
dp_size:
tp_size: 4
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.tp4.dp4.v3.0.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
#learning_rate: 5e-6
learning_rate: 1e-5
gradient_accumulation_steps: 16
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 1
max_steps: 500
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 5
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: False
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.SFTLossOnlyPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
bf16:
enabled: False
fp16:
enabled: True
auto_cast: False
loss_scale: 0
initial_scale_power: 16
loss_scale_window: 1000
hysteresis: 2
consecutive_hysteresis: False
min_loss_scale: 1
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/sft/gpt4o-distil-v3.1-v100-test.yaml
================================================
defaults:
- hydra: default
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload
# - deepspeed@ds_cfg: train_hybrid_engine_zero2
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: ""
model_path_prefix: ../pretrained-models/ # /mnt/fangkai_blob/share/models
output_path_prefix: "" # /mnt/fangkai_blob/reward_modeling/
#train_file: ${data_path_prefix}outputs/apps/apps.train.r2c.vanilla.gpt-4o.tem1.0.s42.json
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
model:
_target_: models.llama_tp.LlamaForCausalLM.from_pretrained
gradient_checkpointing: False
attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
# device_map:
# _target_: models.utils.return_single_device_map
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsWithFunctionName
split: train
train_sub_split: train
use_starter_code: True
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: problem_id
extract_fields: [ "pos" ]
extra_file: ${data_path_prefix}outputs/apps/apps.train.r2c.vanilla.gpt-4o.tem1.0.n11.exec.dpo_v1.0.json
- _target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: pos
mode: multi
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
suffix: "{pos}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{suffix}{chat_suffix}"
# chosen: "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n{pos}<|im_end|>"
# reject: "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n{neg}<|im_end|>"
# prompt: ${chat_prefix}${prompt}${chat_connect}
# chosen: ${chat_prefix}${prompt}${chat_connect}{completion}
index_field: problem_id
kv_mapping:
chosen: chosen
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPODataSFTCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${model_path_prefix}/deepseek-coder-7b-instruct-v1.5
pretrain:
resume:
dp_size:
tp_size: 4
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v3.1.dp4.tp4.test.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
#learning_rate: 5e-6
learning_rate: 1e-5
gradient_accumulation_steps: 16
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 1
max_steps: 500
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 5
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: False
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.SFTLossOnlyPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
bf16:
enabled: False
fp16:
enabled: True
auto_cast: False
loss_scale: 0
initial_scale_power: 16
loss_scale_window: 1000
hysteresis: 2
consecutive_hysteresis: False
min_loss_scale: 1
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/r2c_generation/deepseek_coder/sft/gpt4o-distil-v3.1-v100.yaml
================================================
defaults:
- hydra: default
- deepspeed@ds_cfg: train_hybrid_engine_zero1
# - deepspeed@ds_cfg: train_hybrid_engine_zero2
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: ""
model_path_prefix: ../pretrained-models/ # /mnt/fangkai_blob/share/models
output_path_prefix: "" # /mnt/fangkai_blob/reward_modeling/
#train_file: ${data_path_prefix}outputs/apps/apps.train.r2c.vanilla.gpt-4o.tem1.0.s42.json
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
model:
_target_: models.llama_tp.LlamaForCausalLM.from_pretrained
gradient_checkpointing: True
attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
# device_map:
# _target_: models.utils.return_single_device_map
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/r2c_prompt_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsWithFunctionName
split: train
train_sub_split: train
use_starter_code: True
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: problem_id
extract_index_field: problem_id
extract_fields: [ "pos" ]
extra_file: ${data_path_prefix}outputs/apps/apps.train.r2c.vanilla.gpt-4o.tem1.0.n11.exec.dpo_v1.0.json
- _target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: pos
mode: multi
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
suffix: "{pos}"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{suffix}{chat_suffix}"
# chosen: "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n{pos}<|im_end|>"
# reject: "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n{neg}<|im_end|>"
# prompt: ${chat_prefix}${prompt}${chat_connect}
# chosen: ${chat_prefix}${prompt}${chat_connect}{completion}
index_field: problem_id
kv_mapping:
chosen: chosen
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPODataSFTCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${model_path_prefix}/deepseek-coder-7b-instruct-v1.5
pretrain:
resume:
dp_size:
tp_size: 4
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.r2c.gpt4o.distil.V100.w8.v3.1.dp4.tp4.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
#learning_rate: 5e-6
learning_rate: 1e-5
gradient_accumulation_steps: 16
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 1
max_steps: 500
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 5
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: False
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.SFTLossOnlyPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
bf16:
enabled: False
fp16:
enabled: True
auto_cast: False
loss_scale: 0
initial_scale_power: 16
loss_scale_window: 1000
hysteresis: 2
consecutive_hysteresis: False
min_loss_scale: 1
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/apps/test_input_gen/deepseek_coder/sft/v1.0-a100.yaml
================================================
defaults:
- hydra: default
# - deepspeed@ds_cfg: train_hybrid_engine_zero3
- deepspeed@ds_cfg: train_hybrid_engine_zero1
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: ""
model_path_prefix: /mnt/fangkai_blob/share/models
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
train_file: "hf:codeparrot/apps"
dev_file:
test_file:
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: bfloat16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
model:
_target_: models.llama.LlamaForCausalLM.from_pretrained
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
torch_dtype: ${torch_dtype}
pad_token_id: 100001
# device_map:
# _target_: models.utils.return_single_device_map
chat_prefix: "<|begin▁of▁sentence|>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\n"
chat_connect: "\n### Response:\n"
chat_suffix: "\n<|EOT|>"
prompt:
_target_: data.input_utils.read_text
file_path: prompts/apps/test_input_gen_0shot_v1.0.txt
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
read_fn:
_target_: data.apps.APPsFlatTestCasesReader
split: train
train_sub_split: train
aligner:
_target_: data.input_aligner.flat_aligner
input_index_field: problem_id
extract_field: [ "test_inputs" ]
mode: "multi"
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: ${chat_prefix}
prompt: ${prompt}
chat_connect: ${chat_connect}
completion: "\n{test_inputs}\n"
chat_suffix: ${chat_suffix}
compositions:
prompt: "{chat_prefix}{prompt}{chat_connect}"
chosen: "{chat_prefix}{prompt}{chat_connect}{completion}{chat_suffix}"
index_field: problem_id
kv_mapping:
chosen: chosen
problem_id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPODataSFTCollator
tokenizer: ${tokenizer_init}
max_seq_length: 3072
# Dataloader
num_workers: 8
prefetch_factor: 2
# Wiki path pretrain v8.2
model_name_or_path: ${model_path_prefix}/deepseek-coder-7b-instruct-v1.5
pretrain:
resume:
dp_size:
tp_size: 1
pp_size: 1
exp_name: deepseek-coder-v1.5-ins.7b.apps.test_inputs_gen.a100.w8.v1.0.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
#learning_rate: 5e-6
learning_rate: 1e-5
gradient_accumulation_steps: 16
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 1
max_steps: 300
warmup_proportion: 0.1
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 5
save_ds_state: False
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.SFTLossOnlyPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
# zero_optimization:
# offload_optimizer:
# device: cpu
# pin_memory: True
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/mathscale/llama/dpo/iter1/llama3.1-dpo-4o-iter0-v1.0-H100.yaml
================================================
defaults:
- hydra: default
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload_cosine
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share
model_path_prefix: /mnt/fangkai_blob/share/models/
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
sft_model_dir: ${output_path_prefix}/experiments/llama3.1.8b.mathscale4o.process-dpo.iter0.A100.dp8.v2.2.s42/checkpoint-1200/
train_file: ${sft_model_dir}/mathscale4o/train.500k.de_con.boxed.v1.0.n10.tem1.0.p0.9.prefer_pair.json
dev_file:
test_file:
eos_token: "<|end_of_text|>"
eos_token_id: 128001
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: bfloat16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
pad_token: ${eos_token}
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.5
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
# attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: ${eos_token_id}
device_map: ${device_map}
ref_model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
# attn_implementation: "eager"
device_map: ${device_map}
pad_token_id: ${eos_token_id}
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.dpo_bi_random_choice_aligner
pos_field: pos
neg_field: neg
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: "{question}\n\nPlease put your final answer within {instruction}."
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${eos_token}
compositions:
prompt: "{chat_prefix}"
chosen: "{chat_prefix}{pos}{chat_suffix}"
reject: "{chat_prefix}{neg}{chat_suffix}"
instruction: "\\boxed{}"
index_field: id
kv_mapping:
chosen: chosen
reject: reject
id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 2048
# Dataloader
num_workers: 8
prefetch_factor: 2
model_name_or_path: ${sft_model_dir}
pretrain:
resume: latest
dp_size:
tp_size: 1
pp_size: 1
exp_name: llama3.1.8b.mathscale4o.dpo.iter1.H100.dp8.v1.0.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-6
learning_rate: 8e-7
gradient_accumulation_steps: 64
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 3
max_steps: 0
warmup_proportion: 0.03
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 200
save_best: False
eval_steps: 200
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/mathscale/llama/dpo/iter1/llama3.1-dpo-4o-iter0-v1.1-A100-40.yaml
================================================
defaults:
- hydra: default
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload_cosine
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share
model_path_prefix: /mnt/fangkai_blob/share/models/
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
sft_model_dir: ${output_path_prefix}/experiments/llama3.1.8b.mathscale4o.process-dpo.iter0.A100.dp8.v2.2.s42/checkpoint-1200/
train_file: ${sft_model_dir}/mathscale4o/train.500k.de_con.boxed.v1.0.n10.tem1.0.p0.9.prefer_pair.json
dev_file:
test_file:
eos_token: "<|end_of_text|>"
eos_token_id: 128001
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: bfloat16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
pad_token: ${eos_token}
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.5
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
# attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: ${eos_token_id}
# device_map: ${device_map}
ref_model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
# attn_implementation: "eager"
# device_map: ${device_map}
pad_token_id: ${eos_token_id}
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.dpo_bi_random_choice_aligner
pos_field: pos
neg_field: neg
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: "{question}\n\nPlease put your final answer within {instruction}."
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${eos_token}
compositions:
prompt: "{chat_prefix}"
chosen: "{chat_prefix}{pos}{chat_suffix}"
reject: "{chat_prefix}{neg}{chat_suffix}"
instruction: "\\boxed{}"
index_field: id
kv_mapping:
chosen: chosen
reject: reject
id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 2048
# Dataloader
num_workers: 8
prefetch_factor: 2
model_name_or_path: ${sft_model_dir}
pretrain:
resume: latest
dp_size:
tp_size: 4
pp_size: 1
exp_name: llama3.1.8b.mathscale4o.dpo.iter1.A100-40.tp4dp32.v1.1.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
learning_rate: 2e-6
#learning_rate: 8e-7
gradient_accumulation_steps: 8
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 3
max_steps: 0
warmup_proportion: 0.03
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/mathscale/llama/dpo/iter1/llama3.1-pdpo-4o-iter1-1.0-A100.yaml
================================================
defaults:
- hydra: default
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload_cosine
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share
model_path_prefix: /mnt/fangkai_blob/share/models/
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
sft_model_dir: ${output_path_prefix}/experiments/llama3.1.8b.mathscale4o.process-dpo.iter0.A100.dp8.v2.2.s42/checkpoint-1200/
train_file: ${model_path_prefix}/mathstral-7B-v0.1/mathscale4o/train.500k.de_con.v1.0.boxed.json
dev_file:
test_file:
eos_token: "<|end_of_text|>"
eos_token_id: 128001
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: bfloat16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
pad_token: ${eos_token}
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.5
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
# attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: ${eos_token_id}
device_map: ${device_map}
ref_model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
# attn_implementation: "eager"
device_map: ${device_map}
pad_token_id: ${eos_token_id}
read_tensor:
_target_: data.combine_dataset.ReplayDataset
_recursive_: False
new_dataset_cfg: ${read_tensor_pdpo}
old_dataset_cfg: ${read_tensor_dpo}
replay_ratio: 1.0
read_tensor_dpo:
_target_: data.combine_dataset.MultiMappingDataset
file_path: ${sft_model_dir}/mathscale4o/train.500k.de_con.boxed.v1.0.n10.tem1.0.p0.9.prefer_pair.json
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.dpo_bi_random_choice_aligner
pos_field: pos
neg_field: neg
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: "{question}\n\nPlease put your final answer within {instruction}."
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${eos_token}
compositions:
prompt: "{chat_prefix}"
chosen: "{chat_prefix}{pos}{chat_suffix}"
reject: "{chat_prefix}{neg}{chat_suffix}"
instruction: "\\boxed{}"
index_field: id
kv_mapping:
chosen: chosen
reject: reject
id: index
prompt: prompt
read_tensor_pdpo:
_target_: data.combine_dataset.MultiMappingDataset
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: id
extract_index_field: idx
extract_fields: [ value, prefix ]
extra_file: ${sft_model_dir}/mathscale4o/train.500k.de_con.boxed.v1.0.n10.tem1.0.p0.9.upper0.7.r0.3.sample10.filter_same.prefix_completion.n3.tem1.0.p0.9.process_rm_gd.local.json
- _target_: data.input_aligner.value2pair_mapping_aligner
field: prefix
pos_field: pos_prefix
neg_field: neg_prefix
value_field: value
value_mapping_func:
_target_: data.input_aligner.return_threshold_mapping
value_threshold: 1
- _target_: data.input_aligner.dpo_bi_random_choice_aligner
pos_field: pos_prefix
neg_field: neg_prefix
sort_accord_to_len: True
top_k: 5
num_workers: 32
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: "{question}\n\nPlease put your final answer within {instruction}."
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${eos_token}
compositions:
prompt: "{chat_prefix}"
chosen: "{chat_prefix}{pos}"
reject: "{chat_prefix}{neg}"
instruction: "\\boxed{}"
index_field: id
kv_mapping:
chosen: chosen
reject: reject
id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 2048
# Dataloader
num_workers: 8
prefetch_factor: 2
model_name_or_path: ${sft_model_dir}
pretrain:
resume: latest
dp_size:
tp_size: 1
pp_size: 1
exp_name: llama3.1.8b.mathscale4o.process-dpo.iter1.A100.dp8.v1.0.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 1e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 64
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 3
max_steps: 0
warmup_proportion: 0.03
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/mathscale/llama/dpo/iter1/llama3.1-pdpo-4o-iter1-1.1-v100.yaml
================================================
defaults:
- hydra: default
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload_cosine
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share
model_path_prefix: /mnt/fangkai_blob/share/models/
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
sft_model_dir: ${output_path_prefix}/experiments/llama3.1.8b.mathscale4o.process-dpo.iter0.A100.dp8.v2.2.s42/checkpoint-1200/
train_file: ${model_path_prefix}/mathstral-7B-v0.1/mathscale4o/train.500k.de_con.v1.0.boxed.json
dev_file:
test_file:
eos_token: "<|end_of_text|>"
eos_token_id: 128001
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
pad_token: ${eos_token}
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.5
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
# attn_implementation: "flash_attention_2"
attn_implementation: "sdpa"
torch_dtype: ${torch_dtype}
pad_token_id: ${eos_token_id}
# device_map: ${device_map}
ref_model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
# attn_implementation: "flash_attention_2"
attn_implementation: "sdpa"
# device_map: ${device_map}
pad_token_id: ${eos_token_id}
#read_tensor:
# _target_: data.combine_dataset.ReplayDataset
# _recursive_: False
# new_dataset_cfg: ${read_tensor_pdpo}
# old_dataset_cfg: ${read_tensor_dpo}
# replay_ratio: 1.0
#read_tensor_dpo:
# _target_: data.combine_dataset.MultiMappingDataset
# file_path: ${sft_model_dir}/mathscale4o/train.500k.de_con.boxed.v1.0.n10.tem1.0.p0.9.prefer_pair.json
# aligner:
# _target_: data.input_aligner.concat_aligner
# aligners:
# - _target_: data.input_aligner.dpo_bi_random_choice_aligner
# pos_field: pos
# neg_field: neg
# template:
# _target_: data.input_utils.recompose_template
# units:
# chat_prefix: "{question}\n\nPlease put your final answer within {instruction}."
# pos: "{pos}"
# neg: "{neg}"
# chat_suffix: ${eos_token}
# compositions:
# prompt: "{chat_prefix}"
# chosen: "{chat_prefix}{pos}{chat_suffix}"
# reject: "{chat_prefix}{neg}{chat_suffix}"
# instruction: "\\boxed{}"
# index_field: id
# kv_mapping:
# chosen: chosen
# reject: reject
# id: index
# prompt: prompt
#read_tensor_pdpo:
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: id
extract_index_field: idx
extract_fields: [ value, prefix ]
extra_file: ${sft_model_dir}/mathscale4o/train.500k.de_con.boxed.v1.0.n10.tem1.0.p0.9.upper0.7.r0.3.sample10.filter_same.prefix_completion.n3.tem1.0.p0.9.process_rm_gd.local.json
- _target_: data.input_aligner.value2pair_mapping_aligner
field: prefix
pos_field: pos_prefix
neg_field: neg_prefix
value_field: value
value_mapping_func:
_target_: data.input_aligner.return_threshold_mapping
value_threshold: 1
- _target_: data.input_aligner.dpo_bi_random_choice_aligner
pos_field: pos_prefix
neg_field: neg_prefix
sort_accord_to_len: True
top_k: 5
num_workers: 32
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: "{question}\n\nPlease put your final answer within {instruction}."
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${eos_token}
compositions:
prompt: "{chat_prefix}"
chosen: "{chat_prefix}{pos}"
reject: "{chat_prefix}{neg}"
instruction: "\\boxed{}"
index_field: id
kv_mapping:
chosen: chosen
reject: reject
id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 2048
# Dataloader
num_workers: 8
prefetch_factor: 2
model_name_or_path: ${sft_model_dir}
pretrain:
resume: latest
dp_size:
tp_size: 4
pp_size: 1
exp_name: llama3.1.8b.mathscale4o.process-dpo.iter1.V100.tp4dp64.v1.1.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 2
per_gpu_eval_batch_size: 2
#learning_rate: 1e-4
learning_rate: 1e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 4
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 3
max_steps: 0
warmup_proportion: 0.03
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: False
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
bf16:
enabled: False
fp16:
enabled: True
auto_cast: False
loss_scale: 0
initial_scale_power: 16
loss_scale_window: 1000
hysteresis: 2
consecutive_hysteresis: False
min_loss_scale: 1
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/mathscale/llama/dpo/llama3.1-dpo-4o-iter0-v1.0-v100.yaml
================================================
defaults:
- hydra: default
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload_cosine
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share
model_path_prefix: /mnt/fangkai_blob/share/models/
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
sft_model_dir: ${model_path_prefix}/llama3.1_8b_mathscale4o/model_lr1e-5_batch512_epochs3_gpus8_linearSchedule/
train_file: ${sft_model_dir}/mathscale4o/train.500k.de_con.boxed.v1.0.n10.tem1.0.p0.9.prefer_pair.json
dev_file:
test_file:
eos_token: "<|end_of_text|>"
eos_token_id: 128001
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
pad_token: ${eos_token}
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.5
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
# attn_implementation: "flash_attention_2"
attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: ${eos_token_id}
# device_map: ${device_map}
ref_model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
# attn_implementation: "flash_attention_2"
attn_implementation: "eager"
# device_map: ${device_map}
pad_token_id: ${eos_token_id}
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.dpo_bi_random_choice_aligner
pos_field: pos
neg_field: neg
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: "{question}\n\nPlease put your final answer within {instruction}."
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${eos_token}
compositions:
prompt: "{chat_prefix}"
chosen: "{chat_prefix}{pos}{chat_suffix}"
reject: "{chat_prefix}{neg}{chat_suffix}"
instruction: "\\boxed{}"
index_field: id
kv_mapping:
chosen: chosen
reject: reject
id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 2048
# Dataloader
num_workers: 8
prefetch_factor: 2
model_name_or_path: ${sft_model_dir}
pretrain:
resume: latest
dp_size:
tp_size: 4
pp_size: 1
exp_name: llama3.1.8b.mathscale4o.dpo.iter0.V100.tp4dp8.v1.0.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 1e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 64
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 3
max_steps: 0
warmup_proportion: 0.03
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 200
save_best: False
eval_steps: 200
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: False
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
bf16:
enabled: False
fp16:
enabled: True
auto_cast: False
loss_scale: 0
initial_scale_power: 16
loss_scale_window: 1000
hysteresis: 2
consecutive_hysteresis: False
min_loss_scale: 1
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/mathscale/llama/dpo/llama3.1-pdpo-4o-iter0-v1.0-v100.yaml
================================================
defaults:
- hydra: default
- deepspeed@ds_cfg: train_hybrid_engine_zero1_cosine
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share
model_path_prefix: /mnt/fangkai_blob/share/models/
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
sft_model_dir: ${model_path_prefix}/llama3.1_8b_mathscale4o/model_lr1e-5_batch512_epochs3_gpus8_linearSchedule/
train_file: ${model_path_prefix}/mathstral-7B-v0.1/mathscale4o/train.500k.de_con.v1.0.boxed.json
dev_file:
test_file:
eos_token: "<|end_of_text|>"
eos_token_id: 128001
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
pad_token: ${eos_token}
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.5
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
# attn_implementation: "flash_attention_2"
attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: ${eos_token_id}
# device_map: ${device_map}
ref_model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
# attn_implementation: "flash_attention_2"
attn_implementation: "eager"
# device_map: ${device_map}
pad_token_id: ${eos_token_id}
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: id
extract_index_field: idx
extract_fields: [ value, prefix ]
extra_file: ${sft_model_dir}/mathscale4o/train.500k.de_con.boxed.v1.0.n10.tem1.0.p0.9.upper0.7.r0.3.sample10.filter_same.prefix_completion.n3.tem1.0.p0.9.process_rm_gd.binary.local.json
- _target_: data.input_aligner.value2pair_aligner
field: prefix
pos_field: pos_prefix
neg_field: neg_prefix
value_field: value
- _target_: data.input_aligner.dpo_bi_random_choice_aligner
pos_field: pos_prefix
neg_field: neg_prefix
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: "{question}\n\nPlease put your final answer within {instruction}."
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${eos_token}
compositions:
prompt: "{chat_prefix}"
chosen: "{chat_prefix}{pos}"
reject: "{chat_prefix}{neg}"
instruction: "\\boxed{}"
index_field: id
kv_mapping:
chosen: chosen
reject: reject
id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 2048
# Dataloader
num_workers: 8
prefetch_factor: 2
model_name_or_path: ${sft_model_dir}
pretrain:
resume: latest
dp_size:
tp_size: 4
pp_size: 1
exp_name: llama3.1.8b.mathscale4o.process-dpo.iter0.V100.tp4dp32.v1.0.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 1e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 16
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 3
max_steps: 0
warmup_proportion: 0.03
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 200
save_best: False
eval_steps: 200
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: False
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
bf16:
enabled: False
fp16:
enabled: True
auto_cast: False
loss_scale: 0
initial_scale_power: 16
loss_scale_window: 1000
hysteresis: 2
consecutive_hysteresis: False
min_loss_scale: 1
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/mathscale/llama/dpo/llama3.1-pdpo-4o-iter0-v1.1-H100.yaml
================================================
defaults:
- hydra: default
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload_cosine
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share
model_path_prefix: /mnt/fangkai_blob/share/models/
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
sft_model_dir: ${model_path_prefix}/llama3.1_8b_mathscale4o/model_lr1e-5_batch512_epochs3_gpus8_linearSchedule/
train_file: ${model_path_prefix}/mathstral-7B-v0.1/mathscale4o/train.500k.de_con.v1.0.boxed.json
dev_file:
test_file:
eos_token: "<|end_of_text|>"
eos_token_id: 128001
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: bfloat16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
pad_token: ${eos_token}
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.5
# sft_loss: True
# sft_loss_weight: 0.2
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
# attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: ${eos_token_id}
device_map: ${device_map}
ref_model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
# attn_implementation: "eager"
device_map: ${device_map}
pad_token_id: ${eos_token_id}
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: id
extract_index_field: idx
extract_fields: [ value, prefix ]
extra_file: ${sft_model_dir}/mathscale4o/train.500k.de_con.boxed.v1.0.n10.tem1.0.p0.9.upper0.7.r0.3.sample10.filter_same.prefix_completion.n3.tem1.0.p0.9.process_rm_gd.binary.local.json
- _target_: data.input_aligner.value2pair_aligner
field: prefix
pos_field: pos_prefix
neg_field: neg_prefix
value_field: value
- _target_: data.input_aligner.dpo_random_choice_aligner
anchor_field: pos_prefix
paired_field: neg_prefix
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: "{question}\n\nPlease put your final answer within {instruction}."
pos: "{pos_prefix}"
neg: "{neg_prefix}"
chat_suffix: ${eos_token}
compositions:
prompt: "{chat_prefix}"
chosen: "{chat_prefix}{pos}"
reject: "{chat_prefix}{neg}"
instruction: "\\boxed{}"
index_field: id
kv_mapping:
chosen: chosen
reject: reject
id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 2048
# Dataloader
num_workers: 8
prefetch_factor: 2
model_name_or_path: ${sft_model_dir}
pretrain:
resume: latest
dp_size:
tp_size: 1
pp_size: 1
exp_name: llama3.1.8b.mathscale4o.process-dpo.iter0.H100.dp8.v1.1.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 1e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 64
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 1
max_steps: 0
warmup_proportion: 0.03
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 200
save_best: False
eval_steps: 200
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/mathscale/llama/dpo/llama3.1-pdpo-4o-iter0-v1.2-V100.yaml
================================================
defaults:
- hydra: default
- deepspeed@ds_cfg: train_hybrid_engine_zero1_cosine
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share
model_path_prefix: /mnt/fangkai_blob/share/models/
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
sft_model_dir: ${model_path_prefix}/llama3.1_8b_mathscale4o/model_lr1e-5_batch512_epochs3_gpus8_linearSchedule/
train_file: ${model_path_prefix}/mathstral-7B-v0.1/mathscale4o/train.500k.de_con.v1.0.boxed.json
dev_file:
test_file:
eos_token: "<|end_of_text|>"
eos_token_id: 128001
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
pad_token: ${eos_token}
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.5
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
# attn_implementation: "flash_attention_2"
attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: ${eos_token_id}
# device_map: ${device_map}
ref_model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
# attn_implementation: "flash_attention_2"
attn_implementation: "eager"
# device_map: ${device_map}
pad_token_id: ${eos_token_id}
read_tensor:
_target_: data.combine_dataset.ReplayDataset
_recursive_: False
new_dataset_cfg: ${read_tensor_pdpo}
old_dataset_cfg: ${read_tensor_dpo}
replay_ratio: 1.0
read_tensor_dpo:
_target_: data.combine_dataset.MultiMappingDataset
file_path: ${sft_model_dir}/mathscale4o/train.500k.de_con.boxed.v1.0.n10.tem1.0.p0.9.prefer_pair.json
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.dpo_bi_random_choice_aligner
pos_field: pos
neg_field: neg
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: "{question}\n\nPlease put your final answer within {instruction}."
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${eos_token}
compositions:
prompt: "{chat_prefix}"
chosen: "{chat_prefix}{pos}{chat_suffix}"
reject: "{chat_prefix}{neg}{chat_suffix}"
instruction: "\\boxed{}"
index_field: id
kv_mapping:
chosen: chosen
reject: reject
id: index
prompt: prompt
read_tensor_pdpo:
_target_: data.combine_dataset.MultiMappingDataset
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: id
extract_index_field: idx
extract_fields: [ value, prefix ]
extra_file: ${sft_model_dir}/mathscale4o/train.500k.de_con.boxed.v1.0.n10.tem1.0.p0.9.upper0.7.r0.3.sample10.filter_same.prefix_completion.n3.tem1.0.p0.9.process_rm_gd.binary.local.json
- _target_: data.input_aligner.value2pair_aligner
field: prefix
pos_field: pos_prefix
neg_field: neg_prefix
value_field: value
- _target_: data.input_aligner.dpo_random_choice_aligner
anchor_field: pos_prefix
paired_field: neg_prefix
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: "{question}\n\nPlease put your final answer within {instruction}."
pos: "{pos_prefix}"
neg: "{neg_prefix}"
chat_suffix: ${eos_token}
compositions:
prompt: "{chat_prefix}"
chosen: "{chat_prefix}{pos}"
reject: "{chat_prefix}{neg}"
instruction: "\\boxed{}"
index_field: id
kv_mapping:
chosen: chosen
reject: reject
id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 2048
# Dataloader
num_workers: 8
prefetch_factor: 2
model_name_or_path: ${sft_model_dir}
pretrain:
resume: latest
dp_size:
tp_size: 4
pp_size: 1
exp_name: llama3.1.8b.mathscale4o.process-dpo.iter0.V100.tp4dp64.v1.2.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 1e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 8
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 1
max_steps: 0
warmup_proportion: 0.03
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 200
save_best: False
eval_steps: 200
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: False
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
bf16:
enabled: False
fp16:
enabled: True
auto_cast: False
loss_scale: 0
initial_scale_power: 16
loss_scale_window: 1000
hysteresis: 2
consecutive_hysteresis: False
min_loss_scale: 1
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/mathscale/llama/dpo/llama3.1-pdpo-4o-iter0-v1.2-a100-40.yaml
================================================
defaults:
- hydra: default
- deepspeed@ds_cfg: train_hybrid_engine_zero1_cosine
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share
model_path_prefix: /mnt/fangkai_blob/share/models/
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
sft_model_dir: ${model_path_prefix}/llama3.1_8b_mathscale4o/model_lr1e-5_batch512_epochs3_gpus8_linearSchedule/
train_file: ${model_path_prefix}/mathstral-7B-v0.1/mathscale4o/train.500k.de_con.v1.0.boxed.json
dev_file:
test_file:
eos_token: "<|end_of_text|>"
eos_token_id: 128001
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: bfloat16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
pad_token: ${eos_token}
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.5
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
# attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: ${eos_token_id}
# device_map: ${device_map}
ref_model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
# attn_implementation: "eager"
# device_map: ${device_map}
pad_token_id: ${eos_token_id}
read_tensor:
_target_: data.combine_dataset.ReplayDataset
_recursive_: False
new_dataset_cfg: ${read_tensor_pdpo}
old_dataset_cfg: ${read_tensor_dpo}
replay_ratio: 1.0
read_tensor_dpo:
_target_: data.combine_dataset.MultiMappingDataset
file_path: ${sft_model_dir}/mathscale4o/train.500k.de_con.boxed.v1.0.n10.tem1.0.p0.9.prefer_pair.json
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.dpo_bi_random_choice_aligner
pos_field: pos
neg_field: neg
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: "{question}\n\nPlease put your final answer within {instruction}."
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${eos_token}
compositions:
prompt: "{chat_prefix}"
chosen: "{chat_prefix}{pos}{chat_suffix}"
reject: "{chat_prefix}{neg}{chat_suffix}"
instruction: "\\boxed{}"
index_field: id
kv_mapping:
chosen: chosen
reject: reject
id: index
prompt: prompt
read_tensor_pdpo:
_target_: data.combine_dataset.MultiMappingDataset
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: id
extract_index_field: idx
extract_fields: [ value, prefix ]
extra_file: ${sft_model_dir}/mathscale4o/train.500k.de_con.boxed.v1.0.n10.tem1.0.p0.9.upper0.7.r0.3.sample10.filter_same.prefix_completion.n3.tem1.0.p0.9.process_rm_gd.binary.local.json
- _target_: data.input_aligner.value2pair_aligner
field: prefix
pos_field: pos_prefix
neg_field: neg_prefix
value_field: value
- _target_: data.input_aligner.dpo_random_choice_aligner
anchor_field: pos_prefix
paired_field: neg_prefix
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: "{question}\n\nPlease put your final answer within {instruction}."
pos: "{pos_prefix}"
neg: "{neg_prefix}"
chat_suffix: ${eos_token}
compositions:
prompt: "{chat_prefix}"
chosen: "{chat_prefix}{pos}"
reject: "{chat_prefix}{neg}"
instruction: "\\boxed{}"
index_field: id
kv_mapping:
chosen: chosen
reject: reject
id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 2048
# Dataloader
num_workers: 8
prefetch_factor: 2
model_name_or_path: ${sft_model_dir}
pretrain:
resume: latest
dp_size:
tp_size: 4
pp_size: 1
exp_name: llama3.1.8b.mathscale4o.process-dpo.iter0.A100-40.tp4dp4.v1.2.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 2
per_gpu_eval_batch_size: 2
#learning_rate: 1e-4
learning_rate: 1e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 64
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 1
max_steps: 0
warmup_proportion: 0.03
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 200
save_best: False
eval_steps: 200
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/mathscale/llama/dpo/llama3.1-pdpo-4o-iter0-v2.0-v100.yaml
================================================
defaults:
- hydra: default
- deepspeed@ds_cfg: train_hybrid_engine_zero1_cosine
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share
model_path_prefix: /mnt/fangkai_blob/share/models/
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
sft_model_dir: ${model_path_prefix}/llama3.1_8b_mathscale4o/model_lr1e-5_batch512_epochs3_gpus8_linearSchedule/
train_file: ${model_path_prefix}/mathstral-7B-v0.1/mathscale4o/train.500k.de_con.v1.0.boxed.json
dev_file:
test_file:
eos_token: "<|end_of_text|>"
eos_token_id: 128001
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
pad_token: ${eos_token}
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.5
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
# attn_implementation: "flash_attention_2"
attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: ${eos_token_id}
# device_map: ${device_map}
ref_model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
# attn_implementation: "flash_attention_2"
attn_implementation: "eager"
# device_map: ${device_map}
pad_token_id: ${eos_token_id}
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: id
extract_index_field: idx
extract_fields: [ value, prefix ]
extra_file: ${sft_model_dir}/mathscale4o/train.500k.de_con.boxed.v1.0.n10.tem1.0.p0.9.upper0.7.r0.3.sample10.filter_same.prefix_completion.n3.tem1.0.p0.9.process_rm_gd.local.json
- _target_: data.input_aligner.value2pair_mapping_aligner
field: prefix
pos_field: pos_prefix
neg_field: neg_prefix
value_field: value
value_mapping_func:
_target_: data.input_aligner.return_threshold_mapping
value_threshold: 2
- _target_: data.input_aligner.dpo_bi_random_choice_aligner
pos_field: pos_prefix
neg_field: neg_prefix
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: "{question}\n\nPlease put your final answer within {instruction}."
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${eos_token}
compositions:
prompt: "{chat_prefix}"
chosen: "{chat_prefix}{pos}"
reject: "{chat_prefix}{neg}"
instruction: "\\boxed{}"
index_field: id
kv_mapping:
chosen: chosen
reject: reject
id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 2048
# Dataloader
num_workers: 8
prefetch_factor: 2
model_name_or_path: ${sft_model_dir}
pretrain:
resume: latest
dp_size:
tp_size: 4
pp_size: 1
exp_name: llama3.1.8b.mathscale4o.process-dpo.iter0.V100.tp4dp32.v2.0.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 1e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 16
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 3
max_steps: 0
warmup_proportion: 0.03
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 200
save_best: False
eval_steps: 200
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: False
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
bf16:
enabled: False
fp16:
enabled: True
auto_cast: False
loss_scale: 0
initial_scale_power: 16
loss_scale_window: 1000
hysteresis: 2
consecutive_hysteresis: False
min_loss_scale: 1
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/mathscale/llama/dpo/llama3.1-pdpo-4o-iter0-v2.1-a100-40.yaml
================================================
defaults:
- hydra: default
- deepspeed@ds_cfg: train_hybrid_engine_zero1_cosine
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share
model_path_prefix: /mnt/fangkai_blob/share/models/
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
sft_model_dir: ${model_path_prefix}/llama3.1_8b_mathscale4o/model_lr1e-5_batch512_epochs3_gpus8_linearSchedule/
train_file: ${model_path_prefix}/mathstral-7B-v0.1/mathscale4o/train.500k.de_con.v1.0.boxed.json
dev_file:
test_file:
eos_token: "<|end_of_text|>"
eos_token_id: 128001
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: bfloat16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
pad_token: ${eos_token}
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.1
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
# attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: ${eos_token_id}
# device_map: ${device_map}
ref_model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
# attn_implementation: "eager"
# device_map: ${device_map}
pad_token_id: ${eos_token_id}
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: id
extract_index_field: idx
extract_fields: [ value, prefix ]
extra_file: ${sft_model_dir}/mathscale4o/train.500k.de_con.boxed.v1.0.n10.tem1.0.p0.9.upper0.7.r0.3.sample10.filter_same.prefix_completion.n3.tem1.0.p0.9.process_rm_gd.local.json
- _target_: data.input_aligner.value2pair_mapping_aligner
field: prefix
pos_field: pos_prefix
neg_field: neg_prefix
value_field: value
value_mapping_func:
_target_: data.input_aligner.return_threshold_mapping
value_threshold: 2
- _target_: data.input_aligner.dpo_bi_random_choice_aligner
pos_field: pos_prefix
neg_field: neg_prefix
sort_accord_to_len: True
top_k: 5
num_workers: 32
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: "{question}\n\nPlease put your final answer within {instruction}."
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${eos_token}
compositions:
prompt: "{chat_prefix}"
chosen: "{chat_prefix}{pos}"
reject: "{chat_prefix}{neg}"
instruction: "\\boxed{}"
index_field: id
kv_mapping:
chosen: chosen
reject: reject
id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 2048
# Dataloader
num_workers: 8
prefetch_factor: 2
model_name_or_path: ${sft_model_dir}
pretrain:
resume: latest
dp_size:
tp_size: 4
pp_size: 1
exp_name: llama3.1.8b.mathscale4o.process-dpo.iter0.A100-40.tp4dp8.v2.1.fix.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 2
per_gpu_eval_batch_size: 2
#learning_rate: 1e-4
learning_rate: 1e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 32
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 3
max_steps: 0
warmup_proportion: 0.03
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 200
save_best: False
eval_steps: 200
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/mathscale/llama/dpo/llama3.1-pdpo-4o-iter0-v2.1-v100.yaml
================================================
defaults:
- hydra: default
- deepspeed@ds_cfg: train_hybrid_engine_zero1_cosine
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share
model_path_prefix: /mnt/fangkai_blob/share/models/
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
sft_model_dir: ${model_path_prefix}/llama3.1_8b_mathscale4o/model_lr1e-5_batch512_epochs3_gpus8_linearSchedule/
train_file: ${model_path_prefix}/mathstral-7B-v0.1/mathscale4o/train.500k.de_con.v1.0.boxed.json
dev_file:
test_file:
eos_token: "<|end_of_text|>"
eos_token_id: 128001
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: float16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
pad_token: ${eos_token}
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.1
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
# attn_implementation: "flash_attention_2"
attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: ${eos_token_id}
# device_map: ${device_map}
ref_model:
_target_: models.llama_tp.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
# attn_implementation: "flash_attention_2"
attn_implementation: "eager"
# device_map: ${device_map}
pad_token_id: ${eos_token_id}
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: id
extract_index_field: idx
extract_fields: [ value, prefix ]
extra_file: ${sft_model_dir}/mathscale4o/train.500k.de_con.boxed.v1.0.n10.tem1.0.p0.9.upper0.7.r0.3.sample10.filter_same.prefix_completion.n3.tem1.0.p0.9.process_rm_gd.local.json
- _target_: data.input_aligner.value2pair_mapping_aligner
field: prefix
pos_field: pos_prefix
neg_field: neg_prefix
value_field: value
value_mapping_func:
_target_: data.input_aligner.return_threshold_mapping
value_threshold: 2
- _target_: data.input_aligner.dpo_bi_random_choice_aligner
pos_field: pos_prefix
neg_field: neg_prefix
sort_accord_to_len: True
top_k: 5
num_workers: 32
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: "{question}\n\nPlease put your final answer within {instruction}."
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${eos_token}
compositions:
prompt: "{chat_prefix}"
chosen: "{chat_prefix}{pos}"
reject: "{chat_prefix}{neg}"
instruction: "\\boxed{}"
index_field: id
kv_mapping:
chosen: chosen
reject: reject
id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 2048
# Dataloader
num_workers: 8
prefetch_factor: 2
model_name_or_path: ${sft_model_dir}
pretrain:
resume: latest
dp_size:
tp_size: 8
pp_size: 1
exp_name: llama3.1.8b.mathscale4o.process-dpo.iter0.V100.tp8dp32.v2.1.fix.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 1e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 16
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 3
max_steps: 0
warmup_proportion: 0.03
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 200
save_best: False
eval_steps: 200
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: False
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
bf16:
enabled: False
fp16:
enabled: True
auto_cast: False
loss_scale: 0
initial_scale_power: 16
loss_scale_window: 1000
hysteresis: 2
consecutive_hysteresis: False
min_loss_scale: 1
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/mathscale/llama/dpo/llama3.1-pdpo-4o-iter0-v2.2-A100.yaml
================================================
defaults:
- hydra: default
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload_cosine
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share
model_path_prefix: /mnt/fangkai_blob/share/models/
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
sft_model_dir: ${model_path_prefix}/llama3.1_8b_mathscale4o/model_lr1e-5_batch512_epochs3_gpus8_linearSchedule/
train_file: ${model_path_prefix}/mathstral-7B-v0.1/mathscale4o/train.500k.de_con.v1.0.boxed.json
dev_file:
test_file:
eos_token: "<|end_of_text|>"
eos_token_id: 128001
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: bfloat16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
pad_token: ${eos_token}
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.5
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
# attn_implementation: "eager"
torch_dtype: ${torch_dtype}
pad_token_id: ${eos_token_id}
device_map: ${device_map}
ref_model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
# attn_implementation: "eager"
device_map: ${device_map}
pad_token_id: ${eos_token_id}
read_tensor:
_target_: data.combine_dataset.ReplayDataset
_recursive_: False
new_dataset_cfg: ${read_tensor_pdpo}
old_dataset_cfg: ${read_tensor_dpo}
replay_ratio: 1.0
read_tensor_dpo:
_target_: data.combine_dataset.MultiMappingDataset
file_path: ${sft_model_dir}/mathscale4o/train.500k.de_con.boxed.v1.0.n10.tem1.0.p0.9.prefer_pair.json
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.dpo_bi_random_choice_aligner
pos_field: pos
neg_field: neg
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: "{question}\n\nPlease put your final answer within {instruction}."
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${eos_token}
compositions:
prompt: "{chat_prefix}"
chosen: "{chat_prefix}{pos}{chat_suffix}"
reject: "{chat_prefix}{neg}{chat_suffix}"
instruction: "\\boxed{}"
index_field: id
kv_mapping:
chosen: chosen
reject: reject
id: index
prompt: prompt
read_tensor_pdpo:
_target_: data.combine_dataset.MultiMappingDataset
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: id
extract_index_field: idx
extract_fields: [ value, prefix ]
extra_file: ${sft_model_dir}/mathscale4o/train.500k.de_con.boxed.v1.0.n10.tem1.0.p0.9.upper0.7.r0.3.sample10.filter_same.prefix_completion.n3.tem1.0.p0.9.process_rm_gd.binary.local.json
- _target_: data.input_aligner.value2pair_aligner
field: prefix
pos_field: pos_prefix
neg_field: neg_prefix
value_field: value
- _target_: data.input_aligner.dpo_bi_random_choice_aligner
pos_field: pos_prefix
neg_field: neg_prefix
sort_accord_to_len: True
top_k: 5
num_workers: 32
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: "{question}\n\nPlease put your final answer within {instruction}."
pos: "{pos}"
neg: "{neg}"
chat_suffix: ${eos_token}
compositions:
prompt: "{chat_prefix}"
chosen: "{chat_prefix}{pos}"
reject: "{chat_prefix}{neg}"
instruction: "\\boxed{}"
index_field: id
kv_mapping:
chosen: chosen
reject: reject
id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 2048
# Dataloader
num_workers: 8
prefetch_factor: 2
model_name_or_path: ${sft_model_dir}
pretrain:
resume: latest
dp_size:
tp_size: 1
pp_size: 1
exp_name: llama3.1.8b.mathscale4o.process-dpo.iter0.A100.dp8.v2.2.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 1e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 64
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 3
max_steps: 0
warmup_proportion: 0.03
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 200
save_best: False
eval_steps: 200
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/mathscale/llama/dpo/numina-co/llama3.1-pdpo-iter1-1.0-split01-p0.0-h100.yaml
================================================
defaults:
- hydra: default
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload_cosine
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share
model_path_prefix: /mnt/fangkai_blob/share/models/
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
sft_model_dir: ${output_path_prefix}/experiments/llama3.1.8b.mathscale4o.process-dpo.iter0.A100.dp8.v2.2.s42/checkpoint-1200/
train_file: ${data_path_prefix}/dataset/NuminaMath/numina-math-cot.de_con.label.json
dev_file:
test_file:
eos_token: "<|end_of_text|>"
eos_token_id: 128001
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: bfloat16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
pad_token: ${eos_token}
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.5
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
# attn_implementation: "sdpa"
torch_dtype: ${torch_dtype}
pad_token_id: ${eos_token_id}
device_map: ${device_map}
ref_model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
# attn_implementation: "sdpa"
device_map: ${device_map}
pad_token_id: ${eos_token_id}
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: id
extract_index_field: idx
extract_fields: [ value, prefix ]
extra_file: ${sft_model_dir}/numina/cot.de_con.n16.tem1.0.p1.0.split01.upper0.8.r0.3.sample8.filter_same.process_rm.sc-p0.0.azure.json
- _target_: data.input_aligner.value2pair_mapping_aligner
field: prefix
pos_field: pos_prefix
neg_field: neg_prefix
value_field: value
value_mapping_func:
_target_: data.input_aligner.return_threshold_mapping
value_threshold: 1
- _target_: data.input_aligner.dpo_bi_random_choice_aligner
pos_field: pos_prefix
neg_field: neg_prefix
# sort_accord_to_len: True
# top_k: 5
# num_workers: 32
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: "{question}\n\nPlease put your final answer within {instruction}."
# pos: "{pos}"
# neg: "{neg}"
pos: "{pos_prefix}"
neg: "{neg_prefix}"
chat_suffix: ${eos_token}
compositions:
prompt: "{chat_prefix}"
chosen: "{chat_prefix}{pos}"
reject: "{chat_prefix}{neg}"
instruction: "\\boxed{}"
index_field: id
kv_mapping:
chosen: chosen
reject: reject
id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 2048
# Dataloader
num_workers: 8
prefetch_factor: 2
model_name_or_path: ${sft_model_dir}
pretrain:
resume: latest
dp_size:
tp_size: 1
pp_size: 1
exp_name: llama3.1.8b.numina.process-dpo-sc-p0.0.iter1.split01.H100.dp8.v1.0.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 1e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 64
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 3
max_steps: 0
warmup_proportion: 0.03
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/mathscale/llama/dpo/numina-co/llama3.1-pdpo-iter1-1.0-split01-p0.5-h100.yaml
================================================
defaults:
- hydra: default
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload_cosine
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share
model_path_prefix: /mnt/fangkai_blob/share/models/
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
sft_model_dir: ${output_path_prefix}/experiments/llama3.1.8b.mathscale4o.process-dpo.iter0.A100.dp8.v2.2.s42/checkpoint-1200/
train_file: ${data_path_prefix}/dataset/NuminaMath/numina-math-cot.de_con.label.json
dev_file:
test_file:
eos_token: "<|end_of_text|>"
eos_token_id: 128001
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: bfloat16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
pad_token: ${eos_token}
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.5
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
# attn_implementation: "sdpa"
torch_dtype: ${torch_dtype}
pad_token_id: ${eos_token_id}
device_map: ${device_map}
ref_model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
# attn_implementation: "sdpa"
device_map: ${device_map}
pad_token_id: ${eos_token_id}
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: id
extract_index_field: idx
extract_fields: [ value, prefix ]
extra_file: ${sft_model_dir}/numina/cot.de_con.n16.tem1.0.p1.0.split01.upper0.8.r0.3.sample8.filter_same.process_rm.sc-p0.5.azure.json
- _target_: data.input_aligner.value2pair_mapping_aligner
field: prefix
pos_field: pos_prefix
neg_field: neg_prefix
value_field: value
value_mapping_func:
_target_: data.input_aligner.return_threshold_mapping
value_threshold: 1
- _target_: data.input_aligner.dpo_bi_random_choice_aligner
pos_field: pos_prefix
neg_field: neg_prefix
# sort_accord_to_len: True
# top_k: 5
# num_workers: 32
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: "{question}\n\nPlease put your final answer within {instruction}."
# pos: "{pos}"
# neg: "{neg}"
pos: "{pos_prefix}"
neg: "{neg_prefix}"
chat_suffix: ${eos_token}
compositions:
prompt: "{chat_prefix}"
chosen: "{chat_prefix}{pos}"
reject: "{chat_prefix}{neg}"
instruction: "\\boxed{}"
index_field: id
kv_mapping:
chosen: chosen
reject: reject
id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 2048
# Dataloader
num_workers: 8
prefetch_factor: 2
model_name_or_path: ${sft_model_dir}
pretrain:
resume: latest
dp_size:
tp_size: 1
pp_size: 1
exp_name: llama3.1.8b.numina.process-dpo-sc-p0.5.iter1.split01.H100.dp8.v1.0.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 1e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 64
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 3
max_steps: 0
warmup_proportion: 0.03
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/mathscale/llama/dpo/numina-co/llama3.1-pdpo-iter1-split0123-cross2-p0.5-v1.0-h100.yaml
================================================
defaults:
- hydra: default
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload_cosine
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share
model_path_prefix: /mnt/fangkai_blob/share/models/
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
sft_model_dir: ${output_path_prefix}/experiments/llama3.1.8b.mathscale4o.process-dpo.iter0.A100.dp8.v2.2.s42/checkpoint-1200/
train_file: ${data_path_prefix}/dataset/NuminaMath/numina-math-cot.de_con.label.json
dev_file:
test_file:
eos_token: "<|end_of_text|>"
eos_token_id: 128001
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: bfloat16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
pad_token: ${eos_token}
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.5
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
# attn_implementation: "sdpa"
torch_dtype: ${torch_dtype}
pad_token_id: ${eos_token_id}
device_map: ${device_map}
ref_model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
# attn_implementation: "sdpa"
device_map: ${device_map}
pad_token_id: ${eos_token_id}
split01_model_name: llama3.1.8b.numina.process-dpo-sc-p0.5.iter1.split01.H100.dp8.v1.0.s42
split23_model_name: llama3.1.8b.numina.process-dpo-sc-p0.5.iter1.split23.A100.dp16.v1.0.s42
read_tensor_split01:
_target_: data.combine_dataset.MultiMappingDataset
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: id
extract_index_field: idx
extract_fields: [ value, prefix ]
extra_file: ${sft_model_dir}/numina/${split23_model_name}/cot.de_con.n16.tem1.0.p1.0.split01.upper0.8.r0.3.sample8.filter_same.process_rm.sc-p0.5.azure.json
- _target_: data.input_aligner.value2pair_mapping_aligner
field: prefix
pos_field: pos_prefix
neg_field: neg_prefix
value_field: value
value_mapping_func:
_target_: data.input_aligner.return_threshold_mapping
value_threshold: 1
- _target_: data.input_aligner.dpo_bi_random_choice_aligner
pos_field: pos_prefix
neg_field: neg_prefix
# sort_accord_to_len: True
# top_k: 5
# num_workers: 32
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: "{question}\n\nPlease put your final answer within {instruction}."
# pos: "{pos}"
# neg: "{neg}"
pos: "{pos_prefix}"
neg: "{neg_prefix}"
chat_suffix: ${eos_token}
compositions:
prompt: "{chat_prefix}"
chosen: "{chat_prefix}{pos}"
reject: "{chat_prefix}{neg}"
instruction: "\\boxed{}"
index_field: id
kv_mapping:
chosen: chosen
reject: reject
id: index
prompt: prompt
read_tensor_split23:
_target_: data.combine_dataset.MultiMappingDataset
file_path: ${train_file}
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: id
extract_index_field: idx
extract_fields: [ value, prefix ]
extra_file: ${sft_model_dir}/numina/${split01_model_name}/cot.de_con.n16.tem1.0.p1.0.split23.upper0.8.r0.3.sample8.filter_same.process_rm.sc-p0.5.azure.json
- _target_: data.input_aligner.value2pair_mapping_aligner
field: prefix
pos_field: pos_prefix
neg_field: neg_prefix
value_field: value
value_mapping_func:
_target_: data.input_aligner.return_threshold_mapping
value_threshold: 1
- _target_: data.input_aligner.dpo_bi_random_choice_aligner
pos_field: pos_prefix
neg_field: neg_prefix
# sort_accord_to_len: True
# top_k: 5
# num_workers: 32
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: "{question}\n\nPlease put your final answer within {instruction}."
# pos: "{pos}"
# neg: "{neg}"
pos: "{pos_prefix}"
neg: "{neg_prefix}"
chat_suffix: ${eos_token}
compositions:
prompt: "{chat_prefix}"
chosen: "{chat_prefix}{pos}"
reject: "{chat_prefix}{neg}"
instruction: "\\boxed{}"
index_field: id
kv_mapping:
chosen: chosen
reject: reject
id: index
prompt: prompt
read_tensor:
_target_: data.combine_dataset.ReplayDataset
_recursive_: False
new_dataset_cfg: ${read_tensor_split01}
old_dataset_cfg: ${read_tensor_split23}
replay_ratio: 1.0
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 2048
# Dataloader
num_workers: 8
prefetch_factor: 2
model_name_or_path: ${sft_model_dir}
pretrain:
resume: latest
dp_size:
tp_size: 1
pp_size: 1
exp_name: llama3.1.8b.numina.process-dpo-sc-p0.5.iter1.split0123-cross2.H100.dp16.v1.0.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 1e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 32
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 3
max_steps: 0
warmup_proportion: 0.03
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/mathscale/llama/dpo/numina-co/llama3.1-pdpo-iter1-split0123-cross2-p0.5-v1.1-h100.yaml
================================================
defaults:
- hydra: default
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload_cosine
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share
model_path_prefix: /mnt/fangkai_blob/share/models/
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
sft_model_dir: ${output_path_prefix}/experiments/llama3.1.8b.mathscale4o.process-dpo.iter0.A100.dp8.v2.2.s42/checkpoint-1200/
train_file: ${data_path_prefix}/dataset/NuminaMath/numina-math-cot.de_con.label.json
dev_file:
test_file:
eos_token: "<|end_of_text|>"
eos_token_id: 128001
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: bfloat16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
pad_token: ${eos_token}
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.5
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
# attn_implementation: "sdpa"
torch_dtype: ${torch_dtype}
pad_token_id: ${eos_token_id}
device_map: ${device_map}
ref_model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
# attn_implementation: "sdpa"
device_map: ${device_map}
pad_token_id: ${eos_token_id}
split01_model_name: llama3.1.8b.numina.process-dpo-sc-p0.5.iter1.split01.H100.dp8.v1.0.s42
split23_model_name: llama3.1.8b.numina.process-dpo-sc-p0.5.iter1.split23.A100.dp16.v1.0.s42
read_tensor_split01:
_target_: data.combine_dataset.MultiMappingDataset
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: id
extract_index_field: idx
extract_fields: [ value, prefix ]
extra_file: ${sft_model_dir}/numina/${split23_model_name}/cot.de_con.n16.tem1.0.p1.0.split01.upper0.8.r0.3.sample8.filter_same.process_rm.sc-p0.5.azure.json
- _target_: data.input_aligner.value2pair_mapping_aligner
field: prefix
pos_field: pos_prefix
neg_field: neg_prefix
value_field: value
value_mapping_func:
_target_: data.input_aligner.return_threshold_mapping
value_threshold: 2
- _target_: data.input_aligner.dpo_bi_random_choice_aligner
pos_field: pos_prefix
neg_field: neg_prefix
sort_accord_to_len: True
top_k: 5
num_workers: 32
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: "{question}\n\nPlease put your final answer within {instruction}."
pos: "{pos}"
neg: "{neg}"
# pos: "{pos_prefix}"
# neg: "{neg_prefix}"
chat_suffix: ${eos_token}
compositions:
prompt: "{chat_prefix}"
chosen: "{chat_prefix}{pos}"
reject: "{chat_prefix}{neg}"
instruction: "\\boxed{}"
index_field: id
kv_mapping:
chosen: chosen
reject: reject
id: index
prompt: prompt
read_tensor_split23:
_target_: data.combine_dataset.MultiMappingDataset
file_path: ${train_file}
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: id
extract_index_field: idx
extract_fields: [ value, prefix ]
extra_file: ${sft_model_dir}/numina/${split01_model_name}/cot.de_con.n16.tem1.0.p1.0.split23.upper0.8.r0.3.sample8.filter_same.process_rm.sc-p0.5.azure.json
- _target_: data.input_aligner.value2pair_mapping_aligner
field: prefix
pos_field: pos_prefix
neg_field: neg_prefix
value_field: value
value_mapping_func:
_target_: data.input_aligner.return_threshold_mapping
value_threshold: 2
- _target_: data.input_aligner.dpo_bi_random_choice_aligner
pos_field: pos_prefix
neg_field: neg_prefix
sort_accord_to_len: True
top_k: 5
num_workers: 32
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: "{question}\n\nPlease put your final answer within {instruction}."
pos: "{pos}"
neg: "{neg}"
# pos: "{pos_prefix}"
# neg: "{neg_prefix}"
chat_suffix: ${eos_token}
compositions:
prompt: "{chat_prefix}"
chosen: "{chat_prefix}{pos}"
reject: "{chat_prefix}{neg}"
instruction: "\\boxed{}"
index_field: id
kv_mapping:
chosen: chosen
reject: reject
id: index
prompt: prompt
read_tensor:
_target_: data.combine_dataset.ReplayDataset
_recursive_: False
new_dataset_cfg: ${read_tensor_split01}
old_dataset_cfg: ${read_tensor_split23}
replay_ratio: 1.0
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 2048
# Dataloader
num_workers: 8
prefetch_factor: 2
model_name_or_path: ${sft_model_dir}
pretrain:
resume: latest
dp_size:
tp_size: 1
pp_size: 1
exp_name: llama3.1.8b.numina.process-dpo-sc-p0.5.iter1.split0123-cross2.H100.dp8.v1.1.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 1e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 64
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 3
max_steps: 0
warmup_proportion: 0.03
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/mathscale/llama/dpo/numina-co/llama3.1-pdpo-iter1-split0123-p0.5-v1.0-h100.yaml
================================================
defaults:
- hydra: default
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload_cosine
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share
model_path_prefix: /mnt/fangkai_blob/share/models/
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
sft_model_dir: ${output_path_prefix}/experiments/llama3.1.8b.mathscale4o.process-dpo.iter0.A100.dp8.v2.2.s42/checkpoint-1200/
train_file: ${data_path_prefix}/dataset/NuminaMath/numina-math-cot.de_con.label.json
dev_file:
test_file:
eos_token: "<|end_of_text|>"
eos_token_id: 128001
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: bfloat16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
pad_token: ${eos_token}
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.5
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
# attn_implementation: "sdpa"
torch_dtype: ${torch_dtype}
pad_token_id: ${eos_token_id}
device_map: ${device_map}
ref_model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
# attn_implementation: "sdpa"
device_map: ${device_map}
pad_token_id: ${eos_token_id}
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: id
extract_index_field: idx
extract_fields: [ value, prefix ]
extra_file: ${sft_model_dir}/numina/cot.de_con.n16.tem1.0.p1.0.split0123.upper0.8.r0.3.sample8.filter_same.process_rm.sc-p0.5.azure.json
- _target_: data.input_aligner.value2pair_mapping_aligner
field: prefix
pos_field: pos_prefix
neg_field: neg_prefix
value_field: value
value_mapping_func:
_target_: data.input_aligner.return_threshold_mapping
value_threshold: 1
- _target_: data.input_aligner.dpo_bi_random_choice_aligner
pos_field: pos_prefix
neg_field: neg_prefix
# sort_accord_to_len: True
# top_k: 5
# num_workers: 32
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: "{question}\n\nPlease put your final answer within {instruction}."
# pos: "{pos}"
# neg: "{neg}"
pos: "{pos_prefix}"
neg: "{neg_prefix}"
chat_suffix: ${eos_token}
compositions:
prompt: "{chat_prefix}"
chosen: "{chat_prefix}{pos}"
reject: "{chat_prefix}{neg}"
instruction: "\\boxed{}"
index_field: id
kv_mapping:
chosen: chosen
reject: reject
id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 2048
# Dataloader
num_workers: 8
prefetch_factor: 2
model_name_or_path: ${sft_model_dir}
pretrain:
resume: latest
dp_size:
tp_size: 1
pp_size: 1
exp_name: llama3.1.8b.numina.process-dpo-sc-p0.5.iter1.split0123.H100.dp16.v1.0.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 1e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 32
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 3
max_steps: 0
warmup_proportion: 0.03
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/mathscale/llama/dpo/numina-co/llama3.1-pdpo-iter1-split0123-p0.5-v1.1-h100.yaml
================================================
defaults:
- hydra: default
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload_cosine
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share
model_path_prefix: /mnt/fangkai_blob/share/models/
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
sft_model_dir: ${output_path_prefix}/experiments/llama3.1.8b.mathscale4o.process-dpo.iter0.A100.dp8.v2.2.s42/checkpoint-1200/
train_file: ${data_path_prefix}/dataset/NuminaMath/numina-math-cot.de_con.label.json
dev_file:
test_file:
eos_token: "<|end_of_text|>"
eos_token_id: 128001
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: bfloat16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
pad_token: ${eos_token}
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.5
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
# attn_implementation: "sdpa"
torch_dtype: ${torch_dtype}
pad_token_id: ${eos_token_id}
device_map: ${device_map}
ref_model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
# attn_implementation: "sdpa"
device_map: ${device_map}
pad_token_id: ${eos_token_id}
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: id
extract_index_field: idx
extract_fields: [ value, prefix ]
extra_file: ${sft_model_dir}/numina/cot.de_con.n16.tem1.0.p1.0.split0123.upper0.8.r0.3.sample8.filter_same.process_rm.sc-p0.5.azure.json
- _target_: data.input_aligner.value2pair_mapping_aligner
field: prefix
pos_field: pos_prefix
neg_field: neg_prefix
value_field: value
value_mapping_func:
_target_: data.input_aligner.return_threshold_mapping
value_threshold: 2
- _target_: data.input_aligner.dpo_bi_random_choice_aligner
pos_field: pos_prefix
neg_field: neg_prefix
# sort_accord_to_len: True
# top_k: 5
# num_workers: 32
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: "{question}\n\nPlease put your final answer within {instruction}."
# pos: "{pos}"
# neg: "{neg}"
pos: "{pos_prefix}"
neg: "{neg_prefix}"
chat_suffix: ${eos_token}
compositions:
prompt: "{chat_prefix}"
chosen: "{chat_prefix}{pos}"
reject: "{chat_prefix}{neg}"
instruction: "\\boxed{}"
index_field: id
kv_mapping:
chosen: chosen
reject: reject
id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 2048
# Dataloader
num_workers: 8
prefetch_factor: 2
model_name_or_path: ${sft_model_dir}
pretrain:
resume: latest
dp_size:
tp_size: 1
pp_size: 1
exp_name: llama3.1.8b.numina.process-dpo-sc-p0.5.iter1.split0123.H100.dp16.v1.1.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
#learning_rate: 1e-6
learning_rate: 7e-7
#learning_rate: 2e-5
gradient_accumulation_steps: 32
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 3
max_steps: 0
warmup_proportion: 0.03
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/mathscale/llama/dpo/numina-co/llama3.1-pdpo-iter1-split23-p0.0-v1.0-h100.yaml
================================================
defaults:
- hydra: default
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload_cosine
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share
model_path_prefix: /mnt/fangkai_blob/share/models/
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
sft_model_dir: ${output_path_prefix}/experiments/llama3.1.8b.mathscale4o.process-dpo.iter0.A100.dp8.v2.2.s42/checkpoint-1200/
train_file: ${data_path_prefix}/dataset/NuminaMath/numina-math-cot.de_con.label.json
dev_file:
test_file:
eos_token: "<|end_of_text|>"
eos_token_id: 128001
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: bfloat16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
pad_token: ${eos_token}
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.5
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
# attn_implementation: "sdpa"
torch_dtype: ${torch_dtype}
pad_token_id: ${eos_token_id}
device_map: ${device_map}
ref_model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
# attn_implementation: "sdpa"
device_map: ${device_map}
pad_token_id: ${eos_token_id}
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: id
extract_index_field: idx
extract_fields: [ value, prefix ]
extra_file: ${sft_model_dir}/numina/cot.de_con.n16.tem1.0.p1.0.split23.upper0.8.r0.3.sample8.filter_same.process_rm.sc-p0.0.azure.json
- _target_: data.input_aligner.value2pair_mapping_aligner
field: prefix
pos_field: pos_prefix
neg_field: neg_prefix
value_field: value
value_mapping_func:
_target_: data.input_aligner.return_threshold_mapping
value_threshold: 1
- _target_: data.input_aligner.dpo_bi_random_choice_aligner
pos_field: pos_prefix
neg_field: neg_prefix
# sort_accord_to_len: True
# top_k: 5
# num_workers: 32
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: "{question}\n\nPlease put your final answer within {instruction}."
# pos: "{pos}"
# neg: "{neg}"
pos: "{pos_prefix}"
neg: "{neg_prefix}"
chat_suffix: ${eos_token}
compositions:
prompt: "{chat_prefix}"
chosen: "{chat_prefix}{pos}"
reject: "{chat_prefix}{neg}"
instruction: "\\boxed{}"
index_field: id
kv_mapping:
chosen: chosen
reject: reject
id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 2048
# Dataloader
num_workers: 8
prefetch_factor: 2
model_name_or_path: ${sft_model_dir}
pretrain:
resume: latest
dp_size:
tp_size: 1
pp_size: 1
exp_name: llama3.1.8b.numina.process-dpo-sc-p0.0.iter1.split23.H100.dp8.v1.0.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 1e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 64
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 3
max_steps: 0
warmup_proportion: 0.03
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/mathscale/llama/dpo/numina-co/llama3.1-pdpo-iter1-split23-p0.0-v1.1-h100.yaml
================================================
defaults:
- hydra: default
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload_cosine
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share
model_path_prefix: /mnt/fangkai_blob/share/models/
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
sft_model_dir: ${output_path_prefix}/experiments/llama3.1.8b.mathscale4o.process-dpo.iter0.A100.dp8.v2.2.s42/checkpoint-1200/
train_file: ${data_path_prefix}/dataset/NuminaMath/numina-math-cot.de_con.label.json
dev_file:
test_file:
eos_token: "<|end_of_text|>"
eos_token_id: 128001
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: bfloat16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
pad_token: ${eos_token}
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.5
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
# attn_implementation: "sdpa"
torch_dtype: ${torch_dtype}
pad_token_id: ${eos_token_id}
device_map: ${device_map}
ref_model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
# attn_implementation: "sdpa"
device_map: ${device_map}
pad_token_id: ${eos_token_id}
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: id
extract_index_field: idx
extract_fields: [ value, prefix ]
extra_file: ${sft_model_dir}/numina/cot.de_con.n16.tem1.0.p1.0.split23.upper0.8.r0.3.sample8.filter_same.process_rm.sc-p0.0.azure.json
- _target_: data.input_aligner.value2pair_mapping_aligner
field: prefix
pos_field: pos_prefix
neg_field: neg_prefix
value_field: value
value_mapping_func:
_target_: data.input_aligner.return_threshold_mapping
value_threshold: 1
- _target_: data.input_aligner.dpo_random_choice_aligner
anchor_field: pos_prefix
paired_field: neg_prefix
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: "{question}\n\nPlease put your final answer within {instruction}."
# pos: "{pos}"
# neg: "{neg}"
pos: "{pos_prefix}"
neg: "{neg_prefix}"
chat_suffix: ${eos_token}
compositions:
prompt: "{chat_prefix}"
chosen: "{chat_prefix}{pos}"
reject: "{chat_prefix}{neg}"
instruction: "\\boxed{}"
index_field: id
kv_mapping:
chosen: chosen
reject: reject
id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 2048
# Dataloader
num_workers: 8
prefetch_factor: 2
model_name_or_path: ${sft_model_dir}
pretrain:
resume: latest
dp_size:
tp_size: 1
pp_size: 1
exp_name: llama3.1.8b.numina.process-dpo-sc-p0.0.iter1.split23.A100.dp16.v1.1.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 1e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 32
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 3
max_steps: 0
warmup_proportion: 0.03
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/mathscale/llama/dpo/numina-co/llama3.1-pdpo-iter1-split23-p0.5-v1.0-h100.yaml
================================================
defaults:
- hydra: default
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload_cosine
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share
model_path_prefix: /mnt/fangkai_blob/share/models/
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
sft_model_dir: ${output_path_prefix}/experiments/llama3.1.8b.mathscale4o.process-dpo.iter0.A100.dp8.v2.2.s42/checkpoint-1200/
train_file: ${data_path_prefix}/dataset/NuminaMath/numina-math-cot.de_con.label.json
dev_file:
test_file:
eos_token: "<|end_of_text|>"
eos_token_id: 128001
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: bfloat16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
pad_token: ${eos_token}
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.5
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
# attn_implementation: "sdpa"
torch_dtype: ${torch_dtype}
pad_token_id: ${eos_token_id}
device_map: ${device_map}
ref_model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
# attn_implementation: "sdpa"
device_map: ${device_map}
pad_token_id: ${eos_token_id}
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: id
extract_index_field: idx
extract_fields: [ value, prefix ]
extra_file: ${sft_model_dir}/numina/cot.de_con.n16.tem1.0.p1.0.split23.upper0.8.r0.3.sample8.filter_same.process_rm.sc-p0.5.azure.json
- _target_: data.input_aligner.value2pair_mapping_aligner
field: prefix
pos_field: pos_prefix
neg_field: neg_prefix
value_field: value
value_mapping_func:
_target_: data.input_aligner.return_threshold_mapping
value_threshold: 1
- _target_: data.input_aligner.dpo_bi_random_choice_aligner
pos_field: pos_prefix
neg_field: neg_prefix
# sort_accord_to_len: True
# top_k: 5
# num_workers: 32
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: "{question}\n\nPlease put your final answer within {instruction}."
# pos: "{pos}"
# neg: "{neg}"
pos: "{pos_prefix}"
neg: "{neg_prefix}"
chat_suffix: ${eos_token}
compositions:
prompt: "{chat_prefix}"
chosen: "{chat_prefix}{pos}"
reject: "{chat_prefix}{neg}"
instruction: "\\boxed{}"
index_field: id
kv_mapping:
chosen: chosen
reject: reject
id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 2048
# Dataloader
num_workers: 8
prefetch_factor: 2
model_name_or_path: ${sft_model_dir}
pretrain:
resume: latest
dp_size:
tp_size: 1
pp_size: 1
exp_name: llama3.1.8b.numina.process-dpo-sc-p0.5.iter1.split23.A100.dp16.v1.0.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 1e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 32
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 3
max_steps: 0
warmup_proportion: 0.03
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/mathscale/llama/dpo/numina-co/llama3.1-pdpo-iter1-split23-p0.5-v1.1-h100.yaml
================================================
defaults:
- hydra: default
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload_cosine
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share
model_path_prefix: /mnt/fangkai_blob/share/models/
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
sft_model_dir: ${output_path_prefix}/experiments/llama3.1.8b.mathscale4o.process-dpo.iter0.A100.dp8.v2.2.s42/checkpoint-1200/
train_file: ${data_path_prefix}/dataset/NuminaMath/numina-math-cot.de_con.label.json
dev_file:
test_file:
eos_token: "<|end_of_text|>"
eos_token_id: 128001
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: bfloat16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
pad_token: ${eos_token}
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.5
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
# attn_implementation: "sdpa"
torch_dtype: ${torch_dtype}
pad_token_id: ${eos_token_id}
device_map: ${device_map}
ref_model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
# attn_implementation: "sdpa"
device_map: ${device_map}
pad_token_id: ${eos_token_id}
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: id
extract_index_field: idx
extract_fields: [ value, prefix ]
extra_file: ${sft_model_dir}/numina/cot.de_con.n16.tem1.0.p1.0.split23.upper0.8.r0.3.sample8.filter_same.process_rm.sc-p0.5.azure.json
- _target_: data.input_aligner.value2pair_mapping_aligner
field: prefix
pos_field: pos_prefix
neg_field: neg_prefix
value_field: value
value_mapping_func:
_target_: data.input_aligner.return_threshold_mapping
value_threshold: 1
- _target_: data.input_aligner.dpo_random_choice_aligner
anchor_field: pos_prefix
paired_field: neg_prefix
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: "{question}\n\nPlease put your final answer within {instruction}."
pos: "{pos_prefix}"
neg: "{neg_prefix}"
chat_suffix: ${eos_token}
compositions:
prompt: "{chat_prefix}"
chosen: "{chat_prefix}{pos}"
reject: "{chat_prefix}{neg}"
instruction: "\\boxed{}"
index_field: id
kv_mapping:
chosen: chosen
reject: reject
id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 2048
# Dataloader
num_workers: 8
prefetch_factor: 2
model_name_or_path: ${sft_model_dir}
pretrain:
resume: latest
dp_size:
tp_size: 1
pp_size: 1
exp_name: llama3.1.8b.numina.process-dpo-sc-p0.5.iter1.split23.A100.dp16.v1.1.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 1e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 32
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 3
max_steps: 0
warmup_proportion: 0.03
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/mathscale/llama/dpo/numina-co/llama3.1-pdpo-iter2-split01-23-p0.5-v1.0-h100.yaml
================================================
defaults:
- hydra: default
- deepspeed@ds_cfg: train_hybrid_engine_zero2_cosine
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share
model_path_prefix: /mnt/fangkai_blob/share/models/
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
sft_model_dir: ${output_path_prefix}/experiments/llama3.1.8b.numina.process-dpo-sc-p0.5.iter1.split01.H100.dp8.v1.0.s42/checkpoint-200
train_file: ${data_path_prefix}/dataset/NuminaMath/numina-math-cot.de_con.label.json
dev_file:
test_file:
eos_token: "<|end_of_text|>"
eos_token_id: 128001
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: bfloat16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
pad_token: ${eos_token}
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.5
sft_loss: True
sft_loss_weight: 0.2
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
# attn_implementation: "sdpa"
torch_dtype: ${torch_dtype}
pad_token_id: ${eos_token_id}
device_map: ${device_map}
ref_model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
# attn_implementation: "sdpa"
device_map: ${device_map}
pad_token_id: ${eos_token_id}
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: id
extract_index_field: idx
extract_fields: [ value, prefix ]
extra_file: ${sft_model_dir}/numina/cot.de_con.n16.tem1.0.p1.0.split23.upper0.8.r0.3.sample8.filter_same.process_rm.sc-p0.5.azure.json
- _target_: data.input_aligner.value2pair_mapping_aligner
field: prefix
pos_field: pos_prefix
neg_field: neg_prefix
value_field: value
value_mapping_func:
_target_: data.input_aligner.return_threshold_mapping
value_threshold: 1
- _target_: data.input_aligner.dpo_bi_random_choice_aligner
pos_field: pos_prefix
neg_field: neg_prefix
# sort_accord_to_len: True
# top_k: 5
# num_workers: 32
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: "{question}\n\nPlease put your final answer within {instruction}."
# pos: "{pos}"
# neg: "{neg}"
pos: "{pos_prefix}"
neg: "{neg_prefix}"
chat_suffix: ${eos_token}
compositions:
prompt: "{chat_prefix}"
chosen: "{chat_prefix}{pos}"
reject: "{chat_prefix}{neg}"
instruction: "\\boxed{}"
index_field: id
kv_mapping:
chosen: chosen
reject: reject
id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 2048
# Dataloader
num_workers: 8
prefetch_factor: 2
model_name_or_path: ${sft_model_dir}
pretrain:
resume: latest
dp_size:
tp_size: 1
pp_size: 1
exp_name: llama3.1.8b.numina.process-dpo-sc-p0.5.iter2.split01-23.A100.dp16.v1.0.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix
do_train: True
evaluate_during_training: False
do_eval: False
eval_sub_path: checkpoint-*
# Training hyper-parameters
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
#learning_rate: 1e-4
learning_rate: 1e-6
#learning_rate: 2e-5
gradient_accumulation_steps: 32
weight_decay: 0.1
adam_epsilon: 1e-6
adam_betas: "(0.9, 0.98)"
#adam_betas: "(0.9, 0.999)"
#max_grad_norm: 0.0
total_dataset_len: -1
max_grad_norm: 1.0
num_train_epochs: 3
max_steps: 0
warmup_proportion: 0.03
warmup_steps: 0
# Optimizer
optimizer:
use_nvlamb:
bit_training:
logging_steps: 1
save_ds_state: True
save_steps: 100
save_best: False
eval_steps: 100
ddp_eval: True
no_cuda: False
seed: 42
local_rank: -1
fp16: True
fp16_opt_level: O1
fp16_bfloat16: True
# Prediction config
prediction_cfg:
metric: "loss"
measure: -1
best_checkpoint:
best_result:
eval_forward_fn:
_target_: general_util.evaluator.DefaultForwardFn
post_process:
_target_: post_processors.dpo.DPOEvalPostProcessor
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas: [ 0.9, 0.95 ]
weight_decay: ${weight_decay}
steps_per_print: 1
# bf16:
# enabled: False
# fp16:
# enabled: True
# auto_cast: False
# loss_scale: 0
# initial_scale_power: 16
# loss_scale_window: 1000
# hysteresis: 2
# consecutive_hysteresis: False
# min_loss_scale: 1
summary_helper:
_target_: general_util.tensorboard_helper.WandbWriter
batch_index_or_keys:
outputs_index_or_keys:
"train/chosen_reward": chosen_reward
"train/rejected_reward": rejected_reward
# Temporary variables
n_gpu:
device:
train_batch_size:
eval_batch_size:
world_size:
================================================
FILE: PFPO/conf/exp/mathscale/llama/dpo/numina-co/llama3.1-pdpo-iter2-split01-23-p0.5-v1.1-h100.yaml
================================================
defaults:
- hydra: default
- deepspeed@ds_cfg: train_hybrid_engine_zero1_optim_offload_cosine
- _self_ # see here for more details: https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/#composition-order-of-primary-config
hydra:
searchpath:
- file://conf/
data_path_prefix: /mnt/fangkai_blob/share
model_path_prefix: /mnt/fangkai_blob/share/models/
output_path_prefix: /mnt/fangkai_blob/reward_modeling/
sft_model_dir: ${output_path_prefix}/experiments/llama3.1.8b.numina.process-dpo-sc-p0.5.iter1.split01.H100.dp8.v1.0.s42/checkpoint-200
train_file: ${data_path_prefix}/dataset/NuminaMath/numina-math-cot.de_con.label.json
dev_file:
test_file:
eos_token: "<|end_of_text|>"
eos_token_id: 128001
torch_dtype:
_target_: general_util.training_utils.return_torch_dtype
dtype: bfloat16
tokenizer_init:
_target_: general_util.tokenization_utils.init_tokenizer
tokenizer_path: ${model_name_or_path}
padding_side: left
pad_token: ${eos_token}
device_map:
_target_: models.utils.return_single_device_map
model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained_with_ref_model
beta: 0.5
sft_loss: True
sft_loss_weight: 1.0
gradient_checkpointing: True
attn_implementation: "flash_attention_2"
# attn_implementation: "sdpa"
torch_dtype: ${torch_dtype}
pad_token_id: ${eos_token_id}
device_map: ${device_map}
ref_model:
_target_: models.llama.LlamaForCausalLMDPO.from_pretrained
pretrained_model_name_or_path: ${model_name_or_path}
torch_dtype: ${torch_dtype}
attn_implementation: "flash_attention_2"
# attn_implementation: "sdpa"
device_map: ${device_map}
pad_token_id: ${eos_token_id}
read_tensor:
_target_: data.combine_dataset.MultiMappingDataset
aligner:
_target_: data.input_aligner.concat_aligner
aligners:
- _target_: data.input_aligner.field_extract_aligner
input_index_field: id
extract_index_field: idx
extract_fields: [ value, prefix ]
extra_file: ${sft_model_dir}/numina/cot.de_con.n16.tem1.0.p1.0.split23.upper0.8.r0.3.sample8.filter_same.process_rm.sc-p0.5.azure.json
- _target_: data.input_aligner.value2pair_mapping_aligner
field: prefix
pos_field: pos_prefix
neg_field: neg_prefix
value_field: value
value_mapping_func:
_target_: data.input_aligner.return_threshold_mapping
value_threshold: 1
- _target_: data.input_aligner.dpo_bi_random_choice_aligner
pos_field: pos_prefix
neg_field: neg_prefix
# sort_accord_to_len: True
# top_k: 5
# num_workers: 32
template:
_target_: data.input_utils.recompose_template
units:
chat_prefix: "{question}\n\nPlease put your final answer within {instruction}."
# pos: "{pos}"
# neg: "{neg}"
pos: "{pos_prefix}"
neg: "{neg_prefix}"
chat_suffix: ${eos_token}
compositions:
prompt: "{chat_prefix}"
chosen: "{chat_prefix}{pos}"
reject: "{chat_prefix}{neg}"
instruction: "\\boxed{}"
index_field: id
kv_mapping:
chosen: chosen
reject: reject
id: index
prompt: prompt
dist_load_data_barrier: False
extended_vocab:
# Data collator
collator:
_target_: data.general_collator.DPOCollator
tokenizer: ${tokenizer_init}
max_seq_length: 2048
# Dataloader
num_workers: 8
prefetch_factor: 2
model_name_or_path: ${sft_model_dir}
pretrain:
resume: latest
dp_size:
tp_size: 1
pp_size: 1
exp_name: llama3.1.8b.numina.process-dpo-sc-p0.5.iter2.split01-23.A100.dp8.v1.1.s${seed}
exp_notes:
output_dir: ${output_path_prefix}experiments/${exp_name} # Fix