Copy disabled (too large)
Download .txt
Showing preview only (11,282K chars total). Download the full file to get everything.
Repository: open-compass/opencompass
Branch: main
Commit: 3cdd4c2343b3
Files: 3168
Total size: 10.1 MB
Directory structure:
gitextract_3v91281m/
├── .codespellrc
├── .github/
│ ├── ISSUE_TEMPLATE/
│ │ ├── 1_bug-report.yml
│ │ ├── 2_feature-request.yml
│ │ ├── 3_bug-report_zh.yml
│ │ ├── 4_feature-request_zh.yml
│ │ └── config.yml
│ ├── pull_request_template.md
│ └── workflows/
│ ├── daily-ete-test.yml
│ ├── link-check.yml
│ ├── lint.yml
│ ├── pr-run-test.yml
│ ├── pr-stage-check.yml
│ ├── publish-to-pypi.yml
│ └── unit-test.yml
├── .gitignore
├── .owners.yml
├── .pre-commit-config-zh-cn.yaml
├── .pre-commit-config.yaml
├── LICENSE
├── MANIFEST.in
├── README.md
├── README_zh-CN.md
├── autotest/
│ ├── __init__.py
│ ├── cluster/
│ │ ├── __init__.py
│ │ └── chat_models.py
│ ├── eval/
│ │ ├── __init__.py
│ │ ├── eval_base_fullbench.py
│ │ ├── eval_base_longtext_fullbench.py
│ │ ├── eval_chat_longtext_fullbench.py
│ │ ├── eval_chat_obj_fullbench_other.py
│ │ ├── eval_chat_obj_fullbench_v5.py
│ │ ├── eval_chat_obj_fullbench_v6.py
│ │ ├── eval_chat_obj_fullbench_v7.py
│ │ ├── eval_chat_obj_fullbench_v8.py
│ │ ├── eval_chat_obj_v8.py
│ │ ├── eval_chat_sub_fullbench.py
│ │ └── models.py
│ ├── model/
│ │ ├── __init__.py
│ │ ├── base_datasets.py
│ │ ├── chat_datasets.py
│ │ ├── constant.py
│ │ ├── infer_api.py
│ │ ├── infer_api_rollout.py
│ │ ├── infer_lmdeploy_base.py
│ │ ├── infer_lmdeploy_chat.py
│ │ ├── infer_transformers_base.py
│ │ ├── infer_transformers_chat.py
│ │ ├── infer_vllm_base.py
│ │ └── infer_vllm_chat.py
│ ├── oc_score_baseline.yaml
│ └── utils/
│ ├── compare_results.py
│ ├── health_check.py
│ └── oc_score_assert.py
├── dataset-index.yml
├── docs/
│ ├── en/
│ │ ├── .readthedocs.yaml
│ │ ├── Makefile
│ │ ├── _static/
│ │ │ ├── css/
│ │ │ │ └── readthedocs.css
│ │ │ └── js/
│ │ │ └── custom.js
│ │ ├── _templates/
│ │ │ ├── 404.html
│ │ │ ├── autosummary/
│ │ │ │ └── class.rst
│ │ │ └── callable.rst
│ │ ├── advanced_guides/
│ │ │ ├── accelerator_intro.md
│ │ │ ├── circular_eval.md
│ │ │ ├── code_eval.md
│ │ │ ├── code_eval_service.md
│ │ │ ├── contamination_eval.md
│ │ │ ├── custom_dataset.md
│ │ │ ├── evaluation_lightllm.md
│ │ │ ├── evaluation_lmdeploy.md
│ │ │ ├── llm_judge.md
│ │ │ ├── longeval.md
│ │ │ ├── math_verify.md
│ │ │ ├── needleinahaystack_eval.md
│ │ │ ├── new_dataset.md
│ │ │ ├── new_model.md
│ │ │ ├── objective_judgelm_evaluation.md
│ │ │ ├── persistence.md
│ │ │ ├── prompt_attack.md
│ │ │ └── subjective_evaluation.md
│ │ ├── conf.py
│ │ ├── docutils.conf
│ │ ├── get_started/
│ │ │ ├── faq.md
│ │ │ ├── installation.md
│ │ │ └── quick_start.md
│ │ ├── index.rst
│ │ ├── notes/
│ │ │ ├── academic.md
│ │ │ ├── contribution_guide.md
│ │ │ └── news.md
│ │ ├── prompt/
│ │ │ ├── chain_of_thought.md
│ │ │ ├── meta_template.md
│ │ │ ├── overview.md
│ │ │ └── prompt_template.md
│ │ ├── statis.py
│ │ ├── tools.md
│ │ └── user_guides/
│ │ ├── config.md
│ │ ├── corebench.md
│ │ ├── datasets.md
│ │ ├── deepseek_r1.md
│ │ ├── evaluation.md
│ │ ├── experimentation.md
│ │ ├── framework_overview.md
│ │ ├── interns1.md
│ │ ├── metrics.md
│ │ ├── models.md
│ │ └── summarizer.md
│ └── zh_cn/
│ ├── .readthedocs.yaml
│ ├── Makefile
│ ├── _static/
│ │ ├── css/
│ │ │ └── readthedocs.css
│ │ └── js/
│ │ └── custom.js
│ ├── _templates/
│ │ ├── 404.html
│ │ ├── autosummary/
│ │ │ └── class.rst
│ │ └── callable.rst
│ ├── advanced_guides/
│ │ ├── accelerator_intro.md
│ │ ├── circular_eval.md
│ │ ├── code_eval.md
│ │ ├── code_eval_service.md
│ │ ├── compassbench_intro.md
│ │ ├── compassbench_v2_0.md
│ │ ├── contamination_eval.md
│ │ ├── custom_dataset.md
│ │ ├── evaluation_lightllm.md
│ │ ├── evaluation_lmdeploy.md
│ │ ├── llm_judge.md
│ │ ├── longeval.md
│ │ ├── math_verify.md
│ │ ├── needleinahaystack_eval.md
│ │ ├── new_dataset.md
│ │ ├── new_model.md
│ │ ├── objective_judgelm_evaluation.md
│ │ ├── persistence.md
│ │ ├── prompt_attack.md
│ │ └── subjective_evaluation.md
│ ├── conf.py
│ ├── cp_origin_docs.sh
│ ├── docutils.conf
│ ├── get_started/
│ │ ├── faq.md
│ │ ├── installation.md
│ │ └── quick_start.md
│ ├── index.rst
│ ├── notes/
│ │ ├── academic.md
│ │ ├── contribution_guide.md
│ │ └── news.md
│ ├── prompt/
│ │ ├── chain_of_thought.md
│ │ ├── meta_template.md
│ │ ├── overview.md
│ │ └── prompt_template.md
│ ├── statis.py
│ ├── tools.md
│ └── user_guides/
│ ├── config.md
│ ├── corebench.md
│ ├── datasets.md
│ ├── deepseek_r1.md
│ ├── evaluation.md
│ ├── experimentation.md
│ ├── framework_overview.md
│ ├── interns1.md
│ ├── metrics.md
│ ├── models.md
│ └── summarizer.md
├── examples/
│ ├── eval_OlympiadBench.py
│ ├── eval_PMMEval.py
│ ├── eval_ProcessBench.py
│ ├── eval_TheoremQA.py
│ ├── eval_academic_leaderboard_202407.py
│ ├── eval_academic_leaderboard_202412.py
│ ├── eval_academic_leaderboard_202502.py
│ ├── eval_academic_leaderboard_REALTIME.py
│ ├── eval_academic_telechat_thinking.py
│ ├── eval_alaya.py
│ ├── eval_api_demo.py
│ ├── eval_attack.py
│ ├── eval_babilong.py
│ ├── eval_base_demo.py
│ ├── eval_bench_intern_s1.py
│ ├── eval_bluelm_32k_lveval.py
│ ├── eval_cascade_evaluator.py
│ ├── eval_charm_mem.py
│ ├── eval_charm_rea.py
│ ├── eval_chat_agent.py
│ ├── eval_chat_agent_baseline.py
│ ├── eval_chat_demo.py
│ ├── eval_chat_last.py
│ ├── eval_chatml_datasets.py
│ ├── eval_chembench.py
│ ├── eval_chinese_simpleqa.py
│ ├── eval_cibench.py
│ ├── eval_cibench_api.py
│ ├── eval_circular.py
│ ├── eval_claude.py
│ ├── eval_code_passk.py
│ ├── eval_code_passk_repeat_dataset.py
│ ├── eval_codeagent.py
│ ├── eval_codebench_full.py
│ ├── eval_codegeex2.py
│ ├── eval_compassarena_subjectivebench.py
│ ├── eval_compassarena_subjectivebench_bradleyterry.py
│ ├── eval_contamination.py
│ ├── eval_corebench_2409_base_objective.py
│ ├── eval_corebench_2409_chat_objective.py
│ ├── eval_corebench_2409_longcontext.py
│ ├── eval_corebench_2409_subjective.py
│ ├── eval_deepseek_r1.py
│ ├── eval_dingo.py
│ ├── eval_ds1000_interpreter.py
│ ├── eval_edgellm_demo.py
│ ├── eval_eese_api_judge.py
│ ├── eval_gpt3.5.py
│ ├── eval_gpt4.py
│ ├── eval_hellobench.py
│ ├── eval_hf_llama2.py
│ ├── eval_hf_llama_7b.py
│ ├── eval_inference_ppl.py
│ ├── eval_internLM.py
│ ├── eval_intern_s1_pro.py
│ ├── eval_internlm2_chat_keyset.py
│ ├── eval_internlm2_keyset.py
│ ├── eval_internlm3_math500_thinking.py
│ ├── eval_internlm_7b.py
│ ├── eval_internlm_chat_lmdeploy_apiserver.py
│ ├── eval_internlm_chat_turbomind.py
│ ├── eval_internlm_flames_chat.py
│ ├── eval_internlm_lmdeploy_apiserver.py
│ ├── eval_internlm_math_chat.py
│ ├── eval_internlm_turbomind.py
│ ├── eval_judge_dataset_all.py
│ ├── eval_judgebench.py
│ ├── eval_judgerbench.py
│ ├── eval_judgerbenchv2.py
│ ├── eval_korbench.py
│ ├── eval_lightllm.py
│ ├── eval_livestembench.py
│ ├── eval_llama2_7b.py
│ ├── eval_llama2_7b_lveval.py
│ ├── eval_llama3_instruct.py
│ ├── eval_llm_compression.py
│ ├── eval_llm_judge.py
│ ├── eval_lmdeploy_demo.py
│ ├── eval_longbenchv2.py
│ ├── eval_math_llm_judge.py
│ ├── eval_math_llm_judge_internal.py
│ ├── eval_math_verify.py
│ ├── eval_mathbench.py
│ ├── eval_mmlu_cf.py
│ ├── eval_mmlu_pro.py
│ ├── eval_mmlu_with_zero_retriever_overwritten.py
│ ├── eval_model_rollout.py
│ ├── eval_modelscope_datasets.py
│ ├── eval_multi_prompt_demo.py
│ ├── eval_musr.py
│ ├── eval_needlebench_v2.py
│ ├── eval_qwen3.py
│ ├── eval_qwen_7b.py
│ ├── eval_qwen_7b_chat.py
│ ├── eval_qwen_7b_chat_lawbench.py
│ ├── eval_rewardbench.py
│ ├── eval_rmb.py
│ ├── eval_ruler.py
│ ├── eval_ruler_fix_tokenizer.py
│ ├── eval_rwkv5_3b.py
│ ├── eval_scireasoner.py
│ ├── eval_simpleqa.py
│ ├── eval_subjective.py
│ ├── eval_subjective_alpacaeval_official.py
│ ├── eval_subjective_bradleyterry.py
│ ├── eval_teval.py
│ └── eval_with_model_dataset_combinations.py
├── opencompass/
│ ├── __init__.py
│ ├── cli/
│ │ ├── __init__.py
│ │ └── main.py
│ ├── configs/
│ │ ├── chatml_datasets/
│ │ │ ├── AMO_Bench/
│ │ │ │ └── AMO_Bench_gen.py
│ │ │ ├── CPsyExam/
│ │ │ │ └── CPsyExam_gen.py
│ │ │ ├── CS_Bench/
│ │ │ │ └── CS_Bench_gen.py
│ │ │ ├── C_MHChem/
│ │ │ │ └── C_MHChem_gen.py
│ │ │ ├── HMMT2025/
│ │ │ │ └── HMMT2025_gen.py
│ │ │ ├── IMO_Bench_AnswerBench/
│ │ │ │ └── IMO_Bench_AnswerBench_gen.py
│ │ │ ├── MaScQA/
│ │ │ │ └── MaScQA_gen.py
│ │ │ ├── UGD_hard/
│ │ │ │ └── UGD_hard_gen.py
│ │ │ └── UGPhysics/
│ │ │ └── UGPhysics_gen.py
│ │ ├── dataset_collections/
│ │ │ └── chat_OC15.py
│ │ ├── datasets/
│ │ │ ├── ARC_Prize_Public_Evaluation/
│ │ │ │ ├── README.md
│ │ │ │ ├── arc_agi_2_public_evaluation_gen.py
│ │ │ │ ├── arc_prize_public_evaluation_gen.py
│ │ │ │ ├── arc_prize_public_evaluation_gen_872059.py
│ │ │ │ └── arc_prize_public_evaluation_gen_fedd04.py
│ │ │ ├── ARC_c/
│ │ │ │ ├── ARC_c_clean_ppl.py
│ │ │ │ ├── ARC_c_cot_gen_926652.py
│ │ │ │ ├── ARC_c_few_shot_gen_e9b043.py
│ │ │ │ ├── ARC_c_few_shot_ppl.py
│ │ │ │ ├── ARC_c_gen.py
│ │ │ │ ├── ARC_c_gen_1e0de5.py
│ │ │ │ ├── ARC_c_ppl.py
│ │ │ │ ├── ARC_c_ppl_2ef631.py
│ │ │ │ ├── ARC_c_ppl_a450bd.py
│ │ │ │ └── ARC_c_ppl_d52a21.py
│ │ │ ├── ARC_e/
│ │ │ │ ├── ARC_e_gen.py
│ │ │ │ ├── ARC_e_gen_1e0de5.py
│ │ │ │ ├── ARC_e_ppl.py
│ │ │ │ ├── ARC_e_ppl_2ef631.py
│ │ │ │ ├── ARC_e_ppl_a450bd.py
│ │ │ │ └── ARC_e_ppl_d52a21.py
│ │ │ ├── BeyondAIME/
│ │ │ │ ├── beyondaime_cascade_eval_gen_5e9f4f.py
│ │ │ │ └── beyondaime_gen.py
│ │ │ ├── CARDBiomedBench/
│ │ │ │ ├── CARDBiomedBench_llmjudge_gen_99a231.py
│ │ │ │ └── CARDBiomedBench_llmjudge_rawprompt_gen_b4d90c.py
│ │ │ ├── CHARM/
│ │ │ │ ├── README.md
│ │ │ │ ├── README_ZH.md
│ │ │ │ ├── charm_memory_gen_bbbd53.py
│ │ │ │ ├── charm_memory_settings.py
│ │ │ │ ├── charm_reason_cot_only_gen_f7b7d3.py
│ │ │ │ ├── charm_reason_gen.py
│ │ │ │ ├── charm_reason_gen_f8fca2.py
│ │ │ │ ├── charm_reason_ppl_3da4de.py
│ │ │ │ └── charm_reason_settings.py
│ │ │ ├── CIBench/
│ │ │ │ ├── CIBench_generation_gen_8ab0dc.py
│ │ │ │ ├── CIBench_generation_oracle_gen_c4a7c1.py
│ │ │ │ ├── CIBench_template_gen_e6b12a.py
│ │ │ │ └── CIBench_template_oracle_gen_fecda1.py
│ │ │ ├── CLUE_C3/
│ │ │ │ ├── CLUE_C3_gen.py
│ │ │ │ ├── CLUE_C3_gen_8c358f.py
│ │ │ │ ├── CLUE_C3_ppl.py
│ │ │ │ ├── CLUE_C3_ppl_56b537.py
│ │ │ │ └── CLUE_C3_ppl_e24a31.py
│ │ │ ├── CLUE_CMRC/
│ │ │ │ ├── CLUE_CMRC_gen.py
│ │ │ │ ├── CLUE_CMRC_gen_1bd3c8.py
│ │ │ │ ├── CLUE_CMRC_gen_3749cd.py
│ │ │ │ ├── CLUE_CMRC_gen_8484b9.py
│ │ │ │ └── CLUE_CMRC_gen_941108.py
│ │ │ ├── CLUE_DRCD/
│ │ │ │ ├── CLUE_DRCD_gen.py
│ │ │ │ ├── CLUE_DRCD_gen_1bd3c8.py
│ │ │ │ ├── CLUE_DRCD_gen_3749cd.py
│ │ │ │ ├── CLUE_DRCD_gen_8484b9.py
│ │ │ │ └── CLUE_DRCD_gen_941108.py
│ │ │ ├── CLUE_afqmc/
│ │ │ │ ├── CLUE_afqmc_gen.py
│ │ │ │ ├── CLUE_afqmc_gen_901306.py
│ │ │ │ ├── CLUE_afqmc_ppl.py
│ │ │ │ ├── CLUE_afqmc_ppl_378c5b.py
│ │ │ │ ├── CLUE_afqmc_ppl_6507d7.py
│ │ │ │ └── CLUE_afqmc_ppl_7b0c1e.py
│ │ │ ├── CLUE_cmnli/
│ │ │ │ ├── CLUE_cmnli_gen.py
│ │ │ │ ├── CLUE_cmnli_gen_1abf97.py
│ │ │ │ ├── CLUE_cmnli_gen_51e956.py
│ │ │ │ ├── CLUE_cmnli_ppl.py
│ │ │ │ ├── CLUE_cmnli_ppl_98dd6e.py
│ │ │ │ ├── CLUE_cmnli_ppl_ef69e7.py
│ │ │ │ └── CLUE_cmnli_ppl_fdc6de.py
│ │ │ ├── CLUE_ocnli/
│ │ │ │ ├── CLUE_ocnli_gen.py
│ │ │ │ ├── CLUE_ocnli_gen_51e956.py
│ │ │ │ ├── CLUE_ocnli_gen_c4cb6c.py
│ │ │ │ ├── CLUE_ocnli_ppl.py
│ │ │ │ ├── CLUE_ocnli_ppl_98dd6e.py
│ │ │ │ ├── CLUE_ocnli_ppl_ef69e7.py
│ │ │ │ └── CLUE_ocnli_ppl_fdc6de.py
│ │ │ ├── CMPhysBench/
│ │ │ │ ├── cmphysbench_gen.py
│ │ │ │ └── cmphysbench_rawprompt_gen.py
│ │ │ ├── ChemBench/
│ │ │ │ ├── ChemBench_gen.py
│ │ │ │ ├── ChemBench_gen_a9f753.py
│ │ │ │ ├── ChemBench_llmjudge_gen.py
│ │ │ │ ├── ChemBench_llmjudge_gen_c584cf.py
│ │ │ │ └── ChemBench_llmjudge_rawprompt_gen_fa3fc4.py
│ │ │ ├── ClimaQA/
│ │ │ │ ├── ClimaQA_Gold_llm_judge_gen.py
│ │ │ │ ├── ClimaQA_Gold_llm_judge_gen_f15343.py
│ │ │ │ ├── ClimaQA_Gold_llm_judge_rawprompt_gen_b3080f.py
│ │ │ │ ├── ClimaQA_Silver_llm_judge_gen.py
│ │ │ │ └── ClimaQA_Silver_llm_judge_gen_f15343.py
│ │ │ ├── ClinicBench/
│ │ │ │ ├── ClinicBench_llmjudge_gen.py
│ │ │ │ └── ClinicBench_llmjudge_gen_d09668.py
│ │ │ ├── Earth_Silver/
│ │ │ │ ├── Earth_Silver_gen.py
│ │ │ │ ├── Earth_Silver_llmjudge_gen.py
│ │ │ │ ├── Earth_Silver_llmjudge_gen_46140c.py
│ │ │ │ └── Earth_Silver_llmjudge_rawprompt_gen_a84bc6.py
│ │ │ ├── FewCLUE_bustm/
│ │ │ │ ├── FewCLUE_bustm_gen.py
│ │ │ │ ├── FewCLUE_bustm_gen_634f41.py
│ │ │ │ ├── FewCLUE_bustm_ppl.py
│ │ │ │ ├── FewCLUE_bustm_ppl_4b16c0.py
│ │ │ │ ├── FewCLUE_bustm_ppl_9ef540.py
│ │ │ │ └── FewCLUE_bustm_ppl_e53034.py
│ │ │ ├── FewCLUE_chid/
│ │ │ │ ├── FewCLUE_chid_gen.py
│ │ │ │ ├── FewCLUE_chid_gen_0a29a2.py
│ │ │ │ ├── FewCLUE_chid_ppl.py
│ │ │ │ ├── FewCLUE_chid_ppl_8f2872.py
│ │ │ │ └── FewCLUE_chid_ppl_acccb5.py
│ │ │ ├── FewCLUE_cluewsc/
│ │ │ │ ├── FewCLUE_cluewsc_gen.py
│ │ │ │ ├── FewCLUE_cluewsc_gen_c68933.py
│ │ │ │ ├── FewCLUE_cluewsc_ppl.py
│ │ │ │ ├── FewCLUE_cluewsc_ppl_12e4e0.py
│ │ │ │ ├── FewCLUE_cluewsc_ppl_4284a0.py
│ │ │ │ └── FewCLUE_cluewsc_ppl_868415.py
│ │ │ ├── FewCLUE_csl/
│ │ │ │ ├── FewCLUE_csl_gen.py
│ │ │ │ ├── FewCLUE_csl_gen_28b223.py
│ │ │ │ ├── FewCLUE_csl_gen_87f4a8.py
│ │ │ │ ├── FewCLUE_csl_ppl.py
│ │ │ │ ├── FewCLUE_csl_ppl_769f8d.py
│ │ │ │ └── FewCLUE_csl_ppl_841b62.py
│ │ │ ├── FewCLUE_eprstmt/
│ │ │ │ ├── FewCLUE_eprstmt_gen.py
│ │ │ │ ├── FewCLUE_eprstmt_gen_740ea0.py
│ │ │ │ ├── FewCLUE_eprstmt_ppl.py
│ │ │ │ ├── FewCLUE_eprstmt_ppl_1ce587.py
│ │ │ │ └── FewCLUE_eprstmt_ppl_f1e631.py
│ │ │ ├── FewCLUE_ocnli_fc/
│ │ │ │ ├── FewCLUE_ocnli_fc_gen.py
│ │ │ │ ├── FewCLUE_ocnli_fc_gen_f97a97.py
│ │ │ │ ├── FewCLUE_ocnli_fc_ppl.py
│ │ │ │ ├── FewCLUE_ocnli_fc_ppl_9e8b3d.py
│ │ │ │ └── FewCLUE_ocnli_fc_ppl_c08300.py
│ │ │ ├── FewCLUE_tnews/
│ │ │ │ ├── FewCLUE_tnews_gen.py
│ │ │ │ ├── FewCLUE_tnews_gen_b90e4a.py
│ │ │ │ ├── FewCLUE_tnews_ppl.py
│ │ │ │ ├── FewCLUE_tnews_ppl_7d1c07.py
│ │ │ │ ├── FewCLUE_tnews_ppl_d10e8a.py
│ │ │ │ └── FewCLUE_tnews_ppl_fff486.py
│ │ │ ├── FinanceIQ/
│ │ │ │ ├── FinanceIQ_gen.py
│ │ │ │ ├── FinanceIQ_gen_e0e6b5.py
│ │ │ │ ├── FinanceIQ_ppl.py
│ │ │ │ └── FinanceIQ_ppl_42b9bd.py
│ │ │ ├── GLUE_CoLA/
│ │ │ │ ├── GLUE_CoLA_ppl.py
│ │ │ │ └── GLUE_CoLA_ppl_77d0df.py
│ │ │ ├── GLUE_MRPC/
│ │ │ │ ├── GLUE_MRPC_ppl.py
│ │ │ │ └── GLUE_MRPC_ppl_96564c.py
│ │ │ ├── GLUE_QQP/
│ │ │ │ ├── GLUE_QQP_ppl.py
│ │ │ │ └── GLUE_QQP_ppl_250d00.py
│ │ │ ├── GaokaoBench/
│ │ │ │ ├── GaokaoBench_gen.py
│ │ │ │ ├── GaokaoBench_gen_5cfe9e.py
│ │ │ │ ├── GaokaoBench_mixed.py
│ │ │ │ ├── GaokaoBench_mixed_9af5ee.py
│ │ │ │ ├── GaokaoBench_no_subjective_gen_4c31db.py
│ │ │ │ ├── GaokaoBench_no_subjective_gen_d16acb.py
│ │ │ │ ├── GaokaoBench_no_subjective_gen_d21e37.py
│ │ │ │ ├── GaokaoBench_prompts.py
│ │ │ │ └── README.md
│ │ │ ├── HLE/
│ │ │ │ ├── hle_biomed_llm_verify_gen_6ff468.py
│ │ │ │ ├── hle_gen.py
│ │ │ │ ├── hle_llmverify_academic.py
│ │ │ │ ├── hle_llmverify_gen_6ff468.py
│ │ │ │ └── hle_llmverify_rawprompt_gen_0970dd.py
│ │ │ ├── HealthBench/
│ │ │ │ └── healthbench_gen_831613.py
│ │ │ ├── IFBench/
│ │ │ │ ├── IFBench_gen.py
│ │ │ │ └── IFBench_rawprompt_gen.py
│ │ │ ├── IFEval/
│ │ │ │ ├── IFEval.md
│ │ │ │ ├── IFEval_gen.py
│ │ │ │ ├── IFEval_gen_3321a3.py
│ │ │ │ ├── IFEval_gen_353ae7.py
│ │ │ │ ├── IFEval_rawprompt_gen_e7f781.py
│ │ │ │ └── README.md
│ │ │ ├── LCBench/
│ │ │ │ ├── README.md
│ │ │ │ ├── lcbench_gen.py
│ │ │ │ ├── lcbench_gen_5ff288.py
│ │ │ │ ├── lcbench_levels_gen_bb665f.py
│ │ │ │ ├── lcbench_repeat10_gen.py
│ │ │ │ └── lcbench_repeat10_gen_5ff288.py
│ │ │ ├── MMLUArabic/
│ │ │ │ ├── MMLUArabic_gen.py
│ │ │ │ ├── MMLUArabic_gen_326684.py
│ │ │ │ ├── MMLUArabic_ppl.py
│ │ │ │ ├── MMLUArabic_ppl_d2333a.py
│ │ │ │ ├── MMLUArabic_zero_shot_gen.py
│ │ │ │ ├── MMLUArabic_zero_shot_gen_3523e0.py
│ │ │ │ └── README.md
│ │ │ ├── MathBench/
│ │ │ │ ├── deprecated_mathbench_2024_gen_de9ff9.py
│ │ │ │ ├── deprecated_mathbench_agent_gen_48ec47.py
│ │ │ │ ├── deprecated_mathbench_agent_gen_fbe13b.py
│ │ │ │ ├── deprecated_mathbench_arith_gen_ccd638.py
│ │ │ │ ├── deprecated_mathbench_cot_gen_66f329.py
│ │ │ │ ├── deprecated_mathbench_gen_7b734b.py
│ │ │ │ ├── mathbench_2024_few_shot_mixed_4a3fd4.py
│ │ │ │ ├── mathbench_2024_gen_19e486.py
│ │ │ │ ├── mathbench_2024_gen_1dc21d.py
│ │ │ │ ├── mathbench_2024_gen_4b8f28.py
│ │ │ │ ├── mathbench_2024_gen_50a320.py
│ │ │ │ ├── mathbench_2024_gen_fc2a24.py
│ │ │ │ ├── mathbench_2024_wocircular_gen_1dc21d.py
│ │ │ │ ├── mathbench_2024_wocircular_mixed_8eb12b.py
│ │ │ │ ├── mathbench_gen.py
│ │ │ │ └── mathbench_prompt.py
│ │ │ ├── MedBench/
│ │ │ │ ├── medbench_gen.py
│ │ │ │ └── medbench_gen_0b4fff.py
│ │ │ ├── MedCalc_Bench/
│ │ │ │ └── MedCalcBench_official_gen_a5155f.py
│ │ │ ├── MedQA/
│ │ │ │ ├── MedQA_gen_3bf756.py
│ │ │ │ └── MedQA_llmjudge_gen_3bf756.py
│ │ │ ├── MedXpertQA/
│ │ │ │ ├── MedXpertQA_gen.py
│ │ │ │ ├── MedXpertQA_llmjudge_gen.py
│ │ │ │ └── MedXpertQA_llmjudge_rawprompt_gen.py
│ │ │ ├── Medbullets/
│ │ │ │ ├── medbullets_gen.py
│ │ │ │ ├── medbullets_gen_60c8f5.py
│ │ │ │ ├── medbullets_llmjudge_gen.py
│ │ │ │ └── medbullets_llmjudge_gen_60c8f5.py
│ │ │ ├── MolInstructions_chem/
│ │ │ │ ├── mol_instructions_chem_gen.py
│ │ │ │ └── mol_instructions_chem_rawprompt_gen.py
│ │ │ ├── NPHardEval/
│ │ │ │ ├── NPHardEval_gen.py
│ │ │ │ ├── NPHardEval_gen_22aac5.py
│ │ │ │ └── README.md
│ │ │ ├── OlymMATH/
│ │ │ │ ├── README.md
│ │ │ │ ├── olymmath_cascade_eval_gen_97b203.py
│ │ │ │ ├── olymmath_llm_judeg_gen.py
│ │ │ │ ├── olymmath_llmverify_gen_97b203.py
│ │ │ │ └── olymmath_llmverify_rawprompt_gen_9d3a8e.py
│ │ │ ├── OlympiadBench/
│ │ │ │ ├── OlympiadBenchMath_0shot_llmverify_gen_9c22f2.py
│ │ │ │ ├── OlympiadBench_0shot_cascade_eval_gen_be8b13.py
│ │ │ │ ├── OlympiadBench_0shot_gen_be8b13.py
│ │ │ │ ├── OlympiadBench_0shot_llmverify_gen_be8b13.py
│ │ │ │ ├── OlympiadBench_0shot_llmverify_rawprompt_gen_be8b13.py
│ │ │ │ └── OlympiadBench_categories.py
│ │ │ ├── OpenFinData/
│ │ │ │ ├── OpenFinData_gen.py
│ │ │ │ ├── OpenFinData_gen_46dedb.py
│ │ │ │ └── README.md
│ │ │ ├── PHYBench/
│ │ │ │ ├── phybench_gen.py
│ │ │ │ └── phybench_rawprompt_gen.py
│ │ │ ├── PHYSICS/
│ │ │ │ ├── PHYSICS_llm_judge_gen.py
│ │ │ │ ├── PHYSICS_llm_judge_gen_a133a2.py
│ │ │ │ └── PHYSICS_llm_judge_rawprompt_gen_56ebc8.py
│ │ │ ├── PI_LLM/
│ │ │ │ ├── README.md
│ │ │ │ └── pi_llm_gen.py
│ │ │ ├── PJExam/
│ │ │ │ ├── PJExam_gen.py
│ │ │ │ └── PJExam_gen_8cd97c.py
│ │ │ ├── PMMEval/
│ │ │ │ ├── flores_gen.py
│ │ │ │ ├── flores_gen_2697d7.py
│ │ │ │ ├── humanevalxl_gen.py
│ │ │ │ ├── humanevalxl_gen_bdec92.py
│ │ │ │ ├── mgsm_gen.py
│ │ │ │ ├── mgsm_gen_679720.py
│ │ │ │ ├── mhellaswag_gen.py
│ │ │ │ ├── mhellaswag_gen_1a6b73.py
│ │ │ │ ├── mifeval_gen.py
│ │ │ │ ├── mifeval_gen_79f8fb.py
│ │ │ │ ├── mlogiqa_gen.py
│ │ │ │ ├── mlogiqa_gen_36c4f9.py
│ │ │ │ ├── mmmlu_gen.py
│ │ │ │ ├── mmmlu_gen_d5017d.py
│ │ │ │ ├── pmmeval_gen.py
│ │ │ │ ├── xnli_gen.py
│ │ │ │ └── xnli_gen_973734.py
│ │ │ ├── ProcessBench/
│ │ │ │ ├── README.md
│ │ │ │ └── processbench_gen.py
│ │ │ ├── ProteinLMBench/
│ │ │ │ ├── ProteinLMBench_gen_a67965.py
│ │ │ │ ├── ProteinLMBench_llmjudge_gen_a67965.py
│ │ │ │ └── ProteinLMBench_llmjudge_rawprompt_gen_9627a6.py
│ │ │ ├── PubMedQA/
│ │ │ │ ├── PubMedQA_llmjudge_gen.py
│ │ │ │ └── PubMedQA_llmjudge_gen_f00302.py
│ │ │ ├── QuALITY/
│ │ │ │ ├── QuALITY.md
│ │ │ │ ├── QuALITY_gen.py
│ │ │ │ └── QuALITY_gen_c407cb.py
│ │ │ ├── R_Bench/
│ │ │ │ ├── R-Bench.md
│ │ │ │ ├── rbench_gen_544610.py
│ │ │ │ ├── rbench_llmjudge_gen_c89350.py
│ │ │ │ └── rbench_llmjudge_rawprompt_gen_c24221.py
│ │ │ ├── SVAMP/
│ │ │ │ ├── svamp_gen.py
│ │ │ │ └── svamp_gen_fb25e4.py
│ │ │ ├── SciEval/
│ │ │ │ ├── SciEval_5shot_gen_4043d4.py
│ │ │ │ ├── SciEval_5shot_llmjudge_gen_b7b684.py
│ │ │ │ └── SciEval_lifescience_sets.py
│ │ │ ├── SciKnowEval/
│ │ │ │ ├── SciKnowEval_gen_ebe47d.py
│ │ │ │ └── SciKnowEval_llmjudge_gen_ebe47d.py
│ │ │ ├── SciReasoner/
│ │ │ │ ├── GUE_gen.py
│ │ │ │ ├── LLM4Mat_gen.py
│ │ │ │ ├── UMG.py
│ │ │ │ ├── UPG.py
│ │ │ │ ├── bio_instruction_gen.py
│ │ │ │ ├── bulk_modulus_material_gen.py
│ │ │ │ ├── composition_material_gen.py
│ │ │ │ ├── mol_biotext_gen.py
│ │ │ │ ├── mol_molecule_gen.py
│ │ │ │ ├── mol_protein_gen.py
│ │ │ │ ├── opi_gen.py
│ │ │ │ ├── peer_gen.py
│ │ │ │ ├── retrosynthesis_USPTO_gen.py
│ │ │ │ ├── scireasoner_gen.py
│ │ │ │ ├── smol_gen.py
│ │ │ │ ├── unconditional_RNA_gen.py
│ │ │ │ └── unconditional_material_gen.py
│ │ │ ├── ScienceQA/
│ │ │ │ ├── ScienceQA_llmjudge_gen.py
│ │ │ │ └── ScienceQA_llmjudge_gen_f00302.py
│ │ │ ├── SeedBench/
│ │ │ │ ├── README.md
│ │ │ │ ├── seedbench_gen.py
│ │ │ │ └── seedbench_gen_5d5ea1.py
│ │ │ ├── SimpleQA/
│ │ │ │ ├── README.md
│ │ │ │ ├── simpleqa_gen.py
│ │ │ │ └── simpleqa_gen_0283c3.py
│ │ │ ├── SmolInstruct/
│ │ │ │ ├── smolinstruct_0shot_instruct_gen.py
│ │ │ │ ├── smolinstruct_0shot_instruct_rawprompt_gen.py
│ │ │ │ ├── smolinstruct_fts_0shot_instruct.py
│ │ │ │ ├── smolinstruct_fts_0shot_rawprompt_instruct.py
│ │ │ │ ├── smolinstruct_fts_gen_5774b5.py
│ │ │ │ ├── smolinstruct_gen.py
│ │ │ │ ├── smolinstruct_meteor_0shot_instruct.py
│ │ │ │ ├── smolinstruct_meteor_0shot_rawprompt_instruct.py
│ │ │ │ ├── smolinstruct_meteor_gen_065150.py
│ │ │ │ ├── smolinstruct_nc_0shot_instruct.py
│ │ │ │ ├── smolinstruct_nc_0shot_rawprompt_instruct.py
│ │ │ │ ├── smolinstruct_nc_gen_c84c18.py
│ │ │ │ ├── smolinstruct_pp_acc_0_shot_instruct.py
│ │ │ │ ├── smolinstruct_pp_acc_0_shot_rawprompt_instruct.py
│ │ │ │ ├── smolinstruct_pp_acc_gen_8607a3.py
│ │ │ │ ├── smolinstruct_rmse_0shot_instruct.py
│ │ │ │ ├── smolinstruct_rmse_0shot_rawprompt_instruct.py
│ │ │ │ └── smolinstruct_rmse_gen_0fcc6b.py
│ │ │ ├── SuperGLUE_AX_b/
│ │ │ │ ├── SuperGLUE_AX_b_gen.py
│ │ │ │ ├── SuperGLUE_AX_b_gen_4dfefa.py
│ │ │ │ ├── SuperGLUE_AX_b_ppl.py
│ │ │ │ ├── SuperGLUE_AX_b_ppl_0748aa.py
│ │ │ │ └── SuperGLUE_AX_b_ppl_6db806.py
│ │ │ ├── SuperGLUE_AX_g/
│ │ │ │ ├── SuperGLUE_AX_g_gen.py
│ │ │ │ ├── SuperGLUE_AX_g_gen_68aac7.py
│ │ │ │ ├── SuperGLUE_AX_g_ppl.py
│ │ │ │ ├── SuperGLUE_AX_g_ppl_50f8f6.py
│ │ │ │ └── SuperGLUE_AX_g_ppl_66caf3.py
│ │ │ ├── SuperGLUE_BoolQ/
│ │ │ │ ├── SuperGLUE_BoolQ_cot_gen_1d56df.py
│ │ │ │ ├── SuperGLUE_BoolQ_few_shot_gen_ba58ea.py
│ │ │ │ ├── SuperGLUE_BoolQ_few_shot_ppl.py
│ │ │ │ ├── SuperGLUE_BoolQ_gen.py
│ │ │ │ ├── SuperGLUE_BoolQ_gen_883d50.py
│ │ │ │ ├── SuperGLUE_BoolQ_ppl.py
│ │ │ │ ├── SuperGLUE_BoolQ_ppl_16b1d9.py
│ │ │ │ ├── SuperGLUE_BoolQ_ppl_314797.py
│ │ │ │ ├── SuperGLUE_BoolQ_ppl_314b96.py
│ │ │ │ ├── SuperGLUE_BoolQ_ppl_4da4db.py
│ │ │ │ └── SuperGLUE_BoolQ_ppl_9619db.py
│ │ │ ├── SuperGLUE_CB/
│ │ │ │ ├── SuperGLUE_CB_gen.py
│ │ │ │ ├── SuperGLUE_CB_gen_854c6c.py
│ │ │ │ ├── SuperGLUE_CB_ppl.py
│ │ │ │ ├── SuperGLUE_CB_ppl_0143fe.py
│ │ │ │ └── SuperGLUE_CB_ppl_11c175.py
│ │ │ ├── SuperGLUE_COPA/
│ │ │ │ ├── SuperGLUE_COPA_gen.py
│ │ │ │ ├── SuperGLUE_COPA_gen_91ca53.py
│ │ │ │ ├── SuperGLUE_COPA_ppl.py
│ │ │ │ ├── SuperGLUE_COPA_ppl_54058d.py
│ │ │ │ ├── SuperGLUE_COPA_ppl_5c24f1.py
│ │ │ │ └── SuperGLUE_COPA_ppl_9f3618.py
│ │ │ ├── SuperGLUE_MultiRC/
│ │ │ │ ├── SuperGLUE_MultiRC_gen.py
│ │ │ │ ├── SuperGLUE_MultiRC_gen_27071f.py
│ │ │ │ ├── SuperGLUE_MultiRC_ppl.py
│ │ │ │ ├── SuperGLUE_MultiRC_ppl_866273.py
│ │ │ │ └── SuperGLUE_MultiRC_ppl_ced824.py
│ │ │ ├── SuperGLUE_RTE/
│ │ │ │ ├── SuperGLUE_RTE_gen.py
│ │ │ │ ├── SuperGLUE_RTE_gen_68aac7.py
│ │ │ │ ├── SuperGLUE_RTE_ppl.py
│ │ │ │ ├── SuperGLUE_RTE_ppl_50f8f6.py
│ │ │ │ └── SuperGLUE_RTE_ppl_66caf3.py
│ │ │ ├── SuperGLUE_ReCoRD/
│ │ │ │ ├── SuperGLUE_ReCoRD_gen.py
│ │ │ │ ├── SuperGLUE_ReCoRD_gen_0f7784.py
│ │ │ │ ├── SuperGLUE_ReCoRD_gen_30dea0.py
│ │ │ │ └── SuperGLUE_ReCoRD_gen_a69961.py
│ │ │ ├── SuperGLUE_WSC/
│ │ │ │ ├── SuperGLUE_WSC_gen.py
│ │ │ │ ├── SuperGLUE_WSC_gen_7902a7.py
│ │ │ │ ├── SuperGLUE_WSC_gen_fe4bf3.py
│ │ │ │ ├── SuperGLUE_WSC_ppl.py
│ │ │ │ ├── SuperGLUE_WSC_ppl_003529.py
│ │ │ │ ├── SuperGLUE_WSC_ppl_1c4a90.py
│ │ │ │ ├── SuperGLUE_WSC_ppl_d0f531.py
│ │ │ │ └── SuperGLUE_WSC_ppl_f37e78.py
│ │ │ ├── SuperGLUE_WiC/
│ │ │ │ ├── SuperGLUE_WiC_gen.py
│ │ │ │ ├── SuperGLUE_WiC_gen_d06864.py
│ │ │ │ ├── SuperGLUE_WiC_ppl.py
│ │ │ │ ├── SuperGLUE_WiC_ppl_312de9.py
│ │ │ │ ├── SuperGLUE_WiC_ppl_3fb6fd.py
│ │ │ │ └── SuperGLUE_WiC_ppl_c926be.py
│ │ │ ├── TabMWP/
│ │ │ │ ├── TabMWP_gen.py
│ │ │ │ └── TabMWP_gen_2aef96.py
│ │ │ ├── TheoremQA/
│ │ │ │ ├── README.md
│ │ │ │ ├── TheoremQA_5shot_gen_6f0af8.py
│ │ │ │ ├── TheoremQA_few_shot_examples.py
│ │ │ │ ├── TheoremQA_few_shot_examples_official.py
│ │ │ │ ├── TheoremQA_gen.py
│ │ │ │ ├── ThroremQA_0shot_cot_gen_8acdf7.py
│ │ │ │ ├── deprecated_TheoremQA_gen_424e0a.py
│ │ │ │ ├── deprecated_TheoremQA_gen_7009de.py
│ │ │ │ ├── deprecated_TheoremQA_gen_ef26ca.py
│ │ │ │ ├── deprecated_TheoremQA_post_v2_gen_2c2583.py
│ │ │ │ └── deprecated_TheoremQA_post_v2_gen_ef26ca.py
│ │ │ ├── XCOPA/
│ │ │ │ ├── XCOPA_ppl.py
│ │ │ │ └── XCOPA_ppl_54058d.py
│ │ │ ├── XLSum/
│ │ │ │ ├── XLSum_gen.py
│ │ │ │ └── XLSum_gen_2bb71c.py
│ │ │ ├── Xsum/
│ │ │ │ ├── Xsum_gen.py
│ │ │ │ ├── Xsum_gen_31397e.py
│ │ │ │ └── Xsum_gen_8ea5f8.py
│ │ │ ├── adv_glue/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── adv_glue_mnli/
│ │ │ │ │ ├── adv_glue_mnli_gen.py
│ │ │ │ │ └── adv_glue_mnli_gen_bd8ef0.py
│ │ │ │ ├── adv_glue_mnli_mm/
│ │ │ │ │ ├── adv_glue_mnli_mm_gen.py
│ │ │ │ │ └── adv_glue_mnli_mm_gen_bd8ef0.py
│ │ │ │ ├── adv_glue_qnli/
│ │ │ │ │ ├── adv_glue_qnli_gen.py
│ │ │ │ │ └── adv_glue_qnli_gen_0b7326.py
│ │ │ │ ├── adv_glue_qqp/
│ │ │ │ │ ├── adv_glue_qqp_gen.py
│ │ │ │ │ └── adv_glue_qqp_gen_cdc277.py
│ │ │ │ ├── adv_glue_rte/
│ │ │ │ │ ├── adv_glue_rte_gen.py
│ │ │ │ │ └── adv_glue_rte_gen_8cc547.py
│ │ │ │ └── adv_glue_sst2/
│ │ │ │ ├── adv_glue_sst2_gen.py
│ │ │ │ └── adv_glue_sst2_gen_ee8d3b.py
│ │ │ ├── agieval/
│ │ │ │ ├── agieval_gen.py
│ │ │ │ ├── agieval_gen_397d81.py
│ │ │ │ ├── agieval_gen_617738.py
│ │ │ │ ├── agieval_gen_64afd3.py
│ │ │ │ ├── agieval_gen_a0c741.py
│ │ │ │ ├── agieval_mixed.py
│ │ │ │ └── agieval_mixed_0fa998.py
│ │ │ ├── aime2024/
│ │ │ │ ├── README.md
│ │ │ │ ├── aime2024_0shot_nocot_gen_2b9dc2.py
│ │ │ │ ├── aime2024_0shot_nocot_genericllmeval_academic_gen.py
│ │ │ │ ├── aime2024_0shot_nocot_genericllmeval_gen_2b9dc2.py
│ │ │ │ ├── aime2024_cascade_eval_gen_5e9f4f.py
│ │ │ │ ├── aime2024_cascade_eval_rawprompt_gen_2f2c96.py
│ │ │ │ ├── aime2024_gen.py
│ │ │ │ ├── aime2024_gen_17d799.py
│ │ │ │ ├── aime2024_gen_6e39a4.py
│ │ │ │ ├── aime2024_llmjudge_gen.py
│ │ │ │ ├── aime2024_llmjudge_gen_5e9f4f.py
│ │ │ │ ├── aime2024_llmverify_repeat16_gen_bf7475.py
│ │ │ │ └── aime2024_llmverify_repeat8_gen_e8fcee.py
│ │ │ ├── aime2025/
│ │ │ │ ├── aime2025_cascade_eval_gen_5e9f4f.py
│ │ │ │ ├── aime2025_cascade_eval_rawprompt_gen_2f2c96.py
│ │ │ │ ├── aime2025_llmjudge_academic.py
│ │ │ │ └── aime2025_llmjudge_gen_5e9f4f.py
│ │ │ ├── aime2026/
│ │ │ │ ├── aime2026_cascade_eval_gen_6ff468.py
│ │ │ │ └── aime2026_cascade_eval_rawprompt_gen_0970dd.py
│ │ │ ├── anli/
│ │ │ │ ├── anli_gen.py
│ │ │ │ ├── anli_gen_fc7328.py
│ │ │ │ ├── anli_ppl.py
│ │ │ │ └── anli_ppl_1d290e.py
│ │ │ ├── anthropics_evals/
│ │ │ │ ├── airisk_gen.py
│ │ │ │ ├── airisk_gen_ba66fc.py
│ │ │ │ ├── persona_gen.py
│ │ │ │ ├── persona_gen_cc72e2.py
│ │ │ │ ├── sycophancy_gen.py
│ │ │ │ └── sycophancy_gen_4bba45.py
│ │ │ ├── apps/
│ │ │ │ ├── README.md
│ │ │ │ ├── apps_gen.py
│ │ │ │ ├── apps_gen_c7893a.py
│ │ │ │ ├── apps_mini_gen.py
│ │ │ │ ├── apps_mini_gen_c7893a.py
│ │ │ │ ├── deprecated_apps_gen_5b4254.py
│ │ │ │ ├── deprecated_apps_gen_7fbb95.py
│ │ │ │ └── deprecated_apps_gen_b4dee3.py
│ │ │ ├── atlas/
│ │ │ │ ├── README.md
│ │ │ │ ├── atlas_gen.py
│ │ │ │ ├── atlas_val_gen_b2d1b6.py
│ │ │ │ └── atlas_val_rawprompt_gen_277bee.py
│ │ │ ├── babilong/
│ │ │ │ ├── README.md
│ │ │ │ ├── babilong_0k_gen.py
│ │ │ │ ├── babilong_128k_gen.py
│ │ │ │ ├── babilong_16k_gen.py
│ │ │ │ ├── babilong_1m_gen.py
│ │ │ │ ├── babilong_256k_gen.py
│ │ │ │ ├── babilong_2k_gen.py
│ │ │ │ ├── babilong_32k_gen.py
│ │ │ │ └── babilong_4k_gen.py
│ │ │ ├── bbeh/
│ │ │ │ ├── README.md
│ │ │ │ ├── bbeh_gen.py
│ │ │ │ ├── bbeh_llmjudge_gen_86c3a0.py
│ │ │ │ └── bbeh_llmjudge_rawprompt_gen_36b5f4.py
│ │ │ ├── bbh/
│ │ │ │ ├── README.md
│ │ │ │ ├── bbh_0shot_nocot_academic_gen.py
│ │ │ │ ├── bbh_0shot_nocot_gen_925fc4.py
│ │ │ │ ├── bbh_0shot_nocot_gen_9c32f6.py
│ │ │ │ ├── bbh_0shot_nocot_gen_ea7952.py
│ │ │ │ ├── bbh_gen.py
│ │ │ │ ├── bbh_gen_2879b0.py
│ │ │ │ ├── bbh_gen_4a31fa.py
│ │ │ │ ├── bbh_gen_5b92b0.py
│ │ │ │ ├── bbh_gen_5bf00b.py
│ │ │ │ ├── bbh_gen_98fba6.py
│ │ │ │ ├── bbh_gen_ee62e9.py
│ │ │ │ ├── bbh_llm_judge_gen.py
│ │ │ │ ├── bbh_llmjudge_gen_b5bdf1.py
│ │ │ │ └── bbh_subset_settings.py
│ │ │ ├── bigcodebench/
│ │ │ │ ├── bigcodebench_full_complete_gen.py
│ │ │ │ ├── bigcodebench_full_complete_gen_faf748.py
│ │ │ │ ├── bigcodebench_full_instruct_gen.py
│ │ │ │ ├── bigcodebench_full_instruct_gen_8815eb.py
│ │ │ │ ├── bigcodebench_full_instruct_repeat_gen_c3d5ad.py
│ │ │ │ ├── bigcodebench_gen.py
│ │ │ │ ├── bigcodebench_hard_complete_gen.py
│ │ │ │ ├── bigcodebench_hard_complete_gen_2888d3.py
│ │ │ │ ├── bigcodebench_hard_complete_gen_faf748.py
│ │ │ │ ├── bigcodebench_hard_complete_rawprompt_gen_95140b.py
│ │ │ │ ├── bigcodebench_hard_instruct_gen.py
│ │ │ │ ├── bigcodebench_hard_instruct_gen_8815eb.py
│ │ │ │ ├── bigcodebench_hard_instruct_gen_c3d5ad.py
│ │ │ │ ├── bigcodebench_hard_instruct_rawprompt_gen_5cbb9f.py
│ │ │ │ └── bigcodebench_hard_instruct_repeat_gen_c3d5ad.py
│ │ │ ├── biodata/
│ │ │ │ ├── biodata_task_gen.py
│ │ │ │ └── biodata_task_rawprompt_gen.py
│ │ │ ├── calm/
│ │ │ │ ├── README.md
│ │ │ │ └── calm.py
│ │ │ ├── ceval/
│ │ │ │ ├── README.md
│ │ │ │ ├── ceval_clean_ppl.py
│ │ │ │ ├── ceval_gen.py
│ │ │ │ ├── ceval_gen_2daf24.py
│ │ │ │ ├── ceval_gen_5f30c7.py
│ │ │ │ ├── ceval_internal_ppl_1cd8bf.py
│ │ │ │ ├── ceval_internal_ppl_93e5ce.py
│ │ │ │ ├── ceval_llm_judge_gen_a162f0.py
│ │ │ │ ├── ceval_ppl.py
│ │ │ │ ├── ceval_ppl_1cd8bf.py
│ │ │ │ ├── ceval_ppl_578f8d.py
│ │ │ │ ├── ceval_ppl_93e5ce.py
│ │ │ │ └── ceval_zero_shot_gen_bd40ef.py
│ │ │ ├── chatobj_custom/
│ │ │ │ └── chatobj_custom_gen.py
│ │ │ ├── chem_exam/
│ │ │ │ ├── competition_gen.py
│ │ │ │ ├── competition_rawprompt_gen.py
│ │ │ │ ├── gaokao_gen.py
│ │ │ │ └── gaokao_rawprompt_gen.py
│ │ │ ├── chinese_simpleqa/
│ │ │ │ ├── README.md
│ │ │ │ └── chinese_simpleqa_gen.py
│ │ │ ├── civilcomments/
│ │ │ │ ├── civilcomments_clp.py
│ │ │ │ ├── civilcomments_clp_6a2561.py
│ │ │ │ └── civilcomments_clp_a3c5fd.py
│ │ │ ├── clozeTest_maxmin/
│ │ │ │ ├── clozeTest_maxmin_gen.py
│ │ │ │ └── clozeTest_maxmin_gen_c205fb.py
│ │ │ ├── cmb/
│ │ │ │ ├── cmb_gen.py
│ │ │ │ └── cmb_gen_dfb5c4.py
│ │ │ ├── cmmlu/
│ │ │ │ ├── cmmlu_0shot_cot_gen_305931.py
│ │ │ │ ├── cmmlu_0shot_nocot_llmjudge_gen_e1cd9a.py
│ │ │ │ ├── cmmlu_gen.py
│ │ │ │ ├── cmmlu_gen_c13365.py
│ │ │ │ ├── cmmlu_llm_judge_gen.py
│ │ │ │ ├── cmmlu_llmjudge_gen_e1cd9a.py
│ │ │ │ ├── cmmlu_llmjudge_rawprompt_gen_9f9c31.py
│ │ │ │ ├── cmmlu_ppl.py
│ │ │ │ ├── cmmlu_ppl_041cbf.py
│ │ │ │ ├── cmmlu_ppl_8b9c76.py
│ │ │ │ ├── cmmlu_stem_0shot_nocot_gen_3653db.py
│ │ │ │ ├── cmmlu_stem_0shot_nocot_llmjudge_gen_3653db.py
│ │ │ │ └── cmmlu_stem_0shot_nocot_xml_gen_3653db.py
│ │ │ ├── cmo_fib/
│ │ │ │ ├── README.md
│ │ │ │ ├── cmo_fib_0shot_notcot_gen_4c6c29.py
│ │ │ │ ├── cmo_fib_gen.py
│ │ │ │ ├── cmo_fib_gen_2783e5.py
│ │ │ │ └── cmo_fib_gen_ace24b.py
│ │ │ ├── codecompass/
│ │ │ │ └── codecompass_gen_079a6c.py
│ │ │ ├── collections/
│ │ │ │ ├── base_core.py
│ │ │ │ ├── base_medium.py
│ │ │ │ ├── base_medium_llama.py
│ │ │ │ ├── base_small.py
│ │ │ │ ├── chat_core.py
│ │ │ │ ├── chat_medium.py
│ │ │ │ ├── chat_small.py
│ │ │ │ ├── example.py
│ │ │ │ └── leaderboard/
│ │ │ │ ├── qwen.py
│ │ │ │ └── qwen_chat.py
│ │ │ ├── commonsenseqa/
│ │ │ │ ├── commonsenseqa_7shot_cot_gen_734a22.py
│ │ │ │ ├── commonsenseqa_gen.py
│ │ │ │ ├── commonsenseqa_gen_1da2d0.py
│ │ │ │ ├── commonsenseqa_gen_c946f2.py
│ │ │ │ ├── commonsenseqa_ppl.py
│ │ │ │ ├── commonsenseqa_ppl_3e9f2d.py
│ │ │ │ ├── commonsenseqa_ppl_5545e2.py
│ │ │ │ ├── commonsenseqa_ppl_716f78.py
│ │ │ │ ├── commonsenseqa_ppl_c49e77.py
│ │ │ │ └── commonsenseqa_ppl_e51e32.py
│ │ │ ├── commonsenseqa_cn/
│ │ │ │ ├── commonsenseqacn_gen.py
│ │ │ │ ├── commonsenseqacn_gen_d380d0.py
│ │ │ │ ├── commonsenseqacn_ppl.py
│ │ │ │ └── commonsenseqacn_ppl_971f48.py
│ │ │ ├── compassbench_20_v1_1/
│ │ │ │ ├── agent/
│ │ │ │ │ ├── cibench_template_gen_e6b12a.py
│ │ │ │ │ └── mus_teval_gen_105c48.py
│ │ │ │ ├── code/
│ │ │ │ │ └── compassbench_v1_1_code_gen_986f01.py
│ │ │ │ ├── knowledge/
│ │ │ │ │ └── compassbench_v1_knowledge_gen_bd74e0.py
│ │ │ │ ├── language/
│ │ │ │ │ └── compassbench_v1_language_gen_7aa06d.py
│ │ │ │ ├── math/
│ │ │ │ │ ├── compassbench_v1_1_math_gen_1dc21d.py
│ │ │ │ │ └── mathbench_prompt.py
│ │ │ │ └── reason/
│ │ │ │ └── compassbench_v1_reason_gen_d26d08.py
│ │ │ ├── compassbench_20_v1_1_public/
│ │ │ │ ├── agent/
│ │ │ │ │ ├── cibench_template_gen_e6b12a.py
│ │ │ │ │ └── mus_teval_gen_105c48.py
│ │ │ │ ├── code/
│ │ │ │ │ └── compassbench_v1_1_code_gen_986f01.py
│ │ │ │ ├── knowledge/
│ │ │ │ │ └── compassbench_v1_knowledge_gen_bd74e0.py
│ │ │ │ ├── language/
│ │ │ │ │ └── compassbench_v1_language_gen_7aa06d.py
│ │ │ │ ├── math/
│ │ │ │ │ ├── compassbench_v1_1_math_gen_1dc21d.py
│ │ │ │ │ └── mathbench_prompt.py
│ │ │ │ └── reason/
│ │ │ │ └── compassbench_v1_reason_gen_d26d08.py
│ │ │ ├── compassbench_v1_3/
│ │ │ │ ├── compassbench_v1_3_code_gen_c8c3aa.py
│ │ │ │ ├── compassbench_v1_3_knowledge.py
│ │ │ │ ├── compassbench_v1_3_math.py
│ │ │ │ ├── compassbench_v1_3_objective_gen.py
│ │ │ │ ├── compassbench_v1_3_objective_gen_068af0.py
│ │ │ │ └── compassbench_v1_3_prompt.py
│ │ │ ├── contamination/
│ │ │ │ ├── ceval_contamination_ppl_810ec6.py
│ │ │ │ ├── mbpp_contamination_ppl_f01cb6.py
│ │ │ │ └── mmlu_contamination_ppl_810ec6.py
│ │ │ ├── crowspairs/
│ │ │ │ ├── crowspairs_gen.py
│ │ │ │ ├── crowspairs_gen_02b6c1.py
│ │ │ │ ├── crowspairs_gen_381af0.py
│ │ │ │ ├── crowspairs_ppl.py
│ │ │ │ ├── crowspairs_ppl_47f211.py
│ │ │ │ └── crowspairs_ppl_e811e1.py
│ │ │ ├── crowspairs_cn/
│ │ │ │ ├── crowspairscn_gen.py
│ │ │ │ ├── crowspairscn_gen_556dc9.py
│ │ │ │ ├── crowspairscn_ppl.py
│ │ │ │ └── crowspairscn_ppl_f53575.py
│ │ │ ├── cvalues/
│ │ │ │ ├── cvalues_responsibility_gen.py
│ │ │ │ └── cvalues_responsibility_gen_543378.py
│ │ │ ├── demo/
│ │ │ │ ├── demo_cmmlu_base_ppl.py
│ │ │ │ ├── demo_cmmlu_chat_gen.py
│ │ │ │ ├── demo_gsm8k_base_gen.py
│ │ │ │ ├── demo_gsm8k_chat_gen.py
│ │ │ │ ├── demo_math_base_gen.py
│ │ │ │ └── demo_math_chat_gen.py
│ │ │ ├── dingo/
│ │ │ │ └── dingo_gen.py
│ │ │ ├── drop/
│ │ │ │ ├── deprecated_drop_gen_8a9ed9.py
│ │ │ │ ├── drop_examples.py
│ │ │ │ ├── drop_gen.py
│ │ │ │ ├── drop_gen_a2697c.py
│ │ │ │ ├── drop_gen_eb14af.py
│ │ │ │ ├── drop_llm_judge_gen.py
│ │ │ │ ├── drop_llmjudge_gen_3857b0.py
│ │ │ │ └── drop_openai_simple_evals_gen_3857b0.py
│ │ │ ├── ds1000/
│ │ │ │ ├── ds1000_compl_gen_cbc84f.py
│ │ │ │ ├── ds1000_compl_service_eval_gen_cbc84f.py
│ │ │ │ ├── ds1000_gen_5c4bec.py
│ │ │ │ ├── ds1000_gen_cbc84f.py
│ │ │ │ └── ds1000_service_eval_gen_cbc84f.py
│ │ │ ├── eese/
│ │ │ │ └── eese_llm_judge_gen.py
│ │ │ ├── flores/
│ │ │ │ ├── flores_gen.py
│ │ │ │ ├── flores_gen_806ede.py
│ │ │ │ └── flores_gen_aad4fd.py
│ │ │ ├── game24/
│ │ │ │ ├── game24_gen.py
│ │ │ │ └── game24_gen_52a460.py
│ │ │ ├── gaokao_math/
│ │ │ │ ├── README.md
│ │ │ │ └── gaokao_math_gen_f5fd28.py
│ │ │ ├── govrepcrs/
│ │ │ │ ├── govrepcrs_gen.py
│ │ │ │ ├── govrepcrs_gen_aa5eb3.py
│ │ │ │ └── govrepcrs_gen_db7930.py
│ │ │ ├── gpqa/
│ │ │ │ ├── README.md
│ │ │ │ ├── gpqa_0shot_nocot_gen_772ea0.py
│ │ │ │ ├── gpqa_0shot_nocot_genericllmeval_gen_772ea0.py
│ │ │ │ ├── gpqa_0shot_nocot_genericllmeval_xml_gen_772ea0.py
│ │ │ │ ├── gpqa_0shot_nocot_llmjudge_gen_772ea0.py
│ │ │ │ ├── gpqa_cascade_eval_academic.py
│ │ │ │ ├── gpqa_cascade_eval_gen_772ea0.py
│ │ │ │ ├── gpqa_cascade_eval_rawprompt_gen_706039.py
│ │ │ │ ├── gpqa_few_shot_ppl_4b5a83.py
│ │ │ │ ├── gpqa_gen.py
│ │ │ │ ├── gpqa_gen_015262.py
│ │ │ │ ├── gpqa_gen_4baadb.py
│ │ │ │ ├── gpqa_llm_judge_gen.py
│ │ │ │ ├── gpqa_openai_simple_evals_gen_5aeece.py
│ │ │ │ └── gpqa_ppl_6bf57a.py
│ │ │ ├── gsm8k/
│ │ │ │ ├── README.md
│ │ │ │ ├── deprecated_gsm8k_agent_gen_be1606.py
│ │ │ │ ├── gsm8k_0shot_gen_a58960.py
│ │ │ │ ├── gsm8k_0shot_nocot_gen_6cbf22.py
│ │ │ │ ├── gsm8k_0shot_v2_gen_17d799.py
│ │ │ │ ├── gsm8k_0shot_v2_gen_6e39a4.py
│ │ │ │ ├── gsm8k_0shot_v2_gen_a58960.py
│ │ │ │ ├── gsm8k_agent_gen_c3dff3.py
│ │ │ │ ├── gsm8k_gen.py
│ │ │ │ ├── gsm8k_gen_17d0dc.py
│ │ │ │ ├── gsm8k_gen_1d7fe4.py
│ │ │ │ ├── gsm8k_gen_1dce88.py
│ │ │ │ ├── gsm8k_gen_3309bd.py
│ │ │ │ ├── gsm8k_gen_57b0b1.py
│ │ │ │ ├── gsm8k_gen_701491.py
│ │ │ │ ├── gsm8k_gen_a3e34a.py
│ │ │ │ ├── gsm8k_gen_d6de81.py
│ │ │ │ ├── gsm8k_gen_e9e91e.py
│ │ │ │ ├── gsm8k_gen_ee684f.py
│ │ │ │ ├── gsm8k_model_postprocess_gen_a58960.py
│ │ │ │ └── gsm8k_xfinder_gen_a58960.py
│ │ │ ├── gsm8k_contamination/
│ │ │ │ └── gsm8k_contamination_ppl_ecdd22.py
│ │ │ ├── gsm_hard/
│ │ │ │ ├── gsmhard_gen.py
│ │ │ │ └── gsmhard_gen_8a1400.py
│ │ │ ├── hellaswag/
│ │ │ │ ├── README.md
│ │ │ │ ├── hellaswag_10shot_gen_e42710.py
│ │ │ │ ├── hellaswag_10shot_ppl_59c85e.py
│ │ │ │ ├── hellaswag_clean_ppl.py
│ │ │ │ ├── hellaswag_gen.py
│ │ │ │ ├── hellaswag_gen_6faab5.py
│ │ │ │ ├── hellaswag_llm_judge_gen.py
│ │ │ │ ├── hellaswag_llmjudge_gen_809ef1.py
│ │ │ │ ├── hellaswag_ppl.py
│ │ │ │ ├── hellaswag_ppl_47bff9.py
│ │ │ │ ├── hellaswag_ppl_7d7f2d.py
│ │ │ │ ├── hellaswag_ppl_9dbb12.py
│ │ │ │ └── hellaswag_ppl_a6e128.py
│ │ │ ├── hmmt2026/
│ │ │ │ ├── hmmt2026_cascade_eval_gen_6ff468.py
│ │ │ │ └── hmmt2026_cascade_eval_rawprompt_gen_0970dd.py
│ │ │ ├── humaneval/
│ │ │ │ ├── README.md
│ │ │ │ ├── deprecated_humaneval_gen_4a6eef.py
│ │ │ │ ├── deprecated_humaneval_gen_6d1cc2.py
│ │ │ │ ├── deprecated_humaneval_gen_a82cae.py
│ │ │ │ ├── deprecated_humaneval_gen_d2537e.py
│ │ │ │ ├── deprecated_humaneval_gen_fd5822.py
│ │ │ │ ├── deprecated_humaneval_gen_ff7054.py
│ │ │ │ ├── humaneval_gen.py
│ │ │ │ ├── humaneval_gen_66a7f4.py
│ │ │ │ ├── humaneval_gen_8e312c.py
│ │ │ │ ├── humaneval_openai_sample_evals_gen_159614.py
│ │ │ │ ├── humaneval_openai_sample_evals_gen_dcae0e.py
│ │ │ │ ├── humaneval_openai_sample_evals_o1_gen_5e7b00.py
│ │ │ │ ├── humaneval_openai_sample_evals_rawprompt_gen_6ce2ca.py
│ │ │ │ ├── humaneval_openai_sample_evals_repeat_gen_dcae0e.py
│ │ │ │ ├── humaneval_passk_gen_8e312c.py
│ │ │ │ ├── humaneval_repeat10_gen_8e312c.py
│ │ │ │ ├── internal_humaneval_gen_ce6b06.py
│ │ │ │ └── internal_humaneval_gen_d2537e.py
│ │ │ ├── humaneval_cn/
│ │ │ │ ├── humaneval_cn_gen.py
│ │ │ │ ├── humaneval_cn_gen_6313aa.py
│ │ │ │ ├── humaneval_cn_passk_gen_6313aa.py
│ │ │ │ └── humaneval_cn_repeat10_gen_6313aa.py
│ │ │ ├── humaneval_multi/
│ │ │ │ ├── humaneval_multi_gen.py
│ │ │ │ └── humaneval_multi_gen_82cf85.py
│ │ │ ├── humaneval_plus/
│ │ │ │ ├── humaneval_plus_gen.py
│ │ │ │ ├── humaneval_plus_gen_66a7f4.py
│ │ │ │ ├── humaneval_plus_gen_8e312c.py
│ │ │ │ ├── humaneval_plus_openai_simple_evals_gen_159614.py
│ │ │ │ ├── humaneval_plus_passk_gen_8e312c.py
│ │ │ │ ├── humaneval_plus_repeat10_gen_8e312c.py
│ │ │ │ └── humaneval_plus_repeat_gen_41b01c.py
│ │ │ ├── humaneval_pro/
│ │ │ │ ├── README.md
│ │ │ │ ├── humaneval_pro_gen.py
│ │ │ │ ├── humaneval_pro_gen_3dc067.py
│ │ │ │ └── humaneval_pro_repeat_gen_3dc067.py
│ │ │ ├── humanevalx/
│ │ │ │ ├── humanevalx_0shot_nocot_gen_3e4bbd.py
│ │ │ │ ├── humanevalx_gen.py
│ │ │ │ ├── humanevalx_gen_0af626.py
│ │ │ │ ├── humanevalx_gen_3d84a3.py
│ │ │ │ ├── humanevalx_gen_620cfa.py
│ │ │ │ └── humanevalx_repeat_gen_3d84a3.py
│ │ │ ├── hungarian_exam/
│ │ │ │ ├── hungarian_exam_gen.py
│ │ │ │ └── hungarian_exam_gen_8a1435.py
│ │ │ ├── inference_ppl/
│ │ │ │ ├── README.md
│ │ │ │ └── inference_ppl.py
│ │ │ ├── infinitebench/
│ │ │ │ ├── infinitebench.py
│ │ │ │ ├── infinitebenchcodedebug/
│ │ │ │ │ ├── infinitebench_codedebug_gen.py
│ │ │ │ │ └── infinitebench_codedebug_gen_276a42.py
│ │ │ │ ├── infinitebenchcoderun/
│ │ │ │ │ ├── infinitebench_coderun_gen.py
│ │ │ │ │ └── infinitebench_coderun_gen_1a76bd.py
│ │ │ │ ├── infinitebenchendia/
│ │ │ │ │ ├── infinitebench_endia_gen.py
│ │ │ │ │ └── infinitebench_endia_gen_c96eb5.py
│ │ │ │ ├── infinitebenchenmc/
│ │ │ │ │ ├── infinitebench_enmc_gen.py
│ │ │ │ │ └── infinitebench_enmc_gen_3a4102.py
│ │ │ │ ├── infinitebenchenqa/
│ │ │ │ │ ├── infinitebench_enqa_gen.py
│ │ │ │ │ └── infinitebench_enqa_gen_a1640c.py
│ │ │ │ ├── infinitebenchensum/
│ │ │ │ │ ├── infinitebench_ensum_gen.py
│ │ │ │ │ └── infinitebench_ensum_gen_cfbc08.py
│ │ │ │ ├── infinitebenchmathcalc/
│ │ │ │ │ ├── infinitebench_mathcalc_gen.py
│ │ │ │ │ └── infinitebench_mathcalc_gen_78d17e.py
│ │ │ │ ├── infinitebenchmathfind/
│ │ │ │ │ ├── infinitebench_mathfind_gen.py
│ │ │ │ │ └── infinitebench_mathfind_gen_6d799e.py
│ │ │ │ ├── infinitebenchretrievekv/
│ │ │ │ │ ├── infinitebench_retrievekv_gen.py
│ │ │ │ │ └── infinitebench_retrievekv_gen_06b3ac.py
│ │ │ │ ├── infinitebenchretrievenumber/
│ │ │ │ │ ├── infinitebench_retrievenumber_gen.py
│ │ │ │ │ └── infinitebench_retrievenumber_gen_047436.py
│ │ │ │ ├── infinitebenchretrievepasskey/
│ │ │ │ │ ├── infinitebench_retrievepasskey_gen.py
│ │ │ │ │ └── infinitebench_retrievepasskey_gen_62ff68.py
│ │ │ │ └── infinitebenchzhqa/
│ │ │ │ ├── infinitebench_zhqa_gen.py
│ │ │ │ └── infinitebench_zhqa_gen_1e5293.py
│ │ │ ├── internsandbox/
│ │ │ │ ├── internsandbox_gen.py
│ │ │ │ └── internsandbox_gen_44b982.py
│ │ │ ├── iwslt2017/
│ │ │ │ ├── iwslt2017_gen.py
│ │ │ │ ├── iwslt2017_gen_69ce16.py
│ │ │ │ ├── iwslt2017_gen_b4a814.py
│ │ │ │ └── iwslt2017_gen_d0ebd1.py
│ │ │ ├── jigsawmultilingual/
│ │ │ │ ├── jigsawmultilingual_clp.py
│ │ │ │ ├── jigsawmultilingual_clp_1af0ae.py
│ │ │ │ └── jigsawmultilingual_clp_fe50d8.py
│ │ │ ├── judge/
│ │ │ │ ├── judgebench.py
│ │ │ │ ├── judgerbenchv2.py
│ │ │ │ ├── rewardbench.py
│ │ │ │ └── rmb.py
│ │ │ ├── kaoshi/
│ │ │ │ ├── kaoshi_gen.py
│ │ │ │ └── kaoshi_gen_86aca2.py
│ │ │ ├── kcle/
│ │ │ │ ├── kcle_llm_judge_gen.py
│ │ │ │ ├── kcle_llm_judge_gen_60327a.py
│ │ │ │ └── kcle_llm_judge_rawprompt_gen_16e383.py
│ │ │ ├── korbench/
│ │ │ │ ├── korbench_gen.py
│ │ │ │ ├── korbench_llm_judge_gen.py
│ │ │ │ ├── korbench_llmjudge_gen_17854d.py
│ │ │ │ ├── korbench_llmjudge_gen_56cf43.py
│ │ │ │ ├── korbench_mixed_gen_d00bdd.py
│ │ │ │ ├── korbench_single_0_shot_gen.py
│ │ │ │ ├── korbench_single_0shot_cascade_eval_gen_56cf43.py
│ │ │ │ ├── korbench_single_0shot_cascade_eval_rawprompt_gen_c048da.py
│ │ │ │ ├── korbench_single_0shot_genericllmeval_gen_17854d.py
│ │ │ │ ├── korbench_single_0shot_llmjudge_gen.py
│ │ │ │ ├── korbench_single_3_shot_gen.py
│ │ │ │ └── readme.md
│ │ │ ├── lambada/
│ │ │ │ ├── lambada_gen.py
│ │ │ │ ├── lambada_gen_217e11.py
│ │ │ │ └── lambada_gen_8b48a5.py
│ │ │ ├── lawbench/
│ │ │ │ ├── lawbench_one_shot_gen_002588.py
│ │ │ │ └── lawbench_zero_shot_gen_002588.py
│ │ │ ├── lcsts/
│ │ │ │ ├── lcsts_gen.py
│ │ │ │ ├── lcsts_gen_8ee1fe.py
│ │ │ │ └── lcsts_gen_9b0b89.py
│ │ │ ├── leval/
│ │ │ │ ├── leval.py
│ │ │ │ ├── levalcoursera/
│ │ │ │ │ ├── leval_coursera_gen.py
│ │ │ │ │ └── leval_coursera_gen_36a006.py
│ │ │ │ ├── levalfinancialqa/
│ │ │ │ │ ├── leval_financialqa_gen.py
│ │ │ │ │ └── leval_financialqa_gen_b03798.py
│ │ │ │ ├── levalgovreportsumm/
│ │ │ │ │ ├── leval_gov_report_summ_gen.py
│ │ │ │ │ └── leval_gov_report_summ_gen_b03798.py
│ │ │ │ ├── levalgsm100/
│ │ │ │ │ ├── leval_gsm100_gen.py
│ │ │ │ │ └── leval_gsm100_gen_77dd94.py
│ │ │ │ ├── levallegalcontractqa/
│ │ │ │ │ ├── leval_legalcontractqa_gen.py
│ │ │ │ │ └── leval_legalcontractqa_gen_68a2ac.py
│ │ │ │ ├── levalmeetingsumm/
│ │ │ │ │ ├── leval_meetingsumm_gen.py
│ │ │ │ │ └── leval_meetingsumm_gen_b03798.py
│ │ │ │ ├── levalmultidocqa/
│ │ │ │ │ ├── leval_multidocqa_gen.py
│ │ │ │ │ └── leval_multidocqa_gen_96bf3f.py
│ │ │ │ ├── levalnarrativeqa/
│ │ │ │ │ ├── leval_narrativeqa_gen.py
│ │ │ │ │ └── leval_narrativeqa_gen_766dd0.py
│ │ │ │ ├── levalnaturalquestion/
│ │ │ │ │ ├── leval_naturalquestion_gen.py
│ │ │ │ │ └── leval_naturalquestion_gen_52c33f.py
│ │ │ │ ├── levalnewssumm/
│ │ │ │ │ ├── leval_newssumm_gen.py
│ │ │ │ │ └── leval_newssumm_gen_b03798.py
│ │ │ │ ├── levalpaperassistant/
│ │ │ │ │ ├── leval_paper_assistant_gen.py
│ │ │ │ │ └── leval_paper_assistant_gen_b03798.py
│ │ │ │ ├── levalpatentsumm/
│ │ │ │ │ ├── leval_patent_summ_gen.py
│ │ │ │ │ └── leval_patent_summ_gen_b03798.py
│ │ │ │ ├── levalquality/
│ │ │ │ │ ├── leval_quality_gen.py
│ │ │ │ │ └── leval_quality_gen_36a006.py
│ │ │ │ ├── levalreviewsumm/
│ │ │ │ │ ├── leval_review_summ_gen.py
│ │ │ │ │ └── leval_review_summ_gen_b03798.py
│ │ │ │ ├── levalscientificqa/
│ │ │ │ │ ├── leval_scientificqa_gen.py
│ │ │ │ │ └── leval_scientificqa_gen_96bf3f.py
│ │ │ │ ├── levaltopicretrieval/
│ │ │ │ │ ├── leval_topic_retrieval_gen.py
│ │ │ │ │ └── leval_topic_retrieval_gen_bf433f.py
│ │ │ │ ├── levaltpo/
│ │ │ │ │ ├── leval_tpo_gen.py
│ │ │ │ │ └── leval_tpo_gen_36a006.py
│ │ │ │ └── levaltvshowsumm/
│ │ │ │ ├── leval_tvshow_summ_gen.py
│ │ │ │ └── leval_tvshow_summ_gen_b03798.py
│ │ │ ├── livecodebench/
│ │ │ │ ├── README.md
│ │ │ │ ├── livecodebench_code_generation_repeat_gen_b5b6c5.py
│ │ │ │ ├── livecodebench_gen.py
│ │ │ │ ├── livecodebench_gen_6966bc.py
│ │ │ │ ├── livecodebench_gen_a4f90b.py
│ │ │ │ ├── livecodebench_gen_b2b0fd.py
│ │ │ │ ├── livecodebench_o1_gen_f0ed6c.py
│ │ │ │ ├── livecodebench_rawprompt_gen_c09673.py
│ │ │ │ ├── livecodebench_split_v4_o1_gen_f0ed6c.py
│ │ │ │ ├── livecodebench_split_v4_o1_postprocess_gen_f0ed6c.py
│ │ │ │ ├── livecodebench_time_split_gen_a4f90b.py
│ │ │ │ ├── livecodebench_v1_o1_gen_f0ed6c.py
│ │ │ │ └── livecodebench_v6_academic.py
│ │ │ ├── livecodebench_pro/
│ │ │ │ ├── livecodebench_pro_gen.py
│ │ │ │ └── livecodebench_pro_rawprompt_gen.py
│ │ │ ├── livemathbench/
│ │ │ │ ├── README.md
│ │ │ │ ├── livemathbench_gen.py
│ │ │ │ ├── livemathbench_gen_6eb711.py
│ │ │ │ ├── livemathbench_gen_9befbf.py
│ │ │ │ ├── livemathbench_gen_caed8f.py
│ │ │ │ ├── livemathbench_greedy_gen.py
│ │ │ │ ├── livemathbench_greedy_gen_9befbf.py
│ │ │ │ ├── livemathbench_hard_custom_cascade_eval_gen_4bce59.py
│ │ │ │ ├── livemathbench_hard_custom_cascade_eval_rawprompt_gen_e1ce64.py
│ │ │ │ ├── livemathbench_hard_custom_llmverify_gen_85d0ef.py
│ │ │ │ ├── livemathbench_hard_gen_353ae7.py
│ │ │ │ ├── livemathbench_hard_greedy_gen_353ae7.py
│ │ │ │ ├── livemathbench_hard_llmjudge_gen_71eaf5.py
│ │ │ │ ├── livemathbench_v202505_gen_9befbf.py
│ │ │ │ ├── livemathbench_v202505_greedy_gen_9befbf.py
│ │ │ │ ├── livemathbench_v202505_hard_gen_353ae7.py
│ │ │ │ └── livemathbench_v202505_hard_greedy_gen_353ae7.py
│ │ │ ├── livereasonbench/
│ │ │ │ ├── livereasonbench_gen.py
│ │ │ │ ├── livereasonbench_gen_f990de.py
│ │ │ │ ├── livereasonbench_genericllmeval_gen_f990de.py
│ │ │ │ └── livereasonbench_llmverify_20250428_gen_0484cb.py
│ │ │ ├── livestembench/
│ │ │ │ ├── livestembench_0shot_noncot_gen_2e6d10.py
│ │ │ │ ├── livestembench_0shot_noncot_xml_gen_2e6d10.py
│ │ │ │ ├── livestembench_gen.py
│ │ │ │ └── livestembench_gen_3e3c50.py
│ │ │ ├── llm_compression/
│ │ │ │ ├── README.md
│ │ │ │ └── llm_compression.py
│ │ │ ├── longbench/
│ │ │ │ ├── longbench.py
│ │ │ │ ├── longbench2wikimqa/
│ │ │ │ │ ├── longbench_2wikimqa_gen.py
│ │ │ │ │ └── longbench_2wikimqa_gen_6b3efc.py
│ │ │ │ ├── longbenchdureader/
│ │ │ │ │ ├── longbench_dureader_gen.py
│ │ │ │ │ └── longbench_dureader_gen_c6c7e4.py
│ │ │ │ ├── longbenchgov_report/
│ │ │ │ │ ├── longbench_gov_report_gen.py
│ │ │ │ │ └── longbench_gov_report_gen_54c5b0.py
│ │ │ │ ├── longbenchhotpotqa/
│ │ │ │ │ ├── longbench_hotpotqa_gen.py
│ │ │ │ │ └── longbench_hotpotqa_gen_6b3efc.py
│ │ │ │ ├── longbenchlcc/
│ │ │ │ │ ├── longbench_lcc_gen.py
│ │ │ │ │ └── longbench_lcc_gen_6ba507.py
│ │ │ │ ├── longbenchlsht/
│ │ │ │ │ ├── longbench_lsht_gen.py
│ │ │ │ │ └── longbench_lsht_gen_e8a339.py
│ │ │ │ ├── longbenchmulti_news/
│ │ │ │ │ ├── longbench_multi_news_gen.py
│ │ │ │ │ └── longbench_multi_news_gen_6f9da9.py
│ │ │ │ ├── longbenchmultifieldqa_en/
│ │ │ │ │ ├── longbench_multifieldqa_en_gen.py
│ │ │ │ │ └── longbench_multifieldqa_en_gen_d3838e.py
│ │ │ │ ├── longbenchmultifieldqa_zh/
│ │ │ │ │ ├── longbench_multifieldqa_zh_gen.py
│ │ │ │ │ └── longbench_multifieldqa_zh_gen_e9a7ef.py
│ │ │ │ ├── longbenchmusique/
│ │ │ │ │ ├── longbench_musique_gen.py
│ │ │ │ │ └── longbench_musique_gen_6b3efc.py
│ │ │ │ ├── longbenchnarrativeqa/
│ │ │ │ │ ├── longbench_narrativeqa_gen.py
│ │ │ │ │ └── longbench_narrativeqa_gen_a68305.py
│ │ │ │ ├── longbenchpassage_count/
│ │ │ │ │ ├── longbench_passage_count_gen.py
│ │ │ │ │ └── longbench_passage_count_gen_dcdaab.py
│ │ │ │ ├── longbenchpassage_retrieval_en/
│ │ │ │ │ ├── longbench_passage_retrieval_en_gen.py
│ │ │ │ │ └── longbench_passage_retrieval_en_gen_734db5.py
│ │ │ │ ├── longbenchpassage_retrieval_zh/
│ │ │ │ │ ├── longbench_passage_retrieval_zh_gen.py
│ │ │ │ │ └── longbench_passage_retrieval_zh_gen_01cca2.py
│ │ │ │ ├── longbenchqasper/
│ │ │ │ │ ├── longbench_qasper_gen.py
│ │ │ │ │ └── longbench_qasper_gen_6b3efc.py
│ │ │ │ ├── longbenchqmsum/
│ │ │ │ │ ├── longbench_qmsum_gen.py
│ │ │ │ │ └── longbench_qmsum_gen_d33331.py
│ │ │ │ ├── longbenchrepobench/
│ │ │ │ │ ├── longbench_repobench_gen.py
│ │ │ │ │ └── longbench_repobench_gen_6df953.py
│ │ │ │ ├── longbenchsamsum/
│ │ │ │ │ ├── longbench_samsum_gen.py
│ │ │ │ │ └── longbench_samsum_gen_f4416d.py
│ │ │ │ ├── longbenchtrec/
│ │ │ │ │ ├── longbench_trec_gen.py
│ │ │ │ │ └── longbench_trec_gen_824187.py
│ │ │ │ ├── longbenchtriviaqa/
│ │ │ │ │ ├── longbench_triviaqa_gen.py
│ │ │ │ │ └── longbench_triviaqa_gen_d30cb9.py
│ │ │ │ └── longbenchvcsum/
│ │ │ │ ├── longbench_vcsum_gen.py
│ │ │ │ └── longbench_vcsum_gen_f7a8ac.py
│ │ │ ├── longbenchv2/
│ │ │ │ ├── longbenchv2_gen.py
│ │ │ │ └── longbenchv2_gen_75fbba.py
│ │ │ ├── lveval/
│ │ │ │ ├── lveval.md
│ │ │ │ ├── lveval.py
│ │ │ │ ├── lvevalcmrc_mixup/
│ │ │ │ │ ├── lveval_cmrc_mixup_gen.py
│ │ │ │ │ └── lveval_cmrc_mixup_gen_465823.py
│ │ │ │ ├── lvevaldureader_mixup/
│ │ │ │ │ ├── lveval_dureader_mixup_gen.py
│ │ │ │ │ └── lveval_dureader_mixup_gen_465823.py
│ │ │ │ ├── lvevalfactrecall_en/
│ │ │ │ │ ├── lveval_factrecall_en_gen.py
│ │ │ │ │ └── lveval_factrecall_en_gen_9a836f.py
│ │ │ │ ├── lvevalfactrecall_zh/
│ │ │ │ │ ├── lveval_factrecall_zh_gen.py
│ │ │ │ │ └── lveval_factrecall_zh_gen_dbee70.py
│ │ │ │ ├── lvevalhotpotwikiqa_mixup/
│ │ │ │ │ ├── lveval_hotpotwikiqa_mixup_gen.py
│ │ │ │ │ └── lveval_hotpotwikiqa_mixup_gen_77ce82.py
│ │ │ │ ├── lvevallic_mixup/
│ │ │ │ │ ├── lveval_lic_mixup_gen.py
│ │ │ │ │ └── lveval_lic_mixup_gen_01eb0c.py
│ │ │ │ ├── lvevalloogle_CR_mixup/
│ │ │ │ │ ├── lveval_loogle_CR_mixup_gen.py
│ │ │ │ │ └── lveval_loogle_CR_mixup_gen_d7ea36.py
│ │ │ │ ├── lvevalloogle_MIR_mixup/
│ │ │ │ │ ├── lveval_loogle_MIR_mixup_gen.py
│ │ │ │ │ └── lveval_loogle_MIR_mixup_gen_d7ea36.py
│ │ │ │ ├── lvevalloogle_SD_mixup/
│ │ │ │ │ ├── lveval_loogle_SD_mixup_gen.py
│ │ │ │ │ └── lveval_loogle_SD_mixup_gen_d7ea36.py
│ │ │ │ ├── lvevalmultifieldqa_en_mixup/
│ │ │ │ │ ├── lveval_multifieldqa_en_mixup_gen.py
│ │ │ │ │ └── lveval_multifieldqa_en_mixup_gen_d7ea36.py
│ │ │ │ └── lvevalmultifieldqa_zh_mixup/
│ │ │ │ ├── lveval_multifieldqa_zh_mixup_gen.py
│ │ │ │ └── lveval_multifieldqa_zh_mixup_gen_0fbdad.py
│ │ │ ├── mastermath2024v1/
│ │ │ │ ├── mastermath2024v1_gen.py
│ │ │ │ └── mastermath2024v1_gen_be6318.py
│ │ │ ├── matbench/
│ │ │ │ ├── matbench_gen.py
│ │ │ │ ├── matbench_gen_f71840.py
│ │ │ │ ├── matbench_llm_judge_gen_0e9276.py
│ │ │ │ ├── matbench_llm_judge_rawprompt_gen_c987b6.py
│ │ │ │ └── matbench_regex_judge_gen_0e9276.py
│ │ │ ├── math/
│ │ │ │ ├── README.md
│ │ │ │ ├── deprecated_math_agent_evaluatorv2_gen_861b4f.py
│ │ │ │ ├── deprecated_math_evaluatorv2_gen_265cce.py
│ │ │ │ ├── math_0shot_gen_11c4b5.py
│ │ │ │ ├── math_0shot_gen_393424.py
│ │ │ │ ├── math_0shot_llm_judge_gen_393424.py
│ │ │ │ ├── math_0shot_llm_judge_v2_gen_31d777.py
│ │ │ │ ├── math_4shot_base_gen_43d5b6.py
│ │ │ │ ├── math_4shot_base_gen_db136b.py
│ │ │ │ ├── math_4shot_example_from_google_research.py
│ │ │ │ ├── math_500_cascade_eval_gen_6ff468.py
│ │ │ │ ├── math_500_cascade_eval_rawprompt_gen_0970dd.py
│ │ │ │ ├── math_500_gen.py
│ │ │ │ ├── math_500_llmjudge_gen_6ff468.py
│ │ │ │ ├── math_agent_evaluatorv2_gen_0c1b4e.py
│ │ │ │ ├── math_agent_gen_0c1b4e.py
│ │ │ │ ├── math_agent_gen_861b4f.py
│ │ │ │ ├── math_agent_gen_af2293.py
│ │ │ │ ├── math_evaluatorv2_gen_2f4a71.py
│ │ │ │ ├── math_evaluatorv2_gen_cecb31.py
│ │ │ │ ├── math_gen.py
│ │ │ │ ├── math_gen_0957ff.py
│ │ │ │ ├── math_gen_1ed9c2.py
│ │ │ │ ├── math_gen_265cce.py
│ │ │ │ ├── math_gen_559593.py
│ │ │ │ ├── math_gen_5e8458.py
│ │ │ │ ├── math_gen_736506.py
│ │ │ │ ├── math_gen_78ced2.py
│ │ │ │ ├── math_gen_943d32.py
│ │ │ │ ├── math_gen_a58d9d.py
│ │ │ │ ├── math_intern_evaluator_gen_265cce.py
│ │ │ │ ├── math_llm_judge_gen.py
│ │ │ │ ├── math_llm_judge_gen_56606f.py
│ │ │ │ ├── math_prm800k_500_0shot_cot_academic_gen.py
│ │ │ │ ├── math_prm800k_500_0shot_cot_gen.py
│ │ │ │ ├── math_prm800k_500_0shot_cot_gen_11c4b5.py
│ │ │ │ ├── math_prm800k_500_0shot_nocot_gen_b27274.py
│ │ │ │ ├── math_prm800k_500_0shot_nocot_genericllmeval_gen_63a000.py
│ │ │ │ ├── math_prm800k_500_0shot_nocot_genericllmeval_gen_6ff468.py
│ │ │ │ ├── math_prm800k_500_0shot_nocot_genericllmeval_xml_gen_63a000.py
│ │ │ │ ├── math_prm800k_500_0shot_nocot_llmjudge_gen_63a000.py
│ │ │ │ ├── math_prm800k_500_gen.py
│ │ │ │ ├── math_prm800k_500_gen_393424.py
│ │ │ │ ├── math_prm800k_500_llm_judge_gen.py
│ │ │ │ ├── math_prm800k_500_llmverify_gen_6ff468.py
│ │ │ │ └── math_prm800k_500_llmverify_repeat4_gen_97b203.py
│ │ │ ├── math401/
│ │ │ │ ├── math401_gen.py
│ │ │ │ └── math401_gen_ab5f39.py
│ │ │ ├── mbpp/
│ │ │ │ ├── README.md
│ │ │ │ ├── deprecated_mbpp_gen_1e1056.py
│ │ │ │ ├── deprecated_mbpp_gen_6590b0.py
│ │ │ │ ├── deprecated_mbpp_gen_caa7ab.py
│ │ │ │ ├── deprecated_mbpp_passk_gen_1e1056.py
│ │ │ │ ├── deprecated_mbpp_repeat10_gen_1e1056.py
│ │ │ │ ├── deprecated_sanitized_mbpp_gen_1e1056.py
│ │ │ │ ├── deprecated_sanitized_mbpp_gen_cb43ef.py
│ │ │ │ ├── deprecated_sanitized_mbpp_passk_gen_1e1056.py
│ │ │ │ ├── deprecated_sanitized_mbpp_repeat10_gen_1e1056.py
│ │ │ │ ├── mbpp_gen.py
│ │ │ │ ├── mbpp_gen_830460.py
│ │ │ │ ├── mbpp_passk_gen_830460.py
│ │ │ │ ├── mbpp_repeat10_gen_830460.py
│ │ │ │ ├── mbpp_repeat_gen_18dd1b.py
│ │ │ │ ├── sanitized_mbpp_gen_742f0c.py
│ │ │ │ ├── sanitized_mbpp_gen_830460.py
│ │ │ │ ├── sanitized_mbpp_gen_a0fc46.py
│ │ │ │ ├── sanitized_mbpp_mdblock_0shot_nocot_gen_a2e416.py
│ │ │ │ ├── sanitized_mbpp_mdblock_0shot_nocot_rawprompt_gen_30c1e5.py
│ │ │ │ ├── sanitized_mbpp_mdblock_gen_a447ff.py
│ │ │ │ ├── sanitized_mbpp_passk_gen_830460.py
│ │ │ │ └── sanitized_mbpp_repeat10_gen_830460.py
│ │ │ ├── mbpp_cn/
│ │ │ │ ├── deprecated_mbpp_cn_gen_1d1481.py
│ │ │ │ ├── deprecated_mbpp_cn_passk_gen_1d1481.py
│ │ │ │ ├── deprecated_mbpp_cn_repeat10_gen_1d1481.py
│ │ │ │ ├── mbpp_cn_gen.py
│ │ │ │ └── mbpp_cn_gen_9114d5.py
│ │ │ ├── mbpp_plus/
│ │ │ │ ├── deprecated_mbpp_plus_gen_94815c.py
│ │ │ │ ├── mbpp_plus_gen.py
│ │ │ │ └── mbpp_plus_gen_0b836a.py
│ │ │ ├── mbpp_pro/
│ │ │ │ ├── README.md
│ │ │ │ ├── mbpp_pro_gen.py
│ │ │ │ ├── mbpp_pro_gen_3dc067.py
│ │ │ │ └── mbpp_pro_repeat_gen_3dc067.py
│ │ │ ├── medmcqa/
│ │ │ │ ├── medmcqa_gen.py
│ │ │ │ ├── medmcqa_gen_60c8f5.py
│ │ │ │ ├── medmcqa_llmjudge_gen.py
│ │ │ │ ├── medmcqa_llmjudge_gen_60c8f5.py
│ │ │ │ └── medmcqa_llmjudge_rawprompt_gen_015178.py
│ │ │ ├── mgsm/
│ │ │ │ ├── README.md
│ │ │ │ ├── mgsm_gen.py
│ │ │ │ └── mgsm_gen_d967bc.py
│ │ │ ├── mmlu/
│ │ │ │ ├── README.md
│ │ │ │ ├── mmlu_all_sets.py
│ │ │ │ ├── mmlu_clean_ppl.py
│ │ │ │ ├── mmlu_gen.py
│ │ │ │ ├── mmlu_gen_23a9a9.py
│ │ │ │ ├── mmlu_gen_4d595a.py
│ │ │ │ ├── mmlu_gen_5d1409.py
│ │ │ │ ├── mmlu_gen_79e572.py
│ │ │ │ ├── mmlu_gen_a484b3.py
│ │ │ │ ├── mmlu_llm_judge_gen.py
│ │ │ │ ├── mmlu_llmjudge_gen_f4336b.py
│ │ │ │ ├── mmlu_llmjudge_rawprompt_gen_af67f0.py
│ │ │ │ ├── mmlu_model_postprocess_gen_4d595a.py
│ │ │ │ ├── mmlu_openai_0shot_nocot_llmjudge_gen_216503.py
│ │ │ │ ├── mmlu_openai_simple_evals_gen_b618ea.py
│ │ │ │ ├── mmlu_ppl.py
│ │ │ │ ├── mmlu_ppl_ac766d.py
│ │ │ │ ├── mmlu_stem_0shot_cascade_eval_gen_216503.py
│ │ │ │ ├── mmlu_stem_0shot_gen_216503.py
│ │ │ │ ├── mmlu_stem_0shot_xml_gen_216503.py
│ │ │ │ ├── mmlu_stem_sets.py
│ │ │ │ ├── mmlu_xfinder_gen_4d595a.py
│ │ │ │ └── mmlu_zero_shot_gen_47e2c0.py
│ │ │ ├── mmlu_cf/
│ │ │ │ ├── mmlu_cf_categories.py
│ │ │ │ ├── mmlu_cf_few_shot.py
│ │ │ │ ├── mmlu_cf_gen.py
│ │ │ │ ├── mmlu_cf_gen_040615.py
│ │ │ │ └── mmlu_cf_zero_shot.py
│ │ │ ├── mmlu_pro/
│ │ │ │ ├── mmlu_pro_0shot_cot_gen_08c1de.py
│ │ │ │ ├── mmlu_pro_0shot_nocot_genericllmeval_gen_08c1de.py
│ │ │ │ ├── mmlu_pro_0shot_nocot_genericllmeval_rawprompt_gen_0321fb.py
│ │ │ │ ├── mmlu_pro_biomed_0shot_cot_gen_057927.py
│ │ │ │ ├── mmlu_pro_biomed_0shot_nocot_genericllmeval_gen_057927.py
│ │ │ │ ├── mmlu_pro_categories.py
│ │ │ │ ├── mmlu_pro_few_shot_gen_bfaf90.py
│ │ │ │ ├── mmlu_pro_gen.py
│ │ │ │ ├── mmlu_pro_gen_cdbebf.py
│ │ │ │ └── mmlu_pro_llm_judge_gen.py
│ │ │ ├── mmmlu/
│ │ │ │ ├── README.md
│ │ │ │ ├── mmmlu_5_shot_gen_bcbeb3.py
│ │ │ │ ├── mmmlu_gen.py
│ │ │ │ ├── mmmlu_gen_c51a84.py
│ │ │ │ └── mmmlu_prompt.py
│ │ │ ├── mmmlu_lite/
│ │ │ │ ├── README.md
│ │ │ │ ├── mmmlu_lite_gen.py
│ │ │ │ └── mmmlu_lite_gen_c51a84.py
│ │ │ ├── multipl_e/
│ │ │ │ ├── multiple_gen.py
│ │ │ │ ├── multiple_top_ten_gen_f44aaf.py
│ │ │ │ └── multiple_top_ten_repeat_gen_0cd6ce.py
│ │ │ ├── musr/
│ │ │ │ ├── README.md
│ │ │ │ ├── musr_gen.py
│ │ │ │ ├── musr_gen_3622bb.py
│ │ │ │ ├── musr_gen_3c6e15.py
│ │ │ │ ├── musr_gen_b47fd3.py
│ │ │ │ ├── musr_llm_judge_gen.py
│ │ │ │ └── musr_llmjudge_gen_b47fd3.py
│ │ │ ├── narrativeqa/
│ │ │ │ ├── narrativeqa_gen.py
│ │ │ │ ├── narrativeqa_gen_a2d88a.py
│ │ │ │ └── narrativeqa_gen_db6413.py
│ │ │ ├── needlebench/
│ │ │ │ ├── atc/
│ │ │ │ │ ├── atc.py
│ │ │ │ │ ├── atc_choice.py
│ │ │ │ │ ├── atc_choice_20.py
│ │ │ │ │ ├── atc_choice_50.py
│ │ │ │ │ ├── atc_choice_50_en_reasoning.py
│ │ │ │ │ ├── atc_choice_80.py
│ │ │ │ │ └── atc_choice_80_en_reasoning.py
│ │ │ │ ├── needlebench_1000k/
│ │ │ │ │ ├── needlebench_1000k.py
│ │ │ │ │ ├── needlebench_multi_reasoning_1000k.py
│ │ │ │ │ ├── needlebench_multi_retrieval_1000k.py
│ │ │ │ │ └── needlebench_single_1000k.py
│ │ │ │ ├── needlebench_128k/
│ │ │ │ │ ├── needlebench_128k.py
│ │ │ │ │ ├── needlebench_multi_reasoning_128k.py
│ │ │ │ │ ├── needlebench_multi_retrieval_128k.py
│ │ │ │ │ └── needlebench_single_128k.py
│ │ │ │ ├── needlebench_200k/
│ │ │ │ │ ├── needlebench_200k.py
│ │ │ │ │ ├── needlebench_multi_reasoning_200k.py
│ │ │ │ │ ├── needlebench_multi_retrieval_200k.py
│ │ │ │ │ └── needlebench_single_200k.py
│ │ │ │ ├── needlebench_256k/
│ │ │ │ │ ├── needlebench_256k.py
│ │ │ │ │ ├── needlebench_multi_reasoning_256k.py
│ │ │ │ │ ├── needlebench_multi_retrieval_256k.py
│ │ │ │ │ └── needlebench_single_256k.py
│ │ │ │ ├── needlebench_32k/
│ │ │ │ │ ├── needlebench_32k.py
│ │ │ │ │ ├── needlebench_multi_reasoning_32k.py
│ │ │ │ │ ├── needlebench_multi_retrieval_32k.py
│ │ │ │ │ └── needlebench_single_32k.py
│ │ │ │ ├── needlebench_4k/
│ │ │ │ │ ├── needlebench_4k.py
│ │ │ │ │ ├── needlebench_multi_reasoning_4k.py
│ │ │ │ │ ├── needlebench_multi_retrieval_4k.py
│ │ │ │ │ └── needlebench_single_4k.py
│ │ │ │ ├── needlebench_8k/
│ │ │ │ │ ├── needlebench_8k.py
│ │ │ │ │ ├── needlebench_multi_reasoning_8k.py
│ │ │ │ │ ├── needlebench_multi_retrieval_8k.py
│ │ │ │ │ ├── needlebench_multi_retrieval_compare_batch_8k.py
│ │ │ │ │ └── needlebench_single_8k.py
│ │ │ │ ├── needlebench_base/
│ │ │ │ │ ├── needlebench_base_gen.py
│ │ │ │ │ └── needlebench_single.py
│ │ │ │ ├── readme.md
│ │ │ │ └── readme_zh-CN.md
│ │ │ ├── needlebench_v2/
│ │ │ │ ├── atc/
│ │ │ │ │ └── atc_0shot_nocot_2_power_en.py
│ │ │ │ ├── needlebench_v2_1000k/
│ │ │ │ │ ├── needlebench_v2_1000k.py
│ │ │ │ │ ├── needlebench_v2_multi_reasoning_1000k.py
│ │ │ │ │ ├── needlebench_v2_multi_retrieval_1000k.py
│ │ │ │ │ └── needlebench_v2_single_1000k.py
│ │ │ │ ├── needlebench_v2_128k/
│ │ │ │ │ ├── needlebench_v2_128k.py
│ │ │ │ │ ├── needlebench_v2_multi_reasoning_128k.py
│ │ │ │ │ ├── needlebench_v2_multi_retrieval_128k.py
│ │ │ │ │ └── needlebench_v2_single_128k.py
│ │ │ │ ├── needlebench_v2_200k/
│ │ │ │ │ ├── needlebench_v2_200k.py
│ │ │ │ │ ├── needlebench_v2_multi_reasoning_200k.py
│ │ │ │ │ ├── needlebench_v2_multi_retrieval_200k.py
│ │ │ │ │ └── needlebench_v2_single_200k.py
│ │ │ │ ├── needlebench_v2_256k/
│ │ │ │ │ ├── needlebench_v2_256k.py
│ │ │ │ │ ├── needlebench_v2_multi_reasoning_256k.py
│ │ │ │ │ ├── needlebench_v2_multi_retrieval_256k.py
│ │ │ │ │ └── needlebench_v2_single_256k.py
│ │ │ │ ├── needlebench_v2_32k/
│ │ │ │ │ ├── needlebench_v2_32k.py
│ │ │ │ │ ├── needlebench_v2_multi_reasoning_32k.py
│ │ │ │ │ ├── needlebench_v2_multi_retrieval_32k.py
│ │ │ │ │ └── needlebench_v2_single_32k.py
│ │ │ │ ├── needlebench_v2_4k/
│ │ │ │ │ ├── needlebench_v2_4k.py
│ │ │ │ │ ├── needlebench_v2_multi_reasoning_4k.py
│ │ │ │ │ ├── needlebench_v2_multi_retrieval_4k.py
│ │ │ │ │ └── needlebench_v2_single_4k.py
│ │ │ │ ├── needlebench_v2_8k/
│ │ │ │ │ ├── needlebench_v2_8k.py
│ │ │ │ │ ├── needlebench_v2_multi_reasoning_8k.py
│ │ │ │ │ ├── needlebench_v2_multi_retrieval_8k.py
│ │ │ │ │ ├── needlebench_v2_multi_retrieval_compare_batch_8k.py
│ │ │ │ │ └── needlebench_v2_single_8k.py
│ │ │ │ ├── readme.md
│ │ │ │ └── readme_zh-CN.md
│ │ │ ├── nejm_ai_benchmark/
│ │ │ │ ├── nejmaibench_gen.py
│ │ │ │ ├── nejmaibench_gen_60c8f5.py
│ │ │ │ ├── nejmaibench_llmjudge_gen.py
│ │ │ │ └── nejmaibench_llmjudge_gen_60c8f5.py
│ │ │ ├── nq/
│ │ │ │ ├── README.md
│ │ │ │ ├── nq_gen.py
│ │ │ │ ├── nq_gen_0356ec.py
│ │ │ │ ├── nq_gen_2463e2.py
│ │ │ │ ├── nq_gen_3dcea1.py
│ │ │ │ ├── nq_gen_68c1c6.py
│ │ │ │ ├── nq_gen_c788f6.py
│ │ │ │ ├── nq_open_1shot_gen_01cf41.py
│ │ │ │ ├── nq_open_1shot_gen_20a989.py
│ │ │ │ ├── nq_open_1shot_gen_2e45e5.py
│ │ │ │ ├── nq_open_gen_e93f8a.py
│ │ │ │ └── nq_xfinder_gen_3dcea1.py
│ │ │ ├── nq_cn/
│ │ │ │ ├── nqcn_gen.py
│ │ │ │ └── nqcn_gen_141737.py
│ │ │ ├── obqa/
│ │ │ │ ├── obqa_gen.py
│ │ │ │ ├── obqa_gen_9069e4.py
│ │ │ │ ├── obqa_ppl.py
│ │ │ │ ├── obqa_ppl_1defe8.py
│ │ │ │ ├── obqa_ppl_6aac9e.py
│ │ │ │ └── obqa_ppl_c7c154.py
│ │ │ ├── ojbench/
│ │ │ │ └── ojbench_gen.py
│ │ │ ├── omni_math/
│ │ │ │ ├── README.md
│ │ │ │ ├── omni_math_cascade_eval_gen_ccf9c0.py
│ │ │ │ ├── omni_math_gen.py
│ │ │ │ ├── omni_math_gen_18cc08.py
│ │ │ │ └── omni_math_llmverify_gen_ccf9c0.py
│ │ │ ├── openswi/
│ │ │ │ ├── openswi_gen.py
│ │ │ │ └── openswi_rawprompt_gen.py
│ │ │ ├── piqa/
│ │ │ │ ├── piqa_gen.py
│ │ │ │ ├── piqa_gen_1194eb.py
│ │ │ │ ├── piqa_ppl.py
│ │ │ │ ├── piqa_ppl_0cfff2.py
│ │ │ │ ├── piqa_ppl_1cf9f0.py
│ │ │ │ └── piqa_ppl_3431ea.py
│ │ │ ├── promptbench/
│ │ │ │ ├── promptbench_iwslt2017_gen_cbb8c8.py
│ │ │ │ ├── promptbench_math_gen_abf776.py
│ │ │ │ ├── promptbench_squad20_gen_b15d1c.py
│ │ │ │ └── promptbench_wnli_gen_50662f.py
│ │ │ ├── py150/
│ │ │ │ ├── py150_gen.py
│ │ │ │ └── py150_gen_38b13d.py
│ │ │ ├── qabench/
│ │ │ │ ├── qabench_gen.py
│ │ │ │ └── qabench_gen_353ae7.py
│ │ │ ├── qasper/
│ │ │ │ ├── qasper_gen.py
│ │ │ │ ├── qasper_gen_a2d88a.py
│ │ │ │ └── qasper_gen_db6413.py
│ │ │ ├── qaspercut/
│ │ │ │ ├── qaspercut_gen.py
│ │ │ │ ├── qaspercut_gen_a2d88a.py
│ │ │ │ └── qaspercut_gen_db6413.py
│ │ │ ├── race/
│ │ │ │ ├── README.md
│ │ │ │ ├── race_cot_gen_d95929.py
│ │ │ │ ├── race_few_shot_gen_a498ed.py
│ │ │ │ ├── race_few_shot_ppl.py
│ │ │ │ ├── race_gen.py
│ │ │ │ ├── race_gen_69ee4f.py
│ │ │ │ ├── race_gen_9302a5.py
│ │ │ │ ├── race_ppl.py
│ │ │ │ ├── race_ppl_5831a0.py
│ │ │ │ ├── race_ppl_a138cd.py
│ │ │ │ └── race_ppl_abed12.py
│ │ │ ├── realtoxicprompts/
│ │ │ │ ├── realtoxicprompts_gen.py
│ │ │ │ ├── realtoxicprompts_gen_7605e4.py
│ │ │ │ └── realtoxicprompts_gen_ac723c.py
│ │ │ ├── rolebench/
│ │ │ │ ├── instruction_generalization_eng.py
│ │ │ │ ├── instruction_generalization_zh.py
│ │ │ │ └── role_generalization_eng.py
│ │ │ ├── ruler/
│ │ │ │ ├── README.md
│ │ │ │ ├── ruler_128k_gen.py
│ │ │ │ ├── ruler_16k_gen.py
│ │ │ │ ├── ruler_1m_gen.py
│ │ │ │ ├── ruler_256k_gen.py
│ │ │ │ ├── ruler_32k_gen.py
│ │ │ │ ├── ruler_4k_gen.py
│ │ │ │ ├── ruler_512k_gen.py
│ │ │ │ ├── ruler_64k_gen.py
│ │ │ │ ├── ruler_8k_gen.py
│ │ │ │ ├── ruler_combined_gen.py
│ │ │ │ ├── ruler_cwe_gen.py
│ │ │ │ ├── ruler_fwe_gen.py
│ │ │ │ ├── ruler_niah_gen.py
│ │ │ │ ├── ruler_qa_gen.py
│ │ │ │ └── ruler_vt_gen.py
│ │ │ ├── s3eval/
│ │ │ │ ├── s3eval.md
│ │ │ │ ├── s3eval_gen.py
│ │ │ │ └── s3eval_gen_b8ac80.py
│ │ │ ├── safety/
│ │ │ │ ├── safety_gen.py
│ │ │ │ └── safety_gen_7ce197.py
│ │ │ ├── scibench/
│ │ │ │ ├── scibench_gen.py
│ │ │ │ └── scibench_gen_2b21f3.py
│ │ │ ├── scicode/
│ │ │ │ ├── README.md
│ │ │ │ ├── scicode_gen.py
│ │ │ │ ├── scicode_gen_085b98.py
│ │ │ │ ├── scicode_gen_62c139.py
│ │ │ │ └── scicode_wbg_gen_085b98.py
│ │ │ ├── siqa/
│ │ │ │ ├── siqa_gen.py
│ │ │ │ ├── siqa_gen_18632c.py
│ │ │ │ ├── siqa_gen_e78df3.py
│ │ │ │ ├── siqa_ppl.py
│ │ │ │ ├── siqa_ppl_42bc6e.py
│ │ │ │ ├── siqa_ppl_7845b0.py
│ │ │ │ ├── siqa_ppl_ced5f6.py
│ │ │ │ └── siqa_ppl_e8d8c5.py
│ │ │ ├── squad20/
│ │ │ │ ├── squad20_gen.py
│ │ │ │ └── squad20_gen_1710bc.py
│ │ │ ├── srbench/
│ │ │ │ ├── srbench_gen.py
│ │ │ │ └── srbench_rawprompt_gen.py
│ │ │ ├── storycloze/
│ │ │ │ ├── storycloze_gen.py
│ │ │ │ ├── storycloze_gen_7f656a.py
│ │ │ │ ├── storycloze_ppl.py
│ │ │ │ ├── storycloze_ppl_496661.py
│ │ │ │ └── storycloze_ppl_afd16f.py
│ │ │ ├── strategyqa/
│ │ │ │ ├── strategyqa_gen.py
│ │ │ │ ├── strategyqa_gen_1180a7.py
│ │ │ │ └── strategyqa_gen_934441.py
│ │ │ ├── subjective/
│ │ │ │ ├── alignbench/
│ │ │ │ │ ├── alignbench_judgeby_critiquellm.py
│ │ │ │ │ ├── alignbench_judgeby_critiquellm_new.py
│ │ │ │ │ ├── alignbench_v1_1_judgeby_critiquellm.py
│ │ │ │ │ └── alignbench_v1_1_judgeby_critiquellm_new.py
│ │ │ │ ├── alpaca_eval/
│ │ │ │ │ ├── alpacav2_judgeby_gpt4.py
│ │ │ │ │ ├── alpacav2_judgeby_gpt4_bradleyterry.py
│ │ │ │ │ └── alpacav2_judgeby_gpt4_new.py
│ │ │ │ ├── arena_hard/
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── arena_hard_compare.py
│ │ │ │ │ ├── arena_hard_compare_bradleyterry.py
│ │ │ │ │ └── arena_hard_compare_new.py
│ │ │ │ ├── compass_arena_subjective_bench/
│ │ │ │ │ ├── README_pairwise_bt.md
│ │ │ │ │ ├── multiturn/
│ │ │ │ │ │ ├── pairwise_bt_judge.py
│ │ │ │ │ │ ├── pairwise_judge.py
│ │ │ │ │ │ └── pointwise_judge.py
│ │ │ │ │ └── singleturn/
│ │ │ │ │ ├── pairwise_bt_judge.py
│ │ │ │ │ ├── pairwise_judge.py
│ │ │ │ │ └── pointwise_judge.py
│ │ │ │ ├── compassarena/
│ │ │ │ │ ├── compassarena_compare.py
│ │ │ │ │ ├── compassarena_compare_bradleyterry.py
│ │ │ │ │ └── compassarena_compare_new.py
│ │ │ │ ├── compassbench/
│ │ │ │ │ ├── compassbench_checklist.py
│ │ │ │ │ ├── compassbench_compare.py
│ │ │ │ │ ├── compassbench_compare_v11.py
│ │ │ │ │ ├── compassbench_compare_v11_patch.py
│ │ │ │ │ └── compassbench_compare_v12.py
│ │ │ │ ├── flames/
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── flames_gen.py
│ │ │ │ │ └── flames_gen_1a58bb.py
│ │ │ │ ├── fofo/
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── fofo_bilingual_judge.py
│ │ │ │ │ ├── fofo_bilingual_judge_new.py
│ │ │ │ │ ├── fofo_judge.py
│ │ │ │ │ └── fofo_judge_new.py
│ │ │ │ ├── followbench/
│ │ │ │ │ ├── followbench_llmeval.py
│ │ │ │ │ └── followbench_llmeval_new.py
│ │ │ │ ├── hellobench/
│ │ │ │ │ ├── README.md
│ │ │ │ │ └── hellobench.py
│ │ │ │ ├── judgerbench/
│ │ │ │ │ └── judgerbench.py
│ │ │ │ ├── multiround/
│ │ │ │ │ ├── mtbench101_judge.py
│ │ │ │ │ ├── mtbench101_judge_new.py
│ │ │ │ │ ├── mtbench_single_judge_diff_temp.py
│ │ │ │ │ └── mtbench_single_judge_diff_temp_new.py
│ │ │ │ ├── wildbench/
│ │ │ │ │ ├── wildbench.md
│ │ │ │ │ ├── wildbench_pair_judge.py
│ │ │ │ │ ├── wildbench_pair_judge_bradleyterry.py
│ │ │ │ │ └── wildbench_pair_judge_new.py
│ │ │ │ └── writingbench/
│ │ │ │ └── writingbench_judge.py
│ │ │ ├── summedits/
│ │ │ │ ├── summedits_gen.py
│ │ │ │ ├── summedits_gen_315438.py
│ │ │ │ ├── summedits_gen_4fb38b.py
│ │ │ │ ├── summedits_ppl.py
│ │ │ │ ├── summedits_ppl_1fbeb6.py
│ │ │ │ ├── summedits_ppl_3c30d0.py
│ │ │ │ └── summedits_ppl_fa58ba.py
│ │ │ ├── summscreen/
│ │ │ │ ├── summscreen_gen.py
│ │ │ │ ├── summscreen_gen_653185.py
│ │ │ │ └── summscreen_gen_aa5eb3.py
│ │ │ ├── supergpqa/
│ │ │ │ ├── supergpqa_cascade_gen_1545c1.py
│ │ │ │ ├── supergpqa_cascade_rawprompt_gen_ca8345.py
│ │ │ │ ├── supergpqa_gen.py
│ │ │ │ ├── supergpqa_llmjudge_field_gen_1545c1.py
│ │ │ │ └── supergpqa_llmjudge_gen_12b8bc.py
│ │ │ ├── taco/
│ │ │ │ ├── README.md
│ │ │ │ ├── taco_gen.py
│ │ │ │ ├── taco_gen_c7893a.py
│ │ │ │ └── taco_levels_gen_411572.py
│ │ │ ├── teval/
│ │ │ │ ├── README.md
│ │ │ │ ├── teval_en_gen.py
│ │ │ │ ├── teval_en_gen_1ac254.py
│ │ │ │ ├── teval_zh_gen.py
│ │ │ │ └── teval_zh_gen_1ac254.py
│ │ │ ├── triviaqa/
│ │ │ │ ├── README.md
│ │ │ │ ├── triviaqa_gen.py
│ │ │ │ ├── triviaqa_gen_0356ec.py
│ │ │ │ ├── triviaqa_gen_2121ce.py
│ │ │ │ ├── triviaqa_gen_3e39a5.py
│ │ │ │ ├── triviaqa_gen_429db5.py
│ │ │ │ ├── triviaqa_gen_d297bb.py
│ │ │ │ ├── triviaqa_wiki_1shot_gen_20a989.py
│ │ │ │ ├── triviaqa_wiki_1shot_gen_bc5f21.py
│ │ │ │ ├── triviaqa_wiki_1shot_gen_c87d61.py
│ │ │ │ ├── triviaqa_wiki_1shot_gen_eaf81e.py
│ │ │ │ └── triviaqa_wiki_gen_d18bf4.py
│ │ │ ├── triviaqarc/
│ │ │ │ ├── triviaqarc_gen.py
│ │ │ │ ├── triviaqarc_gen_a2d88a.py
│ │ │ │ └── triviaqarc_gen_db6413.py
│ │ │ ├── truthfulqa/
│ │ │ │ ├── truthfulqa_gen.py
│ │ │ │ ├── truthfulqa_gen_1e7d8d.py
│ │ │ │ └── truthfulqa_gen_5ddc62.py
│ │ │ ├── tydiqa/
│ │ │ │ ├── tydiqa_gen.py
│ │ │ │ └── tydiqa_gen_978d2a.py
│ │ │ ├── wikibench/
│ │ │ │ ├── wikibench_few_shot_ppl_c23d79.py
│ │ │ │ ├── wikibench_gen.py
│ │ │ │ ├── wikibench_gen_0978ad.py
│ │ │ │ └── wikibench_gen_f96ece.py
│ │ │ ├── wikitext/
│ │ │ │ ├── wikitext_103_raw_ppl.py
│ │ │ │ ├── wikitext_103_raw_ppl_752e2a.py
│ │ │ │ ├── wikitext_2_raw_ppl.py
│ │ │ │ └── wikitext_2_raw_ppl_752e2a.py
│ │ │ ├── winograd/
│ │ │ │ ├── winograd_ppl.py
│ │ │ │ ├── winograd_ppl_8f3049.py
│ │ │ │ └── winograd_ppl_b6c7ed.py
│ │ │ ├── winogrande/
│ │ │ │ ├── README.md
│ │ │ │ ├── deprecated_winogrande_gen_a9ede5.py
│ │ │ │ ├── winogrande_5shot_gen_6447e6.py
│ │ │ │ ├── winogrande_5shot_gen_b36770.py
│ │ │ │ ├── winogrande_5shot_ll_252f01.py
│ │ │ │ ├── winogrande_gen.py
│ │ │ │ ├── winogrande_gen_458220.py
│ │ │ │ ├── winogrande_gen_a027b6.py
│ │ │ │ ├── winogrande_ll.py
│ │ │ │ ├── winogrande_ll_c5cf57.py
│ │ │ │ ├── winogrande_ppl_55a66e.py
│ │ │ │ └── winogrande_ppl_9307fd.py
│ │ │ └── xiezhi/
│ │ │ ├── xiezhi_gen.py
│ │ │ ├── xiezhi_gen_b86cf5.py
│ │ │ ├── xiezhi_ppl.py
│ │ │ └── xiezhi_ppl_ea6bd7.py
│ │ ├── models/
│ │ │ ├── accessory/
│ │ │ │ ├── accessory_llama2_7b.py
│ │ │ │ ├── accessory_mixtral_8x7b.py
│ │ │ │ └── accessory_sphinx_v2_1k.py
│ │ │ ├── alaya/
│ │ │ │ └── alaya.py
│ │ │ ├── aquila/
│ │ │ │ ├── hf_aquila2_34b.py
│ │ │ │ ├── hf_aquila2_7b.py
│ │ │ │ ├── hf_aquilachat2_34b.py
│ │ │ │ ├── hf_aquilachat2_34b_16k.py
│ │ │ │ ├── hf_aquilachat2_7b.py
│ │ │ │ └── hf_aquilachat2_7b_16k.py
│ │ │ ├── baichuan/
│ │ │ │ ├── hf_baichuan2_13b_base.py
│ │ │ │ ├── hf_baichuan2_13b_chat.py
│ │ │ │ ├── hf_baichuan2_7b_base.py
│ │ │ │ ├── hf_baichuan2_7b_chat.py
│ │ │ │ ├── hf_baichuan_13b_base.py
│ │ │ │ ├── hf_baichuan_13b_chat.py
│ │ │ │ ├── hf_baichuan_7b.py
│ │ │ │ ├── hf_baichuan_m1_14b_base.py
│ │ │ │ └── hf_baichuan_m1_14b_instruct.py
│ │ │ ├── bailing_api/
│ │ │ │ ├── bailing-lite-1116.py
│ │ │ │ └── bailing-pro-1120.py
│ │ │ ├── bluelm/
│ │ │ │ ├── bluelm_3b.py
│ │ │ │ ├── hf_bluelm_7b_base.py
│ │ │ │ ├── hf_bluelm_7b_base_32k.py
│ │ │ │ ├── hf_bluelm_7b_chat.py
│ │ │ │ └── hf_bluelm_7b_chat_32k.py
│ │ │ ├── chatglm/
│ │ │ │ ├── hf_chatglm2_6b.py
│ │ │ │ ├── hf_chatglm3_6b.py
│ │ │ │ ├── hf_chatglm3_6b_32k.py
│ │ │ │ ├── hf_chatglm3_6b_base.py
│ │ │ │ ├── hf_chatglm_6b.py
│ │ │ │ ├── hf_glm4_9b.py
│ │ │ │ ├── hf_glm4_9b_chat.py
│ │ │ │ ├── lmdeploy_glm4_9b.py
│ │ │ │ ├── lmdeploy_glm4_9b_chat.py
│ │ │ │ ├── vllm_chatglm3_6b.py
│ │ │ │ ├── vllm_chatglm3_6b_32k.py
│ │ │ │ └── vllm_glm4_9b_chat.py
│ │ │ ├── claude/
│ │ │ │ ├── claude.py
│ │ │ │ └── claude2.py
│ │ │ ├── codegeex2/
│ │ │ │ └── hf_codegeex2_6b.py
│ │ │ ├── codellama/
│ │ │ │ ├── hf_codellama_13b.py
│ │ │ │ ├── hf_codellama_13b_instruct.py
│ │ │ │ ├── hf_codellama_13b_python.py
│ │ │ │ ├── hf_codellama_34b.py
│ │ │ │ ├── hf_codellama_34b_instruct.py
│ │ │ │ ├── hf_codellama_34b_python.py
│ │ │ │ ├── hf_codellama_70b.py
│ │ │ │ ├── hf_codellama_70b_instruct.py
│ │ │ │ ├── hf_codellama_70b_python.py
│ │ │ │ ├── hf_codellama_7b.py
│ │ │ │ ├── hf_codellama_7b_instruct.py
│ │ │ │ └── hf_codellama_7b_python.py
│ │ │ ├── deepseek/
│ │ │ │ ├── deepseek_r1_streaming.py
│ │ │ │ ├── hf_deepseek_67b_base.py
│ │ │ │ ├── hf_deepseek_67b_chat.py
│ │ │ │ ├── hf_deepseek_7b_base.py
│ │ │ │ ├── hf_deepseek_7b_chat.py
│ │ │ │ ├── hf_deepseek_coder_1_3b_instruct.py
│ │ │ │ ├── hf_deepseek_coder_33b_instruct.py
│ │ │ │ ├── hf_deepseek_coder_6_7b_instruct.py
│ │ │ │ ├── hf_deepseek_moe_16b_base.py
│ │ │ │ ├── hf_deepseek_moe_16b_chat.py
│ │ │ │ ├── hf_deepseek_r1_distill_llama_70b.py
│ │ │ │ ├── hf_deepseek_r1_distill_llama_8b.py
│ │ │ │ ├── hf_deepseek_r1_distill_qwen_14b.py
│ │ │ │ ├── hf_deepseek_r1_distill_qwen_1_5b.py
│ │ │ │ ├── hf_deepseek_r1_distill_qwen_32b.py
│ │ │ │ ├── hf_deepseek_r1_distill_qwen_7b.py
│ │ │ │ ├── hf_deepseek_v2.py
│ │ │ │ ├── hf_deepseek_v2_chat.py
│ │ │ │ ├── hf_deepseek_v2_lite.py
│ │ │ │ ├── hf_deepseek_v2_lite_chat.py
│ │ │ │ ├── lmdeploy_deepseek_67b_base.py
│ │ │ │ ├── lmdeploy_deepseek_67b_chat.py
│ │ │ │ ├── lmdeploy_deepseek_7b_base.py
│ │ │ │ ├── lmdeploy_deepseek_7b_chat.py
│ │ │ │ ├── lmdeploy_deepseek_r1_distill_llama_70b.py
│ │ │ │ ├── lmdeploy_deepseek_r1_distill_llama_8b.py
│ │ │ │ ├── lmdeploy_deepseek_r1_distill_qwen_14b.py
│ │ │ │ ├── lmdeploy_deepseek_r1_distill_qwen_1_5b.py
│ │ │ │ ├── lmdeploy_deepseek_r1_distill_qwen_32b.py
│ │ │ │ ├── lmdeploy_deepseek_r1_distill_qwen_7b.py
│ │ │ │ ├── lmdeploy_deepseek_series.py
│ │ │ │ ├── lmdeploy_deepseek_v2.py
│ │ │ │ ├── lmdeploy_deepseek_v2_5.py
│ │ │ │ ├── lmdeploy_deepseek_v2_5_1210.py
│ │ │ │ ├── lmdeploy_deepseek_v2_lite.py
│ │ │ │ ├── vllm_deepseek_67b_chat.py
│ │ │ │ ├── vllm_deepseek_7b_chat.py
│ │ │ │ ├── vllm_deepseek_moe_16b_base.py
│ │ │ │ └── vllm_deepseek_moe_16b_chat.py
│ │ │ ├── falcon/
│ │ │ │ ├── hf_falcon_40b.py
│ │ │ │ └── hf_falcon_7b.py
│ │ │ ├── gemini/
│ │ │ │ ├── gemini_1_5_flash.py
│ │ │ │ ├── gemini_1_5_pro.py
│ │ │ │ └── gemini_pro.py
│ │ │ ├── gemma/
│ │ │ │ ├── hf_gemma2_27b.py
│ │ │ │ ├── hf_gemma2_27b_it.py
│ │ │ │ ├── hf_gemma2_2b.py
│ │ │ │ ├── hf_gemma2_2b_it.py
│ │ │ │ ├── hf_gemma2_9b.py
│ │ │ │ ├── hf_gemma2_9b_it.py
│ │ │ │ ├── hf_gemma_2b.py
│ │ │ │ ├── hf_gemma_2b_it.py
│ │ │ │ ├── hf_gemma_7b.py
│ │ │ │ ├── hf_gemma_7b_it.py
│ │ │ │ ├── lmdeploy_gemma_27b.py
│ │ │ │ ├── lmdeploy_gemma_27b_it.py
│ │ │ │ ├── lmdeploy_gemma_9b.py
│ │ │ │ ├── lmdeploy_gemma_9b_it.py
│ │ │ │ ├── vllm_gemma_2b.py
│ │ │ │ ├── vllm_gemma_2b_it.py
│ │ │ │ ├── vllm_gemma_3_12b_it.py
│ │ │ │ ├── vllm_gemma_3_27b_it.py
│ │ │ │ ├── vllm_gemma_3_4b_it.py
│ │ │ │ ├── vllm_gemma_7b.py
│ │ │ │ └── vllm_gemma_7b_it.py
│ │ │ ├── hf_internlm/
│ │ │ │ ├── README.md
│ │ │ │ ├── hf_internlm2_1_8b.py
│ │ │ │ ├── hf_internlm2_20b.py
│ │ │ │ ├── hf_internlm2_5_1_8b_chat.py
│ │ │ │ ├── hf_internlm2_5_20b_chat.py
│ │ │ │ ├── hf_internlm2_5_7b.py
│ │ │ │ ├── hf_internlm2_5_7b_chat.py
│ │ │ │ ├── hf_internlm2_7b.py
│ │ │ │ ├── hf_internlm2_base_20b.py
│ │ │ │ ├── hf_internlm2_base_7b.py
│ │ │ │ ├── hf_internlm2_chat_1_8b.py
│ │ │ │ ├── hf_internlm2_chat_1_8b_sft.py
│ │ │ │ ├── hf_internlm2_chat_20b.py
│ │ │ │ ├── hf_internlm2_chat_20b_sft.py
│ │ │ │ ├── hf_internlm2_chat_20b_with_system.py
│ │ │ │ ├── hf_internlm2_chat_7b.py
│ │ │ │ ├── hf_internlm2_chat_7b_sft.py
│ │ │ │ ├── hf_internlm2_chat_7b_with_system.py
│ │ │ │ ├── hf_internlm2_chat_math_20b.py
│ │ │ │ ├── hf_internlm2_chat_math_20b_with_system.py
│ │ │ │ ├── hf_internlm2_chat_math_7b.py
│ │ │ │ ├── hf_internlm2_chat_math_7b_with_system.py
│ │ │ │ ├── hf_internlm2_math_20b.py
│ │ │ │ ├── hf_internlm2_math_7b.py
│ │ │ │ ├── hf_internlm3_8b_instruct.py
│ │ │ │ ├── hf_internlm_20b.py
│ │ │ │ ├── hf_internlm_7b.py
│ │ │ │ ├── hf_internlm_chat_20b.py
│ │ │ │ ├── hf_internlm_chat_7b.py
│ │ │ │ ├── lmdeploy_internlm2_1_8b.py
│ │ │ │ ├── lmdeploy_internlm2_20b.py
│ │ │ │ ├── lmdeploy_internlm2_5_1_8b_chat.py
│ │ │ │ ├── lmdeploy_internlm2_5_20b_chat.py
│ │ │ │ ├── lmdeploy_internlm2_5_7b.py
│ │ │ │ ├── lmdeploy_internlm2_5_7b_chat.py
│ │ │ │ ├── lmdeploy_internlm2_5_7b_chat_1m.py
│ │ │ │ ├── lmdeploy_internlm2_7b.py
│ │ │ │ ├── lmdeploy_internlm2_base_20b.py
│ │ │ │ ├── lmdeploy_internlm2_base_7b.py
│ │ │ │ ├── lmdeploy_internlm2_chat_1_8b.py
│ │ │ │ ├── lmdeploy_internlm2_chat_1_8b_sft.py
│ │ │ │ ├── lmdeploy_internlm2_chat_20b.py
│ │ │ │ ├── lmdeploy_internlm2_chat_20b_sft.py
│ │ │ │ ├── lmdeploy_internlm2_chat_7b.py
│ │ │ │ ├── lmdeploy_internlm2_chat_7b_sft.py
│ │ │ │ ├── lmdeploy_internlm2_series.py
│ │ │ │ ├── lmdeploy_internlm3_8b_instruct.py
│ │ │ │ ├── lmdeploy_internlm3_8b_instruct_128k.py
│ │ │ │ ├── lmdeploy_internlm_20b.py
│ │ │ │ ├── lmdeploy_internlm_7b.py
│ │ │ │ ├── lmdeploy_internlm_chat_20b.py
│ │ │ │ ├── lmdeploy_internlm_chat_7b.py
│ │ │ │ ├── lmdeploy_oreal_32b.py
│ │ │ │ ├── vllm_internlm2_chat_1_8b.py
│ │ │ │ ├── vllm_internlm2_chat_1_8b_sft.py
│ │ │ │ ├── vllm_internlm2_chat_20b.py
│ │ │ │ ├── vllm_internlm2_chat_20b_sft.py
│ │ │ │ ├── vllm_internlm2_chat_7b.py
│ │ │ │ ├── vllm_internlm2_chat_7b_sft.py
│ │ │ │ └── vllm_internlm2_series.py
│ │ │ ├── hf_llama/
│ │ │ │ ├── hf_llama2_13b.py
│ │ │ │ ├── hf_llama2_13b_chat.py
│ │ │ │ ├── hf_llama2_70b.py
│ │ │ │ ├── hf_llama2_70b_chat.py
│ │ │ │ ├── hf_llama2_7b.py
│ │ │ │ ├── hf_llama2_7b_chat.py
│ │ │ │ ├── hf_llama3_1_70b_instruct.py
│ │ │ │ ├── hf_llama3_1_8b.py
│ │ │ │ ├── hf_llama3_1_8b_instruct.py
│ │ │ │ ├── hf_llama3_2_3b_instruct.py
│ │ │ │ ├── hf_llama3_70b.py
│ │ │ │ ├── hf_llama3_70b_instruct.py
│ │ │ │ ├── hf_llama3_8b.py
│ │ │ │ ├── hf_llama3_8b_instruct.py
│ │ │ │ ├── hf_llama_13b.py
│ │ │ │ ├── hf_llama_30b.py
│ │ │ │ ├── hf_llama_65b.py
│ │ │ │ ├── hf_llama_7b.py
│ │ │ │ ├── lmdeploy_llama2_13b.py
│ │ │ │ ├── lmdeploy_llama2_13b_chat.py
│ │ │ │ ├── lmdeploy_llama2_70b.py
│ │ │ │ ├── lmdeploy_llama2_70b_chat.py
│ │ │ │ ├── lmdeploy_llama2_7b.py
│ │ │ │ ├── lmdeploy_llama2_7b_chat.py
│ │ │ │ ├── lmdeploy_llama3_1_70b_instruct.py
│ │ │ │ ├── lmdeploy_llama3_1_8b.py
│ │ │ │ ├── lmdeploy_llama3_1_8b_instruct.py
│ │ │ │ ├── lmdeploy_llama3_2_3b_instruct.py
│ │ │ │ ├── lmdeploy_llama3_3_70b_instruct.py
│ │ │ │ ├── lmdeploy_llama3_70b.py
│ │ │ │ ├── lmdeploy_llama3_70b_instruct.py
│ │ │ │ ├── lmdeploy_llama3_8b.py
│ │ │ │ ├── lmdeploy_llama3_8b_instruct.py
│ │ │ │ ├── lmdeploy_llama_13b.py
│ │ │ │ ├── lmdeploy_llama_30b.py
│ │ │ │ ├── lmdeploy_llama_65b.py
│ │ │ │ ├── lmdeploy_llama_7b.py
│ │ │ │ └── vllm_llama_series.py
│ │ │ ├── huatuogpt/
│ │ │ │ ├── hf_huatuogpt2_13b.py
│ │ │ │ ├── hf_huatuogpt2_7b.py
│ │ │ │ ├── hf_huatuogpt_o1_7b.py
│ │ │ │ └── hf_huatuogpt_o1_8b.py
│ │ │ ├── internlm/
│ │ │ │ └── internlm_7b.py
│ │ │ ├── interns1/
│ │ │ │ └── intern_s1.py
│ │ │ ├── internvl/
│ │ │ │ ├── lmdeploy_internvl_2_5_38b.py
│ │ │ │ └── lmdeploy_internvl_2_5_8b.py
│ │ │ ├── judge_llm/
│ │ │ │ ├── auto_j/
│ │ │ │ │ ├── hf_autoj_bilingual_6b.py
│ │ │ │ │ ├── hf_autoj_eng_13b.py
│ │ │ │ │ ├── hf_autoj_eng_13b_4bit.py
│ │ │ │ │ └── hf_autoj_scen_classifier.py
│ │ │ │ ├── judgelm/
│ │ │ │ │ ├── hf_judgelm_13b_v1.py
│ │ │ │ │ ├── hf_judgelm_33b_v1.py
│ │ │ │ │ └── hf_judgelm_7b_v1.py
│ │ │ │ └── pandalm/
│ │ │ │ ├── hf_alpaca_pandalm_7b_v1.py
│ │ │ │ └── hf_pandalm_7b_v1.py
│ │ │ ├── lemur/
│ │ │ │ └── lemur_70b_chat.py
│ │ │ ├── lingowhale/
│ │ │ │ └── hf_lingowhale_8b.py
│ │ │ ├── mistral/
│ │ │ │ ├── hf_ministral_8b_instruct_2410.py
│ │ │ │ ├── hf_mistral_7b_instruct_v0_1.py
│ │ │ │ ├── hf_mistral_7b_instruct_v0_2.py
│ │ │ │ ├── hf_mistral_7b_instruct_v0_3.py
│ │ │ │ ├── hf_mistral_7b_v0_1.py
│ │ │ │ ├── hf_mistral_7b_v0_2.py
│ │ │ │ ├── hf_mistral_7b_v0_3.py
│ │ │ │ ├── hf_mistral_nemo_instruct_2407.py
│ │ │ │ ├── hf_mistral_small_instruct_2409.py
│ │ │ │ ├── hf_mixtral_8x22b_instruct_v0_1.py
│ │ │ │ ├── hf_mixtral_8x22b_v0_1.py
│ │ │ │ ├── hf_mixtral_8x7b_instruct_v0_1.py
│ │ │ │ ├── hf_mixtral_8x7b_v0_1.py
│ │ │ │ ├── lmdeploy_ministral_8b_instruct_2410.py
│ │ │ │ ├── lmdeploy_mistral_7b_instruct_v0_3.py
│ │ │ │ ├── lmdeploy_mistral_large_instruct_2411.py
│ │ │ │ ├── lmdeploy_mistral_nemo_instruct_2407.py
│ │ │ │ ├── lmdeploy_mistral_small_instruct_2409.py
│ │ │ │ ├── lmdeploy_mixtral_8x22b_instruct_v0_1.py
│ │ │ │ ├── lmdeploy_mixtral_large_instruct_2407.py
│ │ │ │ ├── mixtral_8x7b_32k.py
│ │ │ │ ├── vllm_mistral_7b_instruct_v0_1.py
│ │ │ │ ├── vllm_mistral_7b_instruct_v0_2.py
│ │ │ │ ├── vllm_mistral_7b_v0_1.py
│ │ │ │ ├── vllm_mistral_7b_v0_2.py
│ │ │ │ ├── vllm_mixtral_8x22b_instruct_v0_1.py
│ │ │ │ ├── vllm_mixtral_8x22b_v0_1.py
│ │ │ │ ├── vllm_mixtral_8x7b_instruct_v0_1.py
│ │ │ │ ├── vllm_mixtral_8x7b_v0_1.py
│ │ │ │ └── vllm_mixtral_large_instruct_2407.py
│ │ │ ├── moonshot/
│ │ │ │ ├── kimi_k2.py
│ │ │ │ └── kimi_k2_streaming.py
│ │ │ ├── moss/
│ │ │ │ ├── hf_moss_moon_003_base.py
│ │ │ │ └── hf_moss_moon_003_sft.py
│ │ │ ├── mpt/
│ │ │ │ ├── hf_mpt_7b.py
│ │ │ │ └── hf_mpt_instruct_7b.py
│ │ │ ├── ms_internlm/
│ │ │ │ └── ms_internlm_chat_7b_8k.py
│ │ │ ├── nanbeige/
│ │ │ │ ├── hf_nanbeige2_16b_chat.py
│ │ │ │ ├── hf_nanbeige2_8b_chat.py
│ │ │ │ └── hf_nanbeige_16b_chat.py
│ │ │ ├── nvidia/
│ │ │ │ └── lmdeploy_nemotron_70b_instruct_hf.py
│ │ │ ├── openai/
│ │ │ │ ├── gpt_3_5_turbo.py
│ │ │ │ ├── gpt_3_5_turbo_0125.py
│ │ │ │ ├── gpt_4.py
│ │ │ │ ├── gpt_4o_2024_05_13.py
│ │ │ │ ├── o1_mini_2024_09_12.py
│ │ │ │ └── o1_preview_2024_09_12.py
│ │ │ ├── openbmb/
│ │ │ │ ├── hf_minicpm3_4b.py
│ │ │ │ ├── hf_minicpm_2b_dpo_fp32.py
│ │ │ │ ├── hf_minicpm_2b_sft_bf16.py
│ │ │ │ └── hf_minicpm_2b_sft_fp32.py
│ │ │ ├── opt/
│ │ │ │ ├── hf_opt_125m.py
│ │ │ │ └── hf_opt_350m.py
│ │ │ ├── others/
│ │ │ │ ├── hf_abel_7b_001.py
│ │ │ │ ├── hf_abel_7b_002.py
│ │ │ │ ├── hf_arithmo_mistral_7b.py
│ │ │ │ ├── hf_command_r_plus.py
│ │ │ │ ├── hf_dbrx_base.py
│ │ │ │ ├── hf_dbrx_instruct.py
│ │ │ │ ├── hf_dolphin_21_mistral_7b.py
│ │ │ │ ├── hf_fashiongpt_70b_v11.py
│ │ │ │ ├── hf_gsm8k_rft_llama7b2_u13b.py
│ │ │ │ ├── hf_metamath_7b_v1_0.py
│ │ │ │ ├── hf_metamath_llemma_7b.py
│ │ │ │ ├── hf_metamath_mistral_7b.py
│ │ │ │ ├── hf_openchat_35_0106.py
│ │ │ │ ├── hf_openchat_35_1210.py
│ │ │ │ ├── hf_orionstar_14b_base.py
│ │ │ │ ├── hf_orionstar_yi_34b_chat.py
│ │ │ │ ├── hf_phi_2.py
│ │ │ │ ├── hf_telechat_12b_v2.py
│ │ │ │ ├── hf_telechat_52b.py
│ │ │ │ ├── hf_telechat_7b.py
│ │ │ │ ├── hf_yayi2_30b_base.py
│ │ │ │ ├── vllm_dbrx_instruct.py
│ │ │ │ └── vllm_orionstar_14b_longchat.py
│ │ │ ├── phi/
│ │ │ │ ├── hf_phi_3_5_MoE_instruct.py
│ │ │ │ ├── hf_phi_3_5_mini_instruct.py
│ │ │ │ ├── hf_phi_3_medium_4k_instruct.py
│ │ │ │ ├── hf_phi_3_mini_4k_instruct.py
│ │ │ │ ├── hf_phi_3_small_8k_instruct.py
│ │ │ │ └── hf_phi_4.py
│ │ │ ├── pulse/
│ │ │ │ └── hf_pulse_7b.py
│ │ │ ├── qwen/
│ │ │ │ ├── README.md
│ │ │ │ ├── hf_qwen1_5_0_5b.py
│ │ │ │ ├── hf_qwen1_5_0_5b_chat.py
│ │ │ │ ├── hf_qwen1_5_110b.py
│ │ │ │ ├── hf_qwen1_5_110b_chat.py
│ │ │ │ ├── hf_qwen1_5_14b.py
│ │ │ │ ├── hf_qwen1_5_14b_chat.py
│ │ │ │ ├── hf_qwen1_5_1_8b.py
│ │ │ │ ├── hf_qwen1_5_1_8b_chat.py
│ │ │ │ ├── hf_qwen1_5_32b.py
│ │ │ │ ├── hf_qwen1_5_32b_chat.py
│ │ │ │ ├── hf_qwen1_5_4b.py
│ │ │ │ ├── hf_qwen1_5_4b_chat.py
│ │ │ │ ├── hf_qwen1_5_72b.py
│ │ │ │ ├── hf_qwen1_5_72b_chat.py
│ │ │ │ ├── hf_qwen1_5_7b.py
│ │ │ │ ├── hf_qwen1_5_7b_chat.py
│ │ │ │ ├── hf_qwen1_5_moe_a2_7b.py
│ │ │ │ ├── hf_qwen1_5_moe_a2_7b_chat.py
│ │ │ │ ├── hf_qwen2_0_5b.py
│ │ │ │ ├── hf_qwen2_0_5b_instruct.py
│ │ │ │ ├── hf_qwen2_1_5b.py
│ │ │ │ ├── hf_qwen2_1_5b_instruct.py
│ │ │ │ ├── hf_qwen2_57b_a14b.py
│ │ │ │ ├── hf_qwen2_72b.py
│ │ │ │ ├── hf_qwen2_7b.py
│ │ │ │ ├── hf_qwen2_7b_instruct.py
│ │ │ │ ├── hf_qwen_14b.py
│ │ │ │ ├── hf_qwen_14b_chat.py
│ │ │ │ ├── hf_qwen_1_8b.py
│ │ │ │ ├── hf_qwen_1_8b_chat.py
│ │ │ │ ├── hf_qwen_72b.py
│ │ │ │ ├── hf_qwen_72b_chat.py
│ │ │ │ ├── hf_qwen_7b.py
│ │ │ │ ├── hf_qwen_7b_chat.py
│ │ │ │ ├── lmdeploy_qwen1_5_110b.py
│ │ │ │ ├── lmdeploy_qwen1_5_110b_chat.py
│ │ │ │ ├── lmdeploy_qwen1_5_14b.py
│ │ │ │ ├── lmdeploy_qwen1_5_14b_chat.py
│ │ │ │ ├── lmdeploy_qwen1_5_1_8b.py
│ │ │ │ ├── lmdeploy_qwen1_5_1_8b_chat.py
│ │ │ │ ├── lmdeploy_qwen1_5_32b.py
│ │ │ │ ├── lmdeploy_qwen1_5_32b_chat.py
│ │ │ │ ├── lmdeploy_qwen1_5_4b.py
│ │ │ │ ├── lmdeploy_qwen1_5_4b_chat.py
│ │ │ │ ├── lmdeploy_qwen1_5_72b.py
│ │ │ │ ├── lmdeploy_qwen1_5_72b_chat.py
│ │ │ │ ├── lmdeploy_qwen1_5_7b.py
│ │ │ │ ├── lmdeploy_qwen1_5_7b_chat.py
│ │ │ │ ├── lmdeploy_qwen1_5_series.py
│ │ │ │ ├── lmdeploy_qwen2_1_5b.py
│ │ │ │ ├── lmdeploy_qwen2_1_5b_instruct.py
│ │ │ │ ├── lmdeploy_qwen2_72b.py
│ │ │ │ ├── lmdeploy_qwen2_72b_instruct.py
│ │ │ │ ├── lmdeploy_qwen2_7b.py
│ │ │ │ ├── lmdeploy_qwen2_7b_instruct.py
│ │ │ │ ├── lmdeploy_qwen2_series.py
│ │ │ │ ├── lmdeploy_qwen_14b.py
│ │ │ │ ├── lmdeploy_qwen_14b_chat.py
│ │ │ │ ├── lmdeploy_qwen_1_8b.py
│ │ │ │ ├── lmdeploy_qwen_1_8b_chat.py
│ │ │ │ ├── lmdeploy_qwen_72b.py
│ │ │ │ ├── lmdeploy_qwen_72b_chat.py
│ │ │ │ ├── lmdeploy_qwen_7b.py
│ │ │ │ ├── lmdeploy_qwen_7b_chat.py
│ │ │ │ ├── lmdeploy_qwen_series.py
│ │ │ │ ├── ms_qwen_7b_chat.py
│ │ │ │ ├── vllm_qwen1_5_0_5b.py
│ │ │ │ ├── vllm_qwen1_5_0_5b_chat.py
│ │ │ │ ├── vllm_qwen1_5_110b.py
│ │ │ │ ├── vllm_qwen1_5_110b_chat.py
│ │ │ │ ├── vllm_qwen1_5_14b.py
│ │ │ │ ├── vllm_qwen1_5_14b_chat.py
│ │ │ │ ├── vllm_qwen1_5_1_8b.py
│ │ │ │ ├── vllm_qwen1_5_1_8b_chat.py
│ │ │ │ ├── vllm_qwen1_5_32b.py
│ │ │ │ ├── vllm_qwen1_5_32b_chat.py
│ │ │ │ ├── vllm_qwen1_5_4b.py
│ │ │ │ ├── vllm_qwen1_5_4b_chat.py
│ │ │ │ ├── vllm_qwen1_5_72b.py
│ │ │ │ ├── vllm_qwen1_5_72b_chat.py
│ │ │ │ ├── vllm_qwen1_5_7b.py
│ │ │ │ ├── vllm_qwen1_5_7b_chat.py
│ │ │ │ ├── vllm_qwen1_5_moe_a2_7b.py
│ │ │ │ ├── vllm_qwen1_5_moe_a2_7b_chat.py
│ │ │ │ ├── vllm_qwen1_5_series.py
│ │ │ │ ├── vllm_qwen2_0_5b.py
│ │ │ │ ├── vllm_qwen2_0_5b_instruct.py
│ │ │ │ ├── vllm_qwen2_1_5b.py
│ │ │ │ ├── vllm_qwen2_1_5b_instruct.py
│ │ │ │ ├── vllm_qwen2_57b_a14b_instruct.py
│ │ │ │ ├── vllm_qwen2_72b.py
│ │ │ │ ├── vllm_qwen2_72b_instruct.py
│ │ │ │ ├── vllm_qwen2_7b.py
│ │ │ │ ├── vllm_qwen2_7b_instruct.py
│ │ │ │ ├── vllm_qwen2_series.py
│ │ │ │ ├── vllm_qwen_14b.py
│ │ │ │ ├── vllm_qwen_14b_chat.py
│ │ │ │ ├── vllm_qwen_1_8b.py
│ │ │ │ ├── vllm_qwen_1_8b_chat.py
│ │ │ │ ├── vllm_qwen_72b.py
│ │ │ │ ├── vllm_qwen_72b_chat.py
│ │ │ │ ├── vllm_qwen_7b.py
│ │ │ │ ├── vllm_qwen_7b_chat.py
│ │ │ │ └── vllm_qwen_series.py
│ │ │ ├── qwen2_5/
│ │ │ │ ├── hf_qwen2_5_0_5b_instruct.py
│ │ │ │ ├── hf_qwen2_5_14b_instruct.py
│ │ │ │ ├── hf_qwen2_5_1_5b_instruct.py
│ │ │ │ ├── hf_qwen2_5_32b_instruct.py
│ │ │ │ ├── hf_qwen2_5_3b_instruct.py
│ │ │ │ ├── hf_qwen2_5_72b_instruct.py
│ │ │ │ ├── hf_qwen2_5_7b_instruct.py
│ │ │ │ ├── hf_qwen_2_5_14b.py
│ │ │ │ ├── hf_qwen_2_5_32b.py
│ │ │ │ ├── hf_qwen_2_5_7b.py
│ │ │ │ ├── lmdeploy_qwen2_5_0_5b_instruct.py
│ │ │ │ ├── lmdeploy_qwen2_5_14b.py
│ │ │ │ ├── lmdeploy_qwen2_5_14b_instruct.py
│ │ │ │ ├── lmdeploy_qwen2_5_1_5b.py
│ │ │ │ ├── lmdeploy_qwen2_5_1_5b_instruct.py
│ │ │ │ ├── lmdeploy_qwen2_5_32b.py
│ │ │ │ ├── lmdeploy_qwen2_5_32b_instruct.py
│ │ │ │ ├── lmdeploy_qwen2_5_3b_instruct.py
│ │ │ │ ├── lmdeploy_qwen2_5_72b.py
│ │ │ │ ├── lmdeploy_qwen2_5_72b_instruct.py
│ │ │ │ ├── lmdeploy_qwen2_5_7b.py
│ │ │ │ ├── lmdeploy_qwen2_5_7b_instruct.py
│ │ │ │ ├── vllm_qwen2_5_0_5b_instruct.py
│ │ │ │ ├── vllm_qwen2_5_14b_instruct.py
│ │ │ │ ├── vllm_qwen2_5_14b_instruct_128k.py
│ │ │ │ ├── vllm_qwen2_5_1_5b_instruct.py
│ │ │ │ ├── vllm_qwen2_5_32b_instruct.py
│ │ │ │ ├── vllm_qwen2_5_32b_instruct_128k.py
│ │ │ │ ├── vllm_qwen2_5_3b_instruct.py
│ │ │ │ ├── vllm_qwen2_5_72b_instruct.py
│ │ │ │ ├── vllm_qwen2_5_72b_instruct_128k.py
│ │ │ │ ├── vllm_qwen2_5_7b_instruct.py
│ │ │ │ └── vllm_qwen2_5_7b_instruct_128k.py
│ │ │ ├── qwen3/
│ │ │ │ └── lmdeploy_qwen3_0_6b.py
│ │ │ ├── qwq/
│ │ │ │ ├── lmdeploy_qwq_32b.py
│ │ │ │ └── lmdeploy_qwq_32b_preview.py
│ │ │ ├── rwkv/
│ │ │ │ └── rwkv5_3b.py
│ │ │ ├── skywork/
│ │ │ │ ├── hf_skywork_13b.py
│ │ │ │ └── lmdeploy_skywork_o1_open_llama3_1_8b_instruct.py
│ │ │ ├── telechat/
│ │ │ │ ├── telechat_thinking_streaming_v1.py
│ │ │ │ └── telechat_thinking_v1.py
│ │ │ ├── tigerbot/
│ │ │ │ ├── hf_tigerbot_13b_base_v1.py
│ │ │ │ ├── hf_tigerbot_13b_base_v2.py
│ │ │ │ ├── hf_tigerbot_13b_chat_v1.py
│ │ │ │ ├── hf_tigerbot_13b_chat_v2.py
│ │ │ │ ├── hf_tigerbot_70b_base.py
│ │ │ │ ├── hf_tigerbot_70b_chat_v2.py
│ │ │ │ ├── hf_tigerbot_70b_chat_v3.py
│ │ │ │ ├── hf_tigerbot_7b_base.py
│ │ │ │ ├── hf_tigerbot_7b_base_v3.py
│ │ │ │ ├── hf_tigerbot_7b_chat_v3.py
│ │ │ │ └── hf_tigerbot_7b_sft.py
│ │ │ ├── vicuna/
│ │ │ │ ├── hf_vicuna_13b_v13.py
│ │ │ │ ├── hf_vicuna_13b_v15.py
│ │ │ │ ├── hf_vicuna_13b_v15_16k.py
│ │ │ │ ├── hf_vicuna_33b_v13.py
│ │ │ │ ├── hf_vicuna_7b_v13.py
│ │ │ │ ├── hf_vicuna_7b_v15.py
│ │ │ │ ├── hf_vicuna_7b_v15_16k.py
│ │ │ │ ├── vllm_vicuna_13b_v15_16k.py
│ │ │ │ └── vllm_vicuna_7b_v15_16k.py
│ │ │ ├── wizardcoder/
│ │ │ │ ├── hf_wizardcoder_15b.py
│ │ │ │ ├── hf_wizardcoder_1b.py
│ │ │ │ ├── hf_wizardcoder_3b.py
│ │ │ │ ├── hf_wizardcoder_python_13b.py
│ │ │ │ └── hf_wizardcoder_python_34b.py
│ │ │ ├── wizardlm/
│ │ │ │ ├── hf_wizardlm_13b_v1_2.py
│ │ │ │ ├── hf_wizardlm_70b_v1_0.py
│ │ │ │ ├── hf_wizardlm_7b_v1_0.py
│ │ │ │ ├── hf_wizardmath_7b_v1_0.py
│ │ │ │ ├── hf_wizardmath_7b_v1_1.py
│ │ │ │ ├── vllm_wizardlm_13b_v1_2.py
│ │ │ │ ├── vllm_wizardlm_70b_v1_0.py
│ │ │ │ └── vllm_wizardlm_7b_v1_0.py
│ │ │ ├── yi/
│ │ │ │ ├── hf_yi_1_5_34b.py
│ │ │ │ ├── hf_yi_1_5_34b_chat.py
│ │ │ │ ├── hf_yi_1_5_6b.py
│ │ │ │ ├── hf_yi_1_5_6b_chat.py
│ │ │ │ ├── hf_yi_1_5_9b.py
│ │ │ │ ├── hf_yi_1_5_9b_chat.py
│ │ │ │ ├── hf_yi_34b.py
│ │ │ │ ├── hf_yi_34b_chat.py
│ │ │ │ ├── hf_yi_6b.py
│ │ │ │ ├── hf_yi_6b_chat.py
│ │ │ │ ├── lmdeploy_yi_1_5_34b_chat.py
│ │ │ │ ├── lmdeploy_yi_1_5_6b_chat.py
│ │ │ │ ├── lmdeploy_yi_1_5_9b.py
│ │ │ │ ├── lmdeploy_yi_1_5_9b_chat.py
│ │ │ │ ├── lmdeploy_yi_34b_chat.py
│ │ │ │ ├── lmdeploy_yi_6b_chat.py
│ │ │ │ └── lmdeploy_yi_series.py
│ │ │ └── zephyr/
│ │ │ ├── hf_zephyr_7b_beta.py
│ │ │ └── vllm_zephyr_7b_beta.py
│ │ └── summarizers/
│ │ ├── OlympiadBench.py
│ │ ├── PMMEval.py
│ │ ├── agent_bench.py
│ │ ├── charm_reason.py
│ │ ├── chat_OC15.py
│ │ ├── chat_OC15_multi_faceted.py
│ │ ├── cibench.py
│ │ ├── code_passk.py
│ │ ├── compassbench_v1_1_objective.py
│ │ ├── compassbench_v1_1_objective_public.py
│ │ ├── compassbench_v1_3_objective.py
│ │ ├── compassbench_v1_objective.py
│ │ ├── contamination.py
│ │ ├── example.py
│ │ ├── groups/
│ │ │ ├── GaokaoBench.py
│ │ │ ├── MMLUArabic.py
│ │ │ ├── OlympiadBench.py
│ │ │ ├── PHYSICS.py
│ │ │ ├── PMMEval.py
│ │ │ ├── agieval.py
│ │ │ ├── babilong.py
│ │ │ ├── bbeh.py
│ │ │ ├── bbh.py
│ │ │ ├── biodata.py
│ │ │ ├── calm.py
│ │ │ ├── ceval.py
│ │ │ ├── charm_reason.py
│ │ │ ├── cibench.py
│ │ │ ├── cmmlu.py
│ │ │ ├── ds1000.py
│ │ │ ├── flores.py
│ │ │ ├── humanevalx.py
│ │ │ ├── infinitebench.py
│ │ │ ├── jigsaw_multilingual.py
│ │ │ ├── korbench.py
│ │ │ ├── lawbench.py
│ │ │ ├── lcbench.py
│ │ │ ├── legacy/
│ │ │ │ └── cibench.py
│ │ │ ├── leval.py
│ │ │ ├── longbench.py
│ │ │ ├── lveval.py
│ │ │ ├── mathbench.py
│ │ │ ├── mathbench_2024.py
│ │ │ ├── mathbench_agent.py
│ │ │ ├── mathbench_v1.py
│ │ │ ├── mathbench_v1_2024.py
│ │ │ ├── mathbench_v1_2024_lang.py
│ │ │ ├── mgsm.py
│ │ │ ├── mmlu.py
│ │ │ ├── mmlu_cf.py
│ │ │ ├── mmlu_pro.py
│ │ │ ├── mmmlu.py
│ │ │ ├── multipl_e.py
│ │ │ ├── musr_average.py
│ │ │ ├── plugineval.py
│ │ │ ├── ruler.py
│ │ │ ├── scibench.py
│ │ │ ├── scicode.py
│ │ │ ├── supergpqa.py
│ │ │ ├── teval.py
│ │ │ ├── tydiqa.py
│ │ │ └── xiezhi.py
│ │ ├── infinitebench.py
│ │ ├── internlm2_keyset.py
│ │ ├── judgedataset_all.py
│ │ ├── judgerbenchv2.py
│ │ ├── lawbench.py
│ │ ├── leaderboard.py
│ │ ├── leval.py
│ │ ├── longbench.py
│ │ ├── longeval_v2.py
│ │ ├── lveval.py
│ │ ├── math_agent.py
│ │ ├── math_baseline.py
│ │ ├── mathbench.py
│ │ ├── mathbench_v1.py
│ │ ├── medium.py
│ │ ├── mmlu_cf.py
│ │ ├── mmlu_pro.py
│ │ ├── mmmlu.py
│ │ ├── mmmlu_lite.py
│ │ ├── needlebench.py
│ │ ├── plugineval.py
│ │ ├── rewardbench.py
│ │ ├── ruler.py
│ │ ├── scicode.py
│ │ ├── scireasoner.py
│ │ ├── simpleqa.py
│ │ ├── small.py
│ │ ├── subjective.py
│ │ ├── teval.py
│ │ └── tiny.py
│ ├── datasets/
│ │ ├── CARDBiomedBench.py
│ │ ├── ClinicBench.py
│ │ ├── Earth_Silver.py
│ │ ├── FinanceIQ.py
│ │ ├── GaokaoBench.py
│ │ ├── IFBench/
│ │ │ ├── evaluation_lib.py
│ │ │ ├── ifbench.py
│ │ │ ├── instructions.py
│ │ │ ├── instructions_registry.py
│ │ │ └── instructions_util.py
│ │ ├── IFEval/
│ │ │ ├── __init__.py
│ │ │ ├── evaluation_main.py
│ │ │ ├── ifeval.py
│ │ │ ├── instructions.py
│ │ │ ├── instructions_registry.py
│ │ │ └── instructions_util.py
│ │ ├── LCBench.py
│ │ ├── MMLUArabic.py
│ │ ├── MedCalc_Bench.py
│ │ ├── MedQA.py
│ │ ├── MedXpertQA.py
│ │ ├── Medbullets.py
│ │ ├── NPHardEval/
│ │ │ ├── __init__.py
│ │ │ ├── cmp_GCP_D.py
│ │ │ ├── cmp_KSP.py
│ │ │ ├── cmp_TSP_D.py
│ │ │ ├── hard_GCP.py
│ │ │ ├── hard_MSP.py
│ │ │ ├── hard_TSP.py
│ │ │ ├── p_BSP.py
│ │ │ ├── p_EDP.py
│ │ │ ├── p_SPP.py
│ │ │ ├── prompts.py
│ │ │ └── utils.py
│ │ ├── OlympiadBench.py
│ │ ├── OpenFinData.py
│ │ ├── OpenSWI.py
│ │ ├── PI_LLM.py
│ │ ├── PMMEval/
│ │ │ ├── __init__.py
│ │ │ ├── flores.py
│ │ │ ├── humanevalxl.py
│ │ │ ├── mgsm.py
│ │ │ ├── mhellaswag.py
│ │ │ ├── mifeval.py
│ │ │ ├── mifeval_utils/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── combination_checker.py
│ │ │ │ ├── detectable_content_checker.py
│ │ │ │ ├── detectable_format_checker.py
│ │ │ │ ├── keywords_checker.py
│ │ │ │ ├── length_constraints_checker.py
│ │ │ │ ├── punctuation_checker.py
│ │ │ │ └── startend_checker.py
│ │ │ ├── mlogiqa.py
│ │ │ ├── mmmlu.py
│ │ │ └── xnli.py
│ │ ├── ProcessBench.py
│ │ ├── ProteinLMBench.py
│ │ ├── PubMedQA.py
│ │ ├── QuALITY.py
│ │ ├── SciEval.py
│ │ ├── SciKnowEval.py
│ │ ├── SciReasoner/
│ │ │ ├── GUE.py
│ │ │ ├── LLM4Chem/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── config.py
│ │ │ │ ├── evaluator.py
│ │ │ │ ├── retrosynthesis_evaluator.py
│ │ │ │ └── utils/
│ │ │ │ ├── __input__.py
│ │ │ │ ├── chat_generation.py
│ │ │ │ ├── core_tagger.py
│ │ │ │ ├── general_prompter.py
│ │ │ │ ├── metrics.py
│ │ │ │ └── smiles_canonicalization.py
│ │ │ ├── LLM4Mat.py
│ │ │ ├── Mol_Instructions/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── biotext.py
│ │ │ │ ├── molecule.py
│ │ │ │ ├── normalized_SW_score.py
│ │ │ │ └── protein.py
│ │ │ ├── PEER.py
│ │ │ ├── __init__.py
│ │ │ ├── bio_instruction.py
│ │ │ ├── bulk_modulus_material.py
│ │ │ ├── composition_material.py
│ │ │ ├── opi/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── config.py
│ │ │ │ ├── evaluator.py
│ │ │ │ ├── process_ec_numbers.py
│ │ │ │ └── utils/
│ │ │ │ ├── accuracy4fold_type.py
│ │ │ │ └── metrics4all.py
│ │ │ ├── uncond_RNA.py
│ │ │ ├── uncond_material.py
│ │ │ ├── unconditional_molecule_generation/
│ │ │ │ ├── UMG.py
│ │ │ │ └── __init__.py
│ │ │ └── unconditional_protein_generation/
│ │ │ ├── UPG.py
│ │ │ ├── __init__.py
│ │ │ ├── main.py
│ │ │ └── omegafold/
│ │ │ ├── __init__.py
│ │ │ ├── __main__.py
│ │ │ ├── confidence.py
│ │ │ ├── config.py
│ │ │ ├── decode.py
│ │ │ ├── embedders.py
│ │ │ ├── geoformer.py
│ │ │ ├── model.py
│ │ │ ├── modules.py
│ │ │ ├── omegaplm.py
│ │ │ ├── pipeline.py
│ │ │ └── utils/
│ │ │ ├── __init__.py
│ │ │ ├── protein_utils/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── aaframe.py
│ │ │ │ ├── functions.py
│ │ │ │ └── residue_constants.py
│ │ │ └── torch_utils.py
│ │ ├── ScienceQA.py
│ │ ├── SeedBench.py
│ │ ├── TheoremQA/
│ │ │ ├── __init__.py
│ │ │ ├── legacy.py
│ │ │ ├── main.py
│ │ │ ├── number_utils.py
│ │ │ └── utils.py
│ │ ├── __init__.py
│ │ ├── advglue.py
│ │ ├── afqmcd.py
│ │ ├── agieval/
│ │ │ ├── __init__.py
│ │ │ ├── agieval.py
│ │ │ ├── constructions.py
│ │ │ ├── dataset_loader.py
│ │ │ ├── evaluation.py
│ │ │ ├── math_equivalence.py
│ │ │ ├── post_process.py
│ │ │ └── utils.py
│ │ ├── aime2024.py
│ │ ├── anli.py
│ │ ├── anthropics_evals.py
│ │ ├── apps.py
│ │ ├── arc.py
│ │ ├── arc_prize_public_evaluation.py
│ │ ├── atlas/
│ │ │ ├── dataset_loader.py
│ │ │ ├── evaluation.py
│ │ │ └── prompt.py
│ │ ├── ax.py
│ │ ├── babilong/
│ │ │ ├── __init__.py
│ │ │ ├── babilong.py
│ │ │ ├── babilong_utils.py
│ │ │ └── prompts.py
│ │ ├── base.py
│ │ ├── bbeh.py
│ │ ├── bbh.py
│ │ ├── benbench.py
│ │ ├── beyondaime.py
│ │ ├── bigcodebench/
│ │ │ ├── __init__.py
│ │ │ ├── bigcodebench.py
│ │ │ └── extractor.py
│ │ ├── biodata.py
│ │ ├── boolq.py
│ │ ├── bustum.py
│ │ ├── c3.py
│ │ ├── calm/
│ │ │ ├── __init__.py
│ │ │ ├── calm.py
│ │ │ ├── data_processing/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── generate_questions.py
│ │ │ │ ├── prompt/
│ │ │ │ │ ├── AC-B_causal_judgement.py
│ │ │ │ │ ├── AR-B_CaLM-AR.py
│ │ │ │ │ ├── ATE.py
│ │ │ │ │ ├── BAS-B_backadj.py
│ │ │ │ │ ├── BAS-C_max-BAS.py
│ │ │ │ │ ├── BAS-C_min-BAS.py
│ │ │ │ │ ├── BAS-C_mix-BAS.py
│ │ │ │ │ ├── CA-B_FA.py
│ │ │ │ │ ├── CA-B_FP.py
│ │ │ │ │ ├── CB-B_collider-bias.py
│ │ │ │ │ ├── CDE.py
│ │ │ │ │ ├── CEG-O_E-CARE.py
│ │ │ │ │ ├── CEI-B.py
│ │ │ │ │ ├── CORR-B_correlation.py
│ │ │ │ │ ├── CR-B_det-counterfactual.py
│ │ │ │ │ ├── CR-C_CRASS.py
│ │ │ │ │ ├── EAE-B_exp-away.py
│ │ │ │ │ ├── ECI-B_CTB.py
│ │ │ │ │ ├── ECI-B_ESC.py
│ │ │ │ │ ├── ECI-B_MAVEN-ERE.py
│ │ │ │ │ ├── ETT.py
│ │ │ │ │ ├── FAS-C_FAS.py
│ │ │ │ │ ├── IV-C_CaLM-IV.py
│ │ │ │ │ ├── NDE.py
│ │ │ │ │ ├── NIE.py
│ │ │ │ │ ├── PCD-B_COPA.py
│ │ │ │ │ ├── PCD-B_E-CARE.py
│ │ │ │ │ ├── PCD-C_COPA.py
│ │ │ │ │ ├── PCD-C_E-CARE.py
│ │ │ │ │ ├── PN.py
│ │ │ │ │ └── PS.py
│ │ │ │ └── task_hiearchy.py
│ │ │ ├── evaluation/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── accuracy/
│ │ │ │ │ ├── choice.py
│ │ │ │ │ ├── open-ended.py
│ │ │ │ │ └── prob.py
│ │ │ │ ├── core_metrics.py
│ │ │ │ ├── error/
│ │ │ │ │ └── basic_adversarial/
│ │ │ │ │ ├── AC-B_causal_judgement.py
│ │ │ │ │ ├── AR-B_CaLM-AR.py
│ │ │ │ │ ├── AS.py
│ │ │ │ │ ├── CA-B.py
│ │ │ │ │ ├── CEI-B.py
│ │ │ │ │ ├── CLADDER.py
│ │ │ │ │ ├── CR-C_CRASS.py
│ │ │ │ │ ├── ECI.py
│ │ │ │ │ ├── Natural.py
│ │ │ │ │ ├── PCD-B.py
│ │ │ │ │ ├── PCD-C.py
│ │ │ │ │ └── Probability.py
│ │ │ │ ├── errors.py
│ │ │ │ └── labeling/
│ │ │ │ ├── AC-B_causal_judgement.py
│ │ │ │ ├── AR-B_CaLM-AR.py
│ │ │ │ ├── AS.py
│ │ │ │ ├── CA-B_FA.py
│ │ │ │ ├── CA-B_FP.py
│ │ │ │ ├── CEG-O_E-CARE.py
│ │ │ │ ├── CEI-B.py
│ │ │ │ ├── CLADDER.py
│ │ │ │ ├── CR-C_CRASS.py
│ │ │ │ ├── ECI.py
│ │ │ │ ├── Natural.py
│ │ │ │ ├── PCD-B.py
│ │ │ │ ├── PCD-C.py
│ │ │ │ ├── Probability.py
│ │ │ │ └── common_answers.py
│ │ │ └── utils/
│ │ │ ├── __init__.py
│ │ │ └── load_items.py
│ │ ├── cb.py
│ │ ├── ceval.py
│ │ ├── charm.py
│ │ ├── chatml/
│ │ │ ├── __init__.py
│ │ │ ├── chatml.py
│ │ │ └── verification.py
│ │ ├── chem_exam.py
│ │ ├── chembench.py
│ │ ├── chid.py
│ │ ├── chinese_simpleqa.py
│ │ ├── cibench.py
│ │ ├── circular.py
│ │ ├── civilcomments.py
│ │ ├── climaqa.py
│ │ ├── clozeTest_maxmin.py
│ │ ├── cluewsc.py
│ │ ├── cmb.py
│ │ ├── cmmlu.py
│ │ ├── cmnli.py
│ │ ├── cmo_fib.py
│ │ ├── cmphysbench/
│ │ │ ├── SEED/
│ │ │ │ ├── README.md
│ │ │ │ ├── SEED.py
│ │ │ │ ├── __init__.py
│ │ │ │ ├── extended_zss.py
│ │ │ │ ├── latex_pre_process.py
│ │ │ │ └── test.py
│ │ │ ├── __init__.py
│ │ │ └── cmphysbench.py
│ │ ├── cmrc.py
│ │ ├── codecompass/
│ │ │ ├── CodeCompass.py
│ │ │ ├── __init__.py
│ │ │ ├── codecompass_runner.py
│ │ │ ├── evaluator.py
│ │ │ ├── executor.py
│ │ │ ├── metrics.py
│ │ │ └── utils.py
│ │ ├── commonsenseqa.py
│ │ ├── commonsenseqa_cn.py
│ │ ├── compassbench_obj.py
│ │ ├── copa.py
│ │ ├── crowspairs.py
│ │ ├── crowspairs_cn.py
│ │ ├── csl.py
│ │ ├── custom.py
│ │ ├── cvalues.py
│ │ ├── dingo.py
│ │ ├── drcd.py
│ │ ├── drop.py
│ │ ├── drop_simple_eval.py
│ │ ├── ds1000.py
│ │ ├── ds1000_interpreter.py
│ │ ├── eese/
│ │ │ ├── eese.py
│ │ │ ├── eese_postprocessors.py
│ │ │ └── utils.py
│ │ ├── eprstmt.py
│ │ ├── flores.py
│ │ ├── game24.py
│ │ ├── gaokao_math.py
│ │ ├── generic.py
│ │ ├── govrepcrs.py
│ │ ├── gpqa.py
│ │ ├── gsm8k.py
│ │ ├── gsm_hard.py
│ │ ├── healthbench/
│ │ │ ├── healthbench.py
│ │ │ ├── sampler/
│ │ │ │ └── chat_completion_sampler.py
│ │ │ └── types.py
│ │ ├── hellaswag.py
│ │ ├── hle.py
│ │ ├── huggingface.py
│ │ ├── humaneval.py
│ │ ├── humaneval_multi.py
│ │ ├── humaneval_pro.py
│ │ ├── humanevalx.py
│ │ ├── hungarian_math.py
│ │ ├── inference_ppl.py
│ │ ├── infinitebench/
│ │ │ ├── __init__.py
│ │ │ ├── infinitebench_codedebug.py
│ │ │ ├── infinitebench_coderun.py
│ │ │ ├── infinitebench_endia.py
│ │ │ ├── infinitebench_enmc.py
│ │ │ ├── infinitebench_enqa.py
│ │ │ ├── infinitebench_ensum.py
│ │ │ ├── infinitebench_mathcalc.py
│ │ │ ├── infinitebench_mathfind.py
│ │ │ ├── infinitebench_retrievekv.py
│ │ │ ├── infinitebench_retrievenumber.py
│ │ │ ├── infinitebench_retrievepasskey.py
│ │ │ ├── infinitebench_zhqa.py
│ │ │ └── utils.py
│ │ ├── internsandbox.py
│ │ ├── iwslt2017.py
│ │ ├── jigsawmultilingual.py
│ │ ├── jsonl.py
│ │ ├── judge/
│ │ │ ├── __init__.py
│ │ │ ├── judgebench.py
│ │ │ ├── judgerbenchv2.py
│ │ │ ├── rewardbench.py
│ │ │ └── rmb.py
│ │ ├── kaoshi.py
│ │ ├── kcle.py
│ │ ├── korbench/
│ │ │ ├── __init__.py
│ │ │ ├── korbench.py
│ │ │ ├── korbench_dataset_config/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── config.yaml
│ │ │ │ ├── config_wrapper.py
│ │ │ │ └── prompt/
│ │ │ │ ├── 0_shot.yaml
│ │ │ │ ├── 3_shot.yaml
│ │ │ │ ├── __init__.py
│ │ │ │ ├── mixed.yaml
│ │ │ │ ├── self-correction.yaml
│ │ │ │ └── trick.yaml
│ │ │ └── korbench_utils.py
│ │ ├── lambada.py
│ │ ├── lawbench/
│ │ │ ├── __init__.py
│ │ │ ├── evaluation_functions/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── cjft.py
│ │ │ │ ├── flzx.py
│ │ │ │ ├── ftcs.py
│ │ │ │ ├── jdzy.py
│ │ │ │ ├── jec_ac.py
│ │ │ │ ├── jec_kd.py
│ │ │ │ ├── jetq.py
│ │ │ │ ├── lblj.py
│ │ │ │ ├── ljp_accusation.py
│ │ │ │ ├── ljp_article.py
│ │ │ │ ├── ljp_imprison.py
│ │ │ │ ├── sjjc.py
│ │ │ │ ├── wbfl.py
│ │ │ │ ├── wsjd.py
│ │ │ │ ├── xxcq.py
│ │ │ │ ├── ydlj.py
│ │ │ │ ├── yqzy.py
│ │ │ │ └── zxfl.py
│ │ │ ├── lawbench.py
│ │ │ └── utils/
│ │ │ ├── __init__.py
│ │ │ ├── char_smi.py
│ │ │ ├── compare_m2_for_evaluation.py
│ │ │ ├── comprehension_scores.py
│ │ │ ├── function_utils.py
│ │ │ ├── modules/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── alignment.py
│ │ │ │ ├── annotator.py
│ │ │ │ ├── classifier.py
│ │ │ │ ├── merger.py
│ │ │ │ ├── tokenization.py
│ │ │ │ └── tokenizer.py
│ │ │ ├── parallel_to_m2.py
│ │ │ └── rc_f1.py
│ │ ├── lcsts.py
│ │ ├── leval/
│ │ │ ├── __init__.py
│ │ │ ├── evaluators.py
│ │ │ ├── leval_coursera.py
│ │ │ ├── leval_financial_qa.py
│ │ │ ├── leval_gov_report_summ.py
│ │ │ ├── leval_gsm100.py
│ │ │ ├── leval_legal_contract_qa.py
│ │ │ ├── leval_meeting_summ.py
│ │ │ ├── leval_multidoc_qa.py
│ │ │ ├── leval_narrattive_qa.py
│ │ │ ├── leval_natural_question.py
│ │ │ ├── leval_news_summ.py
│ │ │ ├── leval_paper_assistant.py
│ │ │ ├── leval_patent_summ.py
│ │ │ ├── leval_quality.py
│ │ │ ├── leval_review_summ.py
│ │ │ ├── leval_scientific_qa.py
│ │ │ ├── leval_topic_retrieval.py
│ │ │ ├── leval_tpo.py
│ │ │ └── leval_tvshow_summ.py
│ │ ├── livecodebench/
│ │ │ ├── __init__.py
│ │ │ ├── evaluator.py
│ │ │ ├── execute_utils.py
│ │ │ ├── extract_utils.py
│ │ │ ├── livecodebench.py
│ │ │ ├── pass_k_utils.py
│ │ │ ├── prompts.py
│ │ │ └── testing_util.py
│ │ ├── livecodebench_pro/
│ │ │ ├── __init__.py
│ │ │ ├── livecodebench_pro.py
│ │ │ └── livecodebench_pro_evaluator.py
│ │ ├── livemathbench/
│ │ │ ├── __init__.py
│ │ │ ├── livemathbench.py
│ │ │ ├── prompts.py
│ │ │ └── utils.py
│ │ ├── livereasonbench/
│ │ │ ├── __init__.py
│ │ │ └── livereasonbench.py
│ │ ├── livestembench.py
│ │ ├── llm_compression.py
│ │ ├── lmeval.py
│ │ ├── longbench/
│ │ │ ├── __init__.py
│ │ │ ├── evaluators.py
│ │ │ ├── longbench_2wikim_qa.py
│ │ │ ├── longbench_dureader.py
│ │ │ ├── longbench_gov_report.py
│ │ │ ├── longbench_hotpot_qa.py
│ │ │ ├── longbench_lcc.py
│ │ │ ├── longbench_lsht.py
│ │ │ ├── longbench_multi_news.py
│ │ │ ├── longbench_multifieldqa_en.py
│ │ │ ├── longbench_multifieldqa_zh.py
│ │ │ ├── longbench_musique.py
│ │ │ ├── longbench_narrative_qa.py
│ │ │ ├── longbench_passage_count.py
│ │ │ ├── longbench_passage_retrieval_en.py
│ │ │ ├── longbench_passage_retrieval_zh.py
│ │ │ ├── longbench_qasper.py
│ │ │ ├── longbench_qmsum.py
│ │ │ ├── longbench_repobench.py
│ │ │ ├── longbench_samsum.py
│ │ │ ├── longbench_trec.py
│ │ │ ├── longbench_trivia_qa.py
│ │ │ └── longbench_vcsum.py
│ │ ├── longbenchv2.py
│ │ ├── lveval/
│ │ │ ├── __init__.py
│ │ │ ├── evaluators.py
│ │ │ ├── lveval_cmrc_mixup.py
│ │ │ ├── lveval_dureader_mixup.py
│ │ │ ├── lveval_factrecall_en.py
│ │ │ ├── lveval_factrecall_zh.py
│ │ │ ├── lveval_hotpotwikiqa_mixup.py
│ │ │ ├── lveval_lic_mixup.py
│ │ │ ├── lveval_loogle_CR_mixup.py
│ │ │ ├── lveval_loogle_MIR_mixup.py
│ │ │ ├── lveval_loogle_SD_mixup.py
│ │ │ ├── lveval_multifieldqa_en_mixup.py
│ │ │ └── lveval_multifieldqa_zh_mixup.py
│ │ ├── mastermath2024v1.py
│ │ ├── matbench/
│ │ │ ├── __init__.py
│ │ │ ├── matbench.py
│ │ │ └── post_process.py
│ │ ├── math.py
│ │ ├── math401.py
│ │ ├── math_intern.py
│ │ ├── mathbench.py
│ │ ├── mbpp.py
│ │ ├── mbpp_pro.py
│ │ ├── medbench/
│ │ │ ├── __init__.py
│ │ │ ├── constructions.py
│ │ │ ├── dataset_loader.py
│ │ │ ├── evaluation.py
│ │ │ ├── math_equivalence.py
│ │ │ ├── medbench.py
│ │ │ ├── post_process.py
│ │ │ └── utils.py
│ │ ├── medmcqa.py
│ │ ├── mgsm.py
│ │ ├── mmlu.py
│ │ ├── mmlu_cf.py
│ │ ├── mmlu_pro.py
│ │ ├── mmmlu.py
│ │ ├── mol_instructions_chem.py
│ │ ├── multipl_e.py
│ │ ├── multirc.py
│ │ ├── musr/
│ │ │ ├── __init__.py
│ │ │ ├── murder_mystery_solved_ex.py
│ │ │ ├── musr.py
│ │ │ ├── object_placements_solved_ex.py
│ │ │ ├── team_allocation_solved_ex.py
│ │ │ └── tree.py
│ │ ├── narrativeqa.py
│ │ ├── natural_question.py
│ │ ├── natural_question_cn.py
│ │ ├── needlebench/
│ │ │ ├── __init__.py
│ │ │ ├── atc.py
│ │ │ ├── atc_choice.py
│ │ │ ├── multi.py
│ │ │ ├── origin.py
│ │ │ └── parallel.py
│ │ ├── needlebench_v2/
│ │ │ ├── __init__.py
│ │ │ ├── atc.py
│ │ │ ├── atc_elder_only.py
│ │ │ ├── multi.py
│ │ │ ├── origin.py
│ │ │ └── parallel.py
│ │ ├── nejmaibench.py
│ │ ├── obqa.py
│ │ ├── ojbench.py
│ │ ├── olymmath.py
│ │ ├── omni_math.py
│ │ ├── phybench/
│ │ │ ├── EED.py
│ │ │ ├── __init__.py
│ │ │ ├── box_extract.py
│ │ │ ├── extended_zss.py
│ │ │ ├── latex_pre_process.py
│ │ │ └── phybench.py
│ │ ├── physics.py
│ │ ├── piqa.py
│ │ ├── py150.py
│ │ ├── qasper.py
│ │ ├── qaspercut.py
│ │ ├── race.py
│ │ ├── rbench.py
│ │ ├── realtoxicprompts.py
│ │ ├── reasonbench/
│ │ │ ├── ReasonBenchDataset.py
│ │ │ └── __init__.py
│ │ ├── record.py
│ │ ├── rolebench.py
│ │ ├── ruler/
│ │ │ ├── __init__.py
│ │ │ ├── ruler_cwe.py
│ │ │ ├── ruler_fwe.py
│ │ │ ├── ruler_niah.py
│ │ │ ├── ruler_qa.py
│ │ │ └── ruler_vt.py
│ │ ├── s3eval.py
│ │ ├── safety.py
│ │ ├── scibench.py
│ │ ├── scicode.py
│ │ ├── simpleqa.py
│ │ ├── siqa.py
│ │ ├── smolinstruct.py
│ │ ├── squad20.py
│ │ ├── srbench.py
│ │ ├── storycloze.py
│ │ ├── strategyqa.py
│ │ ├── subjective/
│ │ │ ├── __init__.py
│ │ │ ├── alignbench.py
│ │ │ ├── alpacaeval.py
│ │ │ ├── arena_hard.py
│ │ │ ├── commonbench.py
│ │ │ ├── compass_arena.py
│ │ │ ├── compass_arena_subjective_bench.py
│ │ │ ├── compassbench.py
│ │ │ ├── compassbench_checklist.py
│ │ │ ├── compassbench_control_length_bias.py
│ │ │ ├── corev2.py
│ │ │ ├── creationbench.py
│ │ │ ├── flames.py
│ │ │ ├── fofo.py
│ │ │ ├── followbench.py
│ │ │ ├── hellobench.py
│ │ │ ├── judgerbench.py
│ │ │ ├── mtbench.py
│ │ │ ├── mtbench101.py
│ │ │ ├── multiround.py
│ │ │ ├── subjective_cmp.py
│ │ │ ├── utils.py
│ │ │ ├── wildbench.py
│ │ │ └── writingbench.py
│ │ ├── summedits.py
│ │ ├── summscreen.py
│ │ ├── supergpqa/
│ │ │ ├── __init__.py
│ │ │ ├── supergpqa.py
│ │ │ ├── supergpqa_dataset_config/
│ │ │ │ ├── config_default.yaml
│ │ │ │ ├── config_reasoning_models.yaml
│ │ │ │ ├── config_wrapper.py
│ │ │ │ └── prompt/
│ │ │ │ ├── five-shot.yaml
│ │ │ │ ├── robustness-exp.yaml
│ │ │ │ ├── zero-shot-with-subfield.yaml
│ │ │ │ └── zero-shot.yaml
│ │ │ ├── supergpqa_eval.py
│ │ │ └── supergpqa_utils.py
│ │ ├── svamp.py
│ │ ├── tabmwp.py
│ │ ├── taco.py
│ │ ├── teval/
│ │ │ ├── __init__.py
│ │ │ ├── evaluators/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── instruct_evaluator.py
│ │ │ │ ├── planning_evaluator.py
│ │ │ │ ├── reason_retrieve_understand_evaluator.py
│ │ │ │ └── review_evaluator.py
│ │ │ ├── schema.py
│ │ │ └── utils/
│ │ │ ├── __init__.py
│ │ │ ├── convert_results.py
│ │ │ ├── format_load.py
│ │ │ ├── meta_template.py
│ │ │ └── template.py
│ │ ├── tnews.py
│ │ ├── triviaqa.py
│ │ ├── triviaqarc.py
│ │ ├── truthfulqa.py
│ │ ├── tydiqa.py
│ │ ├── wic.py
│ │ ├── wikibench.py
│ │ ├── winograd.py
│ │ ├── winogrande.py
│ │ ├── wnli.py
│ │ ├── wsc.py
│ │ ├── xcopa.py
│ │ ├── xiezhi.py
│ │ ├── xlsum.py
│ │ └── xsum.py
│ ├── evaluator/
│ │ ├── __init__.py
│ │ ├── cascade_evaluator.py
│ │ ├── generic_llm_evaluator.py
│ │ └── math_evaluator.py
│ ├── lagent/
│ │ ├── actions/
│ │ │ ├── ipython_interpreter.py
│ │ │ └── python_interpreter.py
│ │ └── agents/
│ │ └── react.py
│ ├── metrics/
│ │ ├── __init__.py
│ │ ├── dump_results.py
│ │ ├── mme_score.py
│ │ └── seedbench.py
│ ├── models/
│ │ ├── __init__.py
│ │ ├── accessory.py
│ │ ├── ai360_api.py
│ │ ├── alaya.py
│ │ ├── baichuan_api.py
│ │ ├── baidu_api.py
│ │ ├── bailing_api_oc.py
│ │ ├── base.py
│ │ ├── base_api.py
│ │ ├── bluelm_api.py
│ │ ├── bytedance_api.py
│ │ ├── claude_allesapin.py
│ │ ├── claude_api/
│ │ │ ├── __init__.py
│ │ │ ├── claude_api.py
│ │ │ └── postprocessors.py
│ │ ├── claude_sdk_api.py
│ │ ├── deepseek_api.py
│ │ ├── doubao.py
│ │ ├── doubao_api.py
│ │ ├── gemini_api.py
│ │ ├── glm.py
│ │ ├── huggingface.py
│ │ ├── huggingface_above_v4_33.py
│ │ ├── hunyuan_api.py
│ │ ├── intern_model.py
│ │ ├── interntrain.py
│ │ ├── krgpt_api.py
│ │ ├── lagent.py
│ │ ├── langchain.py
│ │ ├── lightllm_api.py
│ │ ├── llama2.py
│ │ ├── minimax_api.py
│ │ ├── mistral_api.py
│ │ ├── mixtral.py
│ │ ├── modelscope.py
│ │ ├── moonshot_api.py
│ │ ├── nanbeige_api.py
│ │ ├── openai_api.py
│ │ ├── openai_streaming.py
│ │ ├── pangu_api.py
│ │ ├── qwen_api.py
│ │ ├── rendu_api.py
│ │ ├── sensetime_api.py
│ │ ├── stepfun_api.py
│ │ ├── telechat_api/
│ │ │ ├── __init__.py
│ │ │ ├── telechat_api.py
│ │ │ ├── telechat_api_streaming.py
│ │ │ └── telechat_auth_sdk.py
│ │ ├── turbomind.py
│ │ ├── turbomind_api.py
│ │ ├── turbomind_with_tf_above_v4_33.py
│ │ ├── unigpt_api.py
│ │ ├── vllm.py
│ │ ├── vllm_with_tf_above_v4_33.py
│ │ ├── xunfei_api.py
│ │ ├── yayi_api.py
│ │ ├── yi_api.py
│ │ ├── zhipuai_api.py
│ │ └── zhipuai_v2_api.py
│ ├── openicl/
│ │ ├── __init__.py
│ │ ├── icl_dataset_reader.py
│ │ ├── icl_evaluator/
│ │ │ ├── __init__.py
│ │ │ ├── code_evaluator.py
│ │ │ ├── hf_metrics/
│ │ │ │ ├── accuracy.py
│ │ │ │ ├── rouge.py
│ │ │ │ ├── sacrebleu.py
│ │ │ │ └── squad.py
│ │ │ ├── icl_agent_evaluator.py
│ │ │ ├── icl_aucroc_evaluator.py
│ │ │ ├── icl_base_evaluator.py
│ │ │ ├── icl_bpc_evaluator.py
│ │ │ ├── icl_circular_evaluator.py
│ │ │ ├── icl_em_evaluator.py
│ │ │ ├── icl_hf_evaluator.py
│ │ │ ├── icl_jieba_rouge_evaluator.py
│ │ │ ├── icl_judge_evaluator.py
│ │ │ ├── icl_korbench_evaluator.py
│ │ │ ├── icl_misc_evaluator.py
│ │ │ ├── icl_plugin_evaluator.py
│ │ │ ├── icl_toxic_evaluator.py
│ │ │ ├── lm_evaluator.py
│ │ │ └── pi_llm_evaluator.py
│ │ ├── icl_inferencer/
│ │ │ ├── __init__.py
│ │ │ ├── icl_agent_inferencer.py
│ │ │ ├── icl_attack_inferencer.py
│ │ │ ├── icl_base_inferencer.py
│ │ │ ├── icl_chat_inferencer.py
│ │ │ ├── icl_chat_inferencer_parallel.py
│ │ │ ├── icl_chatml_inferencer.py
│ │ │ ├── icl_chatml_inferencer_parallel.py
│ │ │ ├── icl_clp_inferencer.py
│ │ │ ├── icl_gen_inferencer.py
│ │ │ ├── icl_gen_inferencer_parallel.py
│ │ │ ├── icl_inference_ppl_only_inferencer.py
│ │ │ ├── icl_ll_inferencer.py
│ │ │ ├── icl_mink_percent_inferencer.py
│ │ │ ├── icl_ppl_inferencer.py
│ │ │ ├── icl_ppl_only_inferencer.py
│ │ │ ├── icl_sc_inferencer.py
│ │ │ ├── icl_sw_ce_loss_inferencer.py
│ │ │ └── icl_tot_inferencer.py
│ │ ├── icl_prompt_template.py
│ │ ├── icl_raw_prompt_template.py
│ │ ├── icl_retriever/
│ │ │ ├── __init__.py
│ │ │ ├── icl_base_retriever.py
│ │ │ ├── icl_bm25_retriever.py
│ │ │ ├── icl_dpp_retriever.py
│ │ │ ├── icl_fix_k_retriever.py
│ │ │ ├── icl_mdl_retriever.py
│ │ │ ├── icl_random_retriever.py
│ │ │ ├── icl_sliding_k_retriever.py
│ │ │ ├── icl_topk_retriever.py
│ │ │ ├── icl_votek_retriever.py
│ │ │ └── icl_zero_retriever.py
│ │ └── utils/
│ │ ├── __init__.py
│ │ └── logging.py
│ ├── partitioners/
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── naive.py
│ │ ├── num_worker.py
│ │ ├── size.py
│ │ ├── sub_naive.py
│ │ ├── sub_num_worker.py
│ │ └── sub_size.py
│ ├── registry.py
│ ├── runners/
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── dlc.py
│ │ ├── local.py
│ │ ├── local_api.py
│ │ ├── rjob.py
│ │ ├── slurm.py
│ │ ├── slurm_sequential.py
│ │ └── volc.py
│ ├── summarizers/
│ │ ├── __init__.py
│ │ ├── circular.py
│ │ ├── default.py
│ │ ├── default_subjective.py
│ │ ├── llm_compression.py
│ │ ├── multi_faceted.py
│ │ ├── multi_model.py
│ │ ├── needlebench.py
│ │ ├── subjective/
│ │ │ ├── __init__.py
│ │ │ ├── alignmentbench.py
│ │ │ ├── all_obj.py
│ │ │ ├── alpacaeval.py
│ │ │ ├── arenahard.py
│ │ │ ├── charm.py
│ │ │ ├── common_summarizer.py
│ │ │ ├── compass_arena.py
│ │ │ ├── compass_arena_bradley_terry.py
│ │ │ ├── compassbench.py
│ │ │ ├── compassbench_v13.py
│ │ │ ├── corev2.py
│ │ │ ├── creationbench.py
│ │ │ ├── flames.py
│ │ │ ├── fofo.py
│ │ │ ├── followbench.py
│ │ │ ├── mtbench.py
│ │ │ ├── mtbench101.py
│ │ │ ├── multiround.py
│ │ │ ├── qacompassbench.py
│ │ │ ├── subjective.py
│ │ │ ├── subjective_post_process.py
│ │ │ ├── utils.py
│ │ │ └── wildbench.py
│ │ └── summarizer_pretrain.py
│ ├── tasks/
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── llm_eval.py
│ │ ├── openicl_attack.py
│ │ ├── openicl_eval.py
│ │ ├── openicl_eval_watch.py
│ │ ├── openicl_infer.py
│ │ ├── openicl_infer_concurrent.py
│ │ ├── outer_eval/
│ │ │ └── alpacaeval.py
│ │ └── subjective_eval.py
│ └── utils/
│ ├── __init__.py
│ ├── abbr.py
│ ├── auxiliary.py
│ ├── build.py
│ ├── collect_env.py
│ ├── datasets.py
│ ├── datasets_info.py
│ ├── dependency.py
│ ├── dict_postprocessors.py
│ ├── file.py
│ ├── fileio.py
│ ├── heartbeat.py
│ ├── infer_status.py
│ ├── lark.py
│ ├── logging.py
│ ├── menu.py
│ ├── network.py
│ ├── prompt.py
│ ├── result_station.py
│ ├── run.py
│ ├── text_postprocessors.py
│ └── types.py
├── run.py
├── setup.py
├── tests/
│ ├── TESTING_GUIDE.md
│ ├── TESTING_GUIDE_zh-CN.md
│ ├── datasets/
│ │ ├── test_aime2025.py
│ │ ├── test_aime2025_eval.py
│ │ ├── test_beyondaime.py
│ │ ├── test_humaneval.py
│ │ └── test_local_datasets.py
│ ├── models/
│ │ ├── test_base_model.py
│ │ ├── test_huggingface.py
│ │ ├── test_huggingface_above_v4_33.py
│ │ ├── test_openai_api.py
│ │ ├── test_openai_streaming.py
│ │ ├── test_turbomind.py
│ │ ├── test_turbomind_with_tf_above_v4_33.py
│ │ ├── test_vllm.py
│ │ └── test_vllm_with_tf_above_v4_33.py
│ ├── openicl/
│ │ ├── test_icl_chat_inferencer_parallel.py
│ │ ├── test_icl_chatml_inferencer_parallel.py
│ │ ├── test_icl_gen_inferencer_parallel.py
│ │ ├── test_prompt_template.py
│ │ └── test_raw_prompt_template.py
│ ├── partitioners/
│ │ ├── test_base_partitioner.py
│ │ └── test_naive.py
│ ├── prompt/
│ │ ├── test_api_template_parser.py
│ │ ├── test_lm_template_parser.py
│ │ └── test_prompt_list.py
│ ├── pytest.ini
│ ├── summarizers/
│ │ └── test_default.py
│ ├── tasks/
│ │ ├── test_base_task.py
│ │ ├── test_openicl_eval_watch.py
│ │ └── test_openicl_infer_concurrent.py
│ └── utils/
│ ├── test_heartbeat.py
│ ├── test_infer_status.py
│ └── test_text_postprocessors.py
└── tools/
├── case_analyzer.py
├── chatml_format_test.py
├── collect_code_preds.py
├── compare_configs.py
├── convert_alignmentbench.py
├── list_configs.py
├── prediction_merger.py
├── prompt_viewer.py
├── test_api_model.py
├── update_dataset_suffix.py
└── viz_multi_model.py
================================================
FILE CONTENTS
================================================
================================================
FILE: .codespellrc
================================================
[codespell]
skip = *.ipynb
count =
quiet-level = 3
ignore-words-list = nd, ans, ques, rouge, softwares, wit
================================================
FILE: .github/ISSUE_TEMPLATE/1_bug-report.yml
================================================
name: 🐞 Bug report
description: Create a report to help us improve
labels: ["bug"]
title: "[Bug] "
body:
- type: markdown
attributes:
value: |
For general questions or idea discussions, please post it to our [**Forum**](https://github.com/open-compass/opencompass/discussions).
If you have already identified the reason, we strongly appreciate you creating a new PR according to [the tutorial](https://opencompass.readthedocs.io/en/master/community/CONTRIBUTING.html)!
If you need our help, please fill in the following form to help us to identify the bug.
- type: checkboxes
attributes:
label: Prerequisite
description: Please check the following items before creating a new issue.
options:
- label: I have searched [Issues](https://github.com/open-compass/opencompass/issues/) and [Discussions](https://github.com/open-compass/opencompass/discussions) but cannot get the expected help.
required: true
- label: The bug has not been fixed in the [latest version](https://github.com/open-compass/opencompass).
required: true
- type: dropdown
id: task
attributes:
label: Type
description: The problem arises when
options:
- I'm evaluating with the officially supported tasks/models/datasets.
- I have modified the code (config is not considered code), or I'm working on my own tasks/models/datasets.
validations:
required: true
- type: textarea
id: environment
validations:
required: true
attributes:
label: Environment
description: |
Please run `python -c "import opencompass.utils;import pprint;pprint.pprint(dict(opencompass.utils.collect_env()))"` to collect necessary environment information and paste it here.
placeholder: |
```python
# The output the above command
```
- type: textarea
attributes:
label: Reproduces the problem - code/configuration sample
description: |
Please provide a code or configuration sample that reproduces the problem you ran into. It can be a Colab link or just a code snippet.
placeholder: |
```python
# Sample code to reproduce the problem
```
validations:
required: true
- type: textarea
attributes:
label: Reproduces the problem - command or script
description: |
What command or script did you run?
placeholder: |
```shell
The command or script you run.
```
validations:
required: true
- type: textarea
attributes:
label: Reproduces the problem - error message
description: |
Please provide the error message or logs you got, with the full traceback.
Tip: You can attach images or log files by dragging them into the text area..
placeholder: |
```
The error message or logs you got, with the full traceback.
```
validations:
required: true
- type: textarea
id: other
attributes:
label: Other information
description: |
Tell us anything else you think we should know.
1. What's your expected result?
2. What dataset did you use?
3. What do you think might be the reason?
================================================
FILE: .github/ISSUE_TEMPLATE/2_feature-request.yml
================================================
name: 🚀 Feature request
description: Suggest an idea for this project
labels: ["enhancement"]
title: "[Feature] "
body:
- type: markdown
attributes:
value: |
For general questions or idea discussions, please post it to our [**Forum**](https://github.com/open-compass/opencompass/discussions).
If you have already implemented the feature, we strongly appreciate you creating a new PR according to [the tutorial](https://opencompass.readthedocs.io/en/master/community/CONTRIBUTING.html)!
- type: textarea
id: describe
validations:
required: true
attributes:
label: Describe the feature
description: |
What kind of feature do you want OpenCompass to add. If there is an official code release or third-party implementation, please also provide the information here, which would be very helpful.
placeholder: |
A clear and concise description of the motivation of the feature.
Ex1. It is inconvenient when \[....\].
Ex2. There is a recent paper \[....\], which is very helpful for \[....\].
- type: checkboxes
id: pr
attributes:
label: Will you implement it?
options:
- label: I would like to implement this feature and create a PR!
================================================
FILE: .github/ISSUE_TEMPLATE/3_bug-report_zh.yml
================================================
name: 🐞 报告 Bug
description: 报告你在使用中遇到的不合预期的情况
labels: ["bug"]
title: "[Bug] "
body:
- type: markdown
attributes:
value: |
我们推荐使用英语模板 Bug report,以便你的问题帮助更多人。
如果需要询问一般性的问题或者想法,请在我们的[**论坛**](https://github.com/open-compass/opencompass/discussions)讨论。
如果你已经有了解决方案,我们非常欢迎你直接创建一个新的 PR 来解决这个问题。创建 PR 的流程可以参考[文档](https://opencompass.readthedocs.io/zh_CN/master/community/CONTRIBUTING.html)。
如果你需要我们的帮助,请填写以下内容帮助我们定位 Bug。
- type: checkboxes
attributes:
label: 先决条件
description: 在创建新问题之前,请检查以下项目。
options:
- label: 我已经搜索过 [问题](https://github.com/open-compass/opencompass/issues/) 和 [讨论](https://github.com/open-compass/opencompass/discussions) 但未得到预期的帮助。
required: true
- label: 错误在 [最新版本](https://github.com/open-compass/opencompass) 中尚未被修复。
required: true
- type: dropdown
id: task
attributes:
label: 问题类型
description: 问题出现时
options:
- 我正在使用官方支持的任务/模型/数据集进行评估。
- 我修改了代码(配置不视为代码),或者我正在处理我自己的任务/模型/数据集。
validations:
required: true
- type: textarea
id: environment
validations:
required: true
attributes:
label: 环境
description: |
请运行 `python -c "import opencompass.utils;import pprint;pprint.pprint(dict(opencompass.utils.collect_env()))"` 来收集必要的环境信息并粘贴在此处。
placeholder: |
```python
# 上述命令的输出
```
- type: textarea
attributes:
label: 重现问题 - 代码/配置示例
description: |
请提供重现您遇到的问题的代码或配置示例。它可以是一个Colab链接或仅仅是一个代码片段。
placeholder: |
```python
# 重现问题的示例代码
```
validations:
required: true
- type: textarea
attributes:
label: 重现问题 - 命令或脚本
description: |
您运行了什么命令或脚本?
placeholder: |
```shell
您运行的命令或脚本。
```
validations:
required: true
- type: textarea
attributes:
label: 重现问题 - 错误信息
description: |
请提供您收到的错误消息或日志,并提供完整的追溯。
提示:您可以通过拖放图片或日志文件到文本区域来附加它们。
placeholder: |
```
您收到的错误消息或日志,带有完整的追溯。
```
validations:
required: true
- type: textarea
id: other
attributes:
label: 其他信息
description: |
告诉我们其他有价值的信息。
1. 你是否对代码或配置文件做了任何改动?
2. 你认为可能的原因是什么?
================================================
FILE: .github/ISSUE_TEMPLATE/4_feature-request_zh.yml
================================================
name: 🚀 功能建议
description: 建议一项新的功能
labels: ["enhancement"]
title: "[Feature] "
body:
- type: markdown
attributes:
value: |
推荐使用英语模板 Feature request,以便你的问题帮助更多人。
如果需要询问一般性的问题或者想法,请在我们的[**论坛**](https://github.com/open-compass/opencompass/discussions)讨论。
如果你已经实现了该功能,我们非常欢迎你直接创建一个新的 PR 来解决这个问题。创建 PR 的流程可以参考[文档](https://opencompass.readthedocs.io/zh_CN/master/community/CONTRIBUTING.html)。
- type: textarea
id: describe
validations:
required: true
attributes:
label: 描述该功能
description: |
你希望 OpenCompass 添加什么功能?如果存在相关的论文、官方实现或者第三方实现,请同时贴出链接,这将非常有帮助。
placeholder: |
简要说明该功能,及为什么需要该功能
例 1. 现在进行 xxx 的时候不方便
例 2. 最近的论文中提出了有一个很有帮助的 xx
- type: checkboxes
id: pr
attributes:
label: 是否希望自己实现该功能?
options:
- label: 我希望自己来实现这一功能,并向 OpenCompass 贡献代码!
================================================
FILE: .github/ISSUE_TEMPLATE/config.yml
================================================
blank_issues_enabled: false
contact_links:
- name: 📚 OpenCompass Documentation (官方文档)
url: https://opencompass.readthedocs.io/en/latest/
about: Check if your question is answered in docs
- name: 💬 General questions (寻求帮助)
url: https://github.com/open-compass/opencompass/discussions
about: Ask general usage questions and discuss with other OpenCompass community members
- name: 🌐 Explore OpenCompass (官网)
url: https://opencompass.org.cn/
about: Get know more about OpenCompass
================================================
FILE: .github/pull_request_template.md
================================================
Thanks for your contribution and we appreciate it a lot. The following instructions would make your pull request more healthy and more easily get feedback. If you do not understand some items, don't worry, just make the pull request and seek help from maintainers.
## Motivation
Please describe the motivation of this PR and the goal you want to achieve through this PR.
## Modification
Please briefly describe what modification is made in this PR.
## BC-breaking (Optional)
Does the modification introduce changes that break the backward compatibility of the downstream repositories?
If so, please describe how it breaks the compatibility and how the downstream projects should modify their code to keep compatibility with this PR.
## Use cases (Optional)
If this PR introduces a new feature, it is better to list some use cases here and update the documentation.
## Checklist
**Before PR**:
- [ ] Pre-commit or other linting tools are used to fix the potential lint issues.
- [ ] Bug fixes are fully covered by unit tests, the case that causes the bug should be added in the unit tests.
- [ ] The modification is covered by complete unit tests. If not, please add more unit test to ensure the correctness.
- [ ] The documentation has been modified accordingly, like docstring or example tutorials.
**After PR**:
- [ ] If the modification has potential influence on downstream or other related projects, this PR should be tested with those projects.
- [ ] CLA has been signed and all committers have signed the CLA in this PR.
================================================
FILE: .github/workflows/daily-ete-test.yml
================================================
name: daily_ete_test
on:
workflow_dispatch:
inputs:
repo_org:
required: false
description: 'Tested repository organization name. Default is open-compass/opencompass'
type: string
default: 'open-compass/opencompass'
repo_ref:
required: false
description: 'Set branch or tag or commit id. Default is "main"'
type: string
default: 'main'
regression_type:
required: true
description: 'regression types'
type: string
default: "['model', 'eval', 'cmd', 'cluster']"
baseline_result:
required: true
description: 'baseline result'
type: string
default: "0.5.0-baseline"
schedule:
- cron: '15 14 * * 2'
env:
HF_DATASETS_OFFLINE: 1
HF_EVALUATE_OFFLINE: 1
TRANSFORMERS_OFFLINE: 1
VLLM_USE_MODELSCOPE: false
LMDEPLOY_USE_MODELSCOPE: false
HF_HUB_OFFLINE: 1
OUTPUT_FOLDER: cuda12.1_dist_${{ github.run_id }}
CONDA_PATH: /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/miniconda3
REPORT_ROOT: /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/eval_report/ete_regression
COMPASS_DATA_CACHE: /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/compass_data_cache
HF_DATASETS_CACHE: /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/hf_cache
HF_HUB_CACHE: /mnt/shared-storage-gpfs2/gpfs2-shared-public/huggingface/hub
HF_DATASETS_DISABLE_LOCKFILES: 1
HF_ENDPOINT: https://hf-mirror.com
PIP_CACHE_DIR: /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/pip_cache_new
CONDA_ENV: ete_regression
VLLM_WORKER_MULTIPROC_METHOD: spawn
KUBEBRAIN_CLUSTER_ENTRY: https://h.pjlab.org.cn
KUBEBRAIN_NAMESPACE: ailab-opencompass
JOB_NAME: daily-test-${{ github.run_id }}-${{ github.run_attempt }}
BASELINE_DIR: ${{github.event.inputs.baseline_result || '0.5.0-baseline' }}
TEST_MODEL: Qwen/Qwen3-8B
jobs:
build-pypi:
runs-on: ubuntu-latest
env:
http_proxy: ''
https_proxy: ''
steps:
- uses: actions/checkout@v5
with:
repository: ${{ github.event.inputs.repo_org || 'open-compass/opencompass' }}
ref: ${{github.event.inputs.repo_ref || 'main'}}
- name: Set up Python 3.10
uses: actions/setup-python@v4
with:
python-version: '3.10'
- name: Build lagent
run: |
pip install wheel setuptools
python setup.py sdist bdist_wheel
- name: Upload Artifacts
uses: actions/upload-artifact@v4
with:
if-no-files-found: error
path: dist/*
retention-days: 1
name: my-artifact-${{ github.run_id }}
prepare_env:
if: ${{!cancelled()}}
needs: ['build-pypi']
runs-on: yidian_cu12_ete
timeout-minutes: 180 #3hours
steps:
- name: Clean workdir
run: sudo git clean -ffdx
- name: Clone repository
uses: actions/checkout@v5
with:
repository: ${{ github.event.inputs.repo_org || 'open-compass/opencompass' }}
ref: ${{github.event.inputs.repo_ref || 'main'}}
- name: Download Artifacts
uses: actions/download-artifact@v4
with:
name: my-artifact-${{ github.run_id }}
- name: Remove Conda Env
run: |
. ${{env.CONDA_PATH}}/bin/activate
conda env remove -y --name ${{env.CONDA_ENV}}
conda info --envs
- name: Prepare - create conda env and install torch - cu12
uses: nick-fields/retry@v3
with:
max_attempts: 3
timeout_minutes: 120
command: |
. ${{env.CONDA_PATH}}/bin/activate
conda create -y --name ${{env.CONDA_ENV}} python=3.10
conda activate ${{env.CONDA_ENV}}
export PIP_CACHE_DIR=${{env.PIP_CACHE_DIR}}
pip install -r /mnt/shared-storage-user/qa-llm-cicd/oc_test_resource/requirements.txt
pip install opencompass*.whl
pip install opencompass[lmdeploy]
pip install opencompass[vllm]
pip install opencompass[full]
pip install opencompass[api]
pip install xformers
pip install torch==2.8.0 torchvision==0.23.0 torchaudio==2.8.0
pip install transformers==4.56.0 vllm==0.11.0 lmdeploy==0.11.0
pip install fire pyyaml pytest
FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/packages/flash_attn-2.8.3+cu12torch2.8cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
cp -r /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/packages/nltk_data /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/miniconda3/envs/${{env.CONDA_ENV}}/nltk_data
- name: conda env
run: |
. ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}
conda info --envs
pip list
daily_model_test:
if: ${{!cancelled() && contains(needs.prepare_env.result, 'success') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_type), 'model'))}}
needs: prepare_env
strategy:
fail-fast: false
matrix:
model_func: ["api", "api_rollout", "lmdeploy_base","lmdeploy_chat","vllm_chat", "vllm_base", "transformers_base", "transformers_chat"]
runs-on: yidian_cu12_ete
timeout-minutes: 240 #4hours
steps:
- name: Clean workdir
run: sudo git clean -ffdx
- name: Clone repository
uses: actions/checkout@v5
with:
repository: ${{ github.event.inputs.repo_org || 'open-compass/opencompass' }}
ref: ${{github.event.inputs.repo_ref || 'main'}}
- name: conda env
run: |
. ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}
conda info --envs
pip list
- name: Run test - api
if: contains(matrix.model_func, 'api')
run: |
. ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}
conda info --envs
echo ${{github.workspace}}
JOB_NAME=model-${{ github.run_id }}-${{ matrix.model_func }}-${{ github.run_attempt }}
JOB_NAME=${JOB_NAME//_/-}
rjob submit --metadata-name=$JOB_NAME --charged-group=opencompass_gpu --private-machine=group --group=opencompass_gpu --gpu=1 --cpu=32 --memory=32568 --private-machine=group --image=registry.h.pjlab.org.cn/ailab-puyu-puyu_gpu/lmdeploy:v0.12.0-cu12.8 --env=COMPASS_DATA_CACHE=/mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/compass_data_cache --env=TIKTOKEN_CACHE_DIR=/mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/share_tiktoken --env=HF_ENDPOINT=https://hf-mirror.com --env=HF_DATASETS_CACHE=/mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/hf_cache --env=HF_HUB_CACHE=/mnt/shared-storage-gpfs2/gpfs2-shared-public/huggingface/hub --env=CUDA_MODULE_LOADING=EAGER --env=HF_DATASETS_OFFLINE=1 --env=TRANSFORMERS_OFFLINE=1 --env=HF_EVALUATE_OFFLINE=1 --env=HF_HUB_OFFLINE=1 --env=VLLM_USE_MODELSCOPE=false --env=VLLM_WORKER_MULTIPROC_METHOD=spawn --mount=gpfs://gpfs1/qa-llm-cicd:/mnt/shared-storage-user/qa-llm-cicd --mount=gpfs://gpfs1/opencompass-shared:/mnt/shared-storage-user/opencompass-shared --mount=gpfs://gpfs1/auto-eval-pipeline:/mnt/shared-storage-user/auto-eval-pipeline --mount=gpfs://gpfs2/gpfs2-shared-public:/mnt/shared-storage-gpfs2/gpfs2-shared-public --host-network=True -- bash -exc 'source ${{env.CONDA_PATH}}/bin/activate; conda activate ${{env.CONDA_ENV}}; conda env list; cd ${{github.workspace}}; lmdeploy serve api_server ${{env.TEST_MODEL}} --session-len 146000 --max-batch-size 1 & python autotest/utils/health_check.py; opencompass autotest/model/infer_${{matrix.model_func}}.py -m infer --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/${{matrix.model_func}} --reuse --dump-res-length'
for i in {1..600}; do
current_status=$(rjob get $JOB_NAME | grep -oP 'rjob [^:]+: \K[^ ]+')
if [[ $current_status == "Succeeded" ]]; then
echo "Task succeeded"
exit 0
elif [[ $current_status == "Failed" || $current_status == "Stopped" ]]; then
echo "Task failed or stopped, fetching logs"
rjob logs job $JOB_NAME
exit 1
fi
sleep 10
done
- name: Run test - other
if: matrix.model_func != 'api' && matrix.model_func != 'api_rollout'
run: |
. ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}
conda info --envs
echo ${{github.workspace}}
JOB_NAME=model-${{ github.run_id }}-${{ matrix.model_func }}-${{ github.run_attempt }}
JOB_NAME=${JOB_NAME//_/-}
rjob submit --metadata-name=$JOB_NAME --charged-group=opencompass_gpu --private-machine=group --group=opencompass_gpu --gpu=2 --cpu=32 --memory=32568 --private-machine=group --image=registry.h.pjlab.org.cn/ailab-puyu-puyu_gpu/lmdeploy:v0.12.0-cu12.8 --env=COMPASS_DATA_CACHE=/mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/compass_data_cache --env=TIKTOKEN_CACHE_DIR=/mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/share_tiktoken --env=HF_ENDPOINT=https://hf-mirror.com --env=HF_DATASETS_CACHE=/mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/hf_cache --env=HF_HUB_CACHE=/mnt/shared-storage-gpfs2/gpfs2-shared-public/huggingface/hub --env=CUDA_MODULE_LOADING=EAGER --env=HF_DATASETS_OFFLINE=1 --env=TRANSFORMERS_OFFLINE=1 --env=HF_EVALUATE_OFFLINE=1 --env=HF_HUB_OFFLINE=1 --env=VLLM_USE_MODELSCOPE=false --env=VLLM_WORKER_MULTIPROC_METHOD=spawn --mount=gpfs://gpfs1/qa-llm-cicd:/mnt/shared-storage-user/qa-llm-cicd --mount=gpfs://gpfs1/opencompass-shared:/mnt/shared-storage-user/opencompass-shared --mount=gpfs://gpfs1/auto-eval-pipeline:/mnt/shared-storage-user/auto-eval-pipeline --mount=gpfs://gpfs2/gpfs2-shared-public:/mnt/shared-storage-gpfs2/gpfs2-shared-public --host-network=True -- bash -exc 'source ${{env.CONDA_PATH}}/bin/activate; conda activate ${{env.CONDA_ENV}}; conda env list; cd ${{github.workspace}}; opencompass autotest/model/infer_${{matrix.model_func}}.py -m infer --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/${{matrix.model_func}} --max-num-workers 2 --reuse --dump-res-length'
for i in {1..600}; do
current_status=$(rjob get $JOB_NAME | grep -oP 'rjob [^:]+: \K[^ ]+')
if [[ $current_status == "Succeeded" ]]; then
echo "Task succeeded"
exit 0
elif [[ $current_status == "Failed" || $current_status == "Stopped" ]]; then
echo "Task failed or stopped, fetching logs"
rjob logs job $JOB_NAME
exit 1
fi
sleep 10
done
- name: Assert result
run: |
. ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}
conda info --envs
python autotest/utils/compare_results.py compare_results ${{env.REPORT_ROOT}}/${{ github.run_id }}/${{matrix.model_func}} ${{env.REPORT_ROOT}}/${{env.BASELINE_DIR}}/${{matrix.model_func}} predictions
- name: stop job
if: always()
run: |
JOB_NAME=model-${{ github.run_id }}-${{ matrix.model_func }}-${{ github.run_attempt }}
JOB_NAME=${JOB_NAME//_/-}
rjob stop job $JOB_NAME
daily_eval_test:
if: ${{!cancelled() && contains(needs.prepare_env.result, 'success') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_type), 'eval'))}}
needs: prepare_env
strategy:
fail-fast: false
matrix:
eval_func: ["chat_obj_fullbench_v5", "chat_obj_fullbench_v6", "chat_obj_fullbench_v7", "chat_obj_fullbench_v8", "chat_obj_v8", "chat_obj_fullbench_other", "chat_sub_fullbench", "base_fullbench","base_longtext_fullbench","chat_longtext_fullbench"]
runs-on: yidian_cu12_daily
timeout-minutes: 240 #4hours
steps:
- name: Clean workdir
run: sudo git clean -ffdx
- name: Clone repository
uses: actions/checkout@v5
with:
repository: ${{ github.event.inputs.repo_org || 'open-compass/opencompass' }}
ref: ${{github.event.inputs.repo_ref || 'main'}}
- name: conda env
run: |
. ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}
conda info --envs
pip list
- name: Run test - without out judger
if: matrix.eval_func == 'chat_obj_fullbench_v5' || matrix.eval_func == 'base_fullbench' || matrix.eval_func == 'base_longtext_fullbench' || matrix.eval_func == 'chat_longtext_fullbench'
run: |
. ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}
mkdir ${{env.REPORT_ROOT}}/${{ github.run_id }} -p
cp -r ${{env.REPORT_ROOT}}/${{ env.BASELINE_DIR }}/predictions/eval_${{matrix.eval_func}} ${{env.REPORT_ROOT}}/${{ github.run_id }}
echo ${{github.workspace}}
JOB_NAME=eval-${{ github.run_id }}-${{ matrix.eval_func }}-${{ github.run_attempt }}
JOB_NAME=${JOB_NAME//_/-}
rjob submit --metadata-name=$JOB_NAME --charged-group=opencompass_gpu --private-machine=group --group=opencompass_gpu --gpu=0 --cpu=32 --memory=32568 --private-machine=group --image=registry.h.pjlab.org.cn/ailab-puyu-puyu_gpu/lmdeploy:v0.12.0-cu12.8 --env=COMPASS_DATA_CACHE=/mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/compass_data_cache --env=TIKTOKEN_CACHE_DIR=/mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/share_tiktoken --env=HF_ENDPOINT=https://hf-mirror.com --env=HF_DATASETS_CACHE=/mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/hf_cache --env=HF_HUB_CACHE=/mnt/shared-storage-gpfs2/gpfs2-shared-public/huggingface/hub --env=CUDA_MODULE_LOADING=EAGER --env=HF_DATASETS_OFFLINE=1 --env=TRANSFORMERS_OFFLINE=1 --env=HF_EVALUATE_OFFLINE=1 --env=HF_HUB_OFFLINE=1 --env=VLLM_USE_MODELSCOPE=false --env=VLLM_WORKER_MULTIPROC_METHOD=spawn --mount=gpfs://gpfs1/qa-llm-cicd:/mnt/shared-storage-user/qa-llm-cicd --mount=gpfs://gpfs1/opencompass-shared:/mnt/shared-storage-user/opencompass-shared --mount=gpfs://gpfs1/auto-eval-pipeline:/mnt/shared-storage-user/auto-eval-pipeline --mount=gpfs://gpfs2/gpfs2-shared-public:/mnt/shared-storage-gpfs2/gpfs2-shared-public --host-network=True -- bash -exc 'source ${{env.CONDA_PATH}}/bin/activate; conda activate ${{env.CONDA_ENV}}; conda env list; cd ${{github.workspace}}; opencompass autotest/eval/eval_${{matrix.eval_func}}.py -m eval --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/eval_${{matrix.eval_func}} --reuse --dump-res-length'
for i in {1..600}; do
current_status=$(rjob get $JOB_NAME | grep -oP 'rjob [^:]+: \K[^ ]+')
if [[ $current_status == "Succeeded" ]]; then
echo "Task succeeded"
exit 0
elif [[ $current_status == "Failed" || $current_status == "Stopped" ]]; then
echo "Task failed or stopped, fetching logs"
rjob logs job $JOB_NAME
exit 1
fi
sleep 10
done
- name: Run test - other subdatasets
if: matrix.eval_func == 'chat_sub_fullbench'
run: |
. ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}
mkdir ${{env.REPORT_ROOT}}/${{ github.run_id }} -p
cp -r ${{env.REPORT_ROOT}}/${{ env.BASELINE_DIR }}/predictions/eval_${{matrix.eval_func}} ${{env.REPORT_ROOT}}/${{ github.run_id }}
echo ${{github.workspace}}
JOB_NAME=eval-${{ github.run_id }}-${{ matrix.eval_func }}-${{ github.run_attempt }}
JOB_NAME=${JOB_NAME//_/-}
rjob submit --metadata-name=$JOB_NAME --charged-group=opencompass_gpu --private-machine=group --group=opencompass_gpu --gpu=1 --cpu=32 --memory=32568 --private-machine=group --image=registry.h.pjlab.org.cn/ailab-puyu-puyu_gpu/lmdeploy:v0.12.0-cu12.8 --env=COMPASS_DATA_CACHE=/mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/compass_data_cache_sub --env=TIKTOKEN_CACHE_DIR=/mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/share_tiktoken --env=HF_ENDPOINT=https://hf-mirror.com --env=HF_DATASETS_CACHE=/mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/hf_cache --env=HF_HUB_CACHE=/mnt/shared-storage-gpfs2/gpfs2-shared-public/huggingface/hub --env=CUDA_MODULE_LOADING=EAGER --env=HF_DATASETS_OFFLINE=1 --env=TRANSFORMERS_OFFLINE=1 --env=HF_EVALUATE_OFFLINE=1 --env=HF_HUB_OFFLINE=1 --env=VLLM_USE_MODELSCOPE=false --env=VLLM_WORKER_MULTIPROC_METHOD=spawn --mount=gpfs://gpfs1/qa-llm-cicd:/mnt/shared-storage-user/qa-llm-cicd --mount=gpfs://gpfs1/opencompass-shared:/mnt/shared-storage-user/opencompass-shared --mount=gpfs://gpfs1/auto-eval-pipeline:/mnt/shared-storage-user/auto-eval-pipeline --mount=gpfs://gpfs2/gpfs2-shared-public:/mnt/shared-storage-gpfs2/gpfs2-shared-public --host-network=True -- bash -exc 'source ${{env.CONDA_PATH}}/bin/activate; conda activate ${{env.CONDA_ENV}}; conda env list; cd ${{github.workspace}}; opencompass autotest/eval/eval_${{matrix.eval_func}}.py -m eval --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/eval_${{matrix.eval_func}} --reuse --dump-res-length'
for i in {1..600}; do
current_status=$(rjob get $JOB_NAME | grep -oP 'rjob [^:]+: \K[^ ]+')
if [[ $current_status == "Succeeded" ]]; then
echo "Task succeeded"
exit 0
elif [[ $current_status == "Failed" || $current_status == "Stopped" ]]; then
echo "Task failed or stopped, fetching logs"
rjob logs job $JOB_NAME
exit 1
fi
sleep 10
done
- name: Run test - with judger
if: matrix.eval_func != 'chat_obj_fullbench_v5' && matrix.eval_func != 'base_fullbench' && matrix.eval_func != 'base_longtext_fullbench' && matrix.eval_func != 'chat_longtext_fullbench' && matrix.eval_func != 'chat_sub_fullbench' && matrix.eval_func != 'chat_obj_fullbench_other'
run: |
. ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}
mkdir ${{env.REPORT_ROOT}}/${{ github.run_id }} -p
cp -r ${{env.REPORT_ROOT}}/${{ env.BASELINE_DIR }}/predictions/eval_${{matrix.eval_func}} ${{env.REPORT_ROOT}}/${{ github.run_id }}
echo ${{github.workspace}}
JOB_NAME=eval-${{ github.run_id }}-${{ matrix.eval_func }}-${{ github.run_attempt }}
JOB_NAME=${JOB_NAME//_/-}
rjob submit --metadata-name=$JOB_NAME --charged-group=opencompass_gpu --private-machine=group --group=opencompass_gpu --gpu=1 --cpu=32 --memory=32568 --private-machine=group --image=registry.h.pjlab.org.cn/ailab-puyu-puyu_gpu/lmdeploy:v0.12.0-cu12.8 --env=COMPASS_DATA_CACHE=/mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/compass_data_cache --env=TIKTOKEN_CACHE_DIR=/mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/share_tiktoken --env=HF_ENDPOINT=https://hf-mirror.com --env=HF_DATASETS_CACHE=/mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/hf_cache --env=HF_HUB_CACHE=/mnt/shared-storage-gpfs2/gpfs2-shared-public/huggingface/hub --env=CUDA_MODULE_LOADING=EAGER --env=HF_DATASETS_OFFLINE=1 --env=TRANSFORMERS_OFFLINE=1 --env=HF_EVALUATE_OFFLINE=1 --env=HF_HUB_OFFLINE=1 --env=VLLM_USE_MODELSCOPE=false --env=VLLM_WORKER_MULTIPROC_METHOD=spawn --mount=gpfs://gpfs1/qa-llm-cicd:/mnt/shared-storage-user/qa-llm-cicd --mount=gpfs://gpfs1/opencompass-shared:/mnt/shared-storage-user/opencompass-shared --mount=gpfs://gpfs1/auto-eval-pipeline:/mnt/shared-storage-user/auto-eval-pipeline --mount=gpfs://gpfs2/gpfs2-shared-public:/mnt/shared-storage-gpfs2/gpfs2-shared-public --host-network=True -- bash -exc 'source ${{env.CONDA_PATH}}/bin/activate; conda activate ${{env.CONDA_ENV}}; conda env list; cd ${{github.workspace}}; opencompass autotest/eval/eval_${{matrix.eval_func}}.py -m eval --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/eval_${{matrix.eval_func}} --reuse --dump-res-length'
for i in {1..600}; do
current_status=$(rjob get $JOB_NAME | grep -oP 'rjob [^:]+: \K[^ ]+')
if [[ $current_status == "Succeeded" ]]; then
echo "Task succeeded"
exit 0
elif [[ $current_status == "Failed" || $current_status == "Stopped" ]]; then
echo "Task failed or stopped, fetching logs"
rjob logs job $JOB_NAME
exit 1
fi
sleep 10
done
- name: Run test - with judger HF source
if: matrix.eval_func == 'chat_obj_fullbench_other'
run: |
. ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}
mkdir ${{env.REPORT_ROOT}}/${{ github.run_id }} -p
cp -r ${{env.REPORT_ROOT}}/${{ env.BASELINE_DIR }}/predictions/eval_${{matrix.eval_func}} ${{env.REPORT_ROOT}}/${{ github.run_id }}
echo ${{github.workspace}}
JOB_NAME=eval-${{ github.run_id }}-${{ matrix.eval_func }}-${{ github.run_attempt }}
JOB_NAME=${JOB_NAME//_/-}
rjob submit --metadata-name=$JOB_NAME --charged-group=opencompass_gpu --private-machine=group --group=opencompass_gpu --gpu=1 --cpu=32 --memory=32568 --private-machine=group --image=registry.h.pjlab.org.cn/ailab-puyu-puyu_gpu/lmdeploy:v0.12.0-cu12.8 --env=COMPASS_DATA_CACHE=/mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/compass_data_cache --env=TIKTOKEN_CACHE_DIR=/mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/share_tiktoken --env=HF_ENDPOINT=https://hf-mirror.com --env=HF_DATASETS_CACHE=/mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/hf_cache --env=HF_HUB_CACHE=/mnt/shared-storage-gpfs2/gpfs2-shared-public/huggingface/hub --env=CUDA_MODULE_LOADING=EAGER --env=HF_DATASETS_OFFLINE=1 --env=TRANSFORMERS_OFFLINE=1 --env=HF_EVALUATE_OFFLINE=1 --env=HF_HUB_OFFLINE=1 --env=VLLM_USE_MODELSCOPE=false --env=VLLM_WORKER_MULTIPROC_METHOD=spawn --env=DATASET_SOURCE=HF --mount=gpfs://gpfs1/qa-llm-cicd:/mnt/shared-storage-user/qa-llm-cicd --mount=gpfs://gpfs1/opencompass-shared:/mnt/shared-storage-user/opencompass-shared --mount=gpfs://gpfs1/auto-eval-pipeline:/mnt/shared-storage-user/auto-eval-pipeline --mount=gpfs://gpfs2/gpfs2-shared-public:/mnt/shared-storage-gpfs2/gpfs2-shared-public --host-network=True -- bash -exc 'source ${{env.CONDA_PATH}}/bin/activate; conda activate ${{env.CONDA_ENV}}; conda env list; cd ${{github.workspace}}; opencompass autotest/eval/eval_${{matrix.eval_func}}.py -m eval --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/eval_${{matrix.eval_func}} --reuse --dump-res-length'
for i in {1..600}; do
current_status=$(rjob get $JOB_NAME | grep -oP 'rjob [^:]+: \K[^ ]+')
if [[ $current_status == "Succeeded" ]]; then
echo "Task succeeded"
exit 0
elif [[ $current_status == "Failed" || $current_status == "Stopped" ]]; then
echo "Task failed or stopped, fetching logs"
rjob logs job $JOB_NAME
exit 1
fi
sleep 10
done
- name: Assert result
run: |
. ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}
conda info --envs
python autotest/utils/compare_results.py compare_results ${{env.REPORT_ROOT}}/${{ github.run_id }}/eval_${{matrix.eval_func}} ${{env.REPORT_ROOT}}/${{env.BASELINE_DIR}}/eval_${{matrix.eval_func}} results
- name: stop job
if: always()
run: |
JOB_NAME=eval-${{ github.run_id }}-${{ matrix.eval_func }}-${{ github.run_attempt }}
JOB_NAME=${JOB_NAME//_/-}
rjob stop job $JOB_NAME
daily_run_cmd:
if: ${{!cancelled() && contains(needs.prepare_env.result, 'success') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_type), 'cmd'))}}
needs: prepare_env
runs-on: yidian_cu12_daily
timeout-minutes: 240 #4hours
steps:
- name: Clean workdir
run: sudo git clean -ffdx
- name: Clone repository
uses: actions/checkout@v5
with:
repository: ${{ github.event.inputs.repo_org || 'open-compass/opencompass' }}
ref: ${{github.event.inputs.repo_ref || 'main'}}
- name: conda env
run: |
. ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}
conda info --envs
pip list
- name: Run test
run: |
. ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}
conda info --envs
JOB_NAME=cmd-${{ github.run_id }}-${{ github.run_attempt }}
JOB_NAME=${JOB_NAME//_/-}
rjob submit --metadata-name=$JOB_NAME --charged-group=opencompass_gpu --private-machine=group --group=opencompass_gpu --gpu=2 --cpu=32 --memory=32568 --private-machine=group --image=registry.h.pjlab.org.cn/ailab-puyu-puyu_gpu/lmdeploy:v0.12.0-cu12.8 --env=COMPASS_DATA_CACHE=/mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/compass_data_cache --env=TIKTOKEN_CACHE_DIR=/mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/share_tiktoken --env=HF_ENDPOINT=https://hf-mirror.com --env=HF_DATASETS_CACHE=/mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/hf_cache --env=HF_HUB_CACHE=/mnt/shared-storage-gpfs2/gpfs2-shared-public/huggingface/hub --env=CUDA_MODULE_LOADING=EAGER --env=HF_DATASETS_OFFLINE=1 --env=TRANSFORMERS_OFFLINE=1 --env=HF_EVALUATE_OFFLINE=1 --env=HF_HUB_OFFLINE=1 --env=VLLM_USE_MODELSCOPE=false --env=VLLM_WORKER_MULTIPROC_METHOD=spawn --mount=gpfs://gpfs1/qa-llm-cicd:/mnt/shared-storage-user/qa-llm-cicd --mount=gpfs://gpfs1/opencompass-shared:/mnt/shared-storage-user/opencompass-shared --mount=gpfs://gpfs1/auto-eval-pipeline:/mnt/shared-storage-user/auto-eval-pipeline --mount=gpfs://gpfs2/gpfs2-shared-public:/mnt/shared-storage-gpfs2/gpfs2-shared-public --host-network=True -- bash -exc '/mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/daily_cmd_test.sh ${{env.REPORT_ROOT}}/${{ github.run_id }}'
for i in {1..600}; do
current_status=$(rjob get $JOB_NAME | grep -oP 'rjob [^:]+: \K[^ ]+')
if [[ $current_status == "Succeeded" ]]; then
echo "Task succeeded"
exit 0
elif [[ $current_status == "Failed" || $current_status == "Stopped" ]]; then
echo "Task failed or stopped, fetching logs"
rjob logs job $JOB_NAME
exit 1
fi
sleep 10
done
- name: Assert result
run: |
. ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}
conda info --envs
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1/*/summary regression_result_daily
python -m pytest -m case1 -s -v --color=yes autotest/utils/oc_score_assert.py
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2/*/summary regression_result_daily
python -m pytest -m case2 -s -v --color=yes autotest/utils/oc_score_assert.py
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3/*/summary regression_result_daily
python -m pytest -m case3 -s -v --color=yes autotest/utils/oc_score_assert.py
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4/*/summary regression_result_daily
python -m pytest -m case4 -s -v --color=yes autotest/utils/oc_score_assert.py
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd5/*/summary regression_result_daily
python -m pytest -m case5 -s -v --color=yes autotest/utils/oc_score_assert.py
- name: stop job
if: always()
run: |
JOB_NAME=cmd-${{ github.run_id }}-${{ github.run_attempt }}
JOB_NAME=${JOB_NAME//_/-}
rjob stop job $JOB_NAME
daily_run_cluster:
if: ${{!cancelled() && contains(needs.prepare_env.result, 'success') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_type), 'cluster'))}}
needs: prepare_env
runs-on: yidian_cu12_daily
timeout-minutes: 240 #4hours
steps:
- name: Clean workdir
run: sudo git clean -ffdx
- name: Clone repository
uses: actions/checkout@v5
with:
repository: ${{ github.event.inputs.repo_org || 'open-compass/opencompass' }}
ref: ${{github.event.inputs.repo_ref || 'main'}}
- name: conda env
run: |
. ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}
conda info --envs
pip list
- name: change rjob.py
run: |
cp /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/config/rjob.py .
sed -i "s/TASK_ID='none'/TASK_ID='${{ github.run_id }}'/g" rjob.py
- name: Run test
run: |
. ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}
conda info --envs
opencompass autotest/cluster/chat_models.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cluster --reuse --dump-res-length
- name: Assert result
run: |
. ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}
conda info --envs
python autotest/utils/compare_results.py compare_results ${{env.REPORT_ROOT}}/${{ github.run_id }}/cluster ${{env.REPORT_ROOT}}/${{env.BASELINE_DIR}}/cluster predictions
python autotest/utils/compare_results.py compare_results ${{env.REPORT_ROOT}}/${{ github.run_id }}/cluster ${{env.REPORT_ROOT}}/${{env.BASELINE_DIR}}/cluster results
================================================
FILE: .github/workflows/link-check.yml
================================================
name: 'Link check'
on:
schedule:
# check links at 01:30 a.m. every day
- cron: '30 1 * * *'
workflow_dispatch: # allow manual trigger
jobs:
link-check:
runs-on: ubuntu-latest
steps:
# - uses: actions/checkout@v3
- name: Install linkchecker
run: |
pip install linkchecker
- name: Run linkchecker
run: |
linkchecker https://opencompass.readthedocs.io/ --no-robots -t 30 --no-warnings \
--ignore-url "https://opencompass.readthedocs.io/.*/static/images/opencompass_logo.svg" \
--ignore-url "https://opencompass.readthedocs.io/.*/_static/images/icon-menu-dots.svg" \
--ignore-url "https://opencompass.readthedocs.io/policy" \
--ignore-url "https://opencompass.readthedocs.io/(en|zh_CN)/[0-9a-f]{40}/.*"
================================================
FILE: .github/workflows/lint.yml
================================================
name: lint
on: [push, pull_request]
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
lint:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Set up Python 3.10
uses: actions/setup-python@v4
with:
python-version: '3.10'
- name: Install pre-commit hook
run: |
pip install pre-commit==3.8.0 mmengine==0.10.5
pre-commit install
- name: Linting
run: pre-commit run --all-files
================================================
FILE: .github/workflows/pr-run-test.yml
================================================
name: pr_run_test
on:
pull_request:
paths-ignore:
- 'README.md'
- 'README_zh-CN.md'
- 'docs/**'
- 'configs/**'
- 'tools/**'
workflow_dispatch:
schedule:
- cron: '56 22 * * *'
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
env:
CONDA_ENV: pr_test
HF_DATASETS_OFFLINE: 1
HF_EVALUATE_OFFLINE: 1
TRANSFORMERS_OFFLINE: 1
VLLM_USE_MODELSCOPE: false
LMDEPLOY_USE_MODELSCOPE: false
HF_HUB_OFFLINE: 1
CONDA_PATH: /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/miniconda3
REPORT_ROOT: /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/eval_report/prtest
COMPASS_DATA_CACHE: /mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/compass_data_cache
HF_DATASETS_CACHE: /mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/hf_cache
HF_HUB_CACHE: /mnt/shared-storage-gpfs2/gpfs2-shared-public/huggingface/hub
KUBEBRAIN_CLUSTER_ENTRY: https://h.pjlab.org.cn
KUBEBRAIN_NAMESPACE: ailab-opencompass
JOB_NAME: pr-test-${{ github.run_id }}-${{ github.run_attempt }}
jobs:
pr_run_test:
runs-on: yidian_cu12
timeout-minutes: 45
steps:
- name: Checkout repository
uses: actions/checkout@v2
- name: Prepare - Install opencompass
run: |
. ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}
python3 -m pip uninstall opencompass -y
python3 -m pip install .[full]
conda info --envs
pip list
lmdeploy check_env
- name: Run test
run: |
. ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}
pip list
rjob submit --metadata-name=${{ env.JOB_NAME }} --charged-group=opencompass_gpu --private-machine=group --group=opencompass_gpu --gpu=2 --cpu=32 --memory=32568 --private-machine=group --image=registry.h.pjlab.org.cn/ailab-puyu/xpuyu:torch-2.6.0-45d96d5f-0607 --env=COMPASS_DATA_CACHE=/mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/compass_data_cache --env=TIKTOKEN_CACHE_DIR=/mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/share_tiktoken --env=HF_ENDPOINT=https://hf-mirror.com --env=HF_DATASETS_CACHE=/mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/hf_cache --env=HF_HUB_CACHE=/mnt/shared-storage-user/large-model-center-share-weights/hf_hub --env=CUDA_MODULE_LOADING=EAGER --env=HF_DATASETS_OFFLINE=1 --env=TRANSFORMERS_OFFLINE=1 --env=HF_EVALUATE_OFFLINE=1 --env=HF_HUB_OFFLINE=1 --mount=gpfs://gpfs1/qa-llm-cicd:/mnt/shared-storage-user/qa-llm-cicd --mount=gpfs://gpfs1/opencompass-shared:/mnt/shared-storage-user/opencompass-shared --mount=gpfs://gpfs1/auto-eval-pipeline:/mnt/shared-storage-user/auto-eval-pipeline --mount=gpfs://gpfs1/large-model-center-share-weights:/mnt/shared-storage-user/large-model-center-share-weights --host-network=True -- bash -exc '/mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/pr_test.sh ${{env.REPORT_ROOT}}/${{ github.run_id }}'
for i in {1..300}; do
current_status=$(rjob get ${{ env.JOB_NAME }} | grep -oP 'rjob [^:]+: \K[^ ]+')
if [[ $current_status == "Succeeded" || $current_status == "Failed" || $current_status == "Stopped" ]]; then
echo "Current status: $current_status, stop checking"
break
fi
sleep 6
done
- name: Get result
run: |
score=$(sed -n '$p' ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result1/*/summary/*.csv | awk -F ',' '{print $NF}')
if (( ${score%.*} >= 75 && ${score%.*} <= 80 )); then
echo "score is $score between 75 and 80"
else
echo "score is $score not between 75 and 80"
exit 1
fi
score=$(sed -n '$p' ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result2/*/summary/*.csv | awk -F ',' '{print $NF}')
if (( ${score%.*} >= 75 && ${score%.*} <= 80 )); then
echo "score is $score between 75 and 80"
else
echo "score is $score not between 75 and 80"
exit 1
fi
score=$(sed -n '$p' ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result3/*/summary/*.csv | awk -F ',' '{print $NF}')
if (( ${score%.*} >= 75 && ${score%.*} <= 80 )); then
echo "score is $score between 75 and 80"
else
echo "score is $score not between 75 and 80"
exit 1
fi
- name: Uninstall opencompass
if: always()
run: |
. ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}
python3 -m pip uninstall opencompass -y
conda info --envs
rjob stop job ${{ env.JOB_NAME }}
notify_to_feishu:
if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }}
needs: [pr_run_test]
timeout-minutes: 5
runs-on: self-hosted
steps:
- name: notify
run: |
curl -X POST -H "Content-Type: application/json" -d '{"msg_type":"post","content":{"post":{"zh_cn":{"title":"Opencompass- pr test failed","content":[[{"tag":"text","text":"branch: ${{github.ref_name}}, run action: ${{github.workflow}} failed. "},{"tag":"a","text":"Please click here for details ","href":"https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"},{"tag":"at","user_id":"'${{ secrets.USER_ID }}'"}]]}}}}' ${{ secrets.WEBHOOK_URL }}
================================================
FILE: .github/workflows/pr-stage-check.yml
================================================
name: pr_stage_test
on:
pull_request:
paths-ignore:
- 'README.md'
- 'README_zh-CN.md'
- 'docs/**'
- 'configs/**'
- 'tools/**'
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
build:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ['3.10']
include:
- torch: 2.9.0
steps:
- name: Free disk space
uses: jlumbroso/free-disk-space@main
with:
# This might remove tools that are actually needed, if set to "true" but frees about 6 GB
tool-cache: false
docker-images: false
# All of these default to true, but feel free to set to "false" if necessary for your workflow
android: true
dotnet: true
haskell: true
large-packages: true
swap-storage: false
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
- name: Upgrade pip
run: python -m pip install --upgrade pip
- name: Install PyTorch
run: pip install torch==${{matrix.torch}} -f https://download.pytorch.org/whl/cpu/torch_stable.html
- name: Install system dependencies
run: |
sudo sed -i '$ a deb http://th.archive.ubuntu.com/ubuntu jammy main' /etc/apt/sources.list
sudo apt-get update && sudo apt-get install -y libc6 libffi-dev libncursesw6 wget unzip
- name: Upgrade pip
run: python -m pip install pip --upgrade
- name: Install opencompass dependencies
run: |
python -m pip install -r requirements.txt
- name: Build and install
run: python -m pip install -e .
- name: Prepare dataset
run: |
wget https://github.com/open-compass/opencompass/releases/download/0.2.2.rc1/OpenCompassData-core-20240207.zip
unzip OpenCompassData-core-20240207.zip
- name: Dry run test
run: |
python run.py --models hf_opt_125m --datasets siqa_gen winograd_ppl --dry-run
build_cu131:
runs-on: ubuntu-latest
container:
image: nvidia/cuda:13.1.1-cudnn-runtime-ubuntu24.04
strategy:
matrix:
python-version: ['3.10']
steps:
- name: Free disk space
uses: jlumbroso/free-disk-space@main
with:
# This might remove tools that are actually needed, if set to "true" but frees about 6 GB
tool-cache: false
docker-images: false
# All of these default to true, but feel free to set to "false" if necessary for your workflow
android: true
dotnet: true
haskell: true
large-packages: true
swap-storage: false
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
- name: Fetch GPG keys
run: |
apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
- name: Install Python-dev
run: apt-get update && apt-get install -y python${{matrix.python-version}}-dev
if: ${{matrix.python-version != 3.10}}
- name: Install system dependencies
run: |
apt-get update
apt-get install -y ffmpeg libsm6 libxext6 git ninja-build libglib2.0-0 libxrender-dev libc6 libc6-dev
sed -i '$ a deb http://th.archive.ubuntu.com/ubuntu jammy main' /etc/apt/sources.list
apt-get update && apt-get install -y libc6 libffi-dev libncursesw6 wget unzip
- name: Upgrade pip
run: python -m pip install pip --upgrade
- name: Install opencompass dependencies
run: |
python -m pip install -r requirements.txt
- name: Build and install
run: python -m pip install -e .
- name: Prepare dataset
run: |
wget https://github.com/open-compass/opencompass/releases/download/0.2.2.rc1/OpenCompassData-core-20240207.zip
unzip OpenCompassData-core-20240207.zip
- name: Dry run test
run: |
python run.py --models hf_opt_125m --datasets siqa_gen winograd_ppl --dry-run
build_windows:
runs-on: windows-2022
strategy:
matrix:
python-version: ['3.10']
platform: [cpu]
steps:
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
- name: Upgrade pip
run: python -m pip install pip --upgrade
- name: Install PyTorch
run: pip install torch==2.5.1 -f https://download.pytorch.org/whl/cpu/torch_stable.html
- name: Install opencompass dependencies
run: |
pip install -r requirements.txt
- name: Build and install
run: pip install -e .
- name: Prepare dataset
run: |
Invoke-WebRequest -Uri https://github.com/open-compass/opencompass/releases/download/0.2.2.rc1/OpenCompassData-core-20240207.zip -OutFile OpenCompassData-core-20240207.zip
unzip OpenCompassData-core-20240207.zip
- name: Dry run test
run: |
python run.py --models hf_opt_125m --datasets siqa_gen winograd_ppl --dry-run
================================================
FILE: .github/workflows/publish-to-pypi.yml
================================================
name: deploy
on:
push:
workflow_dispatch:
inputs:
confirm_publish:
description: 'Type YES to confirm publishing to PyPI'
required: true
type: string
jobs:
build-n-publish:
runs-on: ubuntu-latest
if: |
github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags') ||
(github.event_name == 'workflow_dispatch' && inputs.confirm_publish == 'YES')
steps:
- uses: actions/checkout@v2
- name: Set up Python 3.10
uses: actions/setup-python@v4
with:
python-version: '3.10'
- name: Build lagent
run: |
pip install wheel
python setup.py sdist bdist_wheel
- name: Publish distribution to PyPI
run: |
pip install twine
twine upload dist/* -u __token__ -p ${{ secrets.pypi_password }}
================================================
FILE: .github/workflows/unit-test.yml
================================================
name: unit_test
on:
pull_request:
paths-ignore:
- 'README.md'
- 'README_zh-CN.md'
- 'docs/**'
- 'configs/**'
- 'tools/**'
workflow_dispatch:
schedule:
- cron: '56 22 * * *'
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
env:
CONDA_ENV: unit_test
CONDA_PATH: /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/miniconda3
COMPASS_DATA_CACHE: /mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/compass_data_cache
KUBEBRAIN_CLUSTER_ENTRY: https://h.pjlab.org.cn
KUBEBRAIN_NAMESPACE: ailab-opencompass
JOB_NAME: unit-test-${{ github.run_id }}-${{ github.run_attempt }}
jobs:
unit_test:
runs-on: yidian_cu12_ut
timeout-minutes: 45
steps:
- name: Checkout repository
uses: actions/checkout@v2
- name: Prepare - Install opencompass
run: |
. ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}
python3 -m pip uninstall opencompass -y
python3 -m pip install .[full]
conda info --envs
pip list
lmdeploy check_env
- name: Run test
run: |
. ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}
pip list
coverage run --include="**/opencompass/**/*.py" -m pytest tests -s -vv
coverage report -m
- name: Uninstall opencompass
if: always()
run: |
. ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}
python3 -m pip uninstall opencompass -y
conda info --envs
================================================
FILE: .gitignore
================================================
.DS_Store
output_*/
outputs/
scripts/
icl_inference_output/
.vscode/
tmp/
configs/eval_subjective_alignbench_test.py
configs/openai_key.py
configs/secrets.py
configs/datasets/log.json
configs/eval_debug*.py
configs/viz_*.py
configs/**/*_bkup.py
opencompass/**/*_bkup.py
data
work_dirs
outputs
models/*
configs/internal/
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
*.ipynb
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
.idea
# Auto generate documentation
docs/en/_build/
docs/zh_cn/_build/
# .zip
*.zip
# sft config ignore list
configs/sft_cfg/*B_*
configs/sft_cfg/1B/*
configs/sft_cfg/7B/*
configs/sft_cfg/20B/*
configs/sft_cfg/60B/*
configs/sft_cfg/100B/*
configs/cky/
configs/_internal_legacy*
# in case llama clone in the opencompass
llama/
# in case ilagent clone in the opencompass
ilagent/
# ignore the config file for criticbench evaluation
configs/sft_cfg/criticbench_eval/*
# path of turbomind's model after runing `lmdeploy.serve.turbomind.deploy`
turbomind/
# cibench output
*.db
*.pth
*.pt
*.onnx
*.gz
*.gz.*
*.png
*.txt
*.jpg
*.json
*.jsonl
*.csv
*.npy
*.c
# aliyun
core.*
================================================
FILE: .owners.yml
================================================
assign:
issues: enabled
pull_requests: disabled
strategy:
# random
daily-shift-based
scedule:
'*/1 * * * *'
assignees:
- bittersweet1999
- liushz
- MaiziXiao
- acylam
- tonysy
================================================
FILE: .pre-commit-config-zh-cn.yaml
================================================
exclude: |
(?x)^(
tests/data/|
tests/datasets/|
tests/models/|
opencompass/models/internal/|
opencompass/utils/internal/|
opencompass/openicl/icl_evaluator/hf_metrics/|
opencompass/datasets/lawbench/utils|
opencompass/datasets/lawbench/evaluation_functions/|
opencompass/datasets/medbench/|
opencompass/datasets/teval/|
opencompass/datasets/NPHardEval/|
opencompass/datasets/TheoremQA|
opencompass/datasets/subjective/mtbench101.py|
docs/zh_cn/advanced_guides/compassbench_intro.md |
docs/zh_cn/advanced_guides/compassbench_v2_0.md |
opencompass/utils/datasets.py |
opencompass/utils/datasets_info.py
)
repos:
- repo: https://gitee.com/openmmlab/mirrors-flake8
rev: 5.0.4
hooks:
- id: flake8
exclude: |
(?x)^(
opencompass/configs/|
examples/
)
- repo: https://gitee.com/openmmlab/mirrors-isort
rev: 5.11.5
hooks:
- id: isort
exclude: |
(?x)^(
opencompass/configs/|
examples/
)
- repo: https://gitee.com/openmmlab/mirrors-yapf
rev: v0.32.0
hooks:
- id: yapf
exclude: |
(?x)^(
opencompass/configs/|
examples/
)
- repo: https://gitee.com/openmmlab/mirrors-codespell
rev: v2.2.1
hooks:
- id: codespell
exclude: |
(?x)^(
.*\.jsonl|
.*\.md.template|
opencompass/configs/ |
examples/
)
- repo: https://gitee.com/openmmlab/mirrors-pre-commit-hooks
rev: v4.3.0
hooks:
- id: trailing-whitespace
exclude: |
(?x)^(
dicts/|
projects/.*?/dicts/|
)
- id: check-yaml
- id: end-of-file-fixer
exclude: |
(?x)^(
dicts/|
projects/.*?/dicts/|
)
- id: requirements-txt-fixer
- id: double-quote-string-fixer
- id: check-merge-conflict
- id: fix-encoding-pragma
args: ["--remove"]
- id: mixed-line-ending
args: ["--fix=lf"]
- repo: https://gitee.com/openmmlab/mirrors-mdformat
rev: 0.7.9
hooks:
- id: mdformat
args: ["--number", "--table-width", "200"]
additional_dependencies:
- mdformat-openmmlab
- mdformat_frontmatter
- linkify-it-py
exclude: configs/
- repo: https://gitee.com/openmmlab/mirrors-docformatter
rev: v1.3.1
hooks:
- id: docformatter
args: ["--in-place", "--wrap-descriptions", "79"]
exclude: |
(?x)^(
tests
)
- repo: local
hooks:
- id: update-dataset-suffix
name: dataset suffix updater
entry: ./tools/update_dataset_suffix.py
language: script
pass_filenames: true
require_serial: true
files: ^opencompass/configs/datasets
- repo: local
hooks:
- id: update-dataset-suffix-pacakge
name: dataset suffix updater(package)
entry: ./tools/update_dataset_suffix.py
language: script
pass_filenames: false
# require_serial: true
# files: ^opencompass/configs/datasets
args:
- --root_folder
- opencompass/configs/datasets
# - repo: https://github.com/open-mmlab/pre-commit-hooks
# rev: v0.2.0 # Use the ref you want to point at
# hooks:
# - id: check-algo-readme
# - id: check-copyright
# args: ["mmocr", "tests", "tools"] # these directories will be checked
================================================
FILE: .pre-commit-config.yaml
================================================
exclude: |
(?x)^(
tests/data/|
tests/datasets/|
tests/models/|
opencompass/models/internal/|
opencompass/utils/internal/|
opencompass/openicl/icl_evaluator/hf_metrics/|
opencompass/datasets/lawbench/utils|
opencompass/datasets/lawbench/evaluation_functions/|
opencompass/datasets/medbench/|
opencompass/datasets/matbench/|
opencompass/datasets/teval/|
opencompass/datasets/NPHardEval/|
opencompass/datasets/TheoremQA|
opencompass/datasets/subjective/mtbench101.py|
docs/zh_cn/advanced_guides/compassbench_intro.md |
docs/zh_cn/advanced_guides/compassbench_v2_0.md |
opencompass/utils/datasets.py |
opencompass/utils/datasets_info.py
)
repos:
- repo: https://github.com/PyCQA/flake8
rev: 5.0.4
hooks:
- id: flake8
exclude: |
(?x)^(
opencompass/configs/|
examples/
)
- repo: https://github.com/PyCQA/isort
rev: 5.11.5
hooks:
- id: isort
exclude: |
(?x)^(
opencompass/configs/|
examples/
)
- repo: https://github.com/pre-commit/mirrors-yapf
rev: v0.32.0
hooks:
- id: yapf
exclude: |
(?x)^(
opencompass/configs/|
examples/
)
- repo: https://github.com/codespell-project/codespell
rev: v2.2.1
hooks:
- id: codespell
exclude: |
(?x)^(
.*\.jsonl|
.*\.md.template|
opencompass/configs/ |
examples/
)
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v5.0.0
hooks:
- id: trailing-whitespace
exclude: |
(?x)^(
dicts/|
projects/.*?/dicts/|
)
- id: check-yaml
- id: end-of-file-fixer
exclude: |
(?x)^(
dicts/|
projects/.*?/dicts/|
)
- id: requirements-txt-fixer
- id: double-quote-string-fixer
- id: check-merge-conflict
- id: fix-encoding-pragma
args: ["--remove"]
- id: mixed-line-ending
args: ["--fix=lf"]
- repo: https://github.com/executablebooks/mdformat
rev: 0.7.9
hooks:
- id: mdformat
args: ["--number", "--table-width", "200"]
additional_dependencies:
- mdformat-openmmlab
- mdformat_frontmatter
- linkify-it-py
exclude: configs/
# - repo: https://github.com/myint/docformatter
# rev: v1.3.1
# hooks:
# - id: docformatter
# args: ["--in-place", "--wrap-descriptions", "79"]
- repo: local
hooks:
- id: update-dataset-suffix
name: dataset suffix updater
entry: ./tools/update_dataset_suffix.py
language: script
pass_filenames: true
require_serial: true
files: ^opencompass/configs/datasets
- repo: local
hooks:
- id: update-dataset-suffix-pacakge
name: dataset suffix updater(package)
entry: ./tools/update_dataset_suffix.py
language: script
pass_filenames: false
# require_serial: true
# files: ^opencompass/configs/datasets
args:
- --root_folder
- opencompass/configs/datasets
# - repo: https://github.com/open-mmlab/pre-commit-hooks
# rev: v0.2.0 # Use the ref you want to point at
# hooks:
# - id: check-algo-readme
# - id: check-copyright
# args: ["mmocr", "tests", "tools"] # these directories will be checked
================================================
FILE: LICENSE
================================================
Copyright 2020 OpenCompass Authors. All rights reserved.
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright 2020 OpenCompass Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
================================================
FILE: MANIFEST.in
================================================
recursive-include opencompass/configs *.py *.yml *.json *.txt *.md
recursive-include opencompass/openicl/icl_evaluator/hf_metrics *.py
recursive-include opencompass/datasets *.py *.yml *.json *.txt *.md *.yaml
================================================
FILE: README.md
================================================
<div align="center">
<img src="docs/en/_static/image/logo.svg" width="500px"/>
<br />
<br />
[![][github-release-shield]][github-release-link]
[![][github-releasedate-shield]][github-releasedate-link]
[![][github-contributors-shield]][github-contributors-link]<br>
[![][github-forks-shield]][github-forks-link]
[![][github-stars-shield]][github-stars-link]
[![][github-issues-shield]][github-issues-link]
[![][github-license-shield]][github-license-link]
<!-- [](https://pypi.org/project/opencompass/) -->
[🌐Website](https://opencompass.org.cn/) |
[📖CompassHub](https://hub.opencompass.org.cn/home) |
[📊CompassRank](https://rank.opencompass.org.cn/home) |
[📘Documentation](https://opencompass.readthedocs.io/en/latest/) |
[🛠️Installation](https://opencompass.readthedocs.io/en/latest/get_started/installation.html) |
[🤔Reporting Issues](https://github.com/open-compass/opencompass/issues/new/choose)
English | [简体中文](README_zh-CN.md)
[![][github-trending-shield]][github-trending-url]
</div>
<p align="center">
👋 join us on <a href="https://discord.gg/KKwfEbFj7U" target="_blank">Discord</a> and <a href="https://r.vansin.top/?r=opencompass" target="_blank">WeChat</a>
</p>
> \[!IMPORTANT\]
>
> **Star Us**, You will receive all release notifications from GitHub without any delay ~ ⭐️
<details>
<summary><kbd>Star History</kbd></summary>
<picture>
<source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/svg?repos=open-compass%2Fopencompass&theme=dark&type=Date">
<img width="100%" src="https://api.star-history.com/svg?repos=open-compass%2Fopencompass&type=Date">
</picture>
</details>
## 🧭 Welcome
to **OpenCompass**!
Just like a compass guides us on our journey, OpenCompass will guide you through the complex landscape of evaluating large language models. With its powerful algorithms and intuitive interface, OpenCompass makes it easy to assess the quality and effectiveness of your NLP models.
🚩🚩🚩 Explore opportunities at OpenCompass! We're currently **hiring full-time researchers/engineers and interns**. If you're passionate about LLM and OpenCompass, don't hesitate to reach out to us via [email](mailto:zhangsongyang@pjlab.org.cn). We'd love to hear from you!
🔥🔥🔥 We are delighted to announce that **the OpenCompass has been recommended by the Meta AI**, click [Get Started](https://ai.meta.com/llama/get-started/#validation) of Llama for more information.
> **Attention**<br />
> Breaking Change Notice: In version 0.4.0, we are consolidating all AMOTIC configuration files (previously located in ./configs/datasets, ./configs/models, and ./configs/summarizers) into the opencompass package. Users are advised to update their configuration references to reflect this structural change.
## 🚀 What's New <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>
- **\[2026.02.05\]** OpenCompass now supports Intern-S1-Pro related general and scientific evaluation benchmarks. Please check [Example for Evaluating Intern-S1-Pro](examples/eval_intern_s1_pro.py) and [Model Card](https://huggingface.co/internlm/Intern-S1-Pro) for more details! 🔥🔥🔥
- **\[2025.12.08\]** OpenCompass now supports evaluation for SciReasoner. Please check [Example for Evaluating SciReasoner](examples/eval_scireasoner.py) and [Project GitHub Repo](https://github.com/InternScience/SciReason) for more details! 🔥🔥🔥
- **\[2025.07.26\]** OpenCompass now supports Intern-S1 related general and scientific evaluation benchmarks. Please check [Tutorial for Evaluating Intern-S1](https://opencompass.readthedocs.io/en/latest/user_guides/interns1.html) for more details! 🔥🔥🔥
- **\[2025.04.01\]** OpenCompass now supports `CascadeEvaluator`, a flexible evaluation mechanism that allows multiple evaluators to work in sequence. This enables creating customized evaluation pipelines for complex assessment scenarios. Check out the [documentation](docs/en/advanced_guides/llm_judge.md) for more details! 🔥🔥🔥
- **\[2025.03.11\]** We have supported evaluation for `SuperGPQA` which is a great benchmark for measuring LLM knowledge ability 🔥🔥🔥
- **\[2025.02.28\]** We have added a tutorial for `DeepSeek-R1` series model, please check [Evaluating Reasoning Model](docs/en/user_guides/deepseek_r1.md) for more details! 🔥🔥🔥
- **\[2025.02.15\]** We have added two powerful evaluation tools: `GenericLLMEvaluator` for LLM-as-judge evaluations and `MATHVerifyEvaluator` for mathematical reasoning assessments. Check out the documentation for [LLM Judge](docs/en/advanced_guides/llm_judge.md) and [Math Evaluation](docs/en/advanced_guides/general_math.md) for more details! 🔥🔥🔥
- **\[2025.01.16\]** We now support the [InternLM3-8B-Instruct](https://huggingface.co/internlm/internlm3-8b-instruct) model which has enhanced performance on reasoning and knowledge-intensive tasks.
- **\[2024.12.17\]** We have provided the evaluation script for the December [CompassAcademic](examples/eval_academic_leaderboard_202412.py), which allows users to easily reproduce the official evaluation results by configuring it.
- **\[2024.11.14\]** OpenCompass now offers support for a sophisticated benchmark designed to evaluate complex reasoning skills — [MuSR](https://arxiv.org/pdf/2310.16049). Check out the [demo](examples/eval_musr.py) and give it a spin! 🔥🔥🔥
- **\[2024.11.14\]** OpenCompass now supports the brand new long-context language model evaluation benchmark — [BABILong](https://arxiv.org/pdf/2406.10149). Have a look at the [demo](examples/eval_babilong.py) and give it a try! 🔥🔥🔥
- **\[2024.10.14\]** We now support the OpenAI multilingual QA dataset [MMMLU](https://huggingface.co/datasets/openai/MMMLU). Feel free to give it a try! 🔥🔥🔥
- **\[2024.09.19\]** We now support [Qwen2.5](https://huggingface.co/Qwen)(0.5B to 72B) with multiple backend(huggingface/vllm/lmdeploy). Feel free to give them a try! 🔥🔥🔥
- **\[2024.09.17\]** We now support OpenAI o1(`o1-mini-2024-09-12` and `o1-preview-2024-09-12`). Feel free to give them a try! 🔥🔥🔥
- **\[2024.09.05\]** We now support answer extraction through model post-processing to provide a more accurate representation of the model's capabilities. As part of this update, we have integrated [XFinder](https://github.com/IAAR-Shanghai/xFinder) as our first post-processing model. For more detailed information, please refer to the [documentation](opencompass/utils/postprocessors/xfinder/README.md), and give it a try! 🔥🔥🔥
- **\[2024.08.20\]** OpenCompass now supports the [SciCode](https://github.com/scicode-bench/SciCode): A Research Coding Benchmark Curated by Scientists. 🔥🔥🔥
- **\[2024.08.16\]** OpenCompass now supports the brand new long-context language model evaluation benchmark — [RULER](https://arxiv.org/pdf/2404.06654). RULER provides an evaluation of long-context including retrieval, multi-hop tracing, aggregation, and question answering through flexible configurations. Check out the [RULER](configs/datasets/ruler/README.md) evaluation config now! 🔥🔥🔥
- **\[2024.08.09\]** We have released the example data and configuration for the CompassBench-202408, welcome to [CompassBench](https://opencompass.readthedocs.io/zh-cn/latest/advanced_guides/compassbench_intro.html) for more details. 🔥🔥🔥
- **\[2024.08.01\]** We supported the [Gemma2](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315) models. Welcome to try! 🔥🔥🔥
- **\[2024.07.23\]** We supported the [ModelScope](www.modelscope.cn) datasets, you can load them on demand without downloading all the data to your local disk. Welcome to try! 🔥🔥🔥
- **\[2024.07.17\]** We are excited to announce the release of NeedleBench's [technical report](http://arxiv.org/abs/2407.11963). We invite you to visit our [support documentation](https://opencompass.readthedocs.io/en/latest/advanced_guides/needleinahaystack_eval.html) for detailed evaluation guidelines. 🔥🔥🔥
- **\[2024.07.04\]** OpenCompass now supports InternLM2.5, which has **outstanding reasoning capability**, **1M Context window and** and **stronger tool use**, you can try the models in [OpenCompass Config](https://github.com/open-compass/opencompass/tree/main/configs/models/hf_internlm) and [InternLM](https://github.com/InternLM/InternLM) .🔥🔥🔥.
- **\[2024.06.20\]** OpenCompass now supports one-click switching between inference acceleration backends, enhancing the efficiency of the evaluation process. In addition to the default HuggingFace inference backend, it now also supports popular backends [LMDeploy](https://github.com/InternLM/lmdeploy) and [vLLM](https://github.com/vllm-project/vllm). This feature is available via a simple command-line switch and through deployment APIs. For detailed usage, see the [documentation](docs/en/advanced_guides/accelerator_intro.md).🔥🔥🔥.
> [More](docs/en/notes/news.md)
## 📊 Leaderboard
We provide [OpenCompass Leaderboard](https://rank.opencompass.org.cn/home) for the community to rank all public models and API models. If you would like to join the evaluation, please provide the model repository URL or a standard API interface to the email address `opencompass@pjlab.org.cn`.
You can also refer to [Guide to Reproducing CompassAcademic Leaderboard Results](https://opencompass.readthedocs.io/zh-cn/latest/academic.html) to quickly reproduce the leaderboard results.
<p align="right"><a href="#top">🔝Back to top</a></p>
## 🛠️ Installation
Below are the steps for quick installation and datasets preparation.
### 💻 Environment Setup
We highly recommend using conda to manage your python environment.
- #### Create your virtual environment
```bash
conda create --name opencompass python=3.10 -y
conda activate opencompass
```
- #### Install OpenCompass via pip
```bash
pip install -U opencompass
## Full installation (with support for more datasets)
# pip install "opencompass[full]"
## Environment with model acceleration frameworks
## Manage different acceleration frameworks using virtual environments
## since they usually have dependency conflicts with each other.
# pip install "opencompass[lmdeploy]"
# pip install "opencompass[vllm]"
## API evaluation (i.e. Openai, Qwen)
# pip install "opencompass[api]"
```
- #### Install OpenCompass from source
If you want to use opencompass's latest features, or develop new features, you can also build it from source
```bash
git clone https://github.com/open-compass/opencompass opencompass
cd opencompass
pip install -e .
# pip install -e ".[full]"
# pip install -e ".[vllm]"
```
### 📂 Data Preparation
You can choose one for the following method to prepare datasets.
#### Offline Preparation
You can download and extract the datasets with the following commands:
```bash
# Download dataset to data/ folder
wget https://github.com/open-compass/opencompass/releases/download/0.2.2.rc1/OpenCompassData-core-20240207.zip
unzip OpenCompassData-core-20240207.zip
```
#### Automatic Download from OpenCompass
We have supported download datasets automatic from the OpenCompass storage server. You can run the evaluation with extra `--dry-run` to download these datasets.
Currently, the supported datasets are listed in [here](https://github.com/open-compass/opencompass/blob/main/opencompass/utils/datasets_info.py#L259). More datasets will be uploaded recently.
#### (Optional) Automatic Download with ModelScope
Also you can use the [ModelScope](www.modelscope.cn) to load the datasets on demand.
Installation:
```bash
pip install modelscope[framework]
export DATASET_SOURCE=ModelScope
```
Then submit the evaluation task without downloading all the data to your local disk. Available datasets include:
```bash
humaneval, triviaqa, commonsenseqa, tydiqa, strategyqa, cmmlu, lambada, piqa, ceval, math, LCSTS, Xsum, winogrande, openbookqa, AGIEval, gsm8k, nq, race, siqa, mbpp, mmlu, hellaswag, ARC, BBH, xstory_cloze, summedits, GAOKAO-BENCH, OCNLI, cmnli
```
Some third-party features, like Humaneval and Llama, may require additional steps to work properly, for detailed steps please refer to the [Installation Guide](https://opencompass.readthedocs.io/en/latest/get_started/installation.html).
<p align="right"><a href="#top">🔝Back to top</a></p>
## 🏗️ ️Evaluation
After ensuring that OpenCompass is installed correctly according to the above steps and the datasets are prepared. Now you can start your first evaluation using OpenCompass!
### Your first evaluation with OpenCompass!
OpenCompass support setting your configs via CLI or a python script. For simple evaluation settings we recommend using CLI, for more complex evaluation, it is suggested using the script way. You can find more example scripts under the configs folder.
```bash
# CLI
opencompass --models hf_internlm2_5_1_8b_chat --datasets demo_gsm8k_chat_gen
# Python scripts
opencompass examples/eval_chat_demo.py
```
You can find more script examples under [examples](./examples) folder.
### API evaluation
OpenCompass, by its design, does not really discriminate between open-source models and API models. You can evaluate both model types in the same way or even in one settings.
```bash
export OPENAI_API_KEY="YOUR_OPEN_API_KEY"
# CLI
opencompass --models gpt_4o_2024_05_13 --datasets demo_gsm8k_chat_gen
# Python scripts
opencompass examples/eval_api_demo.py
# You can use o1_mini_2024_09_12/o1_preview_2024_09_12 for o1 models, we set max_completion_tokens=8192 as default.
```
### Accelerated Evaluation
Additionally, if you want to use an inference backend other than HuggingFace for accelerated evaluation, such as LMDeploy or vLLM, you can do so with the command below. Please ensure that you have installed the necessary packages for the chosen backend and that your model supports accelerated inference with it. For more information, see the documentation on inference acceleration backends [here](docs/en/advanced_guides/accelerator_intro.md). Below is an example using LMDeploy:
```bash
# CLI
opencompass --models hf_internlm2_5_1_8b_chat --datasets demo_gsm8k_chat_gen -a lmdeploy
# Python scripts
opencompass examples/eval_lmdeploy_demo.py
```
### Supported Models and Datasets
OpenCompass has predefined configurations for many models and datasets. You can list all available model and dataset configurations using the [tools](./docs/en/tools.md#list-configs).
```bash
# List all configurations
python tools/list_configs.py
# List all configurations related to llama and mmlu
python tools/list_configs.py llama mmlu
```
#### Supported Models
If the model is not on the list but supported by Huggingface AutoModel class or encapsulation of inference engine based on OpenAI interface (see [docs](https://opencompass.readthedocs.io/en/latest/advanced_guides/new_model.html) for details), you can also evaluate it with OpenCompass. You are welcome to contribute to the maintenance of the OpenCompass supported model and dataset lists.
```bash
opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat
```
#### Supported Datasets
Currently, OpenCompass have provided standard recommended configurations for datasets. Generally, config files ending with `_gen.py` or `_llm_judge_gen.py` will point to the recommended config we provide for this dataset. You can refer to [docs](https://opencompass.readthedocs.io/en/latest/dataset_statistics.html) for more details.
```bash
# Recommended Evaluation Config based on Rules
opencompass --datasets aime2024_gen --models hf_internlm2_5_1_8b_chat
# Recommended Evaluation Config based on LLM Judge
opencompass --datasets aime2024_llmjudge_gen --models hf_internlm2_5_1_8b_chat
```
If you want to use multiple GPUs to evaluate the model in data parallel, you can use `--max-num-worker`.
```bash
CUDA_VISIBLE_DEVICES=0,1 opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat --max-num-worker 2
```
> \[!TIP\]
>
> `--hf-num-gpus` is used for model parallel(huggingface format), `--max-num-worker` is used for data parallel.
> \[!TIP\]
>
> configuration with `_ppl` is designed for base model typically.
> configuration with `_gen` can be used for both base model and chat model.
Through the command line or configuration files, OpenCompass also supports evaluating APIs or custom models, as well as more diversified evaluation strategies. Please read the [Quick Start](https://opencompass.readthedocs.io/en/latest/get_started/quick_start.html) to learn how to run an evaluation task.
<p align="right"><a href="#top">🔝Back to top</a></p>
## 📣 OpenCompass 2.0
We are thrilled to introduce OpenCompass 2.0, an advanced suite featuring three key components: [CompassKit](https://github.com/open-compass), [CompassHub](https://hub.opencompass.org.cn/home), and [CompassRank](https://rank.opencompass.org.cn/home).

**CompassRank** has been significantly enhanced into the leaderboards that now incorporates both open-source benchmarks and proprietary benchmarks. This upgrade allows for a more comprehensive evaluation of models across the industry.
**CompassHub** presents a pioneering benchmark browser interface, designed to simplify and expedite the exploration and utilization of an extensive array of benchmarks for researchers and practitioners alike. To enhance the visibility of your own benchmark within the community, we warmly invite you to contribute it to CompassHub. You may initiate the submission process by clicking [here](https://hub.opencompass.org.cn/dataset-submit).
**CompassKit** is a powerful collection of evaluation toolkits specifically tailored for Large Language Models and Large Vision-language Models. It provides an extensive set of tools to assess and measure the performance of these complex models effectively. Welcome to try our toolkits for in your research and products.
## ✨ Introduction

OpenCompass is a one-stop platform for large model evaluation, aiming to provide a fair, open, and reproducible benchmark for large model evaluation. Its main features include:
- **Comprehensive support for models and datasets**: Pre-support for 20+ HuggingFace and API models, a model evaluation scheme of 70+ datasets with about 400,000 questions, comprehensively evaluating the capabilities of the models in five dimensions.
- **Efficient distributed evaluation**: One line command to implement task division and distributed evaluation, completing the full evaluation of billion-scale models in just a few hours.
- **Diversified evaluation paradigms**: Support for zero-shot, few-shot, and chain-of-thought evaluations, combined with standard or dialogue-type prompt templates, to easily stimulate the maximum performance of various models.
- **Modular design with high extensibility**: Want to add new models or datasets, customize an advanced task division strategy, or even support a new cluster management system? Everything about OpenCompass can be easily expanded!
- **Experiment management and reporting mechanism**: Use config files to fully record each experiment, and support real-time reporting of results.
## 📖 Dataset Support
We have supported a statistical list of all datasets that can be used on this platform in the documentation on the OpenCompass website.
You can quickly find the dataset you need from the list through sorting, filtering, and searching functions.
In addition, we provide a recommended configuration for each dataset, and some datasets also support LLM Judge-based configurations.
Please refer to the dataset statistics chapter of [docs](https://opencompass.readthedocs.io/en/latest/dataset_statistics.html) for details.
<p align="right"><a href="#top">🔝Back to top</a></p>
## 📖 Model Support
<table align="center">
<tbody>
<tr align="center" valign="bottom">
<td>
<b>Open-source Models</b>
</td>
<td>
<b>API Models</b>
</td>
<!-- <td>
<b>Custom Models</b>
</td> -->
</tr>
<tr valign="top">
<td>
- [Alpaca](https://github.com/tatsu-lab/stanford_alpaca)
- [Baichuan](https://github.com/baichuan-inc)
- [BlueLM](https://github.com/vivo-ai-lab/BlueLM)
- [ChatGLM2](https://github.com/THUDM/ChatGLM2-6B)
- [ChatGLM3](https://github.com/THUDM/ChatGLM3-6B)
- [Gemma](https://huggingface.co/google/gemma-7b)
- [InternLM](https://github.com/InternLM/InternLM)
- [LLaMA](https://github.com/facebookresearch/llama)
- [LLaMA3](https://github.com/meta-llama/llama3)
- [Qwen](https://github.com/QwenLM/Qwen)
- [TigerBot](https://github.com/TigerResearch/TigerBot)
- [Vicuna](https://github.com/lm-sys/FastChat)
- [WizardLM](https://github.com/nlpxucan/WizardLM)
- [Yi](https://github.com/01-ai/Yi)
- ……
</td>
<td>
- OpenAI
- Gemini
- Claude
- ZhipuAI(ChatGLM)
- Baichuan
- ByteDance(YunQue)
- Huawei(PanGu)
- 360
- Baidu(ERNIEBot)
- MiniMax(ABAB-Chat)
- SenseTime(nova)
- Xunfei(Spark)
- ……
</td>
</tr>
</tbody>
</table>
<p align="right"><a href="#top">🔝Back to top</a></p>
## 🔜 Roadmap
- [x] Subjective Evaluation
- [x] Release CompassAreana.
- [x] Subjective evaluation.
- [x] Long-context
- [x] Long-context evaluation with extensive datasets.
- [ ] Long-context leaderboard.
- [x] Coding
- [ ] Coding evaluation leaderboard.
- [x] Non-python language evaluation service.
- [x] Agent
- [ ] Support various agent frameworks.
- [x] Evaluation of tool use of the LLMs.
- [x] Robustness
- [x] Support various attack methods.
## 👷♂️ Contributing
We appreciate all contributions to improving OpenCompass. Please refer to the [contributing guideline](https://opencompass.readthedocs.io/en/latest/notes/contribution_guide.html) for the best practice.
<!-- Copy-paste in your Readme.md file -->
<!-- Made with [OSS Insight](https://ossinsight.io/) -->
<a href="https://github.com/open-compass/opencompass/graphs/contributors" target="_blank">
<table>
<tr>
<th colspan="2">
<br><img src="https://contrib.rocks/image?repo=open-compass/opencompass"><br><br>
</th>
</tr>
</table>
</a>
## 🤝 Acknowledgements
Some code in this project is cited and modified from [OpenICL](https://github.com/Shark-NLP/OpenICL).
Some datasets and prompt implementations are modified from [chain-of-thought-hub](https://github.com/FranxYao/chain-of-thought-hub) and [instruct-eval](https://github.com/declare-lab/instruct-eval).
## 🖊️ Citation
```bibtex
@misc{2023opencompass,
title={OpenCompass: A Universal Evaluation Platform for Foundation Models},
author={OpenCompass Contributors},
howpublished = {\url{https://github.com/open-compass/opencompass}},
year={2023}
}
```
<p align="right"><a href="#top">🔝Back to top</a></p>
[github-contributors-link]: https://github.com/open-compass/opencompass/graphs/contributors
[github-contributors-shield]: https://img.shields.io/github/contributors/open-compass/opencompass?color=c4f042&labelColor=black&style=flat-square
[github-forks-link]: https://github.com/open-compass/opencompass/network/members
[github-forks-shield]: https://img.shields.io/github/forks/open-compass/opencompass?color=8ae8ff&labelColor=black&style=flat-square
[github-issues-link]: https://github.com/open-compass/opencompass/issues
[github-issues-shield]: https://img.shields.io/github/issues/open-compass/opencompass?color=ff80eb&labelColor=black&style=flat-square
[github-license-link]: https://github.com/open-compass/opencompass/blob/main/LICENSE
[github-license-shield]: https://img.shields.io/github/license/open-compass/opencompass?color=white&labelColor=black&style=flat-square
[github-release-link]: https://github.com/open-compass/opencompass/releases
[github-release-shield]: https://img.shields.io/github/v/release/open-compass/opencompass?color=369eff&labelColor=black&logo=github&style=flat-square
[github-releasedate-link]: https://github.com/open-compass/opencompass/releases
[github-releasedate-shield]: https://img.shields.io/github/release-date/open-compass/opencompass?labelColor=black&style=flat-square
[github-stars-link]: https://github.com/open-compass/opencompass/stargazers
[github-stars-shield]: https://img.shields.io/github/stars/open-compass/opencompass?color=ffcb47&labelColor=black&style=flat-square
[github-trending-shield]: https://trendshift.io/api/badge/repositories/6630
[github-trending-url]: https://trendshift.io/repositories/6630
================================================
FILE: README_zh-CN.md
================================================
<div align="center">
<img src="docs/zh_cn/_static/image/logo.svg" width="500px"/>
<br />
<br />
[![][github-release-shield]][github-release-link]
[![][github-releasedate-shield]][github-releasedate-link]
[![][github-contributors-shield]][github-contributors-link]<br>
[![][github-forks-shield]][github-forks-link]
[![][github-stars-shield]][github-stars-link]
[![][github-issues-shield]][github-issues-link]
[![][github-license-shield]][github-license-link]
<!-- [](https://pypi.org/project/opencompass/) -->
[🌐官方网站](https://opencompass.org.cn/) |
[📖数据集社区](https://hub.opencompass.org.cn/home) |
[📊性能榜单](https://rank.opencompass.org.cn/home) |
[📘文档教程](https://opencompass.readthedocs.io/zh_CN/latest/index.html) |
[🛠️安装](https://opencompass.readthedocs.io/zh_CN/latest/get_started/installation.html) |
[🤔报告问题](https://github.com/open-compass/opencompass/issues/new/choose)
[English](/README.md) | 简体中文
[![][github-trending-shield]][github-trending-url]
</div>
<p align="center">
👋 加入我们的 <a href="https://discord.gg/KKwfEbFj7U" target="_blank">Discord</a> 和 <a href="https://r.vansin.top/?r=opencompass" target="_blank">微信社区</a>
</p>
> \[!IMPORTANT\]
>
> **收藏项目**,你将能第一时间获取 OpenCompass 的最新动态~⭐️
<details>
<summary><kbd>Star History</kbd></summary>
<picture>
<source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/svg?repos=open-compass%2Fopencompass&theme=dark&type=Date">
<img width="100%" src="https://api.star-history.com/svg?repos=open-compass%2Fopencompass&type=Date">
</picture>
</details>
## 🧭 欢迎
来到**OpenCompass**!
就像指南针在我们的旅程中为我们导航一样,我们希望OpenCompass能够帮助你穿越评估大型语言模型的重重迷雾。OpenCompass提供丰富的算法和功能支持,期待OpenCompass能够帮助社区更便捷地对NLP模型的性能进行公平全面的评估。
🚩🚩🚩 欢迎加入 OpenCompass!我们目前**招聘全职研究人员/工程师和实习生**。如果您对 LLM 和 OpenCompass 充满热情,请随时通过[电子邮件](mailto:zhangsongyang@pjlab.org.cn)与我们联系。我们非常期待与您交流!
🔥🔥🔥 祝贺 **OpenCompass 作为大模型标准测试工具被Meta AI官方推荐**, 点击 Llama 的 [入门文档](https://ai.meta.com/llama/get-started/#validation) 获取更多信息。
> **注意**<br />
> 重要通知:从 v0.4.0 版本开始,所有位于 ./configs/datasets、./configs/models 和 ./configs/summarizers 目录下的 AMOTIC 配置文件将迁移至 opencompass 包中。请及时更新您的配置文件路径。
## 🚀 最新进展 <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>
- **\[2026.02.05\]** OpenCompass 现已支持Intern-S1-Pro相关的通用及科学评测基准,请参阅[Intern-S1-Pro评测示例](examples/eval_intern_s1_pro.py)和[模型信息](https://huggingface.co/internlm/Intern-S1-Pro)了解详情!🔥🔥🔥
- **\[2025.12.08\]** OpenCompass 现已支持SciReasoner评测,请参阅[SciReasoner评测示例](examples/eval_scireasoner.py)和[原项目地址](https://github.com/InternScience/SciReason)了解详情!🔥🔥🔥
- **\[2025.07.26\]** OpenCompass 现已支持Intern-S1相关的通用及科学评测基准,请参阅[Intern-S1评测教程](https://opencompass.readthedocs.io/zh-cn/latest/user_guides/interns1.html)了解详情!🔥🔥🔥
- **\[2025.04.01\]** OpenCompass 现已支持 `CascadeEvaluator`,允许多个评估器按顺序工作,可以为更复杂的评估场景创建自定义评估流程,查看[文档](docs/zh_cn/advanced_guides/llm_judge.md)了解具体用法!🔥🔥🔥
- **\[2025.03.11\]** 现已支持 `SuperGPQA` 覆盖285 个研究生学科的知识能力评测,欢迎尝试!🔥🔥🔥
- **\[2025.02.28\]** 我们为 `DeepSeek-R1` 系列模型添加了教程,请查看 [评估推理模型](docs/zh_cn/user_guides/deepseek_r1.md) 了解更多详情!🔥🔥🔥
- **\[2025.02.15\]** 我们新增了两个实用的评测工具:用于LLM作为评判器的`GenericLLMEvaluator`和用于数学推理评估的`MATHVerifyEvaluator`。查看[LLM评判器](docs/zh_cn/advanced_guides/llm_judge.md)和[数学能力评测](docs/zh_cn/advanced_guides/general_math.md)文档了解更多详情!🔥🔥🔥
- **\[2025.01.16\]** 我们现已支持 [InternLM3-8B-Instruct](https://huggingface.co/internlm/internlm3-8b-instruct) 模型,该模型在推理、知识类任务上取得同量级最优性能,欢迎尝试。
- **\[2024.12.17\]** 我们提供了12月CompassAcademic学术榜单评估脚本 [CompassAcademic](configs/eval_academic_leaderboard_202412.py),你可以通过简单地配置复现官方评测结果。
- **\[2024.10.14\]** 现已支持OpenAI多语言问答数据集[MMMLU](https://huggingface.co/datasets/openai/MMMLU),欢迎尝试! 🔥🔥🔥
- **\[2024.09.19\]** 现已支持[Qwen2.5](https://huggingface.co/Qwen)(0.5B to 72B) ,可以使用多种推理后端(huggingface/vllm/lmdeploy), 欢迎尝试! 🔥🔥🔥
- **\[2024.09.05\]** 现已支持OpenAI o1 模型(`o1-mini-2024-09-12` and `o1-preview-2024-09-12`), 欢迎尝试! 🔥🔥🔥
- **\[2024.09.05\]** OpenCompass 现在支持通过模型后处理来进行答案提取,以更准确地展示模型的能力。作为此次更新的一部分,我们集成了 [XFinder](https://github.com/IAAR-Shanghai/xFinder) 作为首个后处理模型。具体信息请参阅 [文档](opencompass/utils/postprocessors/xfinder/README.md),欢迎尝试! 🔥🔥🔥
- **\[2024.08.20\]** OpenCompass 现已支持 [SciCode](https://github.com/scicode-bench/SciCode): A Research Coding Benchmark Curated by Scientists。 🔥🔥🔥
- **\[2024.08.16\]** OpenCompass 现已支持全新的长上下文语言模型评估基准——[RULER](https://arxiv.org/pdf/2404.06654)。RULER 通过灵活的配置,提供了对长上下文包括检索、多跳追踪、聚合和问答等多种任务类型的评测,欢迎访问[RULER](configs/datasets/ruler/README.md)。🔥🔥🔥
- **\[2024.07.23\]** 我们支持了[Gemma2](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315)模型,欢迎试用!🔥🔥🔥
- **\[2024.07.23\]** 我们支持了[ModelScope](www.modelscope.cn)数据集,您可以按需加载,无需事先下载全部数据到本地,欢迎试用!🔥🔥🔥
- **\[2024.07.17\]** 我们发布了CompassBench-202407榜单的示例数据和评测规则,敬请访问 [CompassBench](https://opencompass.readthedocs.io/zh-cn/latest/advanced_guides/compassbench_intro.html) 获取更多信息。 🔥🔥🔥
- **\[2024.07.17\]** 我们正式发布 NeedleBench 的[技术报告](http://arxiv.org/abs/2407.11963)。诚邀您访问我们的[帮助文档](https://opencompass.readthedocs.io/zh-cn/latest/advanced_guides/needleinahaystack_eval.html)进行评估。🔥🔥🔥
- **\[2024.07.04\]** OpenCompass 现已支持 InternLM2.5, 它拥有卓越的推理性能、有效支持百万字超长上下文以及工具调用能力整体升级,欢迎访问[OpenCompass Config](https://github.com/open-compass/opencompass/tree/main/configs/models/hf_internlm) 和 [InternLM](https://github.com/InternLM/InternLM) .🔥🔥🔥.
- **\[2024.06.20\]** OpenCompass 现已支持一键切换推理加速后端,助力评测过程更加高效。除了默认的HuggingFace推理后端外,还支持了常用的 [LMDeploy](https://github.com/InternLM/lmdeploy) 和 [vLLM](https://github.com/vllm-project/vllm) ,支持命令行一键切换和部署 API 加速服务两种方式,详细使用方法见[文档](docs/zh_cn/advanced_guides/accelerator_intro.md)。欢迎试用!🔥🔥🔥.
> [更多](docs/zh_cn/notes/news.md)
## 📊 性能榜单
我们将陆续提供开源模型和 API 模型的具体性能榜单,请见 [OpenCompass Leaderboard](https://rank.opencompass.org.cn/home) 。如需加入评测,请提供模型仓库地址或标准的 API 接口至邮箱 `opencompass@pjlab.org.cn`.
你也可以参考[学术榜单精度复现教程](https://opencompass.readthedocs.io/zh-cn/latest/academic.html),快速地复现榜单的结果。
<p align="right"><a href="#top">🔝返回顶部</a></p>
## 🛠️ 安装指南
下面提供了快速安装和数据集准备的步骤。
### 💻 环境搭建
我们强烈建议使用 `conda` 来管理您的 Python 环境。
- #### 创建虚拟环境
```bash
conda create --name opencompass python=3.10 -y
conda activate opencompass
```
- #### 通过pip安装OpenCompass
```bash
# 支持绝大多数数据集及模型
pip install -U opencompass
# 完整安装(支持更多数据集)
# pip install "opencompass[full]"
# 模型推理后端,由于这些推理后端通常存在依赖冲突,建议使用不同的虚拟环境来管理它们。
# pip install "opencompass[lmdeploy]"
# pip install "opencompass[vllm]"
# API 测试(例如 OpenAI、Qwen)
# pip install "opencompass[api]"
```
- #### 基于源码安装OpenCompass
如果希望使用 OpenCompass 的最新功能,也可以从源代码构建它:
```bash
git clone https://github.com/open-compass/opencompass opencompass
cd opencompass
pip install -e .
# pip install -e ".[full]"
# pip install -e ".[vllm]"
```
### 📂 数据准备
#### 提前离线下载
OpenCompass支持使用本地数据集进行评测,数据集的下载和解压可以通过以下命令完成:
```bash
# 下载数据集到 data/ 处
wget https://github.com/open-compass/opencompass/releases/download/0.2.2.rc1/OpenCompassData-core-20240207.zip
unzip OpenCompassData-core-20240207.zip
```
#### 从 OpenCompass 自动下载
我们已经支持从OpenCompass存储服务器自动下载数据集。您可以通过额外的 `--dry-run` 参数来运行评估以下载这些数据集。
目前支持的数据集列表在[这里](https://github.com/open-compass/opencompass/blob/main/opencompass/utils/datasets_info.py#L259)。更多数据集将会很快上传。
#### (可选) 使用 ModelScope 自动下载
另外,您还可以使用[ModelScope](www.modelscope.cn)来加载数据集:
环境准备:
```bash
pip install modelscope
export DATASET_SOURCE=ModelScope
```
配置好环境后,无需下载全部数据,直接提交评测任务即可。目前支持的数据集有:
```bash
humaneval, triviaqa, commonsenseqa, tydiqa, strategyqa, cmmlu, lambada, piqa, ceval, math, LCSTS, Xsum, winogrande, openbookqa, AGIEval, gsm8k, nq, race, siqa, mbpp, mmlu, hellaswag, ARC, BBH, xstory_cloze, summedits, GAOKAO-BENCH, OCNLI, cmnli
```
有部分第三方功能,如 Humaneval 以及 Llama,可能需要额外步骤才能正常运行,详细步骤请参考[安装指南](https://opencompass.readthedocs.io/zh_CN/latest/get_started/installation.html)。
<p align="right"><a href="#top">🔝返回顶部</a></p>
## 🏗️ ️评测
在确保按照上述步骤正确安装了 OpenCompass 并准备好了数据集之后,现在您可以开始使用 OpenCompass 进行首次评估!
- ### 首次评测
OpenCompass 支持通过命令行界面 (CLI) 或 Python 脚本来设置配置。对于简单的评估设置,我们推荐使用 CLI;而对于更复杂的评估,则建议使用脚本方式。你可以在examples文件夹下找到更多脚本示例。
```bash
# 命令行界面 (CLI)
opencompass --models hf_internlm2_5_1_8b_chat --datasets demo_gsm8k_chat_gen
# Python 脚本
opencompass examples/eval_chat_demo.py
```
你可以在[examples](./examples) 文件夹下找到更多的脚本示例。
- ### API评测
OpenCompass 在设计上并不区分开源模型与 API 模型。您可以以相同的方式或甚至在同一设置中评估这两种类型的模型。
```bash
export OPENAI_API_KEY="YOUR_OPEN_API_KEY"
# 命令行界面 (CLI)
opencompass --models gpt_4o_2024_05_13 --datasets demo_gsm8k_chat_gen
# Python 脚本
opencompass examples/eval_api_demo.py
# 现已支持 o1_mini_2024_09_12/o1_preview_2024_09_12 模型, 默认情况下 max_completion_tokens=8192.
```
- ### 推理后端
另外,如果您想使用除 HuggingFace 之外的推理后端来进行加速评估,比如 LMDeploy 或 vLLM,可以通过以下命令进行。请确保您已经为所选的后端安装了必要的软件包,并且您的模型支持该后端的加速推理。更多信息,请参阅关于推理加速后端的文档 [这里](docs/zh_cn/advanced_guides/accelerator_intro.md)。以下是使用 LMDeploy 的示例:
```bash
opencompass --models hf_internlm2_5_1_8b_chat --datasets demo_gsm8k_chat_gen -a lmdeploy
```
- ### 支持的模型与数据集
OpenCompass 预定义了许多模型和数据集的配置,你可以通过 [工具](./docs/zh_cn/tools.md#ListConfigs) 列出所有可用的模型和数据集配置。
```bash
# 列出所有配置
python tools/list_configs.py
# 列出所有跟 llama 及 mmlu 相关的配置
python tools/list_configs.py llama mmlu
```
#### 支持的模型
如果模型不在列表中,但支持 Huggingface AutoModel 类或支持针对 OpenAI 接口的推理引擎封装(详见[官方文档](https://opencompass.readthedocs.io/zh-cn/latest/advanced_guides/new_model.html)),您仍然可以使用 OpenCompass 对其进行评估。欢迎您贡献维护 OpenCompass 支持的模型和数据集列表。
```bash
opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat
```
#### 支持的数据集
目前,OpenCompass针对数据集给出了标准的推荐配置。通常,`_gen.py`或`_llm_judge_gen.py`为结尾的配置文件将指向我们为该数据集提供的推荐配置。您可以参阅[官方文档](https://opencompass.readthedocs.io/zh-cn/latest/dataset_statistics.html) 的数据集统计章节来获取详细信息。
```bash
# 基于规则的推荐配置
opencompass --datasets aime2024_gen --models hf_internlm2_5_1_8b_chat
# 基于LLM Judge的推荐配置
opencompass --datasets aime2024_llmjudge_gen --models hf_internlm2_5_1_8b_chat
```
此外,如果你想在多块 GPU 上使用模型进行推理,您可以使用 `--max-num-worker` 参数。
```bash
CUDA_VISIBLE_DEVICES=0,1 opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat --max-num-worker 2
```
> \[!TIP\]
>
> `--hf-num-gpus` 用于 模型并行(huggingface 格式),`--max-num-worker` 用于数据并行。
> \[!TIP\]
>
> configuration with `_ppl` is designed for base model typically.
> 配置带 `_ppl` 的配置设计给基础模型使用。
> 配置带 `_gen` 的配置可以同时用于基础模型和对话模型。
通过命令行或配置文件,OpenCompass 还支持评测 API 或自定义模型,以及更多样化的评测策略。请阅读[快速开始](https://opencompass.readthedocs.io/zh_CN/latest/get_started/quick_start.html)了解如何运行一个评测任务。
更多教程请查看我们的[文档](https://opencompass.readthedocs.io/zh_CN/latest/index.html)。
<p align="right"><a href="#top">🔝返回顶部</a></p>
## 📣 OpenCompass 2.0
我们很高兴发布 OpenCompass 司南 2.0 大模型评测体系,它主要由三大核心模块构建而成:[CompassKit](https://github.com/open-compass)、[CompassHub](https://hub.opencompass.org.cn/home)以及[CompassRank](https://rank.opencompass.org.cn/home)。
**CompassRank** 系统进行了重大革新与提升,现已成为一个兼容并蓄的排行榜体系,不仅囊括了开源基准测试项目,还包含了私有基准测试。此番升级极大地拓宽了对行业内各类模型进行全面而深入测评的可能性。
**CompassHub** 创新性地推出了一个基准测试资源导航平台,其设计初衷旨在简化和加快研究人员及行业从业者在多样化的基准测试库中进行搜索与利用的过程。为了让更多独具特色的基准测试成果得以在业内广泛传播和应用,我们热忱欢迎各位将自定义的基准数据贡献至CompassHub平台。只需轻点鼠标,通过访问[这里](https://hub.opencompass.org.cn/dataset-submit),即可启动提交流程。
**CompassKit** 是一系列专为大型语言模型和大型视觉-语言模型打造的强大评估工具合集,它所提供的全面评测工具集能够有效地对这些复杂模型的功能性能进行精准测量和科学评估。在此,我们诚挚邀请您在学术研究或产品研发过程中积极尝试运用我们的工具包,以助您取得更加丰硕的研究成果和产品优化效果。
## ✨ 介绍

OpenCompass 是面向大模型评测的一站式平台。其主要特点如下:
- **开源可复现**:提供公平、公开、可复现的大模型评测方案
- **全面的能力维度**:五大维度设计,提供 70+ 个数据集约 40 万题的的模型评测方案,全面评估模型能力
- **丰富的模型支持**:已支持 20+ HuggingFace 及 API 模型
- **分布式高效评测**:一行命令实现任务分割和分布式评测,数小时即可完成千亿模型全量评测
- **多样化评测范式**:支持零样本、小样本及思维链评测,结合标准型或对话型提示词模板,轻松激发各种模型最大性能
- **灵活化拓展**:想增加新模型或数据集?想要自定义更高级的任务分割策略,甚至接入新的集群管理系统?OpenCompass 的一切均可轻松扩展!
## 📖 数据集支持
我们已经在OpenCompass官网的文档中支持了所有可在本平台上使用的数据集的统计列表。
您可以通过排序、筛选和搜索等功能从列表中快速找到您需要的数据集。
详情请参阅 [官方文档](https://opencompass.readthedocs.io/zh-cn/latest/dataset_statistics.html) 的数据集统计章节。
<p align="right"><a href="#top">🔝返回顶部</a></p>
## 📖 模型支持
<table align="center">
<tbody>
<tr align="center" valign="bottom">
<td>
<b>开源模型</b>
</td>
<td>
<b>API 模型</b>
</td>
<!-- <td>
<b>自定义模型</b>
</td> -->
</tr>
<tr valign="top">
<td>
- [Alpaca](https://github.com/tatsu-lab/stanford_alpaca)
- [Baichuan](https://github.com/baichuan-inc)
- [BlueLM](https://github.com/vivo-ai-lab/BlueLM)
- [ChatGLM2](https://github.com/THUDM/ChatGLM2-6B)
- [ChatGLM3](https://github.com/THUDM/ChatGLM3-6B)
- [Gemma](https://huggingface.co/google/gemma-7b)
- [InternLM](https://github.com/InternLM/InternLM)
- [LLaMA](https://github.com/facebookresearch/llama)
- [LLaMA3](https://github.com/meta-llama/llama3)
- [Qwen](https://github.com/QwenLM/Qwen)
- [TigerBot](https://github.com/TigerResearch/TigerBot)
- [Vicuna](https://github.com/lm-sys/FastChat)
- [WizardLM](https://github.com/nlpxucan/WizardLM)
- [Yi](https://github.com/01-ai/Yi)
- ……
</td>
<td>
- OpenAI
- Gemini
- Claude
- ZhipuAI(ChatGLM)
- Baichuan
- ByteDance(YunQue)
- Huawei(PanGu)
- 360
- Baidu(ERNIEBot)
- MiniMax(ABAB-Chat)
- SenseTime(nova)
- Xunfei(Spark)
- ……
</td>
</tr>
</tbody>
</table>
<p align="right"><a href="#top">🔝返回顶部</a></p>
## 🔜 路线图
- [x] 主观评测
- [x] 发布主观评测榜单
- [x] 发布主观评测数据集
- [x] 长文本
- [x] 支持广泛的长文本评测集
- [ ] 发布长文本评测榜单
- [x] 代码能力
- [ ] 发布代码能力评测榜单
- [x] 提供非Python语言的评测服务
- [x] 智能体
- [ ] 支持丰富的智能体方案
- [x] 提供智能体评测榜单
- [x] 鲁棒性
- [x] 支持各类攻击方法
## 👷♂️ 贡献
我们感谢所有的贡献者为改进和提升 OpenCompass 所作出的努力。请参考[贡献指南](https://opencompass.readthedocs.io/zh_CN/latest/notes/contribution_guide.html)来了解参与项目贡献的相关指引。
<a href="https://github.com/open-compass/opencompass/graphs/contributors" target="_blank">
<table>
<tr>
<th colspan="2">
<br><img src="https://contrib.rocks/image?repo=open-compass/opencompass"><br><br>
</th>
</tr>
</table>
</a>
## 🤝 致谢
该项目部分的代码引用并修改自 [OpenICL](https://github.com/Shark-NLP/OpenICL)。
该项目部分的数据集和提示词实现修改自 [chain-of-thought-hub](https://github.com/FranxYao/chain-of-thought-hub), [instruct-eval](https://github.com/declare-lab/instruct-eval)
## 🖊️ 引用
```bibtex
@misc{2023opencompass,
title={OpenCompass: A Universal Evaluation Platform for Foundation Models},
author={OpenCompass Contributors},
howpublished = {\url{https://github.com/open-compass/opencompass}},
year={2023}
}
```
<p align="right"><a href="#top">🔝返回顶部</a></p>
[github-contributors-link]: https://github.com/open-compass/opencompass/graphs/contributors
[github-contributors-shield]: https://img.shields.io/github/contributors/open-compass/opencompass?color=c4f042&labelColor=black&style=flat-square
[github-forks-link]: https://github.com/open-compass/opencompass/network/members
[github-forks-shield]: https://img.shields.io/github/forks/open-compass/opencompass?color=8ae8ff&labelColor=black&style=flat-square
[github-issues-link]: https://github.com/open-compass/opencompass/issues
[github-issues-shield]: https://img.shields.io/github/issues/open-compass/opencompass?color=ff80eb&labelColor=black&style=flat-square
[github-license-link]: https://github.com/open-compass/opencompass/blob/main/LICENSE
[github-license-shield]: https://img.shields.io/github/license/open-compass/opencompass?color=white&labelColor=black&style=flat-square
[github-release-link]: https://github.com/open-compass/opencompass/releases
[github-release-shield]: https://img.shields.io/github/v/release/open-compass/opencompass?color=369eff&labelColor=black&logo=github&style=flat-square
[github-releasedate-link]: https://github.com/open-compass/opencompass/releases
[github-releasedate-shield]: https://img.shields.io/github/release-date/open-compass/opencompass?labelColor=black&style=flat-square
[github-stars-link]: https://github.com/open-compass/opencompass/stargazers
[github-stars-shield]: https://img.shields.io/github/stars/open-compass/opencompass?color=ffcb47&labelColor=black&style=flat-square
[github-trending-shield]: https://trendshift.io/api/badge/repositories/6630
[github-trending-url]: https://trendshift.io/repositories/6630
================================================
FILE: autotest/__init__.py
================================================
"""OpenCompass automated test package."""
__all__ = []
================================================
FILE: autotest/cluster/__init__.py
================================================
"""OpenCompass inference test configurations."""
__all__ = []
================================================
FILE: autotest/cluster/chat_models.py
================================================
from mmengine.config import read_base
from opencompass.models import (HuggingFacewithChatTemplate,
TurboMindModelwithChatTemplate,
VLLMwithChatTemplate)
from opencompass.utils.text_postprocessors import extract_non_reasoning_content
with read_base():
# choose a list of datasets
from opencompass.configs.datasets.gsm8k.gsm8k_gen import \
gsm8k_datasets # noqa: F401, E501
from opencompass.configs.datasets.race.race_gen import \
race_datasets # noqa: F401, E501
# re-design .. including some models and modify all kinds of configs
from ...rjob import eval, infer # noqa: F401, E501
Qwen3_0_6B_FP8_hf = dict(
type=HuggingFacewithChatTemplate,
abbr='qwen3_0_6b_fp8-hf',
path='Qwen/Qwen3-0.6B-FP8',
max_out_len=16384,
batch_size=8,
run_cfg=dict(num_gpus=1),
pred_postprocessor=dict(type=extract_non_reasoning_content))
Qwen3_0_6B_FP8_turbomind = dict(
type=TurboMindModelwithChatTemplate,
abbr='qwen3-0_6b-fp8-turbomind',
path='Qwen/Qwen3-0.6B-FP8',
engine_config=dict(session_len=32768, max_batch_size=1),
gen_config=dict(top_k=1, max_new_tokens=16384),
max_seq_len=32768,
max_out_len=16384,
batch_size=1,
run_cfg=dict(num_gpus=1),
pred_postprocessor=dict(type=extract_non_reasoning_content))
Qwen3_0_6B_FP8_vllm = dict(
type=VLLMwithChatTemplate,
abbr='qwen3-0_6b-fp8-vllm',
path='Qwen/Qwen3-0.6B-FP8',
model_kwargs=dict(tensor_parallel_size=1),
generation_kwargs=dict(temperature=0), # greedy
max_seq_len=32768,
max_out_len=16384,
batch_size=1,
run_cfg=dict(num_gpus=1),
)
race_datasets = [race_datasets[1]]
datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
for d in datasets:
d['reader_cfg']['test_range'] = '[0:4]'
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
models = [Qwen3_0_6B_FP8_hf, Qwen3_0_6B_FP8_turbomind, Qwen3_0_6B_FP8_vllm]
summarizer = dict(
dataset_abbrs=[
'gsm8k',
'race-middle',
'race-high',
],
summary_groups=sum(
[v for k, v in locals().items() if k.endswith('_summary_groups')], []),
)
================================================
FILE: autotest/eval/__init__.py
================================================
"""OpenCompass inference test configurations."""
__all__ = []
================================================
FILE: autotest/eval/eval_base_fullbench.py
================================================
from mmengine.config import read_base
with read_base():
from autotest.eval.models import base_models
from opencompass.configs.datasets.ARC_c.ARC_c_few_shot_ppl import \
ARC_c_datasets # noqa: F401, E501
from opencompass.configs.datasets.bbh.bbh_gen_98fba6 import \
bbh_datasets # noqa: F401, E501
from opencompass.configs.datasets.cmmlu.cmmlu_ppl_041cbf import \
cmmlu_datasets # noqa: F401, E501
from opencompass.configs.datasets.drop.drop_gen_a2697c import \
drop_datasets # noqa: F401, E501
from opencompass.configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_d21e37 import \
GaokaoBench_datasets # noqa: F401, E501
from opencompass.configs.datasets.gpqa.gpqa_few_shot_ppl_4b5a83 import \
gpqa_datasets # noqa: F401, E501
# Corebench v1.7
from opencompass.configs.datasets.gsm8k.gsm8k_gen_17d0dc import \
gsm8k_datasets # noqa: F401, E501
from opencompass.configs.datasets.hellaswag.hellaswag_10shot_ppl_59c85e import \
hellaswag_datasets # noqa: F401, E501
# from opencompass.configs.datasets.humaneval.internal_humaneval_gen_ce6b06 import \ # noqa: F401, E501
# humaneval_datasets as humaneval_v2_datasets # noqa: F401, E501
# from opencompass.configs.datasets.humaneval.internal_humaneval_gen_d2537e import \ # noqa: F401, E501
# humaneval_datasets # noqa: F401, E501
from opencompass.configs.datasets.math.math_4shot_base_gen_43d5b6 import \
math_datasets # noqa: F401, E501
from opencompass.configs.datasets.MathBench.mathbench_2024_few_shot_mixed_4a3fd4 import \
mathbench_datasets # noqa: F401, E501
from opencompass.configs.datasets.mbpp.sanitized_mbpp_gen_742f0c import \
sanitized_mbpp_datasets # noqa: F401, E501
from opencompass.configs.datasets.mmlu.mmlu_ppl_ac766d import \
mmlu_datasets # noqa: F401, E501
from opencompass.configs.datasets.mmlu_pro.mmlu_pro_few_shot_gen_bfaf90 import \
mmlu_pro_datasets # noqa: F401, E501
from opencompass.configs.datasets.nq.nq_open_1shot_gen_20a989 import \
nq_datasets # noqa: F401, E501
from opencompass.configs.datasets.race.race_few_shot_ppl import \
race_datasets # noqa: F401, E501
from opencompass.configs.datasets.SuperGLUE_BoolQ.SuperGLUE_BoolQ_few_shot_ppl import \
BoolQ_datasets # noqa: F401, E501
from opencompass.configs.datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import \
TheoremQA_datasets # noqa: F401, E501
from opencompass.configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_20a989 import \
triviaqa_datasets # noqa: F401, E501
from opencompass.configs.datasets.wikibench.wikibench_few_shot_ppl_c23d79 import \
wikibench_datasets # noqa: F401, E501
from opencompass.configs.datasets.winogrande.winogrande_5shot_ll_252f01 import \
winogrande_datasets # noqa: F401, E501
from opencompass.configs.summarizers.groups.bbh import \
bbh_summary_groups # noqa: F401, E501
# Summary Groups
from opencompass.configs.summarizers.groups.cmmlu import \
cmmlu_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.GaokaoBench import \
GaokaoBench_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.mathbench_v1_2024 import \
mathbench_2024_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.mmlu import \
mmlu_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.mmlu_pro import \
mmlu_pro_summary_groups # noqa: F401, E501
models = base_models
race_datasets = [race_datasets[1]] # Only take RACE-High
bbh_datasets = [
x for x in bbh_datasets if 'logical_deduction_seven_objects' in x['abbr']
or 'multistep_arithmetic_two' in x['abbr']
]
cmmlu_datasets = [
x for x in cmmlu_datasets if x['abbr'].replace('cmmlu-', '') in [
'ancient_chinese', 'chinese_civil_service_exam',
'chinese_driving_rule', 'chinese_food_culture',
'chinese_foreign_policy', 'chinese_history', 'chinese_literature',
'chinese_teacher_qualification', 'construction_project_management',
'elementary_chinese', 'elementary_commonsense', 'ethnology',
'high_school_politics', 'modern_chinese',
'traditional_chinese_medicine'
]
]
mmlu_datasets = [
x for x in mmlu_datasets if x['abbr'].replace('lukaemon_mmlu_', '') in [
'business_ethics', 'clinical_knowledge', 'college_medicine',
'global_facts', 'human_aging', 'management', 'marketing',
'medical_genetics', 'miscellaneous', 'nutrition',
'professional_accounting', 'professional_medicine', 'virology'
]
]
mmlu_pro_datasets = [mmlu_pro_datasets[0]]
mathbench_datasets = [x for x in mathbench_datasets if 'college' in x['abbr']]
GaokaoBench_datasets = [
x for x in GaokaoBench_datasets if '2010-2022_Math_II_MCQs' in x['abbr']
or '2010-2022_Math_II_Fill-in-the-Blank' in x['abbr']
]
datasets = sum((v for k, v in locals().items()
if k.endswith('_datasets') and 'dingo' not in k.lower()), [])
summary_groups = sum(
[v for k, v in locals().items() if k.endswith('_summary_groups')], [])
summary_groups.append(
{
'name': 'Mathbench',
'subsets': ['mathbench-a (average)', 'mathbench-t (average)'],
}, )
summarizer = dict(
dataset_abbrs=[
'Language',
['race-high', 'accuracy'],
['ARC-c', 'accuracy'],
['BoolQ', 'accuracy'],
['triviaqa_wiki_1shot', 'score'],
['nq_open_1shot', 'score'],
'',
'General Reasoning',
['drop', 'accuracy'],
['bbh', 'naive_average'],
['GPQA_diamond', 'accuracy'],
['hellaswag', 'accuracy'],
['TheoremQA', 'score'],
['winogrande', 'accuracy'],
'',
'Math Calculation',
['gsm8k', 'accuracy'],
['GaokaoBench', 'weighted_average'],
'GaokaoBench_2010-2022_Math_II_MCQs',
'GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank',
['math', 'accuracy'],
['Mathbench', 'naive_average'],
'',
'Knowledge',
['wikibench-wiki-single_choice_cncircular', 'perf_4'],
['cmmlu', 'naive_average'],
['mmlu', 'naive_average'],
['mmlu_pro', 'naive_average'],
'',
'Code',
['openai_humaneval', 'humaneval_pass@1'],
['openai_humaneval_v2', 'humaneval_pass@1'],
['sanitized_mbpp', 'score'],
'',
['dingo_en_192', 'score'],
['dingo_zh_170', 'score'],
'',
'mmlu',
'mmlu-stem',
'mmlu-social-science',
'mmlu-humanities',
['mmlu-other', 'accuracy'],
'',
'cmmlu',
'cmmlu-stem',
'cmmlu-social-science',
'cmmlu-humanities',
'cmmlu-other',
['cmmlu-china-specific', 'accuracy'],
'',
'mmlu_pro',
'mmlu_pro_biology',
'mmlu_pro_business',
'mmlu_pro_chemistry',
'mmlu_pro_computer_science',
'mmlu_pro_economics',
'mmlu_pro_engineering',
'mmlu_pro_health',
'mmlu_pro_history',
'mmlu_pro_law',
'mmlu_pro_math',
'mmlu_pro_philosophy',
'mmlu_pro_physics',
'mmlu_pro_psychology',
'mmlu_pro_other',
'',
'bbh-logical_deduction_seven_objects',
'bbh-multistep_arithmetic_two',
'###### MathBench-A: Application Part ######',
'college',
'high',
'middle',
'primary',
'arithmetic',
'mathbench-a (average)',
'###### MathBench-T: Theory Part ######',
'college_knowledge',
'high_knowledge',
'middle_knowledge',
'primary_knowledge',
'mathbench-t (average)',
],
summary_groups=summary_groups,
)
datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
for d in datasets:
d['reader_cfg']['test_range'] = '[0:4]'
================================================
FILE: autotest/eval/eval_base_longtext_fullbench.py
================================================
from mmengine.config import read_base
with read_base():
from autotest.eval.models import base_models
from opencompass.configs.datasets.longbench.longbench import \
longbench_datasets # noqa: F401, E501
from opencompass.configs.datasets.needlebench.needlebench_base.needlebench_base_gen import \
needlebench_datasets # noqa: F401, E501
# summarizer
from opencompass.configs.summarizers.groups.longbench import \
longbench_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.needlebench import \
needlebench_internal_200k_summarizer # noqa: F401, E501
from opencompass.configs.summarizers.needlebench import (
needlebench_internal_32k_summarizer,
needlebench_internal_100k_summarizer)
models = base_models
needlebench_internal_32k_summary_groups = needlebench_internal_32k_summarizer[
'summary_groups']
needlebench_internal_100k_summary_groups = (
needlebench_internal_100k_summarizer['summary_groups'])
needlebench_internal_200k_summary_groups = (
needlebench_internal_200k_summarizer['summary_groups'])
datasets = [
v[0] for k, v in locals().items()
if k.endswith('_datasets') and isinstance(v, list) and len(v) > 0
]
for d in datasets:
d['reader_cfg']['test_range'] = '[0:4]'
================================================
FILE: autotest/eval/eval_chat_longtext_fullbench.py
================================================
from mmengine.config import read_base
with read_base():
from autotest.eval.models import models
from opencompass.configs.datasets.babilong.babilong_256k_gen import \
babiLong_256k_datasets # noqa: F401, E501
from opencompass.configs.datasets.longbench.longbench import \
longbench_datasets # noqa: F401, E501
from opencompass.configs.datasets.needlebench.needlebench_128k.needlebench_128k import \
needlebench_datasets as needlebench_128k_datasets # noqa: F401, E501
from opencompass.configs.datasets.ruler.ruler_128k_gen import \
ruler_datasets as ruler_128k_datasets # noqa: F401, E501
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat_1m import \
models as lmdeploy_internlm2_5_7b_chat_1m_model # noqa: F401, E501
# Summary Groups
from opencompass.configs.summarizers.groups.babilong import \
babilong_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.longbench import \
longbench_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.ruler import \
ruler_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.needlebench import \
needlebench_128k_summarizer # noqa: F401, E501
models = models
datasets = [
v[0] for k, v in locals().items()
if k.endswith('_datasets') and isinstance(v, list) and len(v) > 0
]
for d in datasets:
d['reader_cfg']['test_range'] = '[0:4]'
================================================
FILE: autotest/eval/eval_chat_obj_fullbench_other.py
================================================
from mmengine.config import read_base
with read_base():
# Datasets
from autotest.eval.models import judge_models, models
from opencompass.configs.chatml_datasets.C_MHChem.C_MHChem_gen import \
datasets as C_MHChem_chatml_datasets # noqa: F401, E501
from opencompass.configs.chatml_datasets.CPsyExam.CPsyExam_gen import \
datasets as CPsyExam_chatml_datasets # noqa: F401, E501
from opencompass.configs.chatml_datasets.MaScQA.MaScQA_gen import \
datasets as MaScQA_chatml_datasets # noqa: F401, E501
from opencompass.configs.chatml_datasets.UGPhysics.UGPhysics_gen import \
datasets as UGPhysics_chatml_datasets # noqa: F401, E501
from opencompass.configs.datasets.eese.eese_llm_judge_gen import \
eese_datasets # noqa: F401, E501
models = models
chatml_datasets = [
v[0] for k, v in locals().items()
if k.endswith('_chatml_datasets') and isinstance(v, list) and len(v) > 0
]
datasets = [eese_datasets[0]]
for d in chatml_datasets:
d['test_range'] = '[0:4]'
for d in datasets:
if 'reader_cfg' in d:
d['reader_cfg']['test_range'] = '[0:4]'
else:
d['test_range'] = '[0:4]'
if 'eval_cfg' in d and 'dataset_cfg' in d['eval_cfg'][
'evaluator'] and 'reader_cfg' in d['eval_cfg']['evaluator'][
'dataset_cfg']:
d['eval_cfg']['evaluator']['dataset_cfg']['reader_cfg'][
'test_range'] = '[0:4]'
if 'eval_cfg' in d and 'llm_evaluator' in d['eval_cfg'][
'evaluator'] and 'dataset_cfg' in d['eval_cfg']['evaluator'][
'llm_evaluator']:
d['eval_cfg']['evaluator']['llm_evaluator']['dataset_cfg'][
'reader_cfg']['test_range'] = '[0:4]'
obj_judge_model = judge_models[0]
for d in datasets:
if 'eval_cfg' in d and 'evaluator' in d['eval_cfg']:
if 'judge_cfg' in d['eval_cfg']['evaluator']:
d['eval_cfg']['evaluator']['judge_cfg'] = obj_judge_model
if 'llm_evaluator' in d['eval_cfg']['evaluator'] and 'judge_cfg' in d[
'eval_cfg']['evaluator']['llm_evaluator']:
d['eval_cfg']['evaluator']['llm_evaluator'][
'judge_cfg'] = obj_judge_model
for d in chatml_datasets:
if 'judge_cfg' in d['evaluator']:
d['evaluator']['judge_cfg'] = obj_judge_model
if 'llm_evaluator' in d['evaluator'] and 'judge_cfg' in d['evaluator'][
'llm_evaluator']:
d['evaluator']['llm_evaluator']['judge_cfg'] = obj_judge_model
================================================
FILE: autotest/eval/eval_chat_obj_fullbench_v5.py
================================================
from mmengine.config import read_base
with read_base():
# read hf models - chat models
# Dataset
from autotest.eval.models import models
from opencompass.configs.datasets.aime2024.aime2024_gen_6e39a4 import \
aime2024_datasets # noqa: F401, E501
from opencompass.configs.datasets.ARC_c.ARC_c_cot_gen_926652 import \
ARC_c_datasets # noqa: F401, E501
# remove because of oom
# from opencompass.configs.datasets.ARC_Prize_Public_Evaluation.arc_prize_public_evaluation_gen_872059 import arc_prize_public_evaluation_datasets # noqa: F401, E501
from opencompass.configs.datasets.bbh.bbh_gen_5b92b0 import \
bbh_datasets # noqa: F401, E501
# from opencompass.configs.datasets.bigcodebench.bigcodebench_hard_complete_gen_faf748 import \ # noqa: F401, E501
# bigcodebench_hard_complete_datasets # noqa: F401, E501
# from opencompass.configs.datasets.bigcodebench.bigcodebench_hard_instruct_gen_8815eb import \ # noqa: F401, E501
# bigcodebench_hard_instruct_datasets # noqa: F401, E501
from opencompass.configs.datasets.cmmlu.cmmlu_0shot_cot_gen_305931 import \
cmmlu_datasets # noqa: F401, E501
from opencompass.configs.datasets.cmo_fib.cmo_fib_gen_ace24b import \
cmo_fib_datasets # noqa: F401, E501
from opencompass.configs.datasets.drop.drop_openai_simple_evals_gen_3857b0 import \
drop_datasets # noqa: F401, E501
from opencompass.configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_4c31db import \
GaokaoBench_datasets # noqa: F401, E501
from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import \
gpqa_datasets # noqa: F401, E501
# new datasets in Fullbench v1.1
from opencompass.configs.datasets.gsm8k.gsm8k_0shot_v2_gen_6e39a4 import \
gsm8k_datasets # noqa: F401, E501
from opencompass.configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import \
hellaswag_datasets # noqa: F401, E501
from opencompass.configs.datasets.humaneval.humaneval_openai_sample_evals_gen_dcae0e import \
humaneval_datasets # noqa: F401, E501
from opencompass.configs.datasets.IFEval.IFEval_gen_353ae7 import \
ifeval_datasets # noqa: F401, E501
from opencompass.configs.datasets.korbench.korbench_single_0_shot_gen import \
korbench_0shot_single_datasets # noqa: F401, E501
from opencompass.configs.datasets.livecodebench.livecodebench_gen_b2b0fd import \
LCB_datasets # noqa: F401, E501
from opencompass.configs.datasets.math.math_0shot_gen_11c4b5 import \
math_datasets # noqa: F401, E501
from opencompass.configs.datasets.MathBench.mathbench_2024_gen_50a320 import \
mathbench_datasets # noqa: F401, E501
from opencompass.configs.datasets.mbpp.sanitized_mbpp_mdblock_gen_a447ff import \
sanitized_mbpp_datasets # noqa: F401, E501
from opencompass.configs.datasets.mmlu.mmlu_openai_simple_evals_gen_b618ea import \
mmlu_datasets # noqa: F401, E501
from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import \
mmlu_pro_datasets # noqa: F401, E501
from opencompass.configs.datasets.mmmlu_lite.mmmlu_lite_gen_c51a84 import \
mmmlu_lite_datasets # noqa: F401, E501
from opencompass.configs.datasets.musr.musr_gen_3622bb import \
musr_datasets # noqa: F401, E501
from opencompass.configs.datasets.nq.nq_open_1shot_gen_2e45e5 import \
nq_datasets # noqa: F401, E501
from opencompass.configs.datasets.race.race_cot_gen_d95929 import \
race_datasets # noqa: F401, E501
from opencompass.configs.datasets.scicode.scicode_gen_085b98 import \
SciCode_datasets # noqa: F401, E501
from opencompass.configs.datasets.SuperGLUE_BoolQ.SuperGLUE_BoolQ_cot_gen_1d56df import \
BoolQ_datasets # noqa: F401, E501
from opencompass.configs.datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import \
TheoremQA_datasets # noqa: F401, E501
from opencompass.configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_bc5f21 import \
triviaqa_datasets # noqa: F401, E501
from opencompass.configs.datasets.wikibench.wikibench_gen_0978ad import \
wikibench_datasets # noqa: F401, E501
# Summary Groups
# Summary Groups
from opencompass.configs.summarizers.groups.bbh import \
bbh_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.cmmlu import \
cmmlu_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.GaokaoBench import \
GaokaoBench_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.korbench import \
korbench_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.mathbench_v1_2024 import \
mathbench_2024_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.mmlu import \
mmlu_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.mmlu_pro import \
mmlu_pro_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.musr_average import \
summarizer as musr_summarizer # noqa: F401, E501
from opencompass.configs.summarizers.groups.scicode import \
scicode_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.mmmlu_lite import \
mmmlu_summary_groups # noqa: F401, E501
models = models
race_datasets = [race_datasets[1]]
bbh_datasets = [
x for x in bbh_datasets if 'logical_deduction_seven_objects' in x['abbr']
or 'multistep_arithmetic_two' in x['abbr']
]
cmmlu_datasets = [
x for x in cmmlu_datasets if x['abbr'].replace('cmmlu-', '') in [
'ancient_chinese', 'chinese_civil_service_exam',
'chinese_driving_rule', 'chinese_food_culture',
'chinese_foreign_policy', 'chinese_history', 'chinese_literature',
'chinese_teacher_qualification', 'construction_project_management',
'elementary_chinese', 'elementary_commonsense', 'ethnology',
'high_school_politics', 'modern_chinese',
'traditional_chinese_medicine'
]
]
mmlu_datasets = [
x for x in mmlu_datasets if x['abbr'].replace('lukaemon_mmlu_', '') in [
'business_ethics', 'clinical_knowledge', 'college_medicine',
'global_facts', 'human_aging', 'management', 'marketing',
'medical_genetics', 'miscellaneous', 'nutrition',
'professional_accounting', 'professional_medicine', 'virology'
]
]
mmlu_pro_datasets = [mmlu_pro_datasets[0]]
mmmlu_lite_datasets = [
x for x in mmmlu_lite_datasets if 'mmlu_lite_AR-XY' in x['abbr']
]
mathbench_datasets = [x for x in mathbench_datasets if 'college' in x['abbr']]
GaokaoBench_datasets = [
x for x in GaokaoBench_datasets if '2010-2022_Math_II_MCQs' in x['abbr']
or '2010-2022_Math_II_Fill-in-the-Blank' in x['abbr']
]
datasets = sum(
(v for k, v in locals().items() if k.endswith('_datasets')
and 'scicode' not in k.lower() and 'teval' not in k and 'human' not in k),
[],
)
datasets += humaneval_datasets
# datasets += SciCode_datasets
musr_summary_groups = musr_summarizer['summary_groups']
summary_groups = sum(
[v for k, v in locals().items() if k.endswith('_summary_groups')], [])
summary_groups.append(
{
'name': 'Mathbench',
'subsets': ['mathbench-a (average)', 'mathbench-t (average)'],
}, )
# Summarizer
summarizer = dict(
dataset_abbrs=[
'Language',
['race-high', 'accuracy'],
['ARC-c', 'accuracy'],
['BoolQ', 'accuracy'],
['triviaqa_wiki_1shot', 'score'],
['nq_open_1shot', 'score'],
['mmmlu_lite', 'naive_average'],
'',
'Instruction Following',
['IFEval', 'Prompt-level-strict-accuracy'],
'',
'General Reasoning',
['drop', 'accuracy'],
['bbh', 'naive_average'],
['GPQA_diamond', 'accuracy'],
['hellaswag', 'accuracy'],
['TheoremQA', 'score'],
['musr_average', 'naive_average'],
['korbench_single', 'naive_average'],
['ARC_Prize_Public_Evaluation', 'accuracy'],
'',
'Math Calculation',
['gsm8k', 'accuracy'],
['GaokaoBench', 'weighted_average'],
['math', 'accuracy'],
['cmo_fib', 'accuracy'],
['aime2024', 'accuracy'],
['Mathbench', 'naive_average'],
'',
'Knowledge',
['wikibench-wiki-single_choice_cncircular', 'perf_4'],
['cmmlu', 'naive_average'],
['mmlu', 'naive_average'],
['mmlu_pro', 'naive_average'],
'',
'Code',
['openai_humaneval', 'humaneval_pass@1'],
['sanitized_mbpp', 'score'],
['humanevalx', 'naive_average'],
['ds1000', 'naive_average'],
['lcb_code_generation', 'pass@1'],
['lcb_code_execution', 'pass@1'],
['lcb_test_output', 'pass@1'],
['bigcodebench_hard_instruct', 'pass@1'],
['bigcodebench_hard_complete', 'pass@1'],
'',
'Agent',
['teval', 'naive_average'],
['SciCode', 'accuracy'],
['SciCode', 'sub_accuracy'],
'',
'bbh-logical_deduction_seven_objects',
'bbh-multistep_arithmetic_two',
'',
'mmlu',
'mmlu-stem',
'mmlu-social-science',
'mmlu-humanities',
'mmlu-other',
'',
'cmmlu',
'cmmlu-stem',
'cmmlu-social-science',
'cmmlu-humanities',
'cmmlu-other',
'cmmlu-china-specific',
'',
'mmlu_pro',
'mmlu_pro_biology',
'mmlu_pro_business',
'mmlu_pro_chemistry',
'mmlu_pro_computer_science',
'mmlu_pro_economics',
'mmlu_pro_engineering',
'mmlu_pro_health',
'mmlu_pro_history',
'mmlu_pro_law',
'mmlu_pro_math',
'mmlu_pro_philosophy',
'mmlu_pro_physics',
'mmlu_pro_psychology',
'mmlu_pro_other',
'',
'ds1000_Pandas',
'ds1000_Numpy',
'ds1000_Tensorflow',
'ds1000_Scipy',
'ds1000_Sklearn',
'ds1000_Pytorch',
'ds1000_Matplotlib',
'',
'mmmlu_lite',
'openai_mmmlu_lite_AR-XY',
'openai_mmmlu_lite_BN-BD',
'openai_mmmlu_lite_DE-DE',
'openai_mmmlu_lite_ES-LA',
'openai_mmmlu_lite_FR-FR',
'openai_mmmlu_lite_HI-IN',
'openai_mmmlu_lite_ID-ID',
'openai_mmmlu_lite_IT-IT',
'openai_mmmlu_lite_JA-JP',
'openai_mmmlu_lite_KO-KR',
'openai_mmmlu_lite_PT-BR',
'openai_mmmlu_lite_SW-KE',
'openai_mmmlu_lite_YO-NG',
'openai_mmmlu_lite_ZH-CN',
'',
'###### MathBench-A: Application Part ######',
'college',
'high',
'middle',
'primary',
'arithmetic',
'mathbench-a (average)',
'###### MathBench-T: Theory Part ######',
'college_knowledge',
'high_knowledge',
'middle_knowledge',
'primary_knowledge',
'mathbench-t (average)',
],
summary_groups=summary_groups,
)
for d in datasets:
d['reader_cfg']['test_range'] = '[0:4]'
================================================
FILE: autotest/eval/eval_chat_obj_fullbench_v6.py
================================================
from mmengine.config import read_base
with read_base():
from autotest.eval.models import judge_models, models
from opencompass.configs.datasets.aime2024.aime2024_llmjudge_gen_5e9f4f import \
aime2024_datasets # noqa: F401, E501
from opencompass.configs.datasets.aime2025.aime2025_llmjudge_gen_5e9f4f import \
aime2025_datasets # noqa: F401, E501
from opencompass.configs.datasets.ARC_Prize_Public_Evaluation.arc_prize_public_evaluation_gen_fedd04 import \
arc_prize_public_evaluation_datasets # noqa: F401, E501
from opencompass.configs.datasets.bbh.bbh_llmjudge_gen_b5bdf1 import \
bbh_datasets # noqa: F401, E501
from opencompass.configs.datasets.cmo_fib.cmo_fib_gen_2783e5 import \
cmo_fib_datasets # noqa: F401, E501
# General Reasoning
from opencompass.configs.datasets.drop.drop_llmjudge_gen_3857b0 import \
drop_datasets # noqa: F401, E501
from opencompass.configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_d16acb import \
GaokaoBench_datasets # noqa: F401, E501
from opencompass.configs.datasets.gpqa.gpqa_0shot_nocot_genericllmeval_gen_772ea0 import \
gpqa_datasets # noqa: F401, E501
# Math Calculation
from opencompass.configs.datasets.gsm8k.gsm8k_0shot_v2_gen_17d799 import \
gsm8k_datasets # noqa: F401, E501
from opencompass.configs.datasets.hellaswag.hellaswag_llmjudge_gen_809ef1 import \
hellaswag_datasets # noqa: F401, E501
from opencompass.configs.datasets.korbench.korbench_llmjudge_gen_56cf43 import \
korbench_0shot_single_datasets # noqa: F401, E501
from opencompass.configs.datasets.math.math_500_llmjudge_gen_6ff468 import \
math_datasets # noqa: F401, E501
from opencompass.configs.datasets.MathBench.mathbench_2024_gen_4b8f28 import \
mathbench_datasets # noqa: F401, E501
from opencompass.configs.datasets.musr.musr_llmjudge_gen_b47fd3 import \
musr_datasets # noqa: F401, E501
from opencompass.configs.datasets.supergpqa.supergpqa_llmjudge_gen_12b8bc import \
supergpqa_datasets # noqa: F401, E501
from opencompass.configs.datasets.teval.teval_en_gen_1ac254 import \
teval_datasets as teval_en_datasets # noqa: F401, E501
from opencompass.configs.datasets.teval.teval_zh_gen_1ac254 import \
teval_datasets as teval_zh_datasets # noqa: F401, E501
from opencompass.configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_c87d61 import \
triviaqa_datasets # noqa: F401, E501
# Summary Groups
from opencompass.configs.summarizers.groups.bbeh import \
bbeh_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.bbh import \
bbh_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.cmmlu import \
cmmlu_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.GaokaoBench import \
GaokaoBench_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.korbench import \
korbench_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.mathbench_v1_2024 import \
mathbench_2024_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.mmlu import \
mmlu_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.mmlu_pro import \
mmlu_pro_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.musr_average import \
summarizer as musr_summarizer
from opencompass.configs.summarizers.groups.teval import \
teval_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.mmmlu_lite import \
mmmlu_summary_groups # noqa: F401, E501
models = models
datasets = [
v[0] for k, v in locals().items() if k.endswith('_datasets')
and 'scicode' not in k.lower() and 'teval' not in k.lower()
and 'arc_prize' not in k.lower() and isinstance(v, list) and len(v) > 0
]
datasets += arc_prize_public_evaluation_datasets
datasets += teval_en_datasets
datasets += teval_zh_datasets
musr_summary_groups = musr_summarizer['summary_groups']
summary_groups = sum(
[v for k, v in locals().items() if k.endswith('_summary_groups')], [])
summary_groups.append(
{
'name': 'Mathbench',
'subsets': ['mathbench-a (average)', 'mathbench-t (average)'],
}, )
for d in datasets:
d['reader_cfg']['test_range'] = '[0:4]'
if 'dataset_cfg' in d['eval_cfg']['evaluator'] and 'reader_cfg' in d[
'eval_cfg']['evaluator']['dataset_cfg']:
d['eval_cfg']['evaluator']['dataset_cfg']['reader_cfg'][
'test_range'] = '[0:4]'
if 'llm_evaluator' in d['eval_cfg']['evaluator'] and 'dataset_cfg' in d[
'eval_cfg']['evaluator']['llm_evaluator']:
d['eval_cfg']['evaluator']['llm_evaluator']['dataset_cfg'][
'reader_cfg']['test_range'] = '[0:4]'
obj_judge_model = judge_models[0]
for d in datasets:
if 'judge_cfg' in d['eval_cfg']['evaluator']:
d['eval_cfg']['evaluator']['judge_cfg'] = obj_judge_model
if 'llm_evaluator' in d['eval_cfg']['evaluator'] and 'judge_cfg' in d[
'eval_cfg']['evaluator']['llm_evaluator']:
d['eval_cfg']['evaluator']['llm_evaluator'][
'judge_cfg'] = obj_judge_model
================================================
FILE: autotest/eval/eval_chat_obj_fullbench_v7.py
================================================
from mmengine.config import read_base
with read_base():
# Datasets
# Instruct Following
# # # # Math Calculation
from autotest.eval.models import judge_models, models
from opencompass.configs.datasets.aime2024.aime2024_cascade_eval_gen_5e9f4f import \
aime2024_datasets # noqa: F401, E501
from opencompass.configs.datasets.aime2025.aime2025_cascade_eval_gen_5e9f4f import \
aime2025_datasets # noqa: F401, E501
# # # General Reasoning
from opencompass.configs.datasets.bbeh.bbeh_llmjudge_gen_86c3a0 import \
bbeh_datasets # noqa: F401, E501
from opencompass.configs.datasets.bigcodebench.bigcodebench_hard_complete_gen_2888d3 import \
bigcodebench_hard_complete_datasets # noqa: F401, E501
from opencompass.configs.datasets.bigcodebench.bigcodebench_hard_instruct_gen_c3d5ad import \
bigcodebench_hard_instruct_datasets # noqa: F401, E501
from opencompass.configs.datasets.chem_exam.competition_gen import \
chem_competition_instruct_datasets # noqa: F401, E501
from opencompass.configs.datasets.chem_exam.gaokao_gen import \
chem_gaokao_instruct_datasets # noqa: F401, E501
from opencompass.configs.datasets.ChemBench.ChemBench_llmjudge_gen_c584cf import \
chembench_datasets # noqa: F401, E501
from opencompass.configs.datasets.ClimaQA.ClimaQA_Gold_llm_judge_gen_f15343 import \
climaqa_datasets # noqa: F401, E501
from opencompass.configs.datasets.cmmlu.cmmlu_llmjudge_gen_e1cd9a import \
cmmlu_datasets # noqa: F401, E501
from opencompass.configs.datasets.Earth_Silver.Earth_Silver_llmjudge_gen import \
earth_silver_mcq_datasets # noqa: F401, E501
from opencompass.configs.datasets.gpqa.gpqa_cascade_eval_gen_772ea0 import \
gpqa_datasets # noqa: F401, E501
from opencompass.configs.datasets.HLE.hle_llmverify_gen_6ff468 import \
hle_datasets # noqa: F401, E501
# # Coding
from opencompass.configs.datasets.IFEval.IFEval_gen_353ae7 import \
ifeval_datasets # noqa: F401, E501
from opencompass.configs.datasets.kcle.kcle_llm_judge_gen import \
kcle_datasets # noqa: F401, E501
from opencompass.configs.datasets.korbench.korbench_single_0shot_cascade_eval_gen_56cf43 import \
korbench_0shot_single_datasets # noqa: F401, E501
from opencompass.configs.datasets.livecodebench.livecodebench_gen_a4f90b import \
LCBCodeGeneration_dataset # noqa: F401, E501
from opencompass.configs.datasets.livemathbench.livemathbench_hard_custom_cascade_eval_gen_4bce59 import \
livemathbench_datasets # noqa: F401, E501
from opencompass.configs.datasets.matbench.matbench_llm_judge_gen_0e9276 import \
matbench_datasets # noqa: F401, E501
from opencompass.configs.datasets.math.math_500_cascade_eval_gen_6ff468 import \
math_datasets # noqa: F401, E501
from opencompass.configs.datasets.mbpp.sanitized_mbpp_mdblock_gen_a447ff import \
sanitized_mbpp_datasets # noqa: F401, E501
from opencompass.configs.datasets.MedXpertQA.MedXpertQA_llmjudge_gen import \
medxpertqa_datasets # noqa: F401, E501
from opencompass.configs.datasets.mmlu.mmlu_llmjudge_gen_f4336b import \
mmlu_datasets # noqa: F401, E501
# # # Knowledge
from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_nocot_genericllmeval_gen_08c1de import \
mmlu_pro_datasets # noqa: F401, E501
from opencompass.configs.datasets.OlymMATH.olymmath_llmverify_gen_97b203 import \
olymmath_datasets # noqa: F401, E501
from opencompass.configs.datasets.OlympiadBench.OlympiadBench_0shot_llmverify_gen_be8b13 import \
olympiadbench_datasets # noqa: F401, E501
from opencompass.configs.datasets.PHYBench.phybench_gen import \
phybench_datasets # noqa: F401, E501
from opencompass.configs.datasets.PHYSICS.PHYSICS_llm_judge_gen_a133a2 import \
physics_datasets # noqa: F401, E501
from opencompass.configs.datasets.ProteinLMBench.ProteinLMBench_llmjudge_gen_a67965 import \
proteinlmbench_datasets # noqa: F401, E501
from opencompass.configs.datasets.R_Bench.rbench_llmjudge_gen_c89350 import \
RBench_datasets # noqa: F401, E501
# # Academic
from opencompass.configs.datasets.SmolInstruct.smolinstruct_0shot_instruct_gen import \
smolinstruct_datasets_0shot_instruct as \
smolinstruct_datasets # noqa: F401, E501
from opencompass.configs.datasets.srbench.srbench_gen import \
srbench_datasets # noqa: F401, E501
from opencompass.configs.datasets.supergpqa.supergpqa_cascade_gen_1545c1 import \
supergpqa_datasets # noqa: F401, E501
# Summary Groups
from opencompass.configs.summarizers.groups.bbeh import \
bbeh_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.cmmlu import \
cmmlu_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.korbench import \
korbench_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.mmlu import \
mmlu_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.mmlu_pro import \
mmlu_pro_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.OlympiadBench import \
OlympiadBenchPhysics_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.OlympiadBench import ( # noqa: F401, E501
OlympiadBench_summary_groups, OlympiadBenchMath_summary_groups)
from opencompass.configs.summarizers.groups.PHYSICS import \
physics_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.supergpqa import \
supergpqa_summary_groups # noqa: F401, E501
models = models
# Add lattest LCB version
LCBCodeGeneration_v6_datasets = LCBCodeGeneration_dataset
LCBCodeGeneration_v6_datasets['abbr'] = 'lcb_code_generation_v6'
LCBCodeGeneration_v6_datasets['release_version'] = 'v6'
LCBCodeGeneration_v6_datasets['eval_cfg']['evaluator'][
'release_version'] = 'v6'
LCBCodeGeneration_v6_datasets = [LCBCodeGeneration_v6_datasets]
repeated_info = [
(math_datasets, 1),
(gpqa_datasets, 1),
(aime2024_datasets, 1),
(aime2025_datasets, 1),
(olympiadbench_datasets, 1),
(livemathbench_datasets, 1),
(olymmath_datasets, 1),
(korbench_0shot_single_datasets, 1),
]
for datasets_, num in repeated_info:
for dataset_ in datasets_:
dataset_['n'] = num
dataset_['k'] = num
datasets = [
v[0] for k, v in locals().items()
if k.endswith('_datasets') and 'bigcode' not in k.lower()
and 'humaneval' not in k.lower() and isinstance(v, list) and len(v) > 0
]
datasets += bigcodebench_hard_instruct_datasets
datasets += bigcodebench_hard_complete_datasets
for d in datasets:
d['reader_cfg']['test_range'] = '[0:4]'
if 'dataset_cfg' in d['eval_cfg']['evaluator'] and 'reader_cfg' in d[
'eval_cfg']['evaluator']['dataset_cfg']:
d['eval_cfg']['evaluator']['dataset_cfg']['reader_cfg'][
'test_range'] = '[0:4]'
if 'llm_evaluator' in d['eval_cfg']['evaluator'] and 'dataset_cfg' in d[
'eval_cfg']['evaluator']['llm_evaluator']:
d['eval_cfg']['evaluator']['llm_evaluator']['dataset_cfg'][
'reader_cfg']['test_range'] = '[0:4]'
obj_judge_model = judge_models[0]
for d in datasets:
if 'judge_cfg' in d['eval_cfg']['evaluator']:
d['eval_cfg']['evaluator']['judge_cfg'] = obj_judge_model
if 'llm_evaluator' in d['eval_cfg']['evaluator'] and 'judge_cfg' in d[
'eval_cfg']['evaluator']['llm_evaluator']:
d['eval_cfg']['evaluator']['llm_evaluator'][
'judge_cfg'] = obj_judge_model
================================================
FILE: autotest/eval/eval_chat_obj_fullbench_v8.py
================================================
from mmengine.config import read_base
with read_base():
# Datasets
from autotest.eval.models import judge_models, models
from opencompass.configs.datasets.atlas.atlas_val_gen_b2d1b6 import \
atlas_datasets # noqa: F401, E501
from opencompass.configs.datasets.biodata.biodata_task_gen import \
biodata_task_datasets # noqa: F401, E501
from opencompass.configs.datasets.CMPhysBench.cmphysbench_gen import \
cmphysbench_datasets # noqa: F401, E501
from opencompass.configs.datasets.IFBench.IFBench_gen import \
ifbench_datasets # noqa: F401, E501
from opencompass.configs.datasets.livecodebench_pro.livecodebench_pro_gen import \
lcb_pro_datasets # noqa: F401, E501
from opencompass.configs.datasets.MolInstructions_chem.mol_instructions_chem_gen import \
mol_gen_selfies_datasets # noqa: F401, E501
from opencompass.configs.datasets.openswi.openswi_gen import \
openswi_datasets # noqa: F401, E501
models = models
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
for d in datasets:
if 'n' in d:
d['n'] = 1
if 'reader_cfg' in d:
d['reader_cfg']['test_range'] = '[0:4]'
else:
d['test_range'] = '[0:4]'
if 'eval_cfg' in d and 'dataset_cfg' in d['eval_cfg'][
'evaluator'] and 'reader_cfg' in d['eval_cfg']['evaluator'][
'dataset_cfg']:
d['eval_cfg']['evaluator']['dataset_cfg']['reader_cfg'][
'test_range'] = '[0:4]'
if 'eval_cfg' in d and 'llm_evaluator' in d['eval_cfg'][
'evaluator'] and 'dataset_cfg' in d['eval_cfg']['evaluator'][
'llm_evaluator']:
d['eval_cfg']['evaluator']['llm_evaluator']['dataset_cfg'][
'reader_cfg']['test_range'] = '[0:4]'
obj_judge_model = judge_models[0]
for d in datasets:
if 'eval_cfg' in d and 'evaluator' in d['eval_cfg']:
if 'atlas' in d['abbr'] and 'judge_cfg' in d['eval_cfg']['evaluator']:
d['eval_cfg']['evaluator']['judge_cfg'] = dict(
judgers=[obj_judge_model])
elif 'judge_cfg' in d['eval_cfg']['evaluator']:
d['eval_cfg']['evaluator']['judge_cfg'] = obj_judge_model
elif 'llm_evaluator' in d['eval_cfg'][
'evaluator'] and 'judge_cfg' in d[ # noqa
'eval_cfg']['evaluator']['llm_evaluator']: # noqa
d['eval_cfg']['evaluator']['llm_evaluator'][
'judge_cfg'] = obj_judge_model
================================================
FILE: autotest/eval/eval_chat_obj_v8.py
================================================
from mmengine.config import read_base
with read_base():
# Datasets
from autotest.eval.models import judge_models, test_models
from opencompass.configs.datasets.aime2026.aime2026_cascade_eval_gen_6ff468 import \
aime2026_datasets # noqa: F401, E501
from opencompass.configs.datasets.biodata.biodata_task_gen import \
biodata_task_datasets # noqa: F401, E501
from opencompass.configs.datasets.hmmt2026.hmmt2026_cascade_eval_gen_6ff468 import \
hmmt2026_datasets # noqa: F401, E501
from opencompass.configs.datasets.MolInstructions_chem.mol_instructions_chem_gen import \
mol_gen_selfies_datasets # noqa: F401, E501
from opencompass.configs.datasets.SciReasoner.scireasoner_gen import ( # noqa: F401, E501
mini_bio_instruction_datasets, mini_composition_material_datasets,
mini_GUE_datasets, mini_LLM4Mat_datasets,
mini_modulus_material_datasets, mini_mol_biotext_datasets,
mini_mol_mol_datasets, mini_mol_protein_datasets, mini_opi_datasets,
mini_PEER_datasets, mini_Retrosynthesis_uspto50k_datasets,
mini_smol_datasets, mini_UMG_Datasets, mini_uncond_material_datasets,
mini_uncond_protein_datasets, mini_uncond_RNA_datasets)
models = test_models
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
obj_judge_model = judge_models[0]
for d in datasets:
if 'eval_cfg' in d and 'evaluator' in d['eval_cfg']:
if 'atlas' in d['abbr'] and 'judge_cfg' in d['eval_cfg']['evaluator']:
d['eval_cfg']['evaluator']['judge_cfg'] = dict(
judgers=[obj_judge_model])
elif 'judge_cfg' in d['eval_cfg']['evaluator']:
d['eval_cfg']['evaluator']['judge_cfg'] = obj_judge_model
elif 'llm_evaluator' in d['eval_cfg'][
'evaluator'] and 'judge_cfg' in d[ # noqa
'eval_cfg']['evaluator']['llm_evaluator']: # noqa
d['eval_cfg']['evaluator']['llm_evaluator'][
'judge_cfg'] = obj_judge_model
================================================
FILE: autotest/eval/eval_chat_sub_fullbench.py
================================================
from mmengine.config import read_base
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.runners import LocalRunner
from opencompass.summarizers import DefaultSubjectiveSummarizer
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
with read_base():
# read hf models - chat models
# Dataset
from autotest.eval.models import judge_models, models
from opencompass.configs.datasets.chinese_simpleqa.chinese_simpleqa_gen import \
csimpleqa_datasets # noqa: F401, E501
from opencompass.configs.datasets.SimpleQA.simpleqa_gen_0283c3 import \
simpleqa_datasets # noqa: F401, E501; noqa: F401, E501
from opencompass.configs.datasets.subjective.alignbench.alignbench_v1_1_judgeby_critiquellm_new import \
alignbench_datasets # noqa: F401, E501
from opencompass.configs.datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4_new import \
alpacav2_datasets # noqa: F401, E501
from opencompass.configs.datasets.subjective.arena_hard.arena_hard_compare_new import \
arenahard_datasets # noqa: F401, E501
from opencompass.configs.datasets.subjective.compassarena.compassarena_compare_new import \
compassarena_datasets # noqa: F401, E501
from opencompass.configs.datasets.subjective.followbench.followbench_llmeval_new import \
followbench_llmeval_datasets # noqa: F401, E501
from opencompass.configs.datasets.subjective.multiround.mtbench101_judge_new import \
mtbench101_datasets # noqa: F401, E501
from opencompass.configs.datasets.subjective.wildbench.wildbench_pair_judge_new import \
wildbench_datasets # noqa: F401, E501
models = models
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')
and 'mtbench101' not in k and 'wildbench' not in k), [])
datasets += mtbench101_datasets # noqa: F401, E501
datasets += wildbench_datasets # noqa: F401, E501
api_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
],
reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
)
judge_models = judge_models
eval = dict(
partitioner=dict(type=SubjectiveNaivePartitioner,
models=models,
judge_models=judge_models),
runner=dict(type=LocalRunner,
max_num_workers=16,
task=dict(type=SubjectiveEvalTask)),
)
summary_groups = []
summary_groups.append({
'name': 'compassarena_language',
'subsets': [
['compassarena_language', '内容总结'],
],
})
summary_groups.append({
'name': 'compassarena_knowledge',
'subsets': [
['compassarena_knowledge', '生活常识_ZH'],
],
})
summary_groups.append({
'name': 'compassarena_reason_v2',
'subsets': [
['compassarena_reason_v2', 'reasoning'],
],
})
summary_groups.append({
'name': 'compassarena_math_v2',
'subsets': [
['compassarena_math_v2', '高等数学_ZH'],
],
})
summary_groups.append({
'name': 'compassarena_creationv2_zh',
'subsets': [
['compassarena_creationv2_zh', '内容扩写_ZH'],
],
})
summary_groups.append({
'name':
'CompassArena',
'subsets': [
'compassarena_language',
'compassarena_knowledge',
'compassarena_reason_v2',
'compassarena_math_v2',
'compassarena_creationv2_zh',
],
})
summary_groups.append({
'name':
'FoFo',
'subsets': [['fofo_test_prompts', 'overall'],
['fofo_test_prompts_cn', 'overall']],
})
summary_groups.append({
'name':
'Followbench',
'subsets': [
['followbench_llmeval_en', 'HSR_AVG'],
['followbench_llmeval_en', 'SSR_AVG'],
],
})
# Summarizer
summarizer = dict(
dataset_abbrs=[
['alignment_bench_v1_1', '总分'],
['alpaca_eval', 'total'],
['arenahard', 'score'],
['Followbench', 'naive_average'],
['CompassArena', 'naive_average'],
['FoFo', 'naive_average'],
['mtbench101', 'avg'],
['wildbench', 'average'],
['simpleqa', 'accuracy_given_attempted'],
['chinese_simpleqa', 'given_attempted_accuracy'],
'',
['alignment_bench_v1_1', '专业能力'],
['alignment_bench_v1_1', '数学计算'],
['alignment_bench_v1_1', '基本任务'],
['alignment_bench_v1_1', '逻辑推理'],
['alignment_bench_v1_1', '中文理解'],
['alignment_bench_v1_1', '文本写作'],
['alignment_bench_v1_1', '角色扮演'],
['alignment_bench_v1_1', '综合问答'],
['alpaca_eval', 'helpful_base'],
['alpaca_eval', 'koala'],
['alpaca_eval', 'oasst'],
['alpaca_eval', 'selfinstruct'],
['alpaca_eval', 'vicuna'],
['compassarena_language', 'naive_average'],
['compassarena_knowledge', 'naive_average'],
['compassarena_reason_v2', 'naive_average'],
['compassarena_math_v2', 'naive_average'],
['compassarena_creationv2_zh', 'naive_average'],
['fofo_test_prompts', 'overall'],
['fofo_test_prompts_cn', 'overall'],
['followbench_llmeval_en', 'HSR_AVG'],
['followbench_llmeval_en', 'SSR_AVG'],
['followbench_llmeval_en', 'HSR_L1'],
['followbench_llmeval_en', 'HSR_L2'],
['followbench_llmeval_en', 'HSR_L3'],
['followbench_llmeval_en', 'HSR_L4'],
['followbench_llmeval_en', 'HSR_L5'],
['followbench_llmeval_en', 'SSR_L1'],
['followbench_llmeval_en', 'SSR_L2'],
['followbench_llmeval_en', 'SSR_L3'],
['followbench_llmeval_en', 'SSR_L4'],
['followbench_llmeval_en', 'SSR_L5'],
['simpleqa', 'f1'],
],
type=DefaultSubjectiveSummarizer,
summary_groups=summary_groups,
)
================================================
FILE: autotest/eval/models.py
================================================
from opencompass.models import TurboMindModel, TurboMindModelwithChatTemplate
from opencompass.utils.text_postprocessors import extract_non_reasoning_content
models = [
dict(type=TurboMindModelwithChatTemplate,
abbr='qwen3-8b-fullbench',
path='Qwen/Qwen3-8B',
engine_config=dict(session_len=32768, max_batch_size=1, tp=1),
gen_config=dict(do_sample=False, enable_thinking=True),
max_seq_len=32768,
max_out_len=32768,
batch_size=1,
run_cfg=dict(num_gpus=1),
pred_postprocessor=dict(type=extract_non_reasoning_content))
]
test_models = [
dict(type=TurboMindModelwithChatTemplate,
abbr='test_model',
path='intern/Intern-S1-Pro',
engine_config=dict(session_len=32768, max_batch_size=1, tp=16),
gen_config=dict(do_sample=False, enable_thinking=True),
max_seq_len=32768,
max_out_len=32768,
batch_size=1,
run_cfg=dict(num_gpus=16),
pred_postprocessor=dict(type=extract_non_reasoning_content))
]
judge_models = [
dict(type=TurboMindModelwithChatTemplate,
abbr='qwen3-8b-fullbench',
path='Qwen/Qwen3-8B',
engine_config=dict(session_len=46000, max_batch_size=1, tp=1),
gen_config=dict(do_sample=False, enable_thinking=True),
max_seq_len=46000,
max_out_len=46000,
batch_size=1,
run_cfg=dict(num_gpus=1),
pred_postprocessor=dict(type=extract_non_reasoning_content))
]
base_models = [
dict(
type=TurboMindModel,
abbr='qwen3-8b-base-fullbench',
path='Qwen/Qwen3-8B-Base',
engine_config=dict(session_len=32768, max_batch_size=1, tp=1),
gen_config=dict(top_k=1,
temperature=1e-6,
top_p=0.9,
max_new_tokens=1024),
max_seq_len=32768,
max_out_len=1024,
batch_size=1,
run_cfg=dict(num_gpus=1),
)
]
================================================
FILE: autotest/model/__init__.py
================================================
"""OpenCompass inference test configurations."""
__all__ = []
================================================
FILE: autotest/model/base_datasets.py
================================================
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.gpqa.gpqa_few_shot_ppl_4b5a83 import \
gpqa_datasets # noqa: F401, E501
from opencompass.configs.datasets.gsm8k.gsm8k_gen_17d0dc import \
gsm8k_datasets # noqa: F401, E501
from opencompass.configs.datasets.infinitebench.infinitebenchretrievepasskey.infinitebench_retrievepasskey_gen import \
InfiniteBench_retrievepasskey_datasets # noqa: F401, E501
from opencompass.configs.datasets.mmlu_pro.mmlu_pro_few_shot_gen_bfaf90 import \
mmlu_pro_datasets # noqa: F401, E501
from opencompass.configs.datasets.winogrande.winogrande_5shot_ll_252f01 import \
winogrande_datasets # noqa: F401, E501
# humaneval_datasets = [humaneval_datasets[0]]
mmlu_pro_datasets = [mmlu_pro_datasets[0]]
datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
for d in datasets:
d['reader_cfg']['test_range'] = '[0:4]'
================================================
FILE: autotest/model/chat_datasets.py
================================================
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.aime2025.aime2025_cascade_eval_gen_5e9f4f import \
aime2025_datasets # noqa: F401, E501
from opencompass.configs.datasets.gsm8k.gsm8k_gen import \
gsm8k_datasets # noqa: F401, E501
# from opencompass.configs.datasets.mmlu_pro.mmlu_pro_gen import \
# mmlu_pro_datasets # noqa: F401, E501
from opencompass.configs.datasets.HLE.hle_gen import \
hle_datasets # noqa: F401, E501
# from opencompass.configs.datasets.humaneval.humaneval_gen import \
# humaneval_datasets # noqa: F401, E501
from opencompass.configs.datasets.IFEval.IFEval_gen import \
ifeval_datasets # noqa: F401, E501
from opencompass.configs.datasets.infinitebench.infinitebenchretrievepasskey.infinitebench_retrievepasskey_gen import \
InfiniteBench_retrievepasskey_datasets # noqa: F401, E501
# humaneval_datasets = [humaneval_datasets[0]]
ifeval_datasets = [ifeval_datasets[0]]
# mmlu_pro_datasets = [mmlu_pro_datasets[0]]
hle_datasets = [hle_datasets[0]]
aime2025_datasets = [aime2025_datasets[0]]
aime2025_datasets[0]['n'] = 2
datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
for d in datasets:
d['reader_cfg']['test_range'] = '[0:4]'
if 'dataset_cfg' in d['eval_cfg']['evaluator'] and 'reader_cfg' in d[
'eval_cfg']['evaluator']['dataset_cfg']:
d['eval_cfg']['evaluator']['dataset_cfg']['reader_cfg'][
'test_range'] = '[0:4]'
if 'llm_evaluator' in d['eval_cfg']['evaluator'] and 'dataset_cfg' in d[
'eval_cfg']['evaluator']['llm_evaluator']:
d['eval_cfg']['evaluator']['llm_evaluator']['dataset_cfg'][
'reader_cfg']['test_range'] = '[0:4]'
================================================
FILE: autotest/model/constant.py
================================================
meta_template = dict(
begin=dict(
role='SYSTEM',
api_role='SYSTEM',
prompt='''
Your answers should be full of happy and lovely tone. Answer the question simply and clearly. Don\'t use any abbreviations and don\'t use any punctuation. Don\'t think too much.''', # noqa
),
round=[ # noqa
dict(role='HUMAN', api_role='HUMAN', prompt='{input}'),
dict(role='BOT', api_role='BOT', generate=True),
])
================================================
FILE: autotest/model/infer_api.py
================================================
from mmengine.config import read_base
from opencompass.models.openai_api import OpenAISDK
from opencompass.models.openai_streaming import OpenAISDKStreaming
from opencompass.utils.text_postprocessors import extract_non_reasoning_content
with read_base():
from autotest.model.chat_datasets import datasets
from autotest.model.constant import meta_template as test_meta_template
datasets = datasets
api_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
],
reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
)
API_BASE = 'http://localhost:23333/v1'
MODEL_PATH = 'Qwen/Qwen3-8B'
TOKENIZER_PATH = 'Qwen/Qwen3-8B'
BASE_API = dict(
type=OpenAISDK,
key='EMPTY',
openai_api_base=API_BASE,
path=MODEL_PATH,
tokenizer_path=TOKENIZER_PATH,
rpm_verbose=True,
meta_template=api_meta_template,
query_per_second=128,
batch_size=128,
retry=20,
pred_postprocessor=dict(type=extract_non_reasoning_content),
)
BASE_STREAMING = dict(
type=OpenAISDKStreaming,
key='EMPTY',
openai_api_base=API_BASE,
path=MODEL_PATH,
tokenizer_path=TOKENIZER_PATH,
rpm_verbose=True,
meta_template=api_meta_template,
query_per_second=128,
batch_size=128,
stream=True,
retry=20,
pred_postprocessor=dict(type=extract_non_reasoning_content),
)
API_BASIC = dict(
**BASE_API,
abbr='lmdeploy-api-test',
max_out_len=1024,
max_seq_len=4096,
)
API_STREAMING = dict(
**BASE_STREAMING,
abbr='lmdeploy-api-streaming-test',
max_out_len=1024,
max_seq_len=4096,
)
API_STREAMING_CHUNK = dict(
**BASE_STREAMING,
abbr='lmdeploy-api-streaming-test-chunk',
max_out_len=1024,
max_seq_len=4096,
stream_chunk_size=10,
verbose=True,
)
API_MAXLEN = dict(
**BASE_API,
abbr='lmdeploy-api-test-maxlen',
max_out_len=4096,
max_seq_len=4096,
)
API_MAXLEN_MID = dict(
**BASE_API,
abbr='lmdeploy-api-test-maxlen-mid',
max_out_len=3896,
max_seq_len=4096,
mode='mid',
)
API_NOTHINK = dict(
**BASE_API,
abbr='lmdeploy-api-test-nothink',
max_out_len=4096,
max_seq_len=4096,
extra_body={'enable_thinking': False},
)
API_IGNORE_EOS = dict(
**BASE_API,
abbr='lmdeploy-api-test-ignore-eos',
max_out_len=128,
max_seq_len=4096,
extra_body={
'ignore_eos': True,
},
)
API_CHAT_TEMPLATE = dict(
**BASE_API,
abbr='lmdeploy-api-test-chat-template',
max_out_len=1024,
max_seq_len=1024,
extra_body={'enable_thinking': False},
)
API_CHAT_TEMPLATE['meta_template'] = test_meta_template
API_OPENAI_STOP = dict(
**BASE_API,
abbr='lmdeploy-api-test-openai-stop',
max_out_len=512,
max_seq_len=4096,
openai_extra_kwargs=dict(
stop=[' and', '</think>', ' to', '\n\n', 'Question:', 'Answer:'], ),
)
API_OPENAI_LOGPROBS = dict(
**BASE_API,
abbr='lmdeploy-api-test-openai-logprobs',
max_out_len=256,
max_seq_len=4096,
openai_extra_kwargs=dict(
logprobs=True,
top_logprobs=5,
),
)
API_OPENAI_COMBINE = dict(
**BASE_API,
abbr='lmdeploy-api-test-openai-combine',
max_out_len=512,
max_seq_len=4096,
openai_extra_kwargs=dict(
presence_penalty=0.3,
frequency_penalty=0.2,
top_p=0.85,
seed=42,
user='opencompass-regression',
),
)
API_LONG_OUTPUT_128K = dict(
**BASE_API,
abbr='lmdeploy-api-test-long-output-128k',
max_out_len=4096,
max_seq_len=131072,
)
models = [
API_BASIC,
API_STREAMING,
API_STREAMING_CHUNK,
API_MAXLEN,
API_MAXLEN_MID,
API_NOTHINK,
API_IGNORE_EOS,
API_CHAT_TEMPLATE,
API_OPENAI_STOP,
API_OPENAI_LOGPROBS,
API_OPENAI_COMBINE,
API_LONG_OUTPUT_128K,
]
for m in models:
m['temperature'] = 0
================================================
FILE: autotest/model/infer_api_rollout.py
================================================
from mmengine.config import read_base
from opencompass.models import OpenAISDKRollout
from opencompass.utils.text_postprocessors import extract_non_reasoning_content
with read_base():
from autotest.model.chat_datasets import datasets
datasets = datasets
api_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
],
reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
)
API_BASE = 'http://localhost:23333/v1'
MODEL_PATH = 'Qwen/Qwen3-8B'
TOKENIZER_PATH = 'Qwen/Qwen3-8B'
BASE_ROLLOUT = dict(
type=OpenAISDKRollout,
key='EMPTY',
openai_api_base=API_BASE,
path=MODEL_PATH,
tokenizer_path=TOKENIZER_PATH,
rpm_verbose=True,
meta_template=api_meta_template,
query_per_second=128,
batch_size=128,
retry=20,
pred_postprocessor=dict(type=extract_non_reasoning_content),
)
API_ROLLOUT_BASIC = dict(
**BASE_ROLLOUT,
abbr='lmdeploy-api-test-rollout',
max_out_len=1024,
max_seq_len=4096,
temperature=0.01,
logprobs=True,
top_logprobs=5,
extra_body=dict(top_k=20),
openai_extra_kwargs=dict(top_p=0.95),
)
API_ROLLOUT_STOP = dict(
**BASE_ROLLOUT,
abbr='lmdeploy-api-test-rollout-stop',
max_out_len=512,
max_seq_len=4096,
temperature=0.2,
logprobs=True,
top_logprobs=5,
openai_extra_kwargs=dict(
stop=[' and', '</think>', ' to', '\n\n', 'Question:', 'Answer:'],
top_p=0.9,
),
)
API_ROLLOUT_COMBINE = dict(
**BASE_ROLLOUT,
abbr='lmdeploy-api-test-rollout-combine',
max_out_len=512,
max_seq_len=4096,
temperature=0.2,
logprobs=True,
top_logprobs=5,
openai_extra_kwargs=dict(
presence_penalty=0.3,
frequency_penalty=0.2,
top_p=0.85,
seed=42,
user='opencompass-regression',
),
)
API_ROLLOUT_IGNORE_EOS = dict(
**BASE_ROLLOUT,
abbr='lmdeploy-api-test-rollout-ignore-eos',
max_out_len=128,
max_seq_len=4096,
temperature=0.2,
logprobs=True,
top_logprobs=5,
extra_body={
'ignore_eos': True,
},
)
API_ROLLOUT_NO_THINK = dict(
**BASE_ROLLOUT,
abbr='lmdeploy-api-test-rollout-no-think',
max_out_len=128,
max_seq_len=4096,
temperature=0.2,
logprobs=True,
top_logprobs=5,
extra_body={
'enable_thinking': False,
},
)
API_ROLLOUT_LONG_OUTPUT_128K = dict(
**BASE_ROLLOUT,
abbr='lmdeploy-api-test-rollout-long-output-128k',
max_out_len=1024,
max_seq_len=131072,
temperature=0.01,
logprobs=True,
top_logprobs=5,
)
models = [
API_ROLLOUT_BASIC,
API_ROLLOUT_STOP,
API_ROLLOUT_COMBINE,
API_ROLLOUT_IGNORE_EOS,
API_ROLLOUT_NO_THINK,
API_ROLLOUT_LONG_OUTPUT_128K,
]
for m in models:
if 'openai_extra_kwargs' not in m:
m['openai_extra_kwargs'] = dict(top_k=1,
temperature=1.0,
repetition_penalty=1.0)
else:
m['openai_extra_kwargs']['top_k'] = 1
m['openai_extra_kwargs']['temperature'] = 1.0
m['openai_extra_kwargs']['repetition_penalty'] = 1.0
================================================
FILE: autotest/model/infer_lmdeploy_base.py
================================================
from mmengine.config import read_base
from opencompass.models import TurboMindModel
from opencompass.utils.text_postprocessors import extract_non_reasoning_content
with read_base():
from autotest.model.base_datasets import datasets
from autotest.model.constant import meta_template as test_meta_template
datasets = datasets
Qwen3_0_6B_Base = dict(
type=TurboMindModel,
abbr='lmdeploy-qwen3-0_6b-base',
path='Qwen/Qwen3-0.6B-Base',
engine_config=dict(max_batch_size=1, session_len=128000),
gen_config=dict(do_sample=False),
max_out_len=32768,
batch_size=1,
run_cfg=dict(num_gpus=1),
pred_postprocessor=dict(type=extract_non_reasoning_content))
Qwen3_0_6B_Base_PYTORCH = dict(type=TurboMindModel,
abbr='lmdeploy-qwen3-0_6b-base-pytorch',
path='Qwen/Qwen3-0.6B-Base',
engine_config=dict(backend='pytorch',
session_len=32768,
max_batch_size=1),
gen_config=dict(do_sample=False),
max_seq_len=32768,
max_out_len=1024,
batch_size=1,
run_cfg=dict(num_gpus=1))
Qwen3_0_6B_Base_BACKEND = dict(type=TurboMindModel,
abbr='lmdeploy-qwen3-0_6b-base-backend',
path='Qwen/Qwen3-0.6B-Base',
engine_config=dict(session_len=32768,
max_batch_size=1),
gen_config=dict(do_sample=False),
max_seq_len=32768,
max_out_len=1024,
batch_size=1,
backend='pytorch',
run_cfg=dict(num_gpus=1))
Qwen3_0_6B_Base_IGNORE_EOS = dict(type=TurboMindModel,
abbr='lmdeploy-qwen3-0_6b-base-ignore-eos',
path='Qwen/Qwen3-0.6B-Base',
engine_config=dict(session_len=4096,
max_batch_size=1),
gen_config=dict(do_sample=False,
max_new_tokens=128,
ignore_eos=True),
max_seq_len=4096,
batch_size=1,
run_cfg=dict(num_gpus=1))
Qwen3_0_6B_Base_TEMP0 = dict(type=TurboMindModel,
abbr='lmdeploy-qwen3-0_6b-base-temp0',
path='Qwen/Qwen3-0.6B-Base',
engine_config=dict(session_len=4096,
max_batch_size=1),
gen_config=dict(temperature=0.0, do_sample=False),
max_seq_len=32768,
max_out_len=1024,
batch_size=1,
run_cfg=dict(num_gpus=1))
Qwen3_0_6B_Base_BAD_WORDS = dict(type=TurboMindModel,
abbr='lmdeploy-qwen3-0_6b-base-bad-words',
path='Qwen/Qwen3-0.6B-Base',
engine_config=dict(session_len=4096,
max_batch_size=1),
gen_config=dict(
temperature=0.0,
do_sample=False,
bad_words=['</think>', '<think>', ' to']),
max_seq_len=32768,
max_out_len=1024,
batch_size=1,
run_cfg=dict(num_gpus=1))
Qwen3_0_6B_Base_SESSION_LEN = dict(type=TurboMindModel,
abbr='lmdeploy-qwen3-0_6b-base-session-len',
path='Qwen/Qwen3-0.6B-Base',
engine_config=dict(session_len=10,
max_batch_size=1),
gen_config=dict(temperature=0.0,
do_sample=False),
max_seq_len=32768,
max_out_len=8192,
batch_size=1,
run_cfg=dict(num_gpus=1))
# Test case for max_new_tokens and min_new_tokens
# which should generate between 90 and 100 tokens
Qwen3_0_6B_Base_NEW_TOKENS = dict(type=TurboMindModel,
abbr='lmdeploy-qwen3-0_6b-base-new-tokens',
path='Qwen/Qwen3-0.6B-Base',
engine_config=dict(session_len=4096,
max_batch_size=1),
gen_config=dict(temperature=0.0,
do_sample=False,
min_new_tokens=90,
max_new_tokens=100),
max_seq_len=32768,
batch_size=1,
run_cfg=dict(num_gpus=1))
Qwen3_0_6B_Base_MAX_SEQ_LEN = dict(type=TurboMindModel,
abbr='lmdeploy-qwen3-0_6b-base-max-seq-len',
path='Qwen/Qwen3-0.6B-Base',
engine_config=dict(max_batch_size=1),
gen_config=dict(do_sample=False),
max_seq_len=200,
max_out_len=100,
batch_size=1,
run_cfg=dict(num_gpus=1))
Qwen3_0_6B_Base_STOP_WORDS = dict(
type=TurboMindModel,
abbr='lmdeploy-qwen3-0_6b-base-stop-words',
path='Qwen/Qwen3-0.6B-Base',
engine_config=dict(session_len=4096, max_batch_size=1),
gen_config=dict(temperature=0.0,
do_sample=False,
stopping_criteria=[' and', '</think>', ' to']),
max_seq_len=4096,
max_out_len=4096,
batch_size=1,
run_cfg=dict(num_gpus=1))
Qwen3_0_6B_Base_TEMPLATE = dict(type=TurboMindModel,
abbr='lmdeploy-qwen3-0_6b-base-template',
path='Qwen/Qwen3-0.6B-Base',
engine_config=dict(session_len=32768,
max_batch_size=1),
gen_config=dict(do_sample=False,
max_new_tokens=256),
max_seq_len=32768,
batch_size=1,
meta_template=test_meta_template,
run_cfg=dict(num_gpus=1))
Qwen3_0_6B_Base_DROP_MIDDLE = dict(type=TurboMindModel,
abbr='lmdeploy-qwen3-0_6b-base-drop-middle',
path='Qwen/Qwen3-0.6B-Base',
engine_config=dict(session_len=32768,
max_batch_size=1),
gen_config=dict(do_sample=False),
max_seq_len=2048,
max_out_len=2000,
batch_size=1,
drop_middle=True,
run_cfg=dict(num_gpus=1))
# Test case for combined parameters
Qwen3_0_6B_BASE_COMBINED = dict(type=TurboMindModel,
abbr='lmdeploy-qwen3-0_6b-base-combined',
path='Qwen/Qwen3-0.6B-Base',
engine_config=dict(session_len=4096,
max_batch_size=1),
gen_config=dict(temperature=0.1,
top_p=0.5,
do_sample=False,
repetition_penalty=0.000001,
random_seed=42,
max_new_tokens=128,
skip_special_tokens=True),
max_seq_len=4096,
batch_size=1,
run_cfg=dict(num_gpus=1))
# Test case for do_sample=True
Qwen3_0_6B_Base_DO_SAMPLE = dict(type=TurboMindModel,
abbr='lmdeploy-qwen3-0_6b-base-do-sample',
path='Qwen/Qwen3-0.6B-Base',
engine_config=dict(session_len=4096,
max_batch_size=1),
gen_config=dict(do_sample=True,
temperature=0.7,
top_p=0.9,
max_new_tokens=1024),
max_seq_len=4096,
max_out_len=1024,
batch_size=1,
run_cfg=dict(num_gpus=1))
# Test case for stop_token_ids, no </think> should be in the output
Qwen3_0_6B_Base_STOP_TOKEN_IDS = dict(
type=TurboMindModel,
abbr='lmdeploy-qwen3-0_6b-base-stop-token-ids',
path='Qwen/Qwen3-0.6B-Base',
engine_config=dict(session_len=4096, max_batch_size=1),
gen_config=dict(temperature=0.0,
do_sample=False,
max_new_tokens=1024,
stop_token_ids=[151645, 151668]),
max_seq_len=4096,
max_out_len=1024,
batch_size=1,
run_cfg=dict(num_gpus=1))
# Test case for bad_token_ids, no </think> should be in the output
Qwen3_0_6B_Base_BAD_TOKEN_IDS = dict(
type=TurboMindModel,
abbr='lmdeploy-qwen3-0_6b-base-bad-token-ids',
path='Qwen/Qwen3-0.6B-Base',
engine_config=dict(session_len=4096, max_batch_size=1),
gen_config=dict(temperature=0.0,
do_sample=False,
max_new_tokens=1024,
bad_token_ids=[151645, 151668]),
max_seq_len=4096,
max_out_len=1024,
batch_size=1,
run_cfg=dict(num_gpus=1))
# Test case for logprobs
Qwen3_0_6B_Base_LOGPROBS = dict(type=TurboMindModel,
abbr='lmdeploy-qwen3-0_6b-base-logprobs',
path='Qwen/Qwen3-0.6B-Base',
engine_config=dict(session_len=4096,
max_batch_size=1),
gen_config=dict(temperature=0.0,
do_sample=False,
max_new_tokens=1024,
logprobs=5),
max_seq_len=4096,
max_out_len=1024,
batch_size=1,
run_cfg=dict(num_gpus=1))
Qwen3_0_6B_Base_ENDSTR = dict(type=TurboMindModel,
abbr='lmdeploy-qwen3-0_6b-base-end-str',
path='Qwen/Qwen3-0.6B-Base',
engine_config=dict(session_len=4096,
max_batch_size=1),
gen_config=dict(temperature=0.0,
do_sample=False,
max_new_tokens=1024),
max_seq_len=4096,
max_out_len=1024,
batch_size=1,
end_str='</think>',
run_cfg=dict(num_gpus=1))
models = [
Qwen3_0_6B_Base, Qwen3_0_6B_Base_PYTORCH, Qwen3_0_6B_Base_BACKEND,
Qwen3_0_6B_Base_DROP_MIDDLE, Qwen3_0_6B_Base_IGNORE_EOS,
Qwen3_0_6B_Base_TEMP0, Qwen3_0_6B_Base_DO_SAMPLE,
Qwen3_0_6B_Base_BAD_WORDS, Qwen3_0_6B_Base_STOP_WORDS,
Qwen3_0_6B_Base_STOP_TOKEN_IDS, Qwen3_0_6B_Base_BAD_TOKEN_IDS,
Qwen3_0_6B_Base_NEW_TOKENS, Qwen3_0_6B_Base_MAX_SEQ_LEN,
Qwen3_0_6B_Base_SESSION_LEN, Qwen3_0_6B_BASE_COMBINED,
Qwen3_0_6B_Base_LOGPROBS, Qwen3_0_6B_Base_TEMPLATE, Qwen3_0_6B_Base_ENDSTR
]
================================================
FILE: autotest/model/infer_lmdeploy_chat.py
================================================
from mmengine.config import read_base
from opencompass.models import TurboMindModelwithChatTemplate
from opencompass.utils.text_postprocessors import extract_non_reasoning_content
with read_base():
from autotest.model.chat_datasets import datasets
from autotest.model.constant import meta_template as test_meta_template
datasets = datasets
# Base model testcase
Qwen3_0_6B_FP8 = dict(
type=TurboMindModelwithChatTemplate,
abbr='lmdeploy-qwen3-0_6b-fp8-base',
path='Qwen/Qwen3-0.6B-FP8',
engine_config=dict(max_batch_size=1, session_len=128000),
gen_config=dict(do_sample=False),
max_out_len=32768,
batch_size=1,
run_cfg=dict(num_gpus=1),
pred_postprocessor=dict(type=extract_non_reasoning_c
gitextract_3v91281m/
├── .codespellrc
├── .github/
│ ├── ISSUE_TEMPLATE/
│ │ ├── 1_bug-report.yml
│ │ ├── 2_feature-request.yml
│ │ ├── 3_bug-report_zh.yml
│ │ ├── 4_feature-request_zh.yml
│ │ └── config.yml
│ ├── pull_request_template.md
│ └── workflows/
│ ├── daily-ete-test.yml
│ ├── link-check.yml
│ ├── lint.yml
│ ├── pr-run-test.yml
│ ├── pr-stage-check.yml
│ ├── publish-to-pypi.yml
│ └── unit-test.yml
├── .gitignore
├── .owners.yml
├── .pre-commit-config-zh-cn.yaml
├── .pre-commit-config.yaml
├── LICENSE
├── MANIFEST.in
├── README.md
├── README_zh-CN.md
├── autotest/
│ ├── __init__.py
│ ├── cluster/
│ │ ├── __init__.py
│ │ └── chat_models.py
│ ├── eval/
│ │ ├── __init__.py
│ │ ├── eval_base_fullbench.py
│ │ ├── eval_base_longtext_fullbench.py
│ │ ├── eval_chat_longtext_fullbench.py
│ │ ├── eval_chat_obj_fullbench_other.py
│ │ ├── eval_chat_obj_fullbench_v5.py
│ │ ├── eval_chat_obj_fullbench_v6.py
│ │ ├── eval_chat_obj_fullbench_v7.py
│ │ ├── eval_chat_obj_fullbench_v8.py
│ │ ├── eval_chat_obj_v8.py
│ │ ├── eval_chat_sub_fullbench.py
│ │ └── models.py
│ ├── model/
│ │ ├── __init__.py
│ │ ├── base_datasets.py
│ │ ├── chat_datasets.py
│ │ ├── constant.py
│ │ ├── infer_api.py
│ │ ├── infer_api_rollout.py
│ │ ├── infer_lmdeploy_base.py
│ │ ├── infer_lmdeploy_chat.py
│ │ ├── infer_transformers_base.py
│ │ ├── infer_transformers_chat.py
│ │ ├── infer_vllm_base.py
│ │ └── infer_vllm_chat.py
│ ├── oc_score_baseline.yaml
│ └── utils/
│ ├── compare_results.py
│ ├── health_check.py
│ └── oc_score_assert.py
├── dataset-index.yml
├── docs/
│ ├── en/
│ │ ├── .readthedocs.yaml
│ │ ├── Makefile
│ │ ├── _static/
│ │ │ ├── css/
│ │ │ │ └── readthedocs.css
│ │ │ └── js/
│ │ │ └── custom.js
│ │ ├── _templates/
│ │ │ ├── 404.html
│ │ │ ├── autosummary/
│ │ │ │ └── class.rst
│ │ │ └── callable.rst
│ │ ├── advanced_guides/
│ │ │ ├── accelerator_intro.md
│ │ │ ├── circular_eval.md
│ │ │ ├── code_eval.md
│ │ │ ├── code_eval_service.md
│ │ │ ├── contamination_eval.md
│ │ │ ├── custom_dataset.md
│ │ │ ├── evaluation_lightllm.md
│ │ │ ├── evaluation_lmdeploy.md
│ │ │ ├── llm_judge.md
│ │ │ ├── longeval.md
│ │ │ ├── math_verify.md
│ │ │ ├── needleinahaystack_eval.md
│ │ │ ├── new_dataset.md
│ │ │ ├── new_model.md
│ │ │ ├── objective_judgelm_evaluation.md
│ │ │ ├── persistence.md
│ │ │ ├── prompt_attack.md
│ │ │ └── subjective_evaluation.md
│ │ ├── conf.py
│ │ ├── docutils.conf
│ │ ├── get_started/
│ │ │ ├── faq.md
│ │ │ ├── installation.md
│ │ │ └── quick_start.md
│ │ ├── index.rst
│ │ ├── notes/
│ │ │ ├── academic.md
│ │ │ ├── contribution_guide.md
│ │ │ └── news.md
│ │ ├── prompt/
│ │ │ ├── chain_of_thought.md
│ │ │ ├── meta_template.md
│ │ │ ├── overview.md
│ │ │ └── prompt_template.md
│ │ ├── statis.py
│ │ ├── tools.md
│ │ └── user_guides/
│ │ ├── config.md
│ │ ├── corebench.md
│ │ ├── datasets.md
│ │ ├── deepseek_r1.md
│ │ ├── evaluation.md
│ │ ├── experimentation.md
│ │ ├── framework_overview.md
│ │ ├── interns1.md
│ │ ├── metrics.md
│ │ ├── models.md
│ │ └── summarizer.md
│ └── zh_cn/
│ ├── .readthedocs.yaml
│ ├── Makefile
│ ├── _static/
│ │ ├── css/
│ │ │ └── readthedocs.css
│ │ └── js/
│ │ └── custom.js
│ ├── _templates/
│ │ ├── 404.html
│ │ ├── autosummary/
│ │ │ └── class.rst
│ │ └── callable.rst
│ ├── advanced_guides/
│ │ ├── accelerator_intro.md
│ │ ├── circular_eval.md
│ │ ├── code_eval.md
│ │ ├── code_eval_service.md
│ │ ├── compassbench_intro.md
│ │ ├── compassbench_v2_0.md
│ │ ├── contamination_eval.md
│ │ ├── custom_dataset.md
│ │ ├── evaluation_lightllm.md
│ │ ├── evaluation_lmdeploy.md
│ │ ├── llm_judge.md
│ │ ├── longeval.md
│ │ ├── math_verify.md
│ │ ├── needleinahaystack_eval.md
│ │ ├── new_dataset.md
│ │ ├── new_model.md
│ │ ├── objective_judgelm_evaluation.md
│ │ ├── persistence.md
│ │ ├── prompt_attack.md
│ │ └── subjective_evaluation.md
│ ├── conf.py
│ ├── cp_origin_docs.sh
│ ├── docutils.conf
│ ├── get_started/
│ │ ├── faq.md
│ │ ├── installation.md
│ │ └── quick_start.md
│ ├── index.rst
│ ├── notes/
│ │ ├── academic.md
│ │ ├── contribution_guide.md
│ │ └── news.md
│ ├── prompt/
│ │ ├── chain_of_thought.md
│ │ ├── meta_template.md
│ │ ├── overview.md
│ │ └── prompt_template.md
│ ├── statis.py
│ ├── tools.md
│ └── user_guides/
│ ├── config.md
│ ├── corebench.md
│ ├── datasets.md
│ ├── deepseek_r1.md
│ ├── evaluation.md
│ ├── experimentation.md
│ ├── framework_overview.md
│ ├── interns1.md
│ ├── metrics.md
│ ├── models.md
│ └── summarizer.md
├── examples/
│ ├── eval_OlympiadBench.py
│ ├── eval_PMMEval.py
│ ├── eval_ProcessBench.py
│ ├── eval_TheoremQA.py
│ ├── eval_academic_leaderboard_202407.py
│ ├── eval_academic_leaderboard_202412.py
│ ├── eval_academic_leaderboard_202502.py
│ ├── eval_academic_leaderboard_REALTIME.py
│ ├── eval_academic_telechat_thinking.py
│ ├── eval_alaya.py
│ ├── eval_api_demo.py
│ ├── eval_attack.py
│ ├── eval_babilong.py
│ ├── eval_base_demo.py
│ ├── eval_bench_intern_s1.py
│ ├── eval_bluelm_32k_lveval.py
│ ├── eval_cascade_evaluator.py
│ ├── eval_charm_mem.py
│ ├── eval_charm_rea.py
│ ├── eval_chat_agent.py
│ ├── eval_chat_agent_baseline.py
│ ├── eval_chat_demo.py
│ ├── eval_chat_last.py
│ ├── eval_chatml_datasets.py
│ ├── eval_chembench.py
│ ├── eval_chinese_simpleqa.py
│ ├── eval_cibench.py
│ ├── eval_cibench_api.py
│ ├── eval_circular.py
│ ├── eval_claude.py
│ ├── eval_code_passk.py
│ ├── eval_code_passk_repeat_dataset.py
│ ├── eval_codeagent.py
│ ├── eval_codebench_full.py
│ ├── eval_codegeex2.py
│ ├── eval_compassarena_subjectivebench.py
│ ├── eval_compassarena_subjectivebench_bradleyterry.py
│ ├── eval_contamination.py
│ ├── eval_corebench_2409_base_objective.py
│ ├── eval_corebench_2409_chat_objective.py
│ ├── eval_corebench_2409_longcontext.py
│ ├── eval_corebench_2409_subjective.py
│ ├── eval_deepseek_r1.py
│ ├── eval_dingo.py
│ ├── eval_ds1000_interpreter.py
│ ├── eval_edgellm_demo.py
│ ├── eval_eese_api_judge.py
│ ├── eval_gpt3.5.py
│ ├── eval_gpt4.py
│ ├── eval_hellobench.py
│ ├── eval_hf_llama2.py
│ ├── eval_hf_llama_7b.py
│ ├── eval_inference_ppl.py
│ ├── eval_internLM.py
│ ├── eval_intern_s1_pro.py
│ ├── eval_internlm2_chat_keyset.py
│ ├── eval_internlm2_keyset.py
│ ├── eval_internlm3_math500_thinking.py
│ ├── eval_internlm_7b.py
│ ├── eval_internlm_chat_lmdeploy_apiserver.py
│ ├── eval_internlm_chat_turbomind.py
│ ├── eval_internlm_flames_chat.py
│ ├── eval_internlm_lmdeploy_apiserver.py
│ ├── eval_internlm_math_chat.py
│ ├── eval_internlm_turbomind.py
│ ├── eval_judge_dataset_all.py
│ ├── eval_judgebench.py
│ ├── eval_judgerbench.py
│ ├── eval_judgerbenchv2.py
│ ├── eval_korbench.py
│ ├── eval_lightllm.py
│ ├── eval_livestembench.py
│ ├── eval_llama2_7b.py
│ ├── eval_llama2_7b_lveval.py
│ ├── eval_llama3_instruct.py
│ ├── eval_llm_compression.py
│ ├── eval_llm_judge.py
│ ├── eval_lmdeploy_demo.py
│ ├── eval_longbenchv2.py
│ ├── eval_math_llm_judge.py
│ ├── eval_math_llm_judge_internal.py
│ ├── eval_math_verify.py
│ ├── eval_mathbench.py
│ ├── eval_mmlu_cf.py
│ ├── eval_mmlu_pro.py
│ ├── eval_mmlu_with_zero_retriever_overwritten.py
│ ├── eval_model_rollout.py
│ ├── eval_modelscope_datasets.py
│ ├── eval_multi_prompt_demo.py
│ ├── eval_musr.py
│ ├── eval_needlebench_v2.py
│ ├── eval_qwen3.py
│ ├── eval_qwen_7b.py
│ ├── eval_qwen_7b_chat.py
│ ├── eval_qwen_7b_chat_lawbench.py
│ ├── eval_rewardbench.py
│ ├── eval_rmb.py
│ ├── eval_ruler.py
│ ├── eval_ruler_fix_tokenizer.py
│ ├── eval_rwkv5_3b.py
│ ├── eval_scireasoner.py
│ ├── eval_simpleqa.py
│ ├── eval_subjective.py
│ ├── eval_subjective_alpacaeval_official.py
│ ├── eval_subjective_bradleyterry.py
│ ├── eval_teval.py
│ └── eval_with_model_dataset_combinations.py
├── opencompass/
│ ├── __init__.py
│ ├── cli/
│ │ ├── __init__.py
│ │ └── main.py
│ ├── configs/
│ │ ├── chatml_datasets/
│ │ │ ├── AMO_Bench/
│ │ │ │ └── AMO_Bench_gen.py
│ │ │ ├── CPsyExam/
│ │ │ │ └── CPsyExam_gen.py
│ │ │ ├── CS_Bench/
│ │ │ │ └── CS_Bench_gen.py
│ │ │ ├── C_MHChem/
│ │ │ │ └── C_MHChem_gen.py
│ │ │ ├── HMMT2025/
│ │ │ │ └── HMMT2025_gen.py
│ │ │ ├── IMO_Bench_AnswerBench/
│ │ │ │ └── IMO_Bench_AnswerBench_gen.py
│ │ │ ├── MaScQA/
│ │ │ │ └── MaScQA_gen.py
│ │ │ ├── UGD_hard/
│ │ │ │ └── UGD_hard_gen.py
│ │ │ └── UGPhysics/
│ │ │ └── UGPhysics_gen.py
│ │ ├── dataset_collections/
│ │ │ └── chat_OC15.py
│ │ ├── datasets/
│ │ │ ├── ARC_Prize_Public_Evaluation/
│ │ │ │ ├── README.md
│ │ │ │ ├── arc_agi_2_public_evaluation_gen.py
│ │ │ │ ├── arc_prize_public_evaluation_gen.py
│ │ │ │ ├── arc_prize_public_evaluation_gen_872059.py
│ │ │ │ └── arc_prize_public_evaluation_gen_fedd04.py
│ │ │ ├── ARC_c/
│ │ │ │ ├── ARC_c_clean_ppl.py
│ │ │ │ ├── ARC_c_cot_gen_926652.py
│ │ │ │ ├── ARC_c_few_shot_gen_e9b043.py
│ │ │ │ ├── ARC_c_few_shot_ppl.py
│ │ │ │ ├── ARC_c_gen.py
│ │ │ │ ├── ARC_c_gen_1e0de5.py
│ │ │ │ ├── ARC_c_ppl.py
│ │ │ │ ├── ARC_c_ppl_2ef631.py
│ │ │ │ ├── ARC_c_ppl_a450bd.py
│ │ │ │ └── ARC_c_ppl_d52a21.py
│ │ │ ├── ARC_e/
│ │ │ │ ├── ARC_e_gen.py
│ │ │ │ ├── ARC_e_gen_1e0de5.py
│ │ │ │ ├── ARC_e_ppl.py
│ │ │ │ ├── ARC_e_ppl_2ef631.py
│ │ │ │ ├── ARC_e_ppl_a450bd.py
│ │ │ │ └── ARC_e_ppl_d52a21.py
│ │ │ ├── BeyondAIME/
│ │ │ │ ├── beyondaime_cascade_eval_gen_5e9f4f.py
│ │ │ │ └── beyondaime_gen.py
│ │ │ ├── CARDBiomedBench/
│ │ │ │ ├── CARDBiomedBench_llmjudge_gen_99a231.py
│ │ │ │ └── CARDBiomedBench_llmjudge_rawprompt_gen_b4d90c.py
│ │ │ ├── CHARM/
│ │ │ │ ├── README.md
│ │ │ │ ├── README_ZH.md
│ │ │ │ ├── charm_memory_gen_bbbd53.py
│ │ │ │ ├── charm_memory_settings.py
│ │ │ │ ├── charm_reason_cot_only_gen_f7b7d3.py
│ │ │ │ ├── charm_reason_gen.py
│ │ │ │ ├── charm_reason_gen_f8fca2.py
│ │ │ │ ├── charm_reason_ppl_3da4de.py
│ │ │ │ └── charm_reason_settings.py
│ │ │ ├── CIBench/
│ │ │ │ ├── CIBench_generation_gen_8ab0dc.py
│ │ │ │ ├── CIBench_generation_oracle_gen_c4a7c1.py
│ │ │ │ ├── CIBench_template_gen_e6b12a.py
│ │ │ │ └── CIBench_template_oracle_gen_fecda1.py
│ │ │ ├── CLUE_C3/
│ │ │ │ ├── CLUE_C3_gen.py
│ │ │ │ ├── CLUE_C3_gen_8c358f.py
│ │ │ │ ├── CLUE_C3_ppl.py
│ │ │ │ ├── CLUE_C3_ppl_56b537.py
│ │ │ │ └── CLUE_C3_ppl_e24a31.py
│ │ │ ├── CLUE_CMRC/
│ │ │ │ ├── CLUE_CMRC_gen.py
│ │ │ │ ├── CLUE_CMRC_gen_1bd3c8.py
│ │ │ │ ├── CLUE_CMRC_gen_3749cd.py
│ │ │ │ ├── CLUE_CMRC_gen_8484b9.py
│ │ │ │ └── CLUE_CMRC_gen_941108.py
│ │ │ ├── CLUE_DRCD/
│ │ │ │ ├── CLUE_DRCD_gen.py
│ │ │ │ ├── CLUE_DRCD_gen_1bd3c8.py
│ │ │ │ ├── CLUE_DRCD_gen_3749cd.py
│ │ │ │ ├── CLUE_DRCD_gen_8484b9.py
│ │ │ │ └── CLUE_DRCD_gen_941108.py
│ │ │ ├── CLUE_afqmc/
│ │ │ │ ├── CLUE_afqmc_gen.py
│ │ │ │ ├── CLUE_afqmc_gen_901306.py
│ │ │ │ ├── CLUE_afqmc_ppl.py
│ │ │ │ ├── CLUE_afqmc_ppl_378c5b.py
│ │ │ │ ├── CLUE_afqmc_ppl_6507d7.py
│ │ │ │ └── CLUE_afqmc_ppl_7b0c1e.py
│ │ │ ├── CLUE_cmnli/
│ │ │ │ ├── CLUE_cmnli_gen.py
│ │ │ │ ├── CLUE_cmnli_gen_1abf97.py
│ │ │ │ ├── CLUE_cmnli_gen_51e956.py
│ │ │ │ ├── CLUE_cmnli_ppl.py
│ │ │ │ ├── CLUE_cmnli_ppl_98dd6e.py
│ │ │ │ ├── CLUE_cmnli_ppl_ef69e7.py
│ │ │ │ └── CLUE_cmnli_ppl_fdc6de.py
│ │ │ ├── CLUE_ocnli/
│ │ │ │ ├── CLUE_ocnli_gen.py
│ │ │ │ ├── CLUE_ocnli_gen_51e956.py
│ │ │ │ ├── CLUE_ocnli_gen_c4cb6c.py
│ │ │ │ ├── CLUE_ocnli_ppl.py
│ │ │ │ ├── CLUE_ocnli_ppl_98dd6e.py
│ │ │ │ ├── CLUE_ocnli_ppl_ef69e7.py
│ │ │ │ └── CLUE_ocnli_ppl_fdc6de.py
│ │ │ ├── CMPhysBench/
│ │ │ │ ├── cmphysbench_gen.py
│ │ │ │ └── cmphysbench_rawprompt_gen.py
│ │ │ ├── ChemBench/
│ │ │ │ ├── ChemBench_gen.py
│ │ │ │ ├── ChemBench_gen_a9f753.py
│ │ │ │ ├── ChemBench_llmjudge_gen.py
│ │ │ │ ├── ChemBench_llmjudge_gen_c584cf.py
│ │ │ │ └── ChemBench_llmjudge_rawprompt_gen_fa3fc4.py
│ │ │ ├── ClimaQA/
│ │ │ │ ├── ClimaQA_Gold_llm_judge_gen.py
│ │ │ │ ├── ClimaQA_Gold_llm_judge_gen_f15343.py
│ │ │ │ ├── ClimaQA_Gold_llm_judge_rawprompt_gen_b3080f.py
│ │ │ │ ├── ClimaQA_Silver_llm_judge_gen.py
│ │ │ │ └── ClimaQA_Silver_llm_judge_gen_f15343.py
│ │ │ ├── ClinicBench/
│ │ │ │ ├── ClinicBench_llmjudge_gen.py
│ │ │ │ └── ClinicBench_llmjudge_gen_d09668.py
│ │ │ ├── Earth_Silver/
│ │ │ │ ├── Earth_Silver_gen.py
│ │ │ │ ├── Earth_Silver_llmjudge_gen.py
│ │ │ │ ├── Earth_Silver_llmjudge_gen_46140c.py
│ │ │ │ └── Earth_Silver_llmjudge_rawprompt_gen_a84bc6.py
│ │ │ ├── FewCLUE_bustm/
│ │ │ │ ├── FewCLUE_bustm_gen.py
│ │ │ │ ├── FewCLUE_bustm_gen_634f41.py
│ │ │ │ ├── FewCLUE_bustm_ppl.py
│ │ │ │ ├── FewCLUE_bustm_ppl_4b16c0.py
│ │ │ │ ├── FewCLUE_bustm_ppl_9ef540.py
│ │ │ │ └── FewCLUE_bustm_ppl_e53034.py
│ │ │ ├── FewCLUE_chid/
│ │ │ │ ├── FewCLUE_chid_gen.py
│ │ │ │ ├── FewCLUE_chid_gen_0a29a2.py
│ │ │ │ ├── FewCLUE_chid_ppl.py
│ │ │ │ ├── FewCLUE_chid_ppl_8f2872.py
│ │ │ │ └── FewCLUE_chid_ppl_acccb5.py
│ │ │ ├── FewCLUE_cluewsc/
│ │ │ │ ├── FewCLUE_cluewsc_gen.py
│ │ │ │ ├── FewCLUE_cluewsc_gen_c68933.py
│ │ │ │ ├── FewCLUE_cluewsc_ppl.py
│ │ │ │ ├── FewCLUE_cluewsc_ppl_12e4e0.py
│ │ │ │ ├── FewCLUE_cluewsc_ppl_4284a0.py
│ │ │ │ └── FewCLUE_cluewsc_ppl_868415.py
│ │ │ ├── FewCLUE_csl/
│ │ │ │ ├── FewCLUE_csl_gen.py
│ │ │ │ ├── FewCLUE_csl_gen_28b223.py
│ │ │ │ ├── FewCLUE_csl_gen_87f4a8.py
│ │ │ │ ├── FewCLUE_csl_ppl.py
│ │ │ │ ├── FewCLUE_csl_ppl_769f8d.py
│ │ │ │ └── FewCLUE_csl_ppl_841b62.py
│ │ │ ├── FewCLUE_eprstmt/
│ │ │ │ ├── FewCLUE_eprstmt_gen.py
│ │ │ │ ├── FewCLUE_eprstmt_gen_740ea0.py
│ │ │ │ ├── FewCLUE_eprstmt_ppl.py
│ │ │ │ ├── FewCLUE_eprstmt_ppl_1ce587.py
│ │ │ │ └── FewCLUE_eprstmt_ppl_f1e631.py
│ │ │ ├── FewCLUE_ocnli_fc/
│ │ │ │ ├── FewCLUE_ocnli_fc_gen.py
│ │ │ │ ├── FewCLUE_ocnli_fc_gen_f97a97.py
│ │ │ │ ├── FewCLUE_ocnli_fc_ppl.py
│ │ │ │ ├── FewCLUE_ocnli_fc_ppl_9e8b3d.py
│ │ │ │ └── FewCLUE_ocnli_fc_ppl_c08300.py
│ │ │ ├── FewCLUE_tnews/
│ │ │ │ ├── FewCLUE_tnews_gen.py
│ │ │ │ ├── FewCLUE_tnews_gen_b90e4a.py
│ │ │ │ ├── FewCLUE_tnews_ppl.py
│ │ │ │ ├── FewCLUE_tnews_ppl_7d1c07.py
│ │ │ │ ├── FewCLUE_tnews_ppl_d10e8a.py
│ │ │ │ └── FewCLUE_tnews_ppl_fff486.py
│ │ │ ├── FinanceIQ/
│ │ │ │ ├── FinanceIQ_gen.py
│ │ │ │ ├── FinanceIQ_gen_e0e6b5.py
│ │ │ │ ├── FinanceIQ_ppl.py
│ │ │ │ └── FinanceIQ_ppl_42b9bd.py
│ │ │ ├── GLUE_CoLA/
│ │ │ │ ├── GLUE_CoLA_ppl.py
│ │ │ │ └── GLUE_CoLA_ppl_77d0df.py
│ │ │ ├── GLUE_MRPC/
│ │ │ │ ├── GLUE_MRPC_ppl.py
│ │ │ │ └── GLUE_MRPC_ppl_96564c.py
│ │ │ ├── GLUE_QQP/
│ │ │ │ ├── GLUE_QQP_ppl.py
│ │ │ │ └── GLUE_QQP_ppl_250d00.py
│ │ │ ├── GaokaoBench/
│ │ │ │ ├── GaokaoBench_gen.py
│ │ │ │ ├── GaokaoBench_gen_5cfe9e.py
│ │ │ │ ├── GaokaoBench_mixed.py
│ │ │ │ ├── GaokaoBench_mixed_9af5ee.py
│ │ │ │ ├── GaokaoBench_no_subjective_gen_4c31db.py
│ │ │ │ ├── GaokaoBench_no_subjective_gen_d16acb.py
│ │ │ │ ├── GaokaoBench_no_subjective_gen_d21e37.py
│ │ │ │ ├── GaokaoBench_prompts.py
│ │ │ │ └── README.md
│ │ │ ├── HLE/
│ │ │ │ ├── hle_biomed_llm_verify_gen_6ff468.py
│ │ │ │ ├── hle_gen.py
│ │ │ │ ├── hle_llmverify_academic.py
│ │ │ │ ├── hle_llmverify_gen_6ff468.py
│ │ │ │ └── hle_llmverify_rawprompt_gen_0970dd.py
│ │ │ ├── HealthBench/
│ │ │ │ └── healthbench_gen_831613.py
│ │ │ ├── IFBench/
│ │ │ │ ├── IFBench_gen.py
│ │ │ │ └── IFBench_rawprompt_gen.py
│ │ │ ├── IFEval/
│ │ │ │ ├── IFEval.md
│ │ │ │ ├── IFEval_gen.py
│ │ │ │ ├── IFEval_gen_3321a3.py
│ │ │ │ ├── IFEval_gen_353ae7.py
│ │ │ │ ├── IFEval_rawprompt_gen_e7f781.py
│ │ │ │ └── README.md
│ │ │ ├── LCBench/
│ │ │ │ ├── README.md
│ │ │ │ ├── lcbench_gen.py
│ │ │ │ ├── lcbench_gen_5ff288.py
│ │ │ │ ├── lcbench_levels_gen_bb665f.py
│ │ │ │ ├── lcbench_repeat10_gen.py
│ │ │ │ └── lcbench_repeat10_gen_5ff288.py
│ │ │ ├── MMLUArabic/
│ │ │ │ ├── MMLUArabic_gen.py
│ │ │ │ ├── MMLUArabic_gen_326684.py
│ │ │ │ ├── MMLUArabic_ppl.py
│ │ │ │ ├── MMLUArabic_ppl_d2333a.py
│ │ │ │ ├── MMLUArabic_zero_shot_gen.py
│ │ │ │ ├── MMLUArabic_zero_shot_gen_3523e0.py
│ │ │ │ └── README.md
│ │ │ ├── MathBench/
│ │ │ │ ├── deprecated_mathbench_2024_gen_de9ff9.py
│ │ │ │ ├── deprecated_mathbench_agent_gen_48ec47.py
│ │ │ │ ├── deprecated_mathbench_agent_gen_fbe13b.py
│ │ │ │ ├── deprecated_mathbench_arith_gen_ccd638.py
│ │ │ │ ├── deprecated_mathbench_cot_gen_66f329.py
│ │ │ │ ├── deprecated_mathbench_gen_7b734b.py
│ │ │ │ ├── mathbench_2024_few_shot_mixed_4a3fd4.py
│ │ │ │ ├── mathbench_2024_gen_19e486.py
│ │ │ │ ├── mathbench_2024_gen_1dc21d.py
│ │ │ │ ├── mathbench_2024_gen_4b8f28.py
│ │ │ │ ├── mathbench_2024_gen_50a320.py
│ │ │ │ ├── mathbench_2024_gen_fc2a24.py
│ │ │ │ ├── mathbench_2024_wocircular_gen_1dc21d.py
│ │ │ │ ├── mathbench_2024_wocircular_mixed_8eb12b.py
│ │ │ │ ├── mathbench_gen.py
│ │ │ │ └── mathbench_prompt.py
│ │ │ ├── MedBench/
│ │ │ │ ├── medbench_gen.py
│ │ │ │ └── medbench_gen_0b4fff.py
│ │ │ ├── MedCalc_Bench/
│ │ │ │ └── MedCalcBench_official_gen_a5155f.py
│ │ │ ├── MedQA/
│ │ │ │ ├── MedQA_gen_3bf756.py
│ │ │ │ └── MedQA_llmjudge_gen_3bf756.py
│ │ │ ├── MedXpertQA/
│ │ │ │ ├── MedXpertQA_gen.py
│ │ │ │ ├── MedXpertQA_llmjudge_gen.py
│ │ │ │ └── MedXpertQA_llmjudge_rawprompt_gen.py
│ │ │ ├── Medbullets/
│ │ │ │ ├── medbullets_gen.py
│ │ │ │ ├── medbullets_gen_60c8f5.py
│ │ │ │ ├── medbullets_llmjudge_gen.py
│ │ │ │ └── medbullets_llmjudge_gen_60c8f5.py
│ │ │ ├── MolInstructions_chem/
│ │ │ │ ├── mol_instructions_chem_gen.py
│ │ │ │ └── mol_instructions_chem_rawprompt_gen.py
│ │ │ ├── NPHardEval/
│ │ │ │ ├── NPHardEval_gen.py
│ │ │ │ ├── NPHardEval_gen_22aac5.py
│ │ │ │ └── README.md
│ │ │ ├── OlymMATH/
│ │ │ │ ├── README.md
│ │ │ │ ├── olymmath_cascade_eval_gen_97b203.py
│ │ │ │ ├── olymmath_llm_judeg_gen.py
│ │ │ │ ├── olymmath_llmverify_gen_97b203.py
│ │ │ │ └── olymmath_llmverify_rawprompt_gen_9d3a8e.py
│ │ │ ├── OlympiadBench/
│ │ │ │ ├── OlympiadBenchMath_0shot_llmverify_gen_9c22f2.py
│ │ │ │ ├── OlympiadBench_0shot_cascade_eval_gen_be8b13.py
│ │ │ │ ├── OlympiadBench_0shot_gen_be8b13.py
│ │ │ │ ├── OlympiadBench_0shot_llmverify_gen_be8b13.py
│ │ │ │ ├── OlympiadBench_0shot_llmverify_rawprompt_gen_be8b13.py
│ │ │ │ └── OlympiadBench_categories.py
│ │ │ ├── OpenFinData/
│ │ │ │ ├── OpenFinData_gen.py
│ │ │ │ ├── OpenFinData_gen_46dedb.py
│ │ │ │ └── README.md
│ │ │ ├── PHYBench/
│ │ │ │ ├── phybench_gen.py
│ │ │ │ └── phybench_rawprompt_gen.py
│ │ │ ├── PHYSICS/
│ │ │ │ ├── PHYSICS_llm_judge_gen.py
│ │ │ │ ├── PHYSICS_llm_judge_gen_a133a2.py
│ │ │ │ └── PHYSICS_llm_judge_rawprompt_gen_56ebc8.py
│ │ │ ├── PI_LLM/
│ │ │ │ ├── README.md
│ │ │ │ └── pi_llm_gen.py
│ │ │ ├── PJExam/
│ │ │ │ ├── PJExam_gen.py
│ │ │ │ └── PJExam_gen_8cd97c.py
│ │ │ ├── PMMEval/
│ │ │ │ ├── flores_gen.py
│ │ │ │ ├── flores_gen_2697d7.py
│ │ │ │ ├── humanevalxl_gen.py
│ │ │ │ ├── humanevalxl_gen_bdec92.py
│ │ │ │ ├── mgsm_gen.py
│ │ │ │ ├── mgsm_gen_679720.py
│ │ │ │ ├── mhellaswag_gen.py
│ │ │ │ ├── mhellaswag_gen_1a6b73.py
│ │ │ │ ├── mifeval_gen.py
│ │ │ │ ├── mifeval_gen_79f8fb.py
│ │ │ │ ├── mlogiqa_gen.py
│ │ │ │ ├── mlogiqa_gen_36c4f9.py
│ │ │ │ ├── mmmlu_gen.py
│ │ │ │ ├── mmmlu_gen_d5017d.py
│ │ │ │ ├── pmmeval_gen.py
│ │ │ │ ├── xnli_gen.py
│ │ │ │ └── xnli_gen_973734.py
│ │ │ ├── ProcessBench/
│ │ │ │ ├── README.md
│ │ │ │ └── processbench_gen.py
│ │ │ ├── ProteinLMBench/
│ │ │ │ ├── ProteinLMBench_gen_a67965.py
│ │ │ │ ├── ProteinLMBench_llmjudge_gen_a67965.py
│ │ │ │ └── ProteinLMBench_llmjudge_rawprompt_gen_9627a6.py
│ │ │ ├── PubMedQA/
│ │ │ │ ├── PubMedQA_llmjudge_gen.py
│ │ │ │ └── PubMedQA_llmjudge_gen_f00302.py
│ │ │ ├── QuALITY/
│ │ │ │ ├── QuALITY.md
│ │ │ │ ├── QuALITY_gen.py
│ │ │ │ └── QuALITY_gen_c407cb.py
│ │ │ ├── R_Bench/
│ │ │ │ ├── R-Bench.md
│ │ │ │ ├── rbench_gen_544610.py
│ │ │ │ ├── rbench_llmjudge_gen_c89350.py
│ │ │ │ └── rbench_llmjudge_rawprompt_gen_c24221.py
│ │ │ ├── SVAMP/
│ │ │ │ ├── svamp_gen.py
│ │ │ │ └── svamp_gen_fb25e4.py
│ │ │ ├── SciEval/
│ │ │ │ ├── SciEval_5shot_gen_4043d4.py
│ │ │ │ ├── SciEval_5shot_llmjudge_gen_b7b684.py
│ │ │ │ └── SciEval_lifescience_sets.py
│ │ │ ├── SciKnowEval/
│ │ │ │ ├── SciKnowEval_gen_ebe47d.py
│ │ │ │ └── SciKnowEval_llmjudge_gen_ebe47d.py
│ │ │ ├── SciReasoner/
│ │ │ │ ├── GUE_gen.py
│ │ │ │ ├── LLM4Mat_gen.py
│ │ │ │ ├── UMG.py
│ │ │ │ ├── UPG.py
│ │ │ │ ├── bio_instruction_gen.py
│ │ │ │ ├── bulk_modulus_material_gen.py
│ │ │ │ ├── composition_material_gen.py
│ │ │ │ ├── mol_biotext_gen.py
│ │ │ │ ├── mol_molecule_gen.py
│ │ │ │ ├── mol_protein_gen.py
│ │ │ │ ├── opi_gen.py
│ │ │ │ ├── peer_gen.py
│ │ │ │ ├── retrosynthesis_USPTO_gen.py
│ │ │ │ ├── scireasoner_gen.py
│ │ │ │ ├── smol_gen.py
│ │ │ │ ├── unconditional_RNA_gen.py
│ │ │ │ └── unconditional_material_gen.py
│ │ │ ├── ScienceQA/
│ │ │ │ ├── ScienceQA_llmjudge_gen.py
│ │ │ │ └── ScienceQA_llmjudge_gen_f00302.py
│ │ │ ├── SeedBench/
│ │ │ │ ├── README.md
│ │ │ │ ├── seedbench_gen.py
│ │ │ │ └── seedbench_gen_5d5ea1.py
│ │ │ ├── SimpleQA/
│ │ │ │ ├── README.md
│ │ │ │ ├── simpleqa_gen.py
│ │ │ │ └── simpleqa_gen_0283c3.py
│ │ │ ├── SmolInstruct/
│ │ │ │ ├── smolinstruct_0shot_instruct_gen.py
│ │ │ │ ├── smolinstruct_0shot_instruct_rawprompt_gen.py
│ │ │ │ ├── smolinstruct_fts_0shot_instruct.py
│ │ │ │ ├── smolinstruct_fts_0shot_rawprompt_instruct.py
│ │ │ │ ├── smolinstruct_fts_gen_5774b5.py
│ │ │ │ ├── smolinstruct_gen.py
│ │ │ │ ├── smolinstruct_meteor_0shot_instruct.py
│ │ │ │ ├── smolinstruct_meteor_0shot_rawprompt_instruct.py
│ │ │ │ ├── smolinstruct_meteor_gen_065150.py
│ │ │ │ ├── smolinstruct_nc_0shot_instruct.py
│ │ │ │ ├── smolinstruct_nc_0shot_rawprompt_instruct.py
│ │ │ │ ├── smolinstruct_nc_gen_c84c18.py
│ │ │ │ ├── smolinstruct_pp_acc_0_shot_instruct.py
│ │ │ │ ├── smolinstruct_pp_acc_0_shot_rawprompt_instruct.py
│ │ │ │ ├── smolinstruct_pp_acc_gen_8607a3.py
│ │ │ │ ├── smolinstruct_rmse_0shot_instruct.py
│ │ │ │ ├── smolinstruct_rmse_0shot_rawprompt_instruct.py
│ │ │ │ └── smolinstruct_rmse_gen_0fcc6b.py
│ │ │ ├── SuperGLUE_AX_b/
│ │ │ │ ├── SuperGLUE_AX_b_gen.py
│ │ │ │ ├── SuperGLUE_AX_b_gen_4dfefa.py
│ │ │ │ ├── SuperGLUE_AX_b_ppl.py
│ │ │ │ ├── SuperGLUE_AX_b_ppl_0748aa.py
│ │ │ │ └── SuperGLUE_AX_b_ppl_6db806.py
│ │ │ ├── SuperGLUE_AX_g/
│ │ │ │ ├── SuperGLUE_AX_g_gen.py
│ │ │ │ ├── SuperGLUE_AX_g_gen_68aac7.py
│ │ │ │ ├── SuperGLUE_AX_g_ppl.py
│ │ │ │ ├── SuperGLUE_AX_g_ppl_50f8f6.py
│ │ │ │ └── SuperGLUE_AX_g_ppl_66caf3.py
│ │ │ ├── SuperGLUE_BoolQ/
│ │ │ │ ├── SuperGLUE_BoolQ_cot_gen_1d56df.py
│ │ │ │ ├── SuperGLUE_BoolQ_few_shot_gen_ba58ea.py
│ │ │ │ ├── SuperGLUE_BoolQ_few_shot_ppl.py
│ │ │ │ ├── SuperGLUE_BoolQ_gen.py
│ │ │ │ ├── SuperGLUE_BoolQ_gen_883d50.py
│ │ │ │ ├── SuperGLUE_BoolQ_ppl.py
│ │ │ │ ├── SuperGLUE_BoolQ_ppl_16b1d9.py
│ │ │ │ ├── SuperGLUE_BoolQ_ppl_314797.py
│ │ │ │ ├── SuperGLUE_BoolQ_ppl_314b96.py
│ │ │ │ ├── SuperGLUE_BoolQ_ppl_4da4db.py
│ │ │ │ └── SuperGLUE_BoolQ_ppl_9619db.py
│ │ │ ├── SuperGLUE_CB/
│ │ │ │ ├── SuperGLUE_CB_gen.py
│ │ │ │ ├── SuperGLUE_CB_gen_854c6c.py
│ │ │ │ ├── SuperGLUE_CB_ppl.py
│ │ │ │ ├── SuperGLUE_CB_ppl_0143fe.py
│ │ │ │ └── SuperGLUE_CB_ppl_11c175.py
│ │ │ ├── SuperGLUE_COPA/
│ │ │ │ ├── SuperGLUE_COPA_gen.py
│ │ │ │ ├── SuperGLUE_COPA_gen_91ca53.py
│ │ │ │ ├── SuperGLUE_COPA_ppl.py
│ │ │ │ ├── SuperGLUE_COPA_ppl_54058d.py
│ │ │ │ ├── SuperGLUE_COPA_ppl_5c24f1.py
│ │ │ │ └── SuperGLUE_COPA_ppl_9f3618.py
│ │ │ ├── SuperGLUE_MultiRC/
│ │ │ │ ├── SuperGLUE_MultiRC_gen.py
│ │ │ │ ├── SuperGLUE_MultiRC_gen_27071f.py
│ │ │ │ ├── SuperGLUE_MultiRC_ppl.py
│ │ │ │ ├── SuperGLUE_MultiRC_ppl_866273.py
│ │ │ │ └── SuperGLUE_MultiRC_ppl_ced824.py
│ │ │ ├── SuperGLUE_RTE/
│ │ │ │ ├── SuperGLUE_RTE_gen.py
│ │ │ │ ├── SuperGLUE_RTE_gen_68aac7.py
│ │ │ │ ├── SuperGLUE_RTE_ppl.py
│ │ │ │ ├── SuperGLUE_RTE_ppl_50f8f6.py
│ │ │ │ └── SuperGLUE_RTE_ppl_66caf3.py
│ │ │ ├── SuperGLUE_ReCoRD/
│ │ │ │ ├── SuperGLUE_ReCoRD_gen.py
│ │ │ │ ├── SuperGLUE_ReCoRD_gen_0f7784.py
│ │ │ │ ├── SuperGLUE_ReCoRD_gen_30dea0.py
│ │ │ │ └── SuperGLUE_ReCoRD_gen_a69961.py
│ │ │ ├── SuperGLUE_WSC/
│ │ │ │ ├── SuperGLUE_WSC_gen.py
│ │ │ │ ├── SuperGLUE_WSC_gen_7902a7.py
│ │ │ │ ├── SuperGLUE_WSC_gen_fe4bf3.py
│ │ │ │ ├── SuperGLUE_WSC_ppl.py
│ │ │ │ ├── SuperGLUE_WSC_ppl_003529.py
│ │ │ │ ├── SuperGLUE_WSC_ppl_1c4a90.py
│ │ │ │ ├── SuperGLUE_WSC_ppl_d0f531.py
│ │ │ │ └── SuperGLUE_WSC_ppl_f37e78.py
│ │ │ ├── SuperGLUE_WiC/
│ │ │ │ ├── SuperGLUE_WiC_gen.py
│ │ │ │ ├── SuperGLUE_WiC_gen_d06864.py
│ │ │ │ ├── SuperGLUE_WiC_ppl.py
│ │ │ │ ├── SuperGLUE_WiC_ppl_312de9.py
│ │ │ │ ├── SuperGLUE_WiC_ppl_3fb6fd.py
│ │ │ │ └── SuperGLUE_WiC_ppl_c926be.py
│ │ │ ├── TabMWP/
│ │ │ │ ├── TabMWP_gen.py
│ │ │ │ └── TabMWP_gen_2aef96.py
│ │ │ ├── TheoremQA/
│ │ │ │ ├── README.md
│ │ │ │ ├── TheoremQA_5shot_gen_6f0af8.py
│ │ │ │ ├── TheoremQA_few_shot_examples.py
│ │ │ │ ├── TheoremQA_few_shot_examples_official.py
│ │ │ │ ├── TheoremQA_gen.py
│ │ │ │ ├── ThroremQA_0shot_cot_gen_8acdf7.py
│ │ │ │ ├── deprecated_TheoremQA_gen_424e0a.py
│ │ │ │ ├── deprecated_TheoremQA_gen_7009de.py
│ │ │ │ ├── deprecated_TheoremQA_gen_ef26ca.py
│ │ │ │ ├── deprecated_TheoremQA_post_v2_gen_2c2583.py
│ │ │ │ └── deprecated_TheoremQA_post_v2_gen_ef26ca.py
│ │ │ ├── XCOPA/
│ │ │ │ ├── XCOPA_ppl.py
│ │ │ │ └── XCOPA_ppl_54058d.py
│ │ │ ├── XLSum/
│ │ │ │ ├── XLSum_gen.py
│ │ │ │ └── XLSum_gen_2bb71c.py
│ │ │ ├── Xsum/
│ │ │ │ ├── Xsum_gen.py
│ │ │ │ ├── Xsum_gen_31397e.py
│ │ │ │ └── Xsum_gen_8ea5f8.py
│ │ │ ├── adv_glue/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── adv_glue_mnli/
│ │ │ │ │ ├── adv_glue_mnli_gen.py
│ │ │ │ │ └── adv_glue_mnli_gen_bd8ef0.py
│ │ │ │ ├── adv_glue_mnli_mm/
│ │ │ │ │ ├── adv_glue_mnli_mm_gen.py
│ │ │ │ │ └── adv_glue_mnli_mm_gen_bd8ef0.py
│ │ │ │ ├── adv_glue_qnli/
│ │ │ │ │ ├── adv_glue_qnli_gen.py
│ │ │ │ │ └── adv_glue_qnli_gen_0b7326.py
│ │ │ │ ├── adv_glue_qqp/
│ │ │ │ │ ├── adv_glue_qqp_gen.py
│ │ │ │ │ └── adv_glue_qqp_gen_cdc277.py
│ │ │ │ ├── adv_glue_rte/
│ │ │ │ │ ├── adv_glue_rte_gen.py
│ │ │ │ │ └── adv_glue_rte_gen_8cc547.py
│ │ │ │ └── adv_glue_sst2/
│ │ │ │ ├── adv_glue_sst2_gen.py
│ │ │ │ └── adv_glue_sst2_gen_ee8d3b.py
│ │ │ ├── agieval/
│ │ │ │ ├── agieval_gen.py
│ │ │ │ ├── agieval_gen_397d81.py
│ │ │ │ ├── agieval_gen_617738.py
│ │ │ │ ├── agieval_gen_64afd3.py
│ │ │ │ ├── agieval_gen_a0c741.py
│ │ │ │ ├── agieval_mixed.py
│ │ │ │ └── agieval_mixed_0fa998.py
│ │ │ ├── aime2024/
│ │ │ │ ├── README.md
│ │ │ │ ├── aime2024_0shot_nocot_gen_2b9dc2.py
│ │ │ │ ├── aime2024_0shot_nocot_genericllmeval_academic_gen.py
│ │ │ │ ├── aime2024_0shot_nocot_genericllmeval_gen_2b9dc2.py
│ │ │ │ ├── aime2024_cascade_eval_gen_5e9f4f.py
│ │ │ │ ├── aime2024_cascade_eval_rawprompt_gen_2f2c96.py
│ │ │ │ ├── aime2024_gen.py
│ │ │ │ ├── aime2024_gen_17d799.py
│ │ │ │ ├── aime2024_gen_6e39a4.py
│ │ │ │ ├── aime2024_llmjudge_gen.py
│ │ │ │ ├── aime2024_llmjudge_gen_5e9f4f.py
│ │ │ │ ├── aime2024_llmverify_repeat16_gen_bf7475.py
│ │ │ │ └── aime2024_llmverify_repeat8_gen_e8fcee.py
│ │ │ ├── aime2025/
│ │ │ │ ├── aime2025_cascade_eval_gen_5e9f4f.py
│ │ │ │ ├── aime2025_cascade_eval_rawprompt_gen_2f2c96.py
│ │ │ │ ├── aime2025_llmjudge_academic.py
│ │ │ │ └── aime2025_llmjudge_gen_5e9f4f.py
│ │ │ ├── aime2026/
│ │ │ │ ├── aime2026_cascade_eval_gen_6ff468.py
│ │ │ │ └── aime2026_cascade_eval_rawprompt_gen_0970dd.py
│ │ │ ├── anli/
│ │ │ │ ├── anli_gen.py
│ │ │ │ ├── anli_gen_fc7328.py
│ │ │ │ ├── anli_ppl.py
│ │ │ │ └── anli_ppl_1d290e.py
│ │ │ ├── anthropics_evals/
│ │ │ │ ├── airisk_gen.py
│ │ │ │ ├── airisk_gen_ba66fc.py
│ │ │ │ ├── persona_gen.py
│ │ │ │ ├── persona_gen_cc72e2.py
│ │ │ │ ├── sycophancy_gen.py
│ │ │ │ └── sycophancy_gen_4bba45.py
│ │ │ ├── apps/
│ │ │ │ ├── README.md
│ │ │ │ ├── apps_gen.py
│ │ │ │ ├── apps_gen_c7893a.py
│ │ │ │ ├── apps_mini_gen.py
│ │ │ │ ├── apps_mini_gen_c7893a.py
│ │ │ │ ├── deprecated_apps_gen_5b4254.py
│ │ │ │ ├── deprecated_apps_gen_7fbb95.py
│ │ │ │ └── deprecated_apps_gen_b4dee3.py
│ │ │ ├── atlas/
│ │ │ │ ├── README.md
│ │ │ │ ├── atlas_gen.py
│ │ │ │ ├── atlas_val_gen_b2d1b6.py
│ │ │ │ └── atlas_val_rawprompt_gen_277bee.py
│ │ │ ├── babilong/
│ │ │ │ ├── README.md
│ │ │ │ ├── babilong_0k_gen.py
│ │ │ │ ├── babilong_128k_gen.py
│ │ │ │ ├── babilong_16k_gen.py
│ │ │ │ ├── babilong_1m_gen.py
│ │ │ │ ├── babilong_256k_gen.py
│ │ │ │ ├── babilong_2k_gen.py
│ │ │ │ ├── babilong_32k_gen.py
│ │ │ │ └── babilong_4k_gen.py
│ │ │ ├── bbeh/
│ │ │ │ ├── README.md
│ │ │ │ ├── bbeh_gen.py
│ │ │ │ ├── bbeh_llmjudge_gen_86c3a0.py
│ │ │ │ └── bbeh_llmjudge_rawprompt_gen_36b5f4.py
│ │ │ ├── bbh/
│ │ │ │ ├── README.md
│ │ │ │ ├── bbh_0shot_nocot_academic_gen.py
│ │ │ │ ├── bbh_0shot_nocot_gen_925fc4.py
│ │ │ │ ├── bbh_0shot_nocot_gen_9c32f6.py
│ │ │ │ ├── bbh_0shot_nocot_gen_ea7952.py
│ │ │ │ ├── bbh_gen.py
│ │ │ │ ├── bbh_gen_2879b0.py
│ │ │ │ ├── bbh_gen_4a31fa.py
│ │ │ │ ├── bbh_gen_5b92b0.py
│ │ │ │ ├── bbh_gen_5bf00b.py
│ │ │ │ ├── bbh_gen_98fba6.py
│ │ │ │ ├── bbh_gen_ee62e9.py
│ │ │ │ ├── bbh_llm_judge_gen.py
│ │ │ │ ├── bbh_llmjudge_gen_b5bdf1.py
│ │ │ │ └── bbh_subset_settings.py
│ │ │ ├── bigcodebench/
│ │ │ │ ├── bigcodebench_full_complete_gen.py
│ │ │ │ ├── bigcodebench_full_complete_gen_faf748.py
│ │ │ │ ├── bigcodebench_full_instruct_gen.py
│ │ │ │ ├── bigcodebench_full_instruct_gen_8815eb.py
│ │ │ │ ├── bigcodebench_full_instruct_repeat_gen_c3d5ad.py
│ │ │ │ ├── bigcodebench_gen.py
│ │ │ │ ├── bigcodebench_hard_complete_gen.py
│ │ │ │ ├── bigcodebench_hard_complete_gen_2888d3.py
│ │ │ │ ├── bigcodebench_hard_complete_gen_faf748.py
│ │ │ │ ├── bigcodebench_hard_complete_rawprompt_gen_95140b.py
│ │ │ │ ├── bigcodebench_hard_instruct_gen.py
│ │ │ │ ├── bigcodebench_hard_instruct_gen_8815eb.py
│ │ │ │ ├── bigcodebench_hard_instruct_gen_c3d5ad.py
│ │ │ │ ├── bigcodebench_hard_instruct_rawprompt_gen_5cbb9f.py
│ │ │ │ └── bigcodebench_hard_instruct_repeat_gen_c3d5ad.py
│ │ │ ├── biodata/
│ │ │ │ ├── biodata_task_gen.py
│ │ │ │ └── biodata_task_rawprompt_gen.py
│ │ │ ├── calm/
│ │ │ │ ├── README.md
│ │ │ │ └── calm.py
│ │ │ ├── ceval/
│ │ │ │ ├── README.md
│ │ │ │ ├── ceval_clean_ppl.py
│ │ │ │ ├── ceval_gen.py
│ │ │ │ ├── ceval_gen_2daf24.py
│ │ │ │ ├── ceval_gen_5f30c7.py
│ │ │ │ ├── ceval_internal_ppl_1cd8bf.py
│ │ │ │ ├── ceval_internal_ppl_93e5ce.py
│ │ │ │ ├── ceval_llm_judge_gen_a162f0.py
│ │ │ │ ├── ceval_ppl.py
│ │ │ │ ├── ceval_ppl_1cd8bf.py
│ │ │ │ ├── ceval_ppl_578f8d.py
│ │ │ │ ├── ceval_ppl_93e5ce.py
│ │ │ │ └── ceval_zero_shot_gen_bd40ef.py
│ │ │ ├── chatobj_custom/
│ │ │ │ └── chatobj_custom_gen.py
│ │ │ ├── chem_exam/
│ │ │ │ ├── competition_gen.py
│ │ │ │ ├── competition_rawprompt_gen.py
│ │ │ │ ├── gaokao_gen.py
│ │ │ │ └── gaokao_rawprompt_gen.py
│ │ │ ├── chinese_simpleqa/
│ │ │ │ ├── README.md
│ │ │ │ └── chinese_simpleqa_gen.py
│ │ │ ├── civilcomments/
│ │ │ │ ├── civilcomments_clp.py
│ │ │ │ ├── civilcomments_clp_6a2561.py
│ │ │ │ └── civilcomments_clp_a3c5fd.py
│ │ │ ├── clozeTest_maxmin/
│ │ │ │ ├── clozeTest_maxmin_gen.py
│ │ │ │ └── clozeTest_maxmin_gen_c205fb.py
│ │ │ ├── cmb/
│ │ │ │ ├── cmb_gen.py
│ │ │ │ └── cmb_gen_dfb5c4.py
│ │ │ ├── cmmlu/
│ │ │ │ ├── cmmlu_0shot_cot_gen_305931.py
│ │ │ │ ├── cmmlu_0shot_nocot_llmjudge_gen_e1cd9a.py
│ │ │ │ ├── cmmlu_gen.py
│ │ │ │ ├── cmmlu_gen_c13365.py
│ │ │ │ ├── cmmlu_llm_judge_gen.py
│ │ │ │ ├── cmmlu_llmjudge_gen_e1cd9a.py
│ │ │ │ ├── cmmlu_llmjudge_rawprompt_gen_9f9c31.py
│ │ │ │ ├── cmmlu_ppl.py
│ │ │ │ ├── cmmlu_ppl_041cbf.py
│ │ │ │ ├── cmmlu_ppl_8b9c76.py
│ │ │ │ ├── cmmlu_stem_0shot_nocot_gen_3653db.py
│ │ │ │ ├── cmmlu_stem_0shot_nocot_llmjudge_gen_3653db.py
│ │ │ │ └── cmmlu_stem_0shot_nocot_xml_gen_3653db.py
│ │ │ ├── cmo_fib/
│ │ │ │ ├── README.md
│ │ │ │ ├── cmo_fib_0shot_notcot_gen_4c6c29.py
│ │ │ │ ├── cmo_fib_gen.py
│ │ │ │ ├── cmo_fib_gen_2783e5.py
│ │ │ │ └── cmo_fib_gen_ace24b.py
│ │ │ ├── codecompass/
│ │ │ │ └── codecompass_gen_079a6c.py
│ │ │ ├── collections/
│ │ │ │ ├── base_core.py
│ │ │ │ ├── base_medium.py
│ │ │ │ ├── base_medium_llama.py
│ │ │ │ ├── base_small.py
│ │ │ │ ├── chat_core.py
│ │ │ │ ├── chat_medium.py
│ │ │ │ ├── chat_small.py
│ │ │ │ ├── example.py
│ │ │ │ └── leaderboard/
│ │ │ │ ├── qwen.py
│ │ │ │ └── qwen_chat.py
│ │ │ ├── commonsenseqa/
│ │ │ │ ├── commonsenseqa_7shot_cot_gen_734a22.py
│ │ │ │ ├── commonsenseqa_gen.py
│ │ │ │ ├── commonsenseqa_gen_1da2d0.py
│ │ │ │ ├── commonsenseqa_gen_c946f2.py
│ │ │ │ ├── commonsenseqa_ppl.py
│ │ │ │ ├── commonsenseqa_ppl_3e9f2d.py
│ │ │ │ ├── commonsenseqa_ppl_5545e2.py
│ │ │ │ ├── commonsenseqa_ppl_716f78.py
│ │ │ │ ├── commonsenseqa_ppl_c49e77.py
│ │ │ │ └── commonsenseqa_ppl_e51e32.py
│ │ │ ├── commonsenseqa_cn/
│ │ │ │ ├── commonsenseqacn_gen.py
│ │ │ │ ├── commonsenseqacn_gen_d380d0.py
│ │ │ │ ├── commonsenseqacn_ppl.py
│ │ │ │ └── commonsenseqacn_ppl_971f48.py
│ │ │ ├── compassbench_20_v1_1/
│ │ │ │ ├── agent/
│ │ │ │ │ ├── cibench_template_gen_e6b12a.py
│ │ │ │ │ └── mus_teval_gen_105c48.py
│ │ │ │ ├── code/
│ │ │ │ │ └── compassbench_v1_1_code_gen_986f01.py
│ │ │ │ ├── knowledge/
│ │ │ │ │ └── compassbench_v1_knowledge_gen_bd74e0.py
│ │ │ │ ├── language/
│ │ │ │ │ └── compassbench_v1_language_gen_7aa06d.py
│ │ │ │ ├── math/
│ │ │ │ │ ├── compassbench_v1_1_math_gen_1dc21d.py
│ │ │ │ │ └── mathbench_prompt.py
│ │ │ │ └── reason/
│ │ │ │ └── compassbench_v1_reason_gen_d26d08.py
│ │ │ ├── compassbench_20_v1_1_public/
│ │ │ │ ├── agent/
│ │ │ │ │ ├── cibench_template_gen_e6b12a.py
│ │ │ │ │ └── mus_teval_gen_105c48.py
│ │ │ │ ├── code/
│ │ │ │ │ └── compassbench_v1_1_code_gen_986f01.py
│ │ │ │ ├── knowledge/
│ │ │ │ │ └── compassbench_v1_knowledge_gen_bd74e0.py
│ │ │ │ ├── language/
│ │ │ │ │ └── compassbench_v1_language_gen_7aa06d.py
│ │ │ │ ├── math/
│ │ │ │ │ ├── compassbench_v1_1_math_gen_1dc21d.py
│ │ │ │ │ └── mathbench_prompt.py
│ │ │ │ └── reason/
│ │ │ │ └── compassbench_v1_reason_gen_d26d08.py
│ │ │ ├── compassbench_v1_3/
│ │ │ │ ├── compassbench_v1_3_code_gen_c8c3aa.py
│ │ │ │ ├── compassbench_v1_3_knowledge.py
│ │ │ │ ├── compassbench_v1_3_math.py
│ │ │ │ ├── compassbench_v1_3_objective_gen.py
│ │ │ │ ├── compassbench_v1_3_objective_gen_068af0.py
│ │ │ │ └── compassbench_v1_3_prompt.py
│ │ │ ├── contamination/
│ │ │ │ ├── ceval_contamination_ppl_810ec6.py
│ │ │ │ ├── mbpp_contamination_ppl_f01cb6.py
│ │ │ │ └── mmlu_contamination_ppl_810ec6.py
│ │ │ ├── crowspairs/
│ │ │ │ ├── crowspairs_gen.py
│ │ │ │ ├── crowspairs_gen_02b6c1.py
│ │ │ │ ├── crowspairs_gen_381af0.py
│ │ │ │ ├── crowspairs_ppl.py
│ │ │ │ ├── crowspairs_ppl_47f211.py
│ │ │ │ └── crowspairs_ppl_e811e1.py
│ │ │ ├── crowspairs_cn/
│ │ │ │ ├── crowspairscn_gen.py
│ │ │ │ ├── crowspairscn_gen_556dc9.py
│ │ │ │ ├── crowspairscn_ppl.py
│ │ │ │ └── crowspairscn_ppl_f53575.py
│ │ │ ├── cvalues/
│ │ │ │ ├── cvalues_responsibility_gen.py
│ │ │ │ └── cvalues_responsibility_gen_543378.py
│ │ │ ├── demo/
│ │ │ │ ├── demo_cmmlu_base_ppl.py
│ │ │ │ ├── demo_cmmlu_chat_gen.py
│ │ │ │ ├── demo_gsm8k_base_gen.py
│ │ │ │ ├── demo_gsm8k_chat_gen.py
│ │ │ │ ├── demo_math_base_gen.py
│ │ │ │ └── demo_math_chat_gen.py
│ │ │ ├── dingo/
│ │ │ │ └── dingo_gen.py
│ │ │ ├── drop/
│ │ │ │ ├── deprecated_drop_gen_8a9ed9.py
│ │ │ │ ├── drop_examples.py
│ │ │ │ ├── drop_gen.py
│ │ │ │ ├── drop_gen_a2697c.py
│ │ │ │ ├── drop_gen_eb14af.py
│ │ │ │ ├── drop_llm_judge_gen.py
│ │ │ │ ├── drop_llmjudge_gen_3857b0.py
│ │ │ │ └── drop_openai_simple_evals_gen_3857b0.py
│ │ │ ├── ds1000/
│ │ │ │ ├── ds1000_compl_gen_cbc84f.py
│ │ │ │ ├── ds1000_compl_service_eval_gen_cbc84f.py
│ │ │ │ ├── ds1000_gen_5c4bec.py
│ │ │ │ ├── ds1000_gen_cbc84f.py
│ │ │ │ └── ds1000_service_eval_gen_cbc84f.py
│ │ │ ├── eese/
│ │ │ │ └── eese_llm_judge_gen.py
│ │ │ ├── flores/
│ │ │ │ ├── flores_gen.py
│ │ │ │ ├── flores_gen_806ede.py
│ │ │ │ └── flores_gen_aad4fd.py
│ │ │ ├── game24/
│ │ │ │ ├── game24_gen.py
│ │ │ │ └── game24_gen_52a460.py
│ │ │ ├── gaokao_math/
│ │ │ │ ├── README.md
│ │ │ │ └── gaokao_math_gen_f5fd28.py
│ │ │ ├── govrepcrs/
│ │ │ │ ├── govrepcrs_gen.py
│ │ │ │ ├── govrepcrs_gen_aa5eb3.py
│ │ │ │ └── govrepcrs_gen_db7930.py
│ │ │ ├── gpqa/
│ │ │ │ ├── README.md
│ │ │ │ ├── gpqa_0shot_nocot_gen_772ea0.py
│ │ │ │ ├── gpqa_0shot_nocot_genericllmeval_gen_772ea0.py
│ │ │ │ ├── gpqa_0shot_nocot_genericllmeval_xml_gen_772ea0.py
│ │ │ │ ├── gpqa_0shot_nocot_llmjudge_gen_772ea0.py
│ │ │ │ ├── gpqa_cascade_eval_academic.py
│ │ │ │ ├── gpqa_cascade_eval_gen_772ea0.py
│ │ │ │ ├── gpqa_cascade_eval_rawprompt_gen_706039.py
│ │ │ │ ├── gpqa_few_shot_ppl_4b5a83.py
│ │ │ │ ├── gpqa_gen.py
│ │ │ │ ├── gpqa_gen_015262.py
│ │ │ │ ├── gpqa_gen_4baadb.py
│ │ │ │ ├── gpqa_llm_judge_gen.py
│ │ │ │ ├── gpqa_openai_simple_evals_gen_5aeece.py
│ │ │ │ └── gpqa_ppl_6bf57a.py
│ │ │ ├── gsm8k/
│ │ │ │ ├── README.md
│ │ │ │ ├── deprecated_gsm8k_agent_gen_be1606.py
│ │ │ │ ├── gsm8k_0shot_gen_a58960.py
│ │ │ │ ├── gsm8k_0shot_nocot_gen_6cbf22.py
│ │ │ │ ├── gsm8k_0shot_v2_gen_17d799.py
│ │ │ │ ├── gsm8k_0shot_v2_gen_6e39a4.py
│ │ │ │ ├── gsm8k_0shot_v2_gen_a58960.py
│ │ │ │ ├── gsm8k_agent_gen_c3dff3.py
│ │ │ │ ├── gsm8k_gen.py
│ │ │ │ ├── gsm8k_gen_17d0dc.py
│ │ │ │ ├── gsm8k_gen_1d7fe4.py
│ │ │ │ ├── gsm8k_gen_1dce88.py
│ │ │ │ ├── gsm8k_gen_3309bd.py
│ │ │ │ ├── gsm8k_gen_57b0b1.py
│ │ │ │ ├── gsm8k_gen_701491.py
│ │ │ │ ├── gsm8k_gen_a3e34a.py
│ │ │ │ ├── gsm8k_gen_d6de81.py
│ │ │ │ ├── gsm8k_gen_e9e91e.py
│ │ │ │ ├── gsm8k_gen_ee684f.py
│ │ │ │ ├── gsm8k_model_postprocess_gen_a58960.py
│ │ │ │ └── gsm8k_xfinder_gen_a58960.py
│ │ │ ├── gsm8k_contamination/
│ │ │ │ └── gsm8k_contamination_ppl_ecdd22.py
│ │ │ ├── gsm_hard/
│ │ │ │ ├── gsmhard_gen.py
│ │ │ │ └── gsmhard_gen_8a1400.py
│ │ │ ├── hellaswag/
│ │ │ │ ├── README.md
│ │ │ │ ├── hellaswag_10shot_gen_e42710.py
│ │ │ │ ├── hellaswag_10shot_ppl_59c85e.py
│ │ │ │ ├── hellaswag_clean_ppl.py
│ │ │ │ ├── hellaswag_gen.py
│ │ │ │ ├── hellaswag_gen_6faab5.py
│ │ │ │ ├── hellaswag_llm_judge_gen.py
│ │ │ │ ├── hellaswag_llmjudge_gen_809ef1.py
│ │ │ │ ├── hellaswag_ppl.py
│ │ │ │ ├── hellaswag_ppl_47bff9.py
│ │ │ │ ├── hellaswag_ppl_7d7f2d.py
│ │ │ │ ├── hellaswag_ppl_9dbb12.py
│ │ │ │ └── hellaswag_ppl_a6e128.py
│ │ │ ├── hmmt2026/
│ │ │ │ ├── hmmt2026_cascade_eval_gen_6ff468.py
│ │ │ │ └── hmmt2026_cascade_eval_rawprompt_gen_0970dd.py
│ │ │ ├── humaneval/
│ │ │ │ ├── README.md
│ │ │ │ ├── deprecated_humaneval_gen_4a6eef.py
│ │ │ │ ├── deprecated_humaneval_gen_6d1cc2.py
│ │ │ │ ├── deprecated_humaneval_gen_a82cae.py
│ │ │ │ ├── deprecated_humaneval_gen_d2537e.py
│ │ │ │ ├── deprecated_humaneval_gen_fd5822.py
│ │ │ │ ├── deprecated_humaneval_gen_ff7054.py
│ │ │ │ ├── humaneval_gen.py
│ │ │ │ ├── humaneval_gen_66a7f4.py
│ │ │ │ ├── humaneval_gen_8e312c.py
│ │ │ │ ├── humaneval_openai_sample_evals_gen_159614.py
│ │ │ │ ├── humaneval_openai_sample_evals_gen_dcae0e.py
│ │ │ │ ├── humaneval_openai_sample_evals_o1_gen_5e7b00.py
│ │ │ │ ├── humaneval_openai_sample_evals_rawprompt_gen_6ce2ca.py
│ │ │ │ ├── humaneval_openai_sample_evals_repeat_gen_dcae0e.py
│ │ │ │ ├── humaneval_passk_gen_8e312c.py
│ │ │ │ ├── humaneval_repeat10_gen_8e312c.py
│ │ │ │ ├── internal_humaneval_gen_ce6b06.py
│ │ │ │ └── internal_humaneval_gen_d2537e.py
│ │ │ ├── humaneval_cn/
│ │ │ │ ├── humaneval_cn_gen.py
│ │ │ │ ├── humaneval_cn_gen_6313aa.py
│ │ │ │ ├── humaneval_cn_passk_gen_6313aa.py
│ │ │ │ └── humaneval_cn_repeat10_gen_6313aa.py
│ │ │ ├── humaneval_multi/
│ │ │ │ ├── humaneval_multi_gen.py
│ │ │ │ └── humaneval_multi_gen_82cf85.py
│ │ │ ├── humaneval_plus/
│ │ │ │ ├── humaneval_plus_gen.py
│ │ │ │ ├── humaneval_plus_gen_66a7f4.py
│ │ │ │ ├── humaneval_plus_gen_8e312c.py
│ │ │ │ ├── humaneval_plus_openai_simple_evals_gen_159614.py
│ │ │ │ ├── humaneval_plus_passk_gen_8e312c.py
│ │ │ │ ├── humaneval_plus_repeat10_gen_8e312c.py
│ │ │ │ └── humaneval_plus_repeat_gen_41b01c.py
│ │ │ ├── humaneval_pro/
│ │ │ │ ├── README.md
│ │ │ │ ├── humaneval_pro_gen.py
│ │ │ │ ├── humaneval_pro_gen_3dc067.py
│ │ │ │ └── humaneval_pro_repeat_gen_3dc067.py
│ │ │ ├── humanevalx/
│ │ │ │ ├── humanevalx_0shot_nocot_gen_3e4bbd.py
│ │ │ │ ├── humanevalx_gen.py
│ │ │ │ ├── humanevalx_gen_0af626.py
│ │ │ │ ├── humanevalx_gen_3d84a3.py
│ │ │ │ ├── humanevalx_gen_620cfa.py
│ │ │ │ └── humanevalx_repeat_gen_3d84a3.py
│ │ │ ├── hungarian_exam/
│ │ │ │ ├── hungarian_exam_gen.py
│ │ │ │ └── hungarian_exam_gen_8a1435.py
│ │ │ ├── inference_ppl/
│ │ │ │ ├── README.md
│ │ │ │ └── inference_ppl.py
│ │ │ ├── infinitebench/
│ │ │ │ ├── infinitebench.py
│ │ │ │ ├── infinitebenchcodedebug/
│ │ │ │ │ ├── infinitebench_codedebug_gen.py
│ │ │ │ │ └── infinitebench_codedebug_gen_276a42.py
│ │ │ │ ├── infinitebenchcoderun/
│ │ │ │ │ ├── infinitebench_coderun_gen.py
│ │ │ │ │ └── infinitebench_coderun_gen_1a76bd.py
│ │ │ │ ├── infinitebenchendia/
│ │ │ │ │ ├── infinitebench_endia_gen.py
│ │ │ │ │ └── infinitebench_endia_gen_c96eb5.py
│ │ │ │ ├── infinitebenchenmc/
│ │ │ │ │ ├── infinitebench_enmc_gen.py
│ │ │ │ │ └── infinitebench_enmc_gen_3a4102.py
│ │ │ │ ├── infinitebenchenqa/
│ │ │ │ │ ├── infinitebench_enqa_gen.py
│ │ │ │ │ └── infinitebench_enqa_gen_a1640c.py
│ │ │ │ ├── infinitebenchensum/
│ │ │ │ │ ├── infinitebench_ensum_gen.py
│ │ │ │ │ └── infinitebench_ensum_gen_cfbc08.py
│ │ │ │ ├── infinitebenchmathcalc/
│ │ │ │ │ ├── infinitebench_mathcalc_gen.py
│ │ │ │ │ └── infinitebench_mathcalc_gen_78d17e.py
│ │ │ │ ├── infinitebenchmathfind/
│ │ │ │ │ ├── infinitebench_mathfind_gen.py
│ │ │ │ │ └── infinitebench_mathfind_gen_6d799e.py
│ │ │ │ ├── infinitebenchretrievekv/
│ │ │ │ │ ├── infinitebench_retrievekv_gen.py
│ │ │ │ │ └── infinitebench_retrievekv_gen_06b3ac.py
│ │ │ │ ├── infinitebenchretrievenumber/
│ │ │ │ │ ├── infinitebench_retrievenumber_gen.py
│ │ │ │ │ └── infinitebench_retrievenumber_gen_047436.py
│ │ │ │ ├── infinitebenchretrievepasskey/
│ │ │ │ │ ├── infinitebench_retrievepasskey_gen.py
│ │ │ │ │ └── infinitebench_retrievepasskey_gen_62ff68.py
│ │ │ │ └── infinitebenchzhqa/
│ │ │ │ ├── infinitebench_zhqa_gen.py
│ │ │ │ └── infinitebench_zhqa_gen_1e5293.py
│ │ │ ├── internsandbox/
│ │ │ │ ├── internsandbox_gen.py
│ │ │ │ └── internsandbox_gen_44b982.py
│ │ │ ├── iwslt2017/
│ │ │ │ ├── iwslt2017_gen.py
│ │ │ │ ├── iwslt2017_gen_69ce16.py
│ │ │ │ ├── iwslt2017_gen_b4a814.py
│ │ │ │ └── iwslt2017_gen_d0ebd1.py
│ │ │ ├── jigsawmultilingual/
│ │ │ │ ├── jigsawmultilingual_clp.py
│ │ │ │ ├── jigsawmultilingual_clp_1af0ae.py
│ │ │ │ └── jigsawmultilingual_clp_fe50d8.py
│ │ │ ├── judge/
│ │ │ │ ├── judgebench.py
│ │ │ │ ├── judgerbenchv2.py
│ │ │ │ ├── rewardbench.py
│ │ │ │ └── rmb.py
│ │ │ ├── kaoshi/
│ │ │ │ ├── kaoshi_gen.py
│ │ │ │ └── kaoshi_gen_86aca2.py
│ │ │ ├── kcle/
│ │ │ │ ├── kcle_llm_judge_gen.py
│ │ │ │ ├── kcle_llm_judge_gen_60327a.py
│ │ │ │ └── kcle_llm_judge_rawprompt_gen_16e383.py
│ │ │ ├── korbench/
│ │ │ │ ├── korbench_gen.py
│ │ │ │ ├── korbench_llm_judge_gen.py
│ │ │ │ ├── korbench_llmjudge_gen_17854d.py
│ │ │ │ ├── korbench_llmjudge_gen_56cf43.py
│ │ │ │ ├── korbench_mixed_gen_d00bdd.py
│ │ │ │ ├── korbench_single_0_shot_gen.py
│ │ │ │ ├── korbench_single_0shot_cascade_eval_gen_56cf43.py
│ │ │ │ ├── korbench_single_0shot_cascade_eval_rawprompt_gen_c048da.py
│ │ │ │ ├── korbench_single_0shot_genericllmeval_gen_17854d.py
│ │ │ │ ├── korbench_single_0shot_llmjudge_gen.py
│ │ │ │ ├── korbench_single_3_shot_gen.py
│ │ │ │ └── readme.md
│ │ │ ├── lambada/
│ │ │ │ ├── lambada_gen.py
│ │ │ │ ├── lambada_gen_217e11.py
│ │ │ │ └── lambada_gen_8b48a5.py
│ │ │ ├── lawbench/
│ │ │ │ ├── lawbench_one_shot_gen_002588.py
│ │ │ │ └── lawbench_zero_shot_gen_002588.py
│ │ │ ├── lcsts/
│ │ │ │ ├── lcsts_gen.py
│ │ │ │ ├── lcsts_gen_8ee1fe.py
│ │ │ │ └── lcsts_gen_9b0b89.py
│ │ │ ├── leval/
│ │ │ │ ├── leval.py
│ │ │ │ ├── levalcoursera/
│ │ │ │ │ ├── leval_coursera_gen.py
│ │ │ │ │ └── leval_coursera_gen_36a006.py
│ │ │ │ ├── levalfinancialqa/
│ │ │ │ │ ├── leval_financialqa_gen.py
│ │ │ │ │ └── leval_financialqa_gen_b03798.py
│ │ │ │ ├── levalgovreportsumm/
│ │ │ │ │ ├── leval_gov_report_summ_gen.py
│ │ │ │ │ └── leval_gov_report_summ_gen_b03798.py
│ │ │ │ ├── levalgsm100/
│ │ │ │ │ ├── leval_gsm100_gen.py
│ │ │ │ │ └── leval_gsm100_gen_77dd94.py
│ │ │ │ ├── levallegalcontractqa/
│ │ │ │ │ ├── leval_legalcontractqa_gen.py
│ │ │ │ │ └── leval_legalcontractqa_gen_68a2ac.py
│ │ │ │ ├── levalmeetingsumm/
│ │ │ │ │ ├── leval_meetingsumm_gen.py
│ │ │ │ │ └── leval_meetingsumm_gen_b03798.py
│ │ │ │ ├── levalmultidocqa/
│ │ │ │ │ ├── leval_multidocqa_gen.py
│ │ │ │ │ └── leval_multidocqa_gen_96bf3f.py
│ │ │ │ ├── levalnarrativeqa/
│ │ │ │ │ ├── leval_narrativeqa_gen.py
│ │ │ │ │ └── leval_narrativeqa_gen_766dd0.py
│ │ │ │ ├── levalnaturalquestion/
│ │ │ │ │ ├── leval_naturalquestion_gen.py
│ │ │ │ │ └── leval_naturalquestion_gen_52c33f.py
│ │ │ │ ├── levalnewssumm/
│ │ │ │ │ ├── leval_newssumm_gen.py
│ │ │ │ │ └── leval_newssumm_gen_b03798.py
│ │ │ │ ├── levalpaperassistant/
│ │ │ │ │ ├── leval_paper_assistant_gen.py
│ │ │ │ │ └── leval_paper_assistant_gen_b03798.py
│ │ │ │ ├── levalpatentsumm/
│ │ │ │ │ ├── leval_patent_summ_gen.py
│ │ │ │ │ └── leval_patent_summ_gen_b03798.py
│ │ │ │ ├── levalquality/
│ │ │ │ │ ├── leval_quality_gen.py
│ │ │ │ │ └── leval_quality_gen_36a006.py
│ │ │ │ ├── levalreviewsumm/
│ │ │ │ │ ├── leval_review_summ_gen.py
│ │ │ │ │ └── leval_review_summ_gen_b03798.py
│ │ │ │ ├── levalscientificqa/
│ │ │ │ │ ├── leval_scientificqa_gen.py
│ │ │ │ │ └── leval_scientificqa_gen_96bf3f.py
│ │ │ │ ├── levaltopicretrieval/
│ │ │ │ │ ├── leval_topic_retrieval_gen.py
│ │ │ │ │ └── leval_topic_retrieval_gen_bf433f.py
│ │ │ │ ├── levaltpo/
│ │ │ │ │ ├── leval_tpo_gen.py
│ │ │ │ │ └── leval_tpo_gen_36a006.py
│ │ │ │ └── levaltvshowsumm/
│ │ │ │ ├── leval_tvshow_summ_gen.py
│ │ │ │ └── leval_tvshow_summ_gen_b03798.py
│ │ │ ├── livecodebench/
│ │ │ │ ├── README.md
│ │ │ │ ├── livecodebench_code_generation_repeat_gen_b5b6c5.py
│ │ │ │ ├── livecodebench_gen.py
│ │ │ │ ├── livecodebench_gen_6966bc.py
│ │ │ │ ├── livecodebench_gen_a4f90b.py
│ │ │ │ ├── livecodebench_gen_b2b0fd.py
│ │ │ │ ├── livecodebench_o1_gen_f0ed6c.py
│ │ │ │ ├── livecodebench_rawprompt_gen_c09673.py
│ │ │ │ ├── livecodebench_split_v4_o1_gen_f0ed6c.py
│ │ │ │ ├── livecodebench_split_v4_o1_postprocess_gen_f0ed6c.py
│ │ │ │ ├── livecodebench_time_split_gen_a4f90b.py
│ │ │ │ ├── livecodebench_v1_o1_gen_f0ed6c.py
│ │ │ │ └── livecodebench_v6_academic.py
│ │ │ ├── livecodebench_pro/
│ │ │ │ ├── livecodebench_pro_gen.py
│ │ │ │ └── livecodebench_pro_rawprompt_gen.py
│ │ │ ├── livemathbench/
│ │ │ │ ├── README.md
│ │ │ │ ├── livemathbench_gen.py
│ │ │ │ ├── livemathbench_gen_6eb711.py
│ │ │ │ ├── livemathbench_gen_9befbf.py
│ │ │ │ ├── livemathbench_gen_caed8f.py
│ │ │ │ ├── livemathbench_greedy_gen.py
│ │ │ │ ├── livemathbench_greedy_gen_9befbf.py
│ │ │ │ ├── livemathbench_hard_custom_cascade_eval_gen_4bce59.py
│ │ │ │ ├── livemathbench_hard_custom_cascade_eval_rawprompt_gen_e1ce64.py
│ │ │ │ ├── livemathbench_hard_custom_llmverify_gen_85d0ef.py
│ │ │ │ ├── livemathbench_hard_gen_353ae7.py
│ │ │ │ ├── livemathbench_hard_greedy_gen_353ae7.py
│ │ │ │ ├── livemathbench_hard_llmjudge_gen_71eaf5.py
│ │ │ │ ├── livemathbench_v202505_gen_9befbf.py
│ │ │ │ ├── livemathbench_v202505_greedy_gen_9befbf.py
│ │ │ │ ├── livemathbench_v202505_hard_gen_353ae7.py
│ │ │ │ └── livemathbench_v202505_hard_greedy_gen_353ae7.py
│ │ │ ├── livereasonbench/
│ │ │ │ ├── livereasonbench_gen.py
│ │ │ │ ├── livereasonbench_gen_f990de.py
│ │ │ │ ├── livereasonbench_genericllmeval_gen_f990de.py
│ │ │ │ └── livereasonbench_llmverify_20250428_gen_0484cb.py
│ │ │ ├── livestembench/
│ │ │ │ ├── livestembench_0shot_noncot_gen_2e6d10.py
│ │ │ │ ├── livestembench_0shot_noncot_xml_gen_2e6d10.py
│ │ │ │ ├── livestembench_gen.py
│ │ │ │ └── livestembench_gen_3e3c50.py
│ │ │ ├── llm_compression/
│ │ │ │ ├── README.md
│ │ │ │ └── llm_compression.py
│ │ │ ├── longbench/
│ │ │ │ ├── longbench.py
│ │ │ │ ├── longbench2wikimqa/
│ │ │ │ │ ├── longbench_2wikimqa_gen.py
│ │ │ │ │ └── longbench_2wikimqa_gen_6b3efc.py
│ │ │ │ ├── longbenchdureader/
│ │ │ │ │ ├── longbench_dureader_gen.py
│ │ │ │ │ └── longbench_dureader_gen_c6c7e4.py
│ │ │ │ ├── longbenchgov_report/
│ │ │ │ │ ├── longbench_gov_report_gen.py
│ │ │ │ │ └── longbench_gov_report_gen_54c5b0.py
│ │ │ │ ├── longbenchhotpotqa/
│ │ │ │ │ ├── longbench_hotpotqa_gen.py
│ │ │ │ │ └── longbench_hotpotqa_gen_6b3efc.py
│ │ │ │ ├── longbenchlcc/
│ │ │ │ │ ├── longbench_lcc_gen.py
│ │ │ │ │ └── longbench_lcc_gen_6ba507.py
│ │ │ │ ├── longbenchlsht/
│ │ │ │ │ ├── longbench_lsht_gen.py
│ │ │ │ │ └── longbench_lsht_gen_e8a339.py
│ │ │ │ ├── longbenchmulti_news/
│ │ │ │ │ ├── longbench_multi_news_gen.py
│ │ │ │ │ └── longbench_multi_news_gen_6f9da9.py
│ │ │ │ ├── longbenchmultifieldqa_en/
│ │ │ │ │ ├── longbench_multifieldqa_en_gen.py
│ │ │ │ │ └── longbench_multifieldqa_en_gen_d3838e.py
│ │ │ │ ├── longbenchmultifieldqa_zh/
│ │ │ │ │ ├── longbench_multifieldqa_zh_gen.py
│ │ │ │ │ └── longbench_multifieldqa_zh_gen_e9a7ef.py
│ │ │ │ ├── longbenchmusique/
│ │ │ │ │ ├── longbench_musique_gen.py
│ │ │ │ │ └── longbench_musique_gen_6b3efc.py
│ │ │ │ ├── longbenchnarrativeqa/
│ │ │ │ │ ├── longbench_narrativeqa_gen.py
│ │ │ │ │ └── longbench_narrativeqa_gen_a68305.py
│ │ │ │ ├── longbenchpassage_count/
│ │ │ │ │ ├── longbench_passage_count_gen.py
│ │ │ │ │ └── longbench_passage_count_gen_dcdaab.py
│ │ │ │ ├── longbenchpassage_retrieval_en/
│ │ │ │ │ ├── longbench_passage_retrieval_en_gen.py
│ │ │ │ │ └── longbench_passage_retrieval_en_gen_734db5.py
│ │ │ │ ├── longbenchpassage_retrieval_zh/
│ │ │ │ │ ├── longbench_passage_retrieval_zh_gen.py
│ │ │ │ │ └── longbench_passage_retrieval_zh_gen_01cca2.py
│ │ │ │ ├── longbenchqasper/
│ │ │ │ │ ├── longbench_qasper_gen.py
│ │ │ │ │ └── longbench_qasper_gen_6b3efc.py
│ │ │ │ ├── longbenchqmsum/
│ │ │ │ │ ├── longbench_qmsum_gen.py
│ │ │ │ │ └── longbench_qmsum_gen_d33331.py
│ │ │ │ ├── longbenchrepobench/
│ │ │ │ │ ├── longbench_repobench_gen.py
│ │ │ │ │ └── longbench_repobench_gen_6df953.py
│ │ │ │ ├── longbenchsamsum/
│ │ │ │ │ ├── longbench_samsum_gen.py
│ │ │ │ │ └── longbench_samsum_gen_f4416d.py
│ │ │ │ ├── longbenchtrec/
│ │ │ │ │ ├── longbench_trec_gen.py
│ │ │ │ │ └── longbench_trec_gen_824187.py
│ │ │ │ ├── longbenchtriviaqa/
│ │ │ │ │ ├── longbench_triviaqa_gen.py
│ │ │ │ │ └── longbench_triviaqa_gen_d30cb9.py
│ │ │ │ └── longbenchvcsum/
│ │ │ │ ├── longbench_vcsum_gen.py
│ │ │ │ └── longbench_vcsum_gen_f7a8ac.py
│ │ │ ├── longbenchv2/
│ │ │ │ ├── longbenchv2_gen.py
│ │ │ │ └── longbenchv2_gen_75fbba.py
│ │ │ ├── lveval/
│ │ │ │ ├── lveval.md
│ │ │ │ ├── lveval.py
│ │ │ │ ├── lvevalcmrc_mixup/
│ │ │ │ │ ├── lveval_cmrc_mixup_gen.py
│ │ │ │ │ └── lveval_cmrc_mixup_gen_465823.py
│ │ │ │ ├── lvevaldureader_mixup/
│ │ │ │ │ ├── lveval_dureader_mixup_gen.py
│ │ │ │ │ └── lveval_dureader_mixup_gen_465823.py
│ │ │ │ ├── lvevalfactrecall_en/
│ │ │ │ │ ├── lveval_factrecall_en_gen.py
│ │ │ │ │ └── lveval_factrecall_en_gen_9a836f.py
│ │ │ │ ├── lvevalfactrecall_zh/
│ │ │ │ │ ├── lveval_factrecall_zh_gen.py
│ │ │ │ │ └── lveval_factrecall_zh_gen_dbee70.py
│ │ │ │ ├── lvevalhotpotwikiqa_mixup/
│ │ │ │ │ ├── lveval_hotpotwikiqa_mixup_gen.py
│ │ │ │ │ └── lveval_hotpotwikiqa_mixup_gen_77ce82.py
│ │ │ │ ├── lvevallic_mixup/
│ │ │ │ │ ├── lveval_lic_mixup_gen.py
│ │ │ │ │ └── lveval_lic_mixup_gen_01eb0c.py
│ │ │ │ ├── lvevalloogle_CR_mixup/
│ │ │ │ │ ├── lveval_loogle_CR_mixup_gen.py
│ │ │ │ │ └── lveval_loogle_CR_mixup_gen_d7ea36.py
│ │ │ │ ├── lvevalloogle_MIR_mixup/
│ │ │ │ │ ├── lveval_loogle_MIR_mixup_gen.py
│ │ │ │ │ └── lveval_loogle_MIR_mixup_gen_d7ea36.py
│ │ │ │ ├── lvevalloogle_SD_mixup/
│ │ │ │ │ ├── lveval_loogle_SD_mixup_gen.py
│ │ │ │ │ └── lveval_loogle_SD_mixup_gen_d7ea36.py
│ │ │ │ ├── lvevalmultifieldqa_en_mixup/
│ │ │ │ │ ├── lveval_multifieldqa_en_mixup_gen.py
│ │ │ │ │ └── lveval_multifieldqa_en_mixup_gen_d7ea36.py
│ │ │ │ └── lvevalmultifieldqa_zh_mixup/
│ │ │ │ ├── lveval_multifieldqa_zh_mixup_gen.py
│ │ │ │ └── lveval_multifieldqa_zh_mixup_gen_0fbdad.py
│ │ │ ├── mastermath2024v1/
│ │ │ │ ├── mastermath2024v1_gen.py
│ │ │ │ └── mastermath2024v1_gen_be6318.py
│ │ │ ├── matbench/
│ │ │ │ ├── matbench_gen.py
│ │ │ │ ├── matbench_gen_f71840.py
│ │ │ │ ├── matbench_llm_judge_gen_0e9276.py
│ │ │ │ ├── matbench_llm_judge_rawprompt_gen_c987b6.py
│ │ │ │ └── matbench_regex_judge_gen_0e9276.py
│ │ │ ├── math/
│ │ │ │ ├── README.md
│ │ │ │ ├── deprecated_math_agent_evaluatorv2_gen_861b4f.py
│ │ │ │ ├── deprecated_math_evaluatorv2_gen_265cce.py
│ │ │ │ ├── math_0shot_gen_11c4b5.py
│ │ │ │ ├── math_0shot_gen_393424.py
│ │ │ │ ├── math_0shot_llm_judge_gen_393424.py
│ │ │ │ ├── math_0shot_llm_judge_v2_gen_31d777.py
│ │ │ │ ├── math_4shot_base_gen_43d5b6.py
│ │ │ │ ├── math_4shot_base_gen_db136b.py
│ │ │ │ ├── math_4shot_example_from_google_research.py
│ │ │ │ ├── math_500_cascade_eval_gen_6ff468.py
│ │ │ │ ├── math_500_cascade_eval_rawprompt_gen_0970dd.py
│ │ │ │ ├── math_500_gen.py
│ │ │ │ ├── math_500_llmjudge_gen_6ff468.py
│ │ │ │ ├── math_agent_evaluatorv2_gen_0c1b4e.py
│ │ │ │ ├── math_agent_gen_0c1b4e.py
│ │ │ │ ├── math_agent_gen_861b4f.py
│ │ │ │ ├── math_agent_gen_af2293.py
│ │ │ │ ├── math_evaluatorv2_gen_2f4a71.py
│ │ │ │ ├── math_evaluatorv2_gen_cecb31.py
│ │ │ │ ├── math_gen.py
│ │ │ │ ├── math_gen_0957ff.py
│ │ │ │ ├── math_gen_1ed9c2.py
│ │ │ │ ├── math_gen_265cce.py
│ │ │ │ ├── math_gen_559593.py
│ │ │ │ ├── math_gen_5e8458.py
│ │ │ │ ├── math_gen_736506.py
│ │ │ │ ├── math_gen_78ced2.py
│ │ │ │ ├── math_gen_943d32.py
│ │ │ │ ├── math_gen_a58d9d.py
│ │ │ │ ├── math_intern_evaluator_gen_265cce.py
│ │ │ │ ├── math_llm_judge_gen.py
│ │ │ │ ├── math_llm_judge_gen_56606f.py
│ │ │ │ ├── math_prm800k_500_0shot_cot_academic_gen.py
│ │ │ │ ├── math_prm800k_500_0shot_cot_gen.py
│ │ │ │ ├── math_prm800k_500_0shot_cot_gen_11c4b5.py
│ │ │ │ ├── math_prm800k_500_0shot_nocot_gen_b27274.py
│ │ │ │ ├── math_prm800k_500_0shot_nocot_genericllmeval_gen_63a000.py
│ │ │ │ ├── math_prm800k_500_0shot_nocot_genericllmeval_gen_6ff468.py
│ │ │ │ ├── math_prm800k_500_0shot_nocot_genericllmeval_xml_gen_63a000.py
│ │ │ │ ├── math_prm800k_500_0shot_nocot_llmjudge_gen_63a000.py
│ │ │ │ ├── math_prm800k_500_gen.py
│ │ │ │ ├── math_prm800k_500_gen_393424.py
│ │ │ │ ├── math_prm800k_500_llm_judge_gen.py
│ │ │ │ ├── math_prm800k_500_llmverify_gen_6ff468.py
│ │ │ │ └── math_prm800k_500_llmverify_repeat4_gen_97b203.py
│ │ │ ├── math401/
│ │ │ │ ├── math401_gen.py
│ │ │ │ └── math401_gen_ab5f39.py
│ │ │ ├── mbpp/
│ │ │ │ ├── README.md
│ │ │ │ ├── deprecated_mbpp_gen_1e1056.py
│ │ │ │ ├── deprecated_mbpp_gen_6590b0.py
│ │ │ │ ├── deprecated_mbpp_gen_caa7ab.py
│ │ │ │ ├── deprecated_mbpp_passk_gen_1e1056.py
│ │ │ │ ├── deprecated_mbpp_repeat10_gen_1e1056.py
│ │ │ │ ├── deprecated_sanitized_mbpp_gen_1e1056.py
│ │ │ │ ├── deprecated_sanitized_mbpp_gen_cb43ef.py
│ │ │ │ ├── deprecated_sanitized_mbpp_passk_gen_1e1056.py
│ │ │ │ ├── deprecated_sanitized_mbpp_repeat10_gen_1e1056.py
│ │ │ │ ├── mbpp_gen.py
│ │ │ │ ├── mbpp_gen_830460.py
│ │ │ │ ├── mbpp_passk_gen_830460.py
│ │ │ │ ├── mbpp_repeat10_gen_830460.py
│ │ │ │ ├── mbpp_repeat_gen_18dd1b.py
│ │ │ │ ├── sanitized_mbpp_gen_742f0c.py
│ │ │ │ ├── sanitized_mbpp_gen_830460.py
│ │ │ │ ├── sanitized_mbpp_gen_a0fc46.py
│ │ │ │ ├── sanitized_mbpp_mdblock_0shot_nocot_gen_a2e416.py
│ │ │ │ ├── sanitized_mbpp_mdblock_0shot_nocot_rawprompt_gen_30c1e5.py
│ │ │ │ ├── sanitized_mbpp_mdblock_gen_a447ff.py
│ │ │ │ ├── sanitized_mbpp_passk_gen_830460.py
│ │ │ │ └── sanitized_mbpp_repeat10_gen_830460.py
│ │ │ ├── mbpp_cn/
│ │ │ │ ├── deprecated_mbpp_cn_gen_1d1481.py
│ │ │ │ ├── deprecated_mbpp_cn_passk_gen_1d1481.py
│ │ │ │ ├── deprecated_mbpp_cn_repeat10_gen_1d1481.py
│ │ │ │ ├── mbpp_cn_gen.py
│ │ │ │ └── mbpp_cn_gen_9114d5.py
│ │ │ ├── mbpp_plus/
│ │ │ │ ├── deprecated_mbpp_plus_gen_94815c.py
│ │ │ │ ├── mbpp_plus_gen.py
│ │ │ │ └── mbpp_plus_gen_0b836a.py
│ │ │ ├── mbpp_pro/
│ │ │ │ ├── README.md
│ │ │ │ ├── mbpp_pro_gen.py
│ │ │ │ ├── mbpp_pro_gen_3dc067.py
│ │ │ │ └── mbpp_pro_repeat_gen_3dc067.py
│ │ │ ├── medmcqa/
│ │ │ │ ├── medmcqa_gen.py
│ │ │ │ ├── medmcqa_gen_60c8f5.py
│ │ │ │ ├── medmcqa_llmjudge_gen.py
│ │ │ │ ├── medmcqa_llmjudge_gen_60c8f5.py
│ │ │ │ └── medmcqa_llmjudge_rawprompt_gen_015178.py
│ │ │ ├── mgsm/
│ │ │ │ ├── README.md
│ │ │ │ ├── mgsm_gen.py
│ │ │ │ └── mgsm_gen_d967bc.py
│ │ │ ├── mmlu/
│ │ │ │ ├── README.md
│ │ │ │ ├── mmlu_all_sets.py
│ │ │ │ ├── mmlu_clean_ppl.py
│ │ │ │ ├── mmlu_gen.py
│ │ │ │ ├── mmlu_gen_23a9a9.py
│ │ │ │ ├── mmlu_gen_4d595a.py
│ │ │ │ ├── mmlu_gen_5d1409.py
│ │ │ │ ├── mmlu_gen_79e572.py
│ │ │ │ ├── mmlu_gen_a484b3.py
│ │ │ │ ├── mmlu_llm_judge_gen.py
│ │ │ │ ├── mmlu_llmjudge_gen_f4336b.py
│ │ │ │ ├── mmlu_llmjudge_rawprompt_gen_af67f0.py
│ │ │ │ ├── mmlu_model_postprocess_gen_4d595a.py
│ │ │ │ ├── mmlu_openai_0shot_nocot_llmjudge_gen_216503.py
│ │ │ │ ├── mmlu_openai_simple_evals_gen_b618ea.py
│ │ │ │ ├── mmlu_ppl.py
│ │ │ │ ├── mmlu_ppl_ac766d.py
│ │ │ │ ├── mmlu_stem_0shot_cascade_eval_gen_216503.py
│ │ │ │ ├── mmlu_stem_0shot_gen_216503.py
│ │ │ │ ├── mmlu_stem_0shot_xml_gen_216503.py
│ │ │ │ ├── mmlu_stem_sets.py
│ │ │ │ ├── mmlu_xfinder_gen_4d595a.py
│ │ │ │ └── mmlu_zero_shot_gen_47e2c0.py
│ │ │ ├── mmlu_cf/
│ │ │ │ ├── mmlu_cf_categories.py
│ │ │ │ ├── mmlu_cf_few_shot.py
│ │ │ │ ├── mmlu_cf_gen.py
│ │ │ │ ├── mmlu_cf_gen_040615.py
│ │ │ │ └── mmlu_cf_zero_shot.py
│ │ │ ├── mmlu_pro/
│ │ │ │ ├── mmlu_pro_0shot_cot_gen_08c1de.py
│ │ │ │ ├── mmlu_pro_0shot_nocot_genericllmeval_gen_08c1de.py
│ │ │ │ ├── mmlu_pro_0shot_nocot_genericllmeval_rawprompt_gen_0321fb.py
│ │ │ │ ├── mmlu_pro_biomed_0shot_cot_gen_057927.py
│ │ │ │ ├── mmlu_pro_biomed_0shot_nocot_genericllmeval_gen_057927.py
│ │ │ │ ├── mmlu_pro_categories.py
│ │ │ │ ├── mmlu_pro_few_shot_gen_bfaf90.py
│ │ │ │ ├── mmlu_pro_gen.py
│ │ │ │ ├── mmlu_pro_gen_cdbebf.py
│ │ │ │ └── mmlu_pro_llm_judge_gen.py
│ │ │ ├── mmmlu/
│ │ │ │ ├── README.md
│ │ │ │ ├── mmmlu_5_shot_gen_bcbeb3.py
│ │ │ │ ├── mmmlu_gen.py
│ │ │ │ ├── mmmlu_gen_c51a84.py
│ │ │ │ └── mmmlu_prompt.py
│ │ │ ├── mmmlu_lite/
│ │ │ │ ├── README.md
│ │ │ │ ├── mmmlu_lite_gen.py
│ │ │ │ └── mmmlu_lite_gen_c51a84.py
│ │ │ ├── multipl_e/
│ │ │ │ ├── multiple_gen.py
│ │ │ │ ├── multiple_top_ten_gen_f44aaf.py
│ │ │ │ └── multiple_top_ten_repeat_gen_0cd6ce.py
│ │ │ ├── musr/
│ │ │ │ ├── README.md
│ │ │ │ ├── musr_gen.py
│ │ │ │ ├── musr_gen_3622bb.py
│ │ │ │ ├── musr_gen_3c6e15.py
│ │ │ │ ├── musr_gen_b47fd3.py
│ │ │ │ ├── musr_llm_judge_gen.py
│ │ │ │ └── musr_llmjudge_gen_b47fd3.py
│ │ │ ├── narrativeqa/
│ │ │ │ ├── narrativeqa_gen.py
│ │ │ │ ├── narrativeqa_gen_a2d88a.py
│ │ │ │ └── narrativeqa_gen_db6413.py
│ │ │ ├── needlebench/
│ │ │ │ ├── atc/
│ │ │ │ │ ├── atc.py
│ │ │ │ │ ├── atc_choice.py
│ │ │ │ │ ├── atc_choice_20.py
│ │ │ │ │ ├── atc_choice_50.py
│ │ │ │ │ ├── atc_choice_50_en_reasoning.py
│ │ │ │ │ ├── atc_choice_80.py
│ │ │ │ │ └── atc_choice_80_en_reasoning.py
│ │ │ │ ├── needlebench_1000k/
│ │ │ │ │ ├── needlebench_1000k.py
│ │ │ │ │ ├── needlebench_multi_reasoning_1000k.py
│ │ │ │ │ ├── needlebench_multi_retrieval_1000k.py
│ │ │ │ │ └── needlebench_single_1000k.py
│ │ │ │ ├── needlebench_128k/
│ │ │ │ │ ├── needlebench_128k.py
│ │ │ │ │ ├── needlebench_multi_reasoning_128k.py
│ │ │ │ │ ├── needlebench_multi_retrieval_128k.py
│ │ │ │ │ └── needlebench_single_128k.py
│ │ │ │ ├── needlebench_200k/
│ │ │ │ │ ├── needlebench_200k.py
│ │ │ │ │ ├── needlebench_multi_reasoning_200k.py
│ │ │ │ │ ├── needlebench_multi_retrieval_200k.py
│ │ │ │ │ └── needlebench_single_200k.py
│ │ │ │ ├── needlebench_256k/
│ │ │ │ │ ├── needlebench_256k.py
│ │ │ │ │ ├── needlebench_multi_reasoning_256k.py
│ │ │ │ │ ├── needlebench_multi_retrieval_256k.py
│ │ │ │ │ └── needlebench_single_256k.py
│ │ │ │ ├── needlebench_32k/
│ │ │ │ │ ├── needlebench_32k.py
│ │ │ │ │ ├── needlebench_multi_reasoning_32k.py
│ │ │ │ │ ├── needlebench_multi_retrieval_32k.py
│ │ │ │ │ └── needlebench_single_32k.py
│ │ │ │ ├── needlebench_4k/
│ │ │ │ │ ├── needlebench_4k.py
│ │ │ │ │ ├── needlebench_multi_reasoning_4k.py
│ │ │ │ │ ├── needlebench_multi_retrieval_4k.py
│ │ │ │ │ └── needlebench_single_4k.py
│ │ │ │ ├── needlebench_8k/
│ │ │ │ │ ├── needlebench_8k.py
│ │ │ │ │ ├── needlebench_multi_reasoning_8k.py
│ │ │ │ │ ├── needlebench_multi_retrieval_8k.py
│ │ │ │ │ ├── needlebench_multi_retrieval_compare_batch_8k.py
│ │ │ │ │ └── needlebench_single_8k.py
│ │ │ │ ├── needlebench_base/
│ │ │ │ │ ├── needlebench_base_gen.py
│ │ │ │ │ └── needlebench_single.py
│ │ │ │ ├── readme.md
│ │ │ │ └── readme_zh-CN.md
│ │ │ ├── needlebench_v2/
│ │ │ │ ├── atc/
│ │ │ │ │ └── atc_0shot_nocot_2_power_en.py
│ │ │ │ ├── needlebench_v2_1000k/
│ │ │ │ │ ├── needlebench_v2_1000k.py
│ │ │ │ │ ├── needlebench_v2_multi_reasoning_1000k.py
│ │ │ │ │ ├── needlebench_v2_multi_retrieval_1000k.py
│ │ │ │ │ └── needlebench_v2_single_1000k.py
│ │ │ │ ├── needlebench_v2_128k/
│ │ │ │ │ ├── needlebench_v2_128k.py
│ │ │ │ │ ├── needlebench_v2_multi_reasoning_128k.py
│ │ │ │ │ ├── needlebench_v2_multi_retrieval_128k.py
│ │ │ │ │ └── needlebench_v2_single_128k.py
│ │ │ │ ├── needlebench_v2_200k/
│ │ │ │ │ ├── needlebench_v2_200k.py
│ │ │ │ │ ├── needlebench_v2_multi_reasoning_200k.py
│ │ │ │ │ ├── needlebench_v2_multi_retrieval_200k.py
│ │ │ │ │ └── needlebench_v2_single_200k.py
│ │ │ │ ├── needlebench_v2_256k/
│ │ │ │ │ ├── needlebench_v2_256k.py
│ │ │ │ │ ├── needlebench_v2_multi_reasoning_256k.py
│ │ │ │ │ ├── needlebench_v2_multi_retrieval_256k.py
│ │ │ │ │ └── needlebench_v2_single_256k.py
│ │ │ │ ├── needlebench_v2_32k/
│ │ │ │ │ ├── needlebench_v2_32k.py
│ │ │ │ │ ├── needlebench_v2_multi_reasoning_32k.py
│ │ │ │ │ ├── needlebench_v2_multi_retrieval_32k.py
│ │ │ │ │ └── needlebench_v2_single_32k.py
│ │ │ │ ├── needlebench_v2_4k/
│ │ │ │ │ ├── needlebench_v2_4k.py
│ │ │ │ │ ├── needlebench_v2_multi_reasoning_4k.py
│ │ │ │ │ ├── needlebench_v2_multi_retrieval_4k.py
│ │ │ │ │ └── needlebench_v2_single_4k.py
│ │ │ │ ├── needlebench_v2_8k/
│ │ │ │ │ ├── needlebench_v2_8k.py
│ │ │ │ │ ├── needlebench_v2_multi_reasoning_8k.py
│ │ │ │ │ ├── needlebench_v2_multi_retrieval_8k.py
│ │ │ │ │ ├── needlebench_v2_multi_retrieval_compare_batch_8k.py
│ │ │ │ │ └── needlebench_v2_single_8k.py
│ │ │ │ ├── readme.md
│ │ │ │ └── readme_zh-CN.md
│ │ │ ├── nejm_ai_benchmark/
│ │ │ │ ├── nejmaibench_gen.py
│ │ │ │ ├── nejmaibench_gen_60c8f5.py
│ │ │ │ ├── nejmaibench_llmjudge_gen.py
│ │ │ │ └── nejmaibench_llmjudge_gen_60c8f5.py
│ │ │ ├── nq/
│ │ │ │ ├── README.md
│ │ │ │ ├── nq_gen.py
│ │ │ │ ├── nq_gen_0356ec.py
│ │ │ │ ├── nq_gen_2463e2.py
│ │ │ │ ├── nq_gen_3dcea1.py
│ │ │ │ ├── nq_gen_68c1c6.py
│ │ │ │ ├── nq_gen_c788f6.py
│ │ │ │ ├── nq_open_1shot_gen_01cf41.py
│ │ │ │ ├── nq_open_1shot_gen_20a989.py
│ │ │ │ ├── nq_open_1shot_gen_2e45e5.py
│ │ │ │ ├── nq_open_gen_e93f8a.py
│ │ │ │ └── nq_xfinder_gen_3dcea1.py
│ │ │ ├── nq_cn/
│ │ │ │ ├── nqcn_gen.py
│ │ │ │ └── nqcn_gen_141737.py
│ │ │ ├── obqa/
│ │ │ │ ├── obqa_gen.py
│ │ │ │ ├── obqa_gen_9069e4.py
│ │ │ │ ├── obqa_ppl.py
│ │ │ │ ├── obqa_ppl_1defe8.py
│ │ │ │ ├── obqa_ppl_6aac9e.py
│ │ │ │ └── obqa_ppl_c7c154.py
│ │ │ ├── ojbench/
│ │ │ │ └── ojbench_gen.py
│ │ │ ├── omni_math/
│ │ │ │ ├── README.md
│ │ │ │ ├── omni_math_cascade_eval_gen_ccf9c0.py
│ │ │ │ ├── omni_math_gen.py
│ │ │ │ ├── omni_math_gen_18cc08.py
│ │ │ │ └── omni_math_llmverify_gen_ccf9c0.py
│ │ │ ├── openswi/
│ │ │ │ ├── openswi_gen.py
│ │ │ │ └── openswi_rawprompt_gen.py
│ │ │ ├── piqa/
│ │ │ │ ├── piqa_gen.py
│ │ │ │ ├── piqa_gen_1194eb.py
│ │ │ │ ├── piqa_ppl.py
│ │ │ │ ├── piqa_ppl_0cfff2.py
│ │ │ │ ├── piqa_ppl_1cf9f0.py
│ │ │ │ └── piqa_ppl_3431ea.py
│ │ │ ├── promptbench/
│ │ │ │ ├── promptbench_iwslt2017_gen_cbb8c8.py
│ │ │ │ ├── promptbench_math_gen_abf776.py
│ │ │ │ ├── promptbench_squad20_gen_b15d1c.py
│ │ │ │ └── promptbench_wnli_gen_50662f.py
│ │ │ ├── py150/
│ │ │ │ ├── py150_gen.py
│ │ │ │ └── py150_gen_38b13d.py
│ │ │ ├── qabench/
│ │ │ │ ├── qabench_gen.py
│ │ │ │ └── qabench_gen_353ae7.py
│ │ │ ├── qasper/
│ │ │ │ ├── qasper_gen.py
│ │ │ │ ├── qasper_gen_a2d88a.py
│ │ │ │ └── qasper_gen_db6413.py
│ │ │ ├── qaspercut/
│ │ │ │ ├── qaspercut_gen.py
│ │ │ │ ├── qaspercut_gen_a2d88a.py
│ │ │ │ └── qaspercut_gen_db6413.py
│ │ │ ├── race/
│ │ │ │ ├── README.md
│ │ │ │ ├── race_cot_gen_d95929.py
│ │ │ │ ├── race_few_shot_gen_a498ed.py
│ │ │ │ ├── race_few_shot_ppl.py
│ │ │ │ ├── race_gen.py
│ │ │ │ ├── race_gen_69ee4f.py
│ │ │ │ ├── race_gen_9302a5.py
│ │ │ │ ├── race_ppl.py
│ │ │ │ ├── race_ppl_5831a0.py
│ │ │ │ ├── race_ppl_a138cd.py
│ │ │ │ └── race_ppl_abed12.py
│ │ │ ├── realtoxicprompts/
│ │ │ │ ├── realtoxicprompts_gen.py
│ │ │ │ ├── realtoxicprompts_gen_7605e4.py
│ │ │ │ └── realtoxicprompts_gen_ac723c.py
│ │ │ ├── rolebench/
│ │ │ │ ├── instruction_generalization_eng.py
│ │ │ │ ├── instruction_generalization_zh.py
│ │ │ │ └── role_generalization_eng.py
│ │ │ ├── ruler/
│ │ │ │ ├── README.md
│ │ │ │ ├── ruler_128k_gen.py
│ │ │ │ ├── ruler_16k_gen.py
│ │ │ │ ├── ruler_1m_gen.py
│ │ │ │ ├── ruler_256k_gen.py
│ │ │ │ ├── ruler_32k_gen.py
│ │ │ │ ├── ruler_4k_gen.py
│ │ │ │ ├── ruler_512k_gen.py
│ │ │ │ ├── ruler_64k_gen.py
│ │ │ │ ├── ruler_8k_gen.py
│ │ │ │ ├── ruler_combined_gen.py
│ │ │ │ ├── ruler_cwe_gen.py
│ │ │ │ ├── ruler_fwe_gen.py
│ │ │ │ ├── ruler_niah_gen.py
│ │ │ │ ├── ruler_qa_gen.py
│ │ │ │ └── ruler_vt_gen.py
│ │ │ ├── s3eval/
│ │ │ │ ├── s3eval.md
│ │ │ │ ├── s3eval_gen.py
│ │ │ │ └── s3eval_gen_b8ac80.py
│ │ │ ├── safety/
│ │ │ │ ├── safety_gen.py
│ │ │ │ └── safety_gen_7ce197.py
│ │ │ ├── scibench/
│ │ │ │ ├── scibench_gen.py
│ │ │ │ └── scibench_gen_2b21f3.py
│ │ │ ├── scicode/
│ │ │ │ ├── README.md
│ │ │ │ ├── scicode_gen.py
│ │ │ │ ├── scicode_gen_085b98.py
│ │ │ │ ├── scicode_gen_62c139.py
│ │ │ │ └── scicode_wbg_gen_085b98.py
│ │ │ ├── siqa/
│ │ │ │ ├── siqa_gen.py
│ │ │ │ ├── siqa_gen_18632c.py
│ │ │ │ ├── siqa_gen_e78df3.py
│ │ │ │ ├── siqa_ppl.py
│ │ │ │ ├── siqa_ppl_42bc6e.py
│ │ │ │ ├── siqa_ppl_7845b0.py
│ │ │ │ ├── siqa_ppl_ced5f6.py
│ │ │ │ └── siqa_ppl_e8d8c5.py
│ │ │ ├── squad20/
│ │ │ │ ├── squad20_gen.py
│ │ │ │ └── squad20_gen_1710bc.py
│ │ │ ├── srbench/
│ │ │ │ ├── srbench_gen.py
│ │ │ │ └── srbench_rawprompt_gen.py
│ │ │ ├── storycloze/
│ │ │ │ ├── storycloze_gen.py
│ │ │ │ ├── storycloze_gen_7f656a.py
│ │ │ │ ├── storycloze_ppl.py
│ │ │ │ ├── storycloze_ppl_496661.py
│ │ │ │ └── storycloze_ppl_afd16f.py
│ │ │ ├── strategyqa/
│ │ │ │ ├── strategyqa_gen.py
│ │ │ │ ├── strategyqa_gen_1180a7.py
│ │ │ │ └── strategyqa_gen_934441.py
│ │ │ ├── subjective/
│ │ │ │ ├── alignbench/
│ │ │ │ │ ├── alignbench_judgeby_critiquellm.py
│ │ │ │ │ ├── alignbench_judgeby_critiquellm_new.py
│ │ │ │ │ ├── alignbench_v1_1_judgeby_critiquellm.py
│ │ │ │ │ └── alignbench_v1_1_judgeby_critiquellm_new.py
│ │ │ │ ├── alpaca_eval/
│ │ │ │ │ ├── alpacav2_judgeby_gpt4.py
│ │ │ │ │ ├── alpacav2_judgeby_gpt4_bradleyterry.py
│ │ │ │ │ └── alpacav2_judgeby_gpt4_new.py
│ │ │ │ ├── arena_hard/
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── arena_hard_compare.py
│ │ │ │ │ ├── arena_hard_compare_bradleyterry.py
│ │ │ │ │ └── arena_hard_compare_new.py
│ │ │ │ ├── compass_arena_subjective_bench/
│ │ │ │ │ ├── README_pairwise_bt.md
│ │ │ │ │ ├── multiturn/
│ │ │ │ │ │ ├── pairwise_bt_judge.py
│ │ │ │ │ │ ├── pairwise_judge.py
│ │ │ │ │ │ └── pointwise_judge.py
│ │ │ │ │ └── singleturn/
│ │ │ │ │ ├── pairwise_bt_judge.py
│ │ │ │ │ ├── pairwise_judge.py
│ │ │ │ │ └── pointwise_judge.py
│ │ │ │ ├── compassarena/
│ │ │ │ │ ├── compassarena_compare.py
│ │ │ │ │ ├── compassarena_compare_bradleyterry.py
│ │ │ │ │ └── compassarena_compare_new.py
│ │ │ │ ├── compassbench/
│ │ │ │ │ ├── compassbench_checklist.py
│ │ │ │ │ ├── compassbench_compare.py
│ │ │ │ │ ├── compassbench_compare_v11.py
│ │ │ │ │ ├── compassbench_compare_v11_patch.py
│ │ │ │ │ └── compassbench_compare_v12.py
│ │ │ │ ├── flames/
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── flames_gen.py
│ │ │ │ │ └── flames_gen_1a58bb.py
│ │ │ │ ├── fofo/
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── fofo_bilingual_judge.py
│ │ │ │ │ ├── fofo_bilingual_judge_new.py
│ │ │ │ │ ├── fofo_judge.py
│ │ │ │ │ └── fofo_judge_new.py
│ │ │ │ ├── followbench/
│ │ │ │ │ ├── followbench_llmeval.py
│ │ │ │ │ └── followbench_llmeval_new.py
│ │ │ │ ├── hellobench/
│ │ │ │ │ ├── README.md
│ │ │ │ │ └── hellobench.py
│ │ │ │ ├── judgerbench/
│ │ │ │ │ └── judgerbench.py
│ │ │ │ ├── multiround/
│ │ │ │ │ ├── mtbench101_judge.py
│ │ │ │ │ ├── mtbench101_judge_new.py
│ │ │ │ │ ├── mtbench_single_judge_diff_temp.py
│ │ │ │ │ └── mtbench_single_judge_diff_temp_new.py
│ │ │ │ ├── wildbench/
│ │ │ │ │ ├── wildbench.md
│ │ │ │ │ ├── wildbench_pair_judge.py
│ │ │ │ │ ├── wildbench_pair_judge_bradleyterry.py
│ │ │ │ │ └── wildbench_pair_judge_new.py
│ │ │ │ └── writingbench/
│ │ │ │ └── writingbench_judge.py
│ │ │ ├── summedits/
│ │ │ │ ├── summedits_gen.py
│ │ │ │ ├── summedits_gen_315438.py
│ │ │ │ ├── summedits_gen_4fb38b.py
│ │ │ │ ├── summedits_ppl.py
│ │ │ │ ├── summedits_ppl_1fbeb6.py
│ │ │ │ ├── summedits_ppl_3c30d0.py
│ │ │ │ └── summedits_ppl_fa58ba.py
│ │ │ ├── summscreen/
│ │ │ │ ├── summscreen_gen.py
│ │ │ │ ├── summscreen_gen_653185.py
│ │ │ │ └── summscreen_gen_aa5eb3.py
│ │ │ ├── supergpqa/
│ │ │ │ ├── supergpqa_cascade_gen_1545c1.py
│ │ │ │ ├── supergpqa_cascade_rawprompt_gen_ca8345.py
│ │ │ │ ├── supergpqa_gen.py
│ │ │ │ ├── supergpqa_llmjudge_field_gen_1545c1.py
│ │ │ │ └── supergpqa_llmjudge_gen_12b8bc.py
│ │ │ ├── taco/
│ │ │ │ ├── README.md
│ │ │ │ ├── taco_gen.py
│ │ │ │ ├── taco_gen_c7893a.py
│ │ │ │ └── taco_levels_gen_411572.py
│ │ │ ├── teval/
│ │ │ │ ├── README.md
│ │ │ │ ├── teval_en_gen.py
│ │ │ │ ├── teval_en_gen_1ac254.py
│ │ │ │ ├── teval_zh_gen.py
│ │ │ │ └── teval_zh_gen_1ac254.py
│ │ │ ├── triviaqa/
│ │ │ │ ├── README.md
│ │ │ │ ├── triviaqa_gen.py
│ │ │ │ ├── triviaqa_gen_0356ec.py
│ │ │ │ ├── triviaqa_gen_2121ce.py
│ │ │ │ ├── triviaqa_gen_3e39a5.py
│ │ │ │ ├── triviaqa_gen_429db5.py
│ │ │ │ ├── triviaqa_gen_d297bb.py
│ │ │ │ ├── triviaqa_wiki_1shot_gen_20a989.py
│ │ │ │ ├── triviaqa_wiki_1shot_gen_bc5f21.py
│ │ │ │ ├── triviaqa_wiki_1shot_gen_c87d61.py
│ │ │ │ ├── triviaqa_wiki_1shot_gen_eaf81e.py
│ │ │ │ └── triviaqa_wiki_gen_d18bf4.py
│ │ │ ├── triviaqarc/
│ │ │ │ ├── triviaqarc_gen.py
│ │ │ │ ├── triviaqarc_gen_a2d88a.py
│ │ │ │ └── triviaqarc_gen_db6413.py
│ │ │ ├── truthfulqa/
│ │ │ │ ├── truthfulqa_gen.py
│ │ │ │ ├── truthfulqa_gen_1e7d8d.py
│ │ │ │ └── truthfulqa_gen_5ddc62.py
│ │ │ ├── tydiqa/
│ │ │ │ ├── tydiqa_gen.py
│ │ │ │ └── tydiqa_gen_978d2a.py
│ │ │ ├── wikibench/
│ │ │ │ ├── wikibench_few_shot_ppl_c23d79.py
│ │ │ │ ├── wikibench_gen.py
│ │ │ │ ├── wikibench_gen_0978ad.py
│ │ │ │ └── wikibench_gen_f96ece.py
│ │ │ ├── wikitext/
│ │ │ │ ├── wikitext_103_raw_ppl.py
│ │ │ │ ├── wikitext_103_raw_ppl_752e2a.py
│ │ │ │ ├── wikitext_2_raw_ppl.py
│ │ │ │ └── wikitext_2_raw_ppl_752e2a.py
│ │ │ ├── winograd/
│ │ │ │ ├── winograd_ppl.py
│ │ │ │ ├── winograd_ppl_8f3049.py
│ │ │ │ └── winograd_ppl_b6c7ed.py
│ │ │ ├── winogrande/
│ │ │ │ ├── README.md
│ │ │ │ ├── deprecated_winogrande_gen_a9ede5.py
│ │ │ │ ├── winogrande_5shot_gen_6447e6.py
│ │ │ │ ├── winogrande_5shot_gen_b36770.py
│ │ │ │ ├── winogrande_5shot_ll_252f01.py
│ │ │ │ ├── winogrande_gen.py
│ │ │ │ ├── winogrande_gen_458220.py
│ │ │ │ ├── winogrande_gen_a027b6.py
│ │ │ │ ├── winogrande_ll.py
│ │ │ │ ├── winogrande_ll_c5cf57.py
│ │ │ │ ├── winogrande_ppl_55a66e.py
│ │ │ │ └── winogrande_ppl_9307fd.py
│ │ │ └── xiezhi/
│ │ │ ├── xiezhi_gen.py
│ │ │ ├── xiezhi_gen_b86cf5.py
│ │ │ ├── xiezhi_ppl.py
│ │ │ └── xiezhi_ppl_ea6bd7.py
│ │ ├── models/
│ │ │ ├── accessory/
│ │ │ │ ├── accessory_llama2_7b.py
│ │ │ │ ├── accessory_mixtral_8x7b.py
│ │ │ │ └── accessory_sphinx_v2_1k.py
│ │ │ ├── alaya/
│ │ │ │ └── alaya.py
│ │ │ ├── aquila/
│ │ │ │ ├── hf_aquila2_34b.py
│ │ │ │ ├── hf_aquila2_7b.py
│ │ │ │ ├── hf_aquilachat2_34b.py
│ │ │ │ ├── hf_aquilachat2_34b_16k.py
│ │ │ │ ├── hf_aquilachat2_7b.py
│ │ │ │ └── hf_aquilachat2_7b_16k.py
│ │ │ ├── baichuan/
│ │ │ │ ├── hf_baichuan2_13b_base.py
│ │ │ │ ├── hf_baichuan2_13b_chat.py
│ │ │ │ ├── hf_baichuan2_7b_base.py
│ │ │ │ ├── hf_baichuan2_7b_chat.py
│ │ │ │ ├── hf_baichuan_13b_base.py
│ │ │ │ ├── hf_baichuan_13b_chat.py
│ │ │ │ ├── hf_baichuan_7b.py
│ │ │ │ ├── hf_baichuan_m1_14b_base.py
│ │ │ │ └── hf_baichuan_m1_14b_instruct.py
│ │ │ ├── bailing_api/
│ │ │ │ ├── bailing-lite-1116.py
│ │ │ │ └── bailing-pro-1120.py
│ │ │ ├── bluelm/
│ │ │ │ ├── bluelm_3b.py
│ │ │ │ ├── hf_bluelm_7b_base.py
│ │ │ │ ├── hf_bluelm_7b_base_32k.py
│ │ │ │ ├── hf_bluelm_7b_chat.py
│ │ │ │ └── hf_bluelm_7b_chat_32k.py
│ │ │ ├── chatglm/
│ │ │ │ ├── hf_chatglm2_6b.py
│ │ │ │ ├── hf_chatglm3_6b.py
│ │ │ │ ├── hf_chatglm3_6b_32k.py
│ │ │ │ ├── hf_chatglm3_6b_base.py
│ │ │ │ ├── hf_chatglm_6b.py
│ │ │ │ ├── hf_glm4_9b.py
│ │ │ │ ├── hf_glm4_9b_chat.py
│ │ │ │ ├── lmdeploy_glm4_9b.py
│ │ │ │ ├── lmdeploy_glm4_9b_chat.py
│ │ │ │ ├── vllm_chatglm3_6b.py
│ │ │ │ ├── vllm_chatglm3_6b_32k.py
│ │ │ │ └── vllm_glm4_9b_chat.py
│ │ │ ├── claude/
│ │ │ │ ├── claude.py
│ │ │ │ └── claude2.py
│ │ │ ├── codegeex2/
│ │ │ │ └── hf_codegeex2_6b.py
│ │ │ ├── codellama/
│ │ │ │ ├── hf_codellama_13b.py
│ │ │ │ ├── hf_codellama_13b_instruct.py
│ │ │ │ ├── hf_codellama_13b_python.py
│ │ │ │ ├── hf_codellama_34b.py
│ │ │ │ ├── hf_codellama_34b_instruct.py
│ │ │ │ ├── hf_codellama_34b_python.py
│ │ │ │ ├── hf_codellama_70b.py
│ │ │ │ ├── hf_codellama_70b_instruct.py
│ │ │ │ ├── hf_codellama_70b_python.py
│ │ │ │ ├── hf_codellama_7b.py
│ │ │ │ ├── hf_codellama_7b_instruct.py
│ │ │ │ └── hf_codellama_7b_python.py
│ │ │ ├── deepseek/
│ │ │ │ ├── deepseek_r1_streaming.py
│ │ │ │ ├── hf_deepseek_67b_base.py
│ │ │ │ ├── hf_deepseek_67b_chat.py
│ │ │ │ ├── hf_deepseek_7b_base.py
│ │ │ │ ├── hf_deepseek_7b_chat.py
│ │ │ │ ├── hf_deepseek_coder_1_3b_instruct.py
│ │ │ │ ├── hf_deepseek_coder_33b_instruct.py
│ │ │ │ ├── hf_deepseek_coder_6_7b_instruct.py
│ │ │ │ ├── hf_deepseek_moe_16b_base.py
│ │ │ │ ├── hf_deepseek_moe_16b_chat.py
│ │ │ │ ├── hf_deepseek_r1_distill_llama_70b.py
│ │ │ │ ├── hf_deepseek_r1_distill_llama_8b.py
│ │ │ │ ├── hf_deepseek_r1_distill_qwen_14b.py
│ │ │ │ ├── hf_deepseek_r1_distill_qwen_1_5b.py
│ │ │ │ ├── hf_deepseek_r1_distill_qwen_32b.py
│ │ │ │ ├── hf_deepseek_r1_distill_qwen_7b.py
│ │ │ │ ├── hf_deepseek_v2.py
│ │ │ │ ├── hf_deepseek_v2_chat.py
│ │ │ │ ├── hf_deepseek_v2_lite.py
│ │ │ │ ├── hf_deepseek_v2_lite_chat.py
│ │ │ │ ├── lmdeploy_deepseek_67b_base.py
│ │ │ │ ├── lmdeploy_deepseek_67b_chat.py
│ │ │ │ ├── lmdeploy_deepseek_7b_base.py
│ │ │ │ ├── lmdeploy_deepseek_7b_chat.py
│ │ │ │ ├── lmdeploy_deepseek_r1_distill_llama_70b.py
│ │ │ │ ├── lmdeploy_deepseek_r1_distill_llama_8b.py
│ │ │ │ ├── lmdeploy_deepseek_r1_distill_qwen_14b.py
│ │ │ │ ├── lmdeploy_deepseek_r1_distill_qwen_1_5b.py
│ │ │ │ ├── lmdeploy_deepseek_r1_distill_qwen_32b.py
│ │ │ │ ├── lmdeploy_deepseek_r1_distill_qwen_7b.py
│ │ │ │ ├── lmdeploy_deepseek_series.py
│ │ │ │ ├── lmdeploy_deepseek_v2.py
│ │ │ │ ├── lmdeploy_deepseek_v2_5.py
│ │ │ │ ├── lmdeploy_deepseek_v2_5_1210.py
│ │ │ │ ├── lmdeploy_deepseek_v2_lite.py
│ │ │ │ ├── vllm_deepseek_67b_chat.py
│ │ │ │ ├── vllm_deepseek_7b_chat.py
│ │ │ │ ├── vllm_deepseek_moe_16b_base.py
│ │ │ │ └── vllm_deepseek_moe_16b_chat.py
│ │ │ ├── falcon/
│ │ │ │ ├── hf_falcon_40b.py
│ │ │ │ └── hf_falcon_7b.py
│ │ │ ├── gemini/
│ │ │ │ ├── gemini_1_5_flash.py
│ │ │ │ ├── gemini_1_5_pro.py
│ │ │ │ └── gemini_pro.py
│ │ │ ├── gemma/
│ │ │ │ ├── hf_gemma2_27b.py
│ │ │ │ ├── hf_gemma2_27b_it.py
│ │ │ │ ├── hf_gemma2_2b.py
│ │ │ │ ├── hf_gemma2_2b_it.py
│ │ │ │ ├── hf_gemma2_9b.py
│ │ │ │ ├── hf_gemma2_9b_it.py
│ │ │ │ ├── hf_gemma_2b.py
│ │ │ │ ├── hf_gemma_2b_it.py
│ │ │ │ ├── hf_gemma_7b.py
│ │ │ │ ├── hf_gemma_7b_it.py
│ │ │ │ ├── lmdeploy_gemma_27b.py
│ │ │ │ ├── lmdeploy_gemma_27b_it.py
│ │ │ │ ├── lmdeploy_gemma_9b.py
│ │ │ │ ├── lmdeploy_gemma_9b_it.py
│ │ │ │ ├── vllm_gemma_2b.py
│ │ │ │ ├── vllm_gemma_2b_it.py
│ │ │ │ ├── vllm_gemma_3_12b_it.py
│ │ │ │ ├── vllm_gemma_3_27b_it.py
│ │ │ │ ├── vllm_gemma_3_4b_it.py
│ │ │ │ ├── vllm_gemma_7b.py
│ │ │ │ └── vllm_gemma_7b_it.py
│ │ │ ├── hf_internlm/
│ │ │ │ ├── README.md
│ │ │ │ ├── hf_internlm2_1_8b.py
│ │ │ │ ├── hf_internlm2_20b.py
│ │ │ │ ├── hf_internlm2_5_1_8b_chat.py
│ │ │ │ ├── hf_internlm2_5_20b_chat.py
│ │ │ │ ├── hf_internlm2_5_7b.py
│ │ │ │ ├── hf_internlm2_5_7b_chat.py
│ │ │ │ ├── hf_internlm2_7b.py
│ │ │ │ ├── hf_internlm2_base_20b.py
│ │ │ │ ├── hf_internlm2_base_7b.py
│ │ │ │ ├── hf_internlm2_chat_1_8b.py
│ │ │ │ ├── hf_internlm2_chat_1_8b_sft.py
│ │ │ │ ├── hf_internlm2_chat_20b.py
│ │ │ │ ├── hf_internlm2_chat_20b_sft.py
│ │ │ │ ├── hf_internlm2_chat_20b_with_system.py
│ │ │ │ ├── hf_internlm2_chat_7b.py
│ │ │ │ ├── hf_internlm2_chat_7b_sft.py
│ │ │ │ ├── hf_internlm2_chat_7b_with_system.py
│ │ │ │ ├── hf_internlm2_chat_math_20b.py
│ │ │ │ ├── hf_internlm2_chat_math_20b_with_system.py
│ │ │ │ ├── hf_internlm2_chat_math_7b.py
│ │ │ │ ├── hf_internlm2_chat_math_7b_with_system.py
│ │ │ │ ├── hf_internlm2_math_20b.py
│ │ │ │ ├── hf_internlm2_math_7b.py
│ │ │ │ ├── hf_internlm3_8b_instruct.py
│ │ │ │ ├── hf_internlm_20b.py
│ │ │ │ ├── hf_internlm_7b.py
│ │ │ │ ├── hf_internlm_chat_20b.py
│ │ │ │ ├── hf_internlm_chat_7b.py
│ │ │ │ ├── lmdeploy_internlm2_1_8b.py
│ │ │ │ ├── lmdeploy_internlm2_20b.py
│ │ │ │ ├── lmdeploy_internlm2_5_1_8b_chat.py
│ │ │ │ ├── lmdeploy_internlm2_5_20b_chat.py
│ │ │ │ ├── lmdeploy_internlm2_5_7b.py
│ │ │ │ ├── lmdeploy_internlm2_5_7b_chat.py
│ │ │ │ ├── lmdeploy_internlm2_5_7b_chat_1m.py
│ │ │ │ ├── lmdeploy_internlm2_7b.py
│ │ │ │ ├── lmdeploy_internlm2_base_20b.py
│ │ │ │ ├── lmdeploy_internlm2_base_7b.py
│ │ │ │ ├── lmdeploy_internlm2_chat_1_8b.py
│ │ │ │ ├── lmdeploy_internlm2_chat_1_8b_sft.py
│ │ │ │ ├── lmdeploy_internlm2_chat_20b.py
│ │ │ │ ├── lmdeploy_internlm2_chat_20b_sft.py
│ │ │ │ ├── lmdeploy_internlm2_chat_7b.py
│ │ │ │ ├── lmdeploy_internlm2_chat_7b_sft.py
│ │ │ │ ├── lmdeploy_internlm2_series.py
│ │ │ │ ├── lmdeploy_internlm3_8b_instruct.py
│ │ │ │ ├── lmdeploy_internlm3_8b_instruct_128k.py
│ │ │ │ ├── lmdeploy_internlm_20b.py
│ │ │ │ ├── lmdeploy_internlm_7b.py
│ │ │ │ ├── lmdeploy_internlm_chat_20b.py
│ │ │ │ ├── lmdeploy_internlm_chat_7b.py
│ │ │ │ ├── lmdeploy_oreal_32b.py
│ │ │ │ ├── vllm_internlm2_chat_1_8b.py
│ │ │ │ ├── vllm_internlm2_chat_1_8b_sft.py
│ │ │ │ ├── vllm_internlm2_chat_20b.py
│ │ │ │ ├── vllm_internlm2_chat_20b_sft.py
│ │ │ │ ├── vllm_internlm2_chat_7b.py
│ │ │ │ ├── vllm_internlm2_chat_7b_sft.py
│ │ │ │ └── vllm_internlm2_series.py
│ │ │ ├── hf_llama/
│ │ │ │ ├── hf_llama2_13b.py
│ │ │ │ ├── hf_llama2_13b_chat.py
│ │ │ │ ├── hf_llama2_70b.py
│ │ │ │ ├── hf_llama2_70b_chat.py
│ │ │ │ ├── hf_llama2_7b.py
│ │ │ │ ├── hf_llama2_7b_chat.py
│ │ │ │ ├── hf_llama3_1_70b_instruct.py
│ │ │ │ ├── hf_llama3_1_8b.py
│ │ │ │ ├── hf_llama3_1_8b_instruct.py
│ │ │ │ ├── hf_llama3_2_3b_instruct.py
│ │ │ │ ├── hf_llama3_70b.py
│ │ │ │ ├── hf_llama3_70b_instruct.py
│ │ │ │ ├── hf_llama3_8b.py
│ │ │ │ ├── hf_llama3_8b_instruct.py
│ │ │ │ ├── hf_llama_13b.py
│ │ │ │ ├── hf_llama_30b.py
│ │ │ │ ├── hf_llama_65b.py
│ │ │ │ ├── hf_llama_7b.py
│ │ │ │ ├── lmdeploy_llama2_13b.py
│ │ │ │ ├── lmdeploy_llama2_13b_chat.py
│ │ │ │ ├── lmdeploy_llama2_70b.py
│ │ │ │ ├── lmdeploy_llama2_70b_chat.py
│ │ │ │ ├── lmdeploy_llama2_7b.py
│ │ │ │ ├── lmdeploy_llama2_7b_chat.py
│ │ │ │ ├── lmdeploy_llama3_1_70b_instruct.py
│ │ │ │ ├── lmdeploy_llama3_1_8b.py
│ │ │ │ ├── lmdeploy_llama3_1_8b_instruct.py
│ │ │ │ ├── lmdeploy_llama3_2_3b_instruct.py
│ │ │ │ ├── lmdeploy_llama3_3_70b_instruct.py
│ │ │ │ ├── lmdeploy_llama3_70b.py
│ │ │ │ ├── lmdeploy_llama3_70b_instruct.py
│ │ │ │ ├── lmdeploy_llama3_8b.py
│ │ │ │ ├── lmdeploy_llama3_8b_instruct.py
│ │ │ │ ├── lmdeploy_llama_13b.py
│ │ │ │ ├── lmdeploy_llama_30b.py
│ │ │ │ ├── lmdeploy_llama_65b.py
│ │ │ │ ├── lmdeploy_llama_7b.py
│ │ │ │ └── vllm_llama_series.py
│ │ │ ├── huatuogpt/
│ │ │ │ ├── hf_huatuogpt2_13b.py
│ │ │ │ ├── hf_huatuogpt2_7b.py
│ │ │ │ ├── hf_huatuogpt_o1_7b.py
│ │ │ │ └── hf_huatuogpt_o1_8b.py
│ │ │ ├── internlm/
│ │ │ │ └── internlm_7b.py
│ │ │ ├── interns1/
│ │ │ │ └── intern_s1.py
│ │ │ ├── internvl/
│ │ │ │ ├── lmdeploy_internvl_2_5_38b.py
│ │ │ │ └── lmdeploy_internvl_2_5_8b.py
│ │ │ ├── judge_llm/
│ │ │ │ ├── auto_j/
│ │ │ │ │ ├── hf_autoj_bilingual_6b.py
│ │ │ │ │ ├── hf_autoj_eng_13b.py
│ │ │ │ │ ├── hf_autoj_eng_13b_4bit.py
│ │ │ │ │ └── hf_autoj_scen_classifier.py
│ │ │ │ ├── judgelm/
│ │ │ │ │ ├── hf_judgelm_13b_v1.py
│ │ │ │ │ ├── hf_judgelm_33b_v1.py
│ │ │ │ │ └── hf_judgelm_7b_v1.py
│ │ │ │ └── pandalm/
│ │ │ │ ├── hf_alpaca_pandalm_7b_v1.py
│ │ │ │ └── hf_pandalm_7b_v1.py
│ │ │ ├── lemur/
│ │ │ │ └── lemur_70b_chat.py
│ │ │ ├── lingowhale/
│ │ │ │ └── hf_lingowhale_8b.py
│ │ │ ├── mistral/
│ │ │ │ ├── hf_ministral_8b_instruct_2410.py
│ │ │ │ ├── hf_mistral_7b_instruct_v0_1.py
│ │ │ │ ├── hf_mistral_7b_instruct_v0_2.py
│ │ │ │ ├── hf_mistral_7b_instruct_v0_3.py
│ │ │ │ ├── hf_mistral_7b_v0_1.py
│ │ │ │ ├── hf_mistral_7b_v0_2.py
│ │ │ │ ├── hf_mistral_7b_v0_3.py
│ │ │ │ ├── hf_mistral_nemo_instruct_2407.py
│ │ │ │ ├── hf_mistral_small_instruct_2409.py
│ │ │ │ ├── hf_mixtral_8x22b_instruct_v0_1.py
│ │ │ │ ├── hf_mixtral_8x22b_v0_1.py
│ │ │ │ ├── hf_mixtral_8x7b_instruct_v0_1.py
│ │ │ │ ├── hf_mixtral_8x7b_v0_1.py
│ │ │ │ ├── lmdeploy_ministral_8b_instruct_2410.py
│ │ │ │ ├── lmdeploy_mistral_7b_instruct_v0_3.py
│ │ │ │ ├── lmdeploy_mistral_large_instruct_2411.py
│ │ │ │ ├── lmdeploy_mistral_nemo_instruct_2407.py
│ │ │ │ ├── lmdeploy_mistral_small_instruct_2409.py
│ │ │ │ ├── lmdeploy_mixtral_8x22b_instruct_v0_1.py
│ │ │ │ ├── lmdeploy_mixtral_large_instruct_2407.py
│ │ │ │ ├── mixtral_8x7b_32k.py
│ │ │ │ ├── vllm_mistral_7b_instruct_v0_1.py
│ │ │ │ ├── vllm_mistral_7b_instruct_v0_2.py
│ │ │ │ ├── vllm_mistral_7b_v0_1.py
│ │ │ │ ├── vllm_mistral_7b_v0_2.py
│ │ │ │ ├── vllm_mixtral_8x22b_instruct_v0_1.py
│ │ │ │ ├── vllm_mixtral_8x22b_v0_1.py
│ │ │ │ ├── vllm_mixtral_8x7b_instruct_v0_1.py
│ │ │ │ ├── vllm_mixtral_8x7b_v0_1.py
│ │ │ │ └── vllm_mixtral_large_instruct_2407.py
│ │ │ ├── moonshot/
│ │ │ │ ├── kimi_k2.py
│ │ │ │ └── kimi_k2_streaming.py
│ │ │ ├── moss/
│ │ │ │ ├── hf_moss_moon_003_base.py
│ │ │ │ └── hf_moss_moon_003_sft.py
│ │ │ ├── mpt/
│ │ │ │ ├── hf_mpt_7b.py
│ │ │ │ └── hf_mpt_instruct_7b.py
│ │ │ ├── ms_internlm/
│ │ │ │ └── ms_internlm_chat_7b_8k.py
│ │ │ ├── nanbeige/
│ │ │ │ ├── hf_nanbeige2_16b_chat.py
│ │ │ │ ├── hf_nanbeige2_8b_chat.py
│ │ │ │ └── hf_nanbeige_16b_chat.py
│ │ │ ├── nvidia/
│ │ │ │ └── lmdeploy_nemotron_70b_instruct_hf.py
│ │ │ ├── openai/
│ │ │ │ ├── gpt_3_5_turbo.py
│ │ │ │ ├── gpt_3_5_turbo_0125.py
│ │ │ │ ├── gpt_4.py
│ │ │ │ ├── gpt_4o_2024_05_13.py
│ │ │ │ ├── o1_mini_2024_09_12.py
│ │ │ │ └── o1_preview_2024_09_12.py
│ │ │ ├── openbmb/
│ │ │ │ ├── hf_minicpm3_4b.py
│ │ │ │ ├── hf_minicpm_2b_dpo_fp32.py
│ │ │ │ ├── hf_minicpm_2b_sft_bf16.py
│ │ │ │ └── hf_minicpm_2b_sft_fp32.py
│ │ │ ├── opt/
│ │ │ │ ├── hf_opt_125m.py
│ │ │ │ └── hf_opt_350m.py
│ │ │ ├── others/
│ │ │ │ ├── hf_abel_7b_001.py
│ │ │ │ ├── hf_abel_7b_002.py
│ │ │ │ ├── hf_arithmo_mistral_7b.py
│ │ │ │ ├── hf_command_r_plus.py
│ │ │ │ ├── hf_dbrx_base.py
│ │ │ │ ├── hf_dbrx_instruct.py
│ │ │ │ ├── hf_dolphin_21_mistral_7b.py
│ │ │ │ ├── hf_fashiongpt_70b_v11.py
│ │ │ │ ├── hf_gsm8k_rft_llama7b2_u13b.py
│ │ │ │ ├── hf_metamath_7b_v1_0.py
│ │ │ │ ├── hf_metamath_llemma_7b.py
│ │ │ │ ├── hf_metamath_mistral_7b.py
│ │ │ │ ├── hf_openchat_35_0106.py
│ │ │ │ ├── hf_openchat_35_1210.py
│ │ │ │ ├── hf_orionstar_14b_base.py
│ │ │ │ ├── hf_orionstar_yi_34b_chat.py
│ │ │ │ ├── hf_phi_2.py
│ │ │ │ ├── hf_telechat_12b_v2.py
│ │ │ │ ├── hf_telechat_52b.py
│ │ │ │ ├── hf_telechat_7b.py
│ │ │ │ ├── hf_yayi2_30b_base.py
│ │ │ │ ├── vllm_dbrx_instruct.py
│ │ │ │ └── vllm_orionstar_14b_longchat.py
│ │ │ ├── phi/
│ │ │ │ ├── hf_phi_3_5_MoE_instruct.py
│ │ │ │ ├── hf_phi_3_5_mini_instruct.py
│ │ │ │ ├── hf_phi_3_medium_4k_instruct.py
│ │ │ │ ├── hf_phi_3_mini_4k_instruct.py
│ │ │ │ ├── hf_phi_3_small_8k_instruct.py
│ │ │ │ └── hf_phi_4.py
│ │ │ ├── pulse/
│ │ │ │ └── hf_pulse_7b.py
│ │ │ ├── qwen/
│ │ │ │ ├── README.md
│ │ │ │ ├── hf_qwen1_5_0_5b.py
│ │ │ │ ├── hf_qwen1_5_0_5b_chat.py
│ │ │ │ ├── hf_qwen1_5_110b.py
│ │ │ │ ├── hf_qwen1_5_110b_chat.py
│ │ │ │ ├── hf_qwen1_5_14b.py
│ │ │ │ ├── hf_qwen1_5_14b_chat.py
│ │ │ │ ├── hf_qwen1_5_1_8b.py
│ │ │ │ ├── hf_qwen1_5_1_8b_chat.py
│ │ │ │ ├── hf_qwen1_5_32b.py
│ │ │ │ ├── hf_qwen1_5_32b_chat.py
│ │ │ │ ├── hf_qwen1_5_4b.py
│ │ │ │ ├── hf_qwen1_5_4b_chat.py
│ │ │ │ ├── hf_qwen1_5_72b.py
│ │ │ │ ├── hf_qwen1_5_72b_chat.py
│ │ │ │ ├── hf_qwen1_5_7b.py
│ │ │ │ ├── hf_qwen1_5_7b_chat.py
│ │ │ │ ├── hf_qwen1_5_moe_a2_7b.py
│ │ │ │ ├── hf_qwen1_5_moe_a2_7b_chat.py
│ │ │ │ ├── hf_qwen2_0_5b.py
│ │ │ │ ├── hf_qwen2_0_5b_instruct.py
│ │ │ │ ├── hf_qwen2_1_5b.py
│ │ │ │ ├── hf_qwen2_1_5b_instruct.py
│ │ │ │ ├── hf_qwen2_57b_a14b.py
│ │ │ │ ├── hf_qwen2_72b.py
│ │ │ │ ├── hf_qwen2_7b.py
│ │ │ │ ├── hf_qwen2_7b_instruct.py
│ │ │ │ ├── hf_qwen_14b.py
│ │ │ │ ├── hf_qwen_14b_chat.py
│ │ │ │ ├── hf_qwen_1_8b.py
│ │ │ │ ├── hf_qwen_1_8b_chat.py
│ │ │ │ ├── hf_qwen_72b.py
│ │ │ │ ├── hf_qwen_72b_chat.py
│ │ │ │ ├── hf_qwen_7b.py
│ │ │ │ ├── hf_qwen_7b_chat.py
│ │ │ │ ├── lmdeploy_qwen1_5_110b.py
│ │ │ │ ├── lmdeploy_qwen1_5_110b_chat.py
│ │ │ │ ├── lmdeploy_qwen1_5_14b.py
│ │ │ │ ├── lmdeploy_qwen1_5_14b_chat.py
│ │ │ │ ├── lmdeploy_qwen1_5_1_8b.py
│ │ │ │ ├── lmdeploy_qwen1_5_1_8b_chat.py
│ │ │ │ ├── lmdeploy_qwen1_5_32b.py
│ │ │ │ ├── lmdeploy_qwen1_5_32b_chat.py
│ │ │ │ ├── lmdeploy_qwen1_5_4b.py
│ │ │ │ ├── lmdeploy_qwen1_5_4b_chat.py
│ │ │ │ ├── lmdeploy_qwen1_5_72b.py
│ │ │ │ ├── lmdeploy_qwen1_5_72b_chat.py
│ │ │ │ ├── lmdeploy_qwen1_5_7b.py
│ │ │ │ ├── lmdeploy_qwen1_5_7b_chat.py
│ │ │ │ ├── lmdeploy_qwen1_5_series.py
│ │ │ │ ├── lmdeploy_qwen2_1_5b.py
│ │ │ │ ├── lmdeploy_qwen2_1_5b_instruct.py
│ │ │ │ ├── lmdeploy_qwen2_72b.py
│ │ │ │ ├── lmdeploy_qwen2_72b_instruct.py
│ │ │ │ ├── lmdeploy_qwen2_7b.py
│ │ │ │ ├── lmdeploy_qwen2_7b_instruct.py
│ │ │ │ ├── lmdeploy_qwen2_series.py
│ │ │ │ ├── lmdeploy_qwen_14b.py
│ │ │ │ ├── lmdeploy_qwen_14b_chat.py
│ │ │ │ ├── lmdeploy_qwen_1_8b.py
│ │ │ │ ├── lmdeploy_qwen_1_8b_chat.py
│ │ │ │ ├── lmdeploy_qwen_72b.py
│ │ │ │ ├── lmdeploy_qwen_72b_chat.py
│ │ │ │ ├── lmdeploy_qwen_7b.py
│ │ │ │ ├── lmdeploy_qwen_7b_chat.py
│ │ │ │ ├── lmdeploy_qwen_series.py
│ │ │ │ ├── ms_qwen_7b_chat.py
│ │ │ │ ├── vllm_qwen1_5_0_5b.py
│ │ │ │ ├── vllm_qwen1_5_0_5b_chat.py
│ │ │ │ ├── vllm_qwen1_5_110b.py
│ │ │ │ ├── vllm_qwen1_5_110b_chat.py
│ │ │ │ ├── vllm_qwen1_5_14b.py
│ │ │ │ ├── vllm_qwen1_5_14b_chat.py
│ │ │ │ ├── vllm_qwen1_5_1_8b.py
│ │ │ │ ├── vllm_qwen1_5_1_8b_chat.py
│ │ │ │ ├── vllm_qwen1_5_32b.py
│ │ │ │ ├── vllm_qwen1_5_32b_chat.py
│ │ │ │ ├── vllm_qwen1_5_4b.py
│ │ │ │ ├── vllm_qwen1_5_4b_chat.py
│ │ │ │ ├── vllm_qwen1_5_72b.py
│ │ │ │ ├── vllm_qwen1_5_72b_chat.py
│ │ │ │ ├── vllm_qwen1_5_7b.py
│ │ │ │ ├── vllm_qwen1_5_7b_chat.py
│ │ │ │ ├── vllm_qwen1_5_moe_a2_7b.py
│ │ │ │ ├── vllm_qwen1_5_moe_a2_7b_chat.py
│ │ │ │ ├── vllm_qwen1_5_series.py
│ │ │ │ ├── vllm_qwen2_0_5b.py
│ │ │ │ ├── vllm_qwen2_0_5b_instruct.py
│ │ │ │ ├── vllm_qwen2_1_5b.py
│ │ │ │ ├── vllm_qwen2_1_5b_instruct.py
│ │ │ │ ├── vllm_qwen2_57b_a14b_instruct.py
│ │ │ │ ├── vllm_qwen2_72b.py
│ │ │ │ ├── vllm_qwen2_72b_instruct.py
│ │ │ │ ├── vllm_qwen2_7b.py
│ │ │ │ ├── vllm_qwen2_7b_instruct.py
│ │ │ │ ├── vllm_qwen2_series.py
│ │ │ │ ├── vllm_qwen_14b.py
│ │ │ │ ├── vllm_qwen_14b_chat.py
│ │ │ │ ├── vllm_qwen_1_8b.py
│ │ │ │ ├── vllm_qwen_1_8b_chat.py
│ │ │ │ ├── vllm_qwen_72b.py
│ │ │ │ ├── vllm_qwen_72b_chat.py
│ │ │ │ ├── vllm_qwen_7b.py
│ │ │ │ ├── vllm_qwen_7b_chat.py
│ │ │ │ └── vllm_qwen_series.py
│ │ │ ├── qwen2_5/
│ │ │ │ ├── hf_qwen2_5_0_5b_instruct.py
│ │ │ │ ├── hf_qwen2_5_14b_instruct.py
│ │ │ │ ├── hf_qwen2_5_1_5b_instruct.py
│ │ │ │ ├── hf_qwen2_5_32b_instruct.py
│ │ │ │ ├── hf_qwen2_5_3b_instruct.py
│ │ │ │ ├── hf_qwen2_5_72b_instruct.py
│ │ │ │ ├── hf_qwen2_5_7b_instruct.py
│ │ │ │ ├── hf_qwen_2_5_14b.py
│ │ │ │ ├── hf_qwen_2_5_32b.py
│ │ │ │ ├── hf_qwen_2_5_7b.py
│ │ │ │ ├── lmdeploy_qwen2_5_0_5b_instruct.py
│ │ │ │ ├── lmdeploy_qwen2_5_14b.py
│ │ │ │ ├── lmdeploy_qwen2_5_14b_instruct.py
│ │ │ │ ├── lmdeploy_qwen2_5_1_5b.py
│ │ │ │ ├── lmdeploy_qwen2_5_1_5b_instruct.py
│ │ │ │ ├── lmdeploy_qwen2_5_32b.py
│ │ │ │ ├── lmdeploy_qwen2_5_32b_instruct.py
│ │ │ │ ├── lmdeploy_qwen2_5_3b_instruct.py
│ │ │ │ ├── lmdeploy_qwen2_5_72b.py
│ │ │ │ ├── lmdeploy_qwen2_5_72b_instruct.py
│ │ │ │ ├── lmdeploy_qwen2_5_7b.py
│ │ │ │ ├── lmdeploy_qwen2_5_7b_instruct.py
│ │ │ │ ├── vllm_qwen2_5_0_5b_instruct.py
│ │ │ │ ├── vllm_qwen2_5_14b_instruct.py
│ │ │ │ ├── vllm_qwen2_5_14b_instruct_128k.py
│ │ │ │ ├── vllm_qwen2_5_1_5b_instruct.py
│ │ │ │ ├── vllm_qwen2_5_32b_instruct.py
│ │ │ │ ├── vllm_qwen2_5_32b_instruct_128k.py
│ │ │ │ ├── vllm_qwen2_5_3b_instruct.py
│ │ │ │ ├── vllm_qwen2_5_72b_instruct.py
│ │ │ │ ├── vllm_qwen2_5_72b_instruct_128k.py
│ │ │ │ ├── vllm_qwen2_5_7b_instruct.py
│ │ │ │ └── vllm_qwen2_5_7b_instruct_128k.py
│ │ │ ├── qwen3/
│ │ │ │ └── lmdeploy_qwen3_0_6b.py
│ │ │ ├── qwq/
│ │ │ │ ├── lmdeploy_qwq_32b.py
│ │ │ │ └── lmdeploy_qwq_32b_preview.py
│ │ │ ├── rwkv/
│ │ │ │ └── rwkv5_3b.py
│ │ │ ├── skywork/
│ │ │ │ ├── hf_skywork_13b.py
│ │ │ │ └── lmdeploy_skywork_o1_open_llama3_1_8b_instruct.py
│ │ │ ├── telechat/
│ │ │ │ ├── telechat_thinking_streaming_v1.py
│ │ │ │ └── telechat_thinking_v1.py
│ │ │ ├── tigerbot/
│ │ │ │ ├── hf_tigerbot_13b_base_v1.py
│ │ │ │ ├── hf_tigerbot_13b_base_v2.py
│ │ │ │ ├── hf_tigerbot_13b_chat_v1.py
│ │ │ │ ├── hf_tigerbot_13b_chat_v2.py
│ │ │ │ ├── hf_tigerbot_70b_base.py
│ │ │ │ ├── hf_tigerbot_70b_chat_v2.py
│ │ │ │ ├── hf_tigerbot_70b_chat_v3.py
│ │ │ │ ├── hf_tigerbot_7b_base.py
│ │ │ │ ├── hf_tigerbot_7b_base_v3.py
│ │ │ │ ├── hf_tigerbot_7b_chat_v3.py
│ │ │ │ └── hf_tigerbot_7b_sft.py
│ │ │ ├── vicuna/
│ │ │ │ ├── hf_vicuna_13b_v13.py
│ │ │ │ ├── hf_vicuna_13b_v15.py
│ │ │ │ ├── hf_vicuna_13b_v15_16k.py
│ │ │ │ ├── hf_vicuna_33b_v13.py
│ │ │ │ ├── hf_vicuna_7b_v13.py
│ │ │ │ ├── hf_vicuna_7b_v15.py
│ │ │ │ ├── hf_vicuna_7b_v15_16k.py
│ │ │ │ ├── vllm_vicuna_13b_v15_16k.py
│ │ │ │ └── vllm_vicuna_7b_v15_16k.py
│ │ │ ├── wizardcoder/
│ │ │ │ ├── hf_wizardcoder_15b.py
│ │ │ │ ├── hf_wizardcoder_1b.py
│ │ │ │ ├── hf_wizardcoder_3b.py
│ │ │ │ ├── hf_wizardcoder_python_13b.py
│ │ │ │ └── hf_wizardcoder_python_34b.py
│ │ │ ├── wizardlm/
│ │ │ │ ├── hf_wizardlm_13b_v1_2.py
│ │ │ │ ├── hf_wizardlm_70b_v1_0.py
│ │ │ │ ├── hf_wizardlm_7b_v1_0.py
│ │ │ │ ├── hf_wizardmath_7b_v1_0.py
│ │ │ │ ├── hf_wizardmath_7b_v1_1.py
│ │ │ │ ├── vllm_wizardlm_13b_v1_2.py
│ │ │ │ ├── vllm_wizardlm_70b_v1_0.py
│ │ │ │ └── vllm_wizardlm_7b_v1_0.py
│ │ │ ├── yi/
│ │ │ │ ├── hf_yi_1_5_34b.py
│ │ │ │ ├── hf_yi_1_5_34b_chat.py
│ │ │ │ ├── hf_yi_1_5_6b.py
│ │ │ │ ├── hf_yi_1_5_6b_chat.py
│ │ │ │ ├── hf_yi_1_5_9b.py
│ │ │ │ ├── hf_yi_1_5_9b_chat.py
│ │ │ │ ├── hf_yi_34b.py
│ │ │ │ ├── hf_yi_34b_chat.py
│ │ │ │ ├── hf_yi_6b.py
│ │ │ │ ├── hf_yi_6b_chat.py
│ │ │ │ ├── lmdeploy_yi_1_5_34b_chat.py
│ │ │ │ ├── lmdeploy_yi_1_5_6b_chat.py
│ │ │ │ ├── lmdeploy_yi_1_5_9b.py
│ │ │ │ ├── lmdeploy_yi_1_5_9b_chat.py
│ │ │ │ ├── lmdeploy_yi_34b_chat.py
│ │ │ │ ├── lmdeploy_yi_6b_chat.py
│ │ │ │ └── lmdeploy_yi_series.py
│ │ │ └── zephyr/
│ │ │ ├── hf_zephyr_7b_beta.py
│ │ │ └── vllm_zephyr_7b_beta.py
│ │ └── summarizers/
│ │ ├── OlympiadBench.py
│ │ ├── PMMEval.py
│ │ ├── agent_bench.py
│ │ ├── charm_reason.py
│ │ ├── chat_OC15.py
│ │ ├── chat_OC15_multi_faceted.py
│ │ ├── cibench.py
│ │ ├── code_passk.py
│ │ ├── compassbench_v1_1_objective.py
│ │ ├── compassbench_v1_1_objective_public.py
│ │ ├── compassbench_v1_3_objective.py
│ │ ├── compassbench_v1_objective.py
│ │ ├── contamination.py
│ │ ├── example.py
│ │ ├── groups/
│ │ │ ├── GaokaoBench.py
│ │ │ ├── MMLUArabic.py
│ │ │ ├── OlympiadBench.py
│ │ │ ├── PHYSICS.py
│ │ │ ├── PMMEval.py
│ │ │ ├── agieval.py
│ │ │ ├── babilong.py
│ │ │ ├── bbeh.py
│ │ │ ├── bbh.py
│ │ │ ├── biodata.py
│ │ │ ├── calm.py
│ │ │ ├── ceval.py
│ │ │ ├── charm_reason.py
│ │ │ ├── cibench.py
│ │ │ ├── cmmlu.py
│ │ │ ├── ds1000.py
│ │ │ ├── flores.py
│ │ │ ├── humanevalx.py
│ │ │ ├── infinitebench.py
│ │ │ ├── jigsaw_multilingual.py
│ │ │ ├── korbench.py
│ │ │ ├── lawbench.py
│ │ │ ├── lcbench.py
│ │ │ ├── legacy/
│ │ │ │ └── cibench.py
│ │ │ ├── leval.py
│ │ │ ├── longbench.py
│ │ │ ├── lveval.py
│ │ │ ├── mathbench.py
│ │ │ ├── mathbench_2024.py
│ │ │ ├── mathbench_agent.py
│ │ │ ├── mathbench_v1.py
│ │ │ ├── mathbench_v1_2024.py
│ │ │ ├── mathbench_v1_2024_lang.py
│ │ │ ├── mgsm.py
│ │ │ ├── mmlu.py
│ │ │ ├── mmlu_cf.py
│ │ │ ├── mmlu_pro.py
│ │ │ ├── mmmlu.py
│ │ │ ├── multipl_e.py
│ │ │ ├── musr_average.py
│ │ │ ├── plugineval.py
│ │ │ ├── ruler.py
│ │ │ ├── scibench.py
│ │ │ ├── scicode.py
│ │ │ ├── supergpqa.py
│ │ │ ├── teval.py
│ │ │ ├── tydiqa.py
│ │ │ └── xiezhi.py
│ │ ├── infinitebench.py
│ │ ├── internlm2_keyset.py
│ │ ├── judgedataset_all.py
│ │ ├── judgerbenchv2.py
│ │ ├── lawbench.py
│ │ ├── leaderboard.py
│ │ ├── leval.py
│ │ ├── longbench.py
│ │ ├── longeval_v2.py
│ │ ├── lveval.py
│ │ ├── math_agent.py
│ │ ├── math_baseline.py
│ │ ├── mathbench.py
│ │ ├── mathbench_v1.py
│ │ ├── medium.py
│ │ ├── mmlu_cf.py
│ │ ├── mmlu_pro.py
│ │ ├── mmmlu.py
│ │ ├── mmmlu_lite.py
│ │ ├── needlebench.py
│ │ ├── plugineval.py
│ │ ├── rewardbench.py
│ │ ├── ruler.py
│ │ ├── scicode.py
│ │ ├── scireasoner.py
│ │ ├── simpleqa.py
│ │ ├── small.py
│ │ ├── subjective.py
│ │ ├── teval.py
│ │ └── tiny.py
│ ├── datasets/
│ │ ├── CARDBiomedBench.py
│ │ ├── ClinicBench.py
│ │ ├── Earth_Silver.py
│ │ ├── FinanceIQ.py
│ │ ├── GaokaoBench.py
│ │ ├── IFBench/
│ │ │ ├── evaluation_lib.py
│ │ │ ├── ifbench.py
│ │ │ ├── instructions.py
│ │ │ ├── instructions_registry.py
│ │ │ └── instructions_util.py
│ │ ├── IFEval/
│ │ │ ├── __init__.py
│ │ │ ├── evaluation_main.py
│ │ │ ├── ifeval.py
│ │ │ ├── instructions.py
│ │ │ ├── instructions_registry.py
│ │ │ └── instructions_util.py
│ │ ├── LCBench.py
│ │ ├── MMLUArabic.py
│ │ ├── MedCalc_Bench.py
│ │ ├── MedQA.py
│ │ ├── MedXpertQA.py
│ │ ├── Medbullets.py
│ │ ├── NPHardEval/
│ │ │ ├── __init__.py
│ │ │ ├── cmp_GCP_D.py
│ │ │ ├── cmp_KSP.py
│ │ │ ├── cmp_TSP_D.py
│ │ │ ├── hard_GCP.py
│ │ │ ├── hard_MSP.py
│ │ │ ├── hard_TSP.py
│ │ │ ├── p_BSP.py
│ │ │ ├── p_EDP.py
│ │ │ ├── p_SPP.py
│ │ │ ├── prompts.py
│ │ │ └── utils.py
│ │ ├── OlympiadBench.py
│ │ ├── OpenFinData.py
│ │ ├── OpenSWI.py
│ │ ├── PI_LLM.py
│ │ ├── PMMEval/
│ │ │ ├── __init__.py
│ │ │ ├── flores.py
│ │ │ ├── humanevalxl.py
│ │ │ ├── mgsm.py
│ │ │ ├── mhellaswag.py
│ │ │ ├── mifeval.py
│ │ │ ├── mifeval_utils/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── combination_checker.py
│ │ │ │ ├── detectable_content_checker.py
│ │ │ │ ├── detectable_format_checker.py
│ │ │ │ ├── keywords_checker.py
│ │ │ │ ├── length_constraints_checker.py
│ │ │ │ ├── punctuation_checker.py
│ │ │ │ └── startend_checker.py
│ │ │ ├── mlogiqa.py
│ │ │ ├── mmmlu.py
│ │ │ └── xnli.py
│ │ ├── ProcessBench.py
│ │ ├── ProteinLMBench.py
│ │ ├── PubMedQA.py
│ │ ├── QuALITY.py
│ │ ├── SciEval.py
│ │ ├── SciKnowEval.py
│ │ ├── SciReasoner/
│ │ │ ├── GUE.py
│ │ │ ├── LLM4Chem/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── config.py
│ │ │ │ ├── evaluator.py
│ │ │ │ ├── retrosynthesis_evaluator.py
│ │ │ │ └── utils/
│ │ │ │ ├── __input__.py
│ │ │ │ ├── chat_generation.py
│ │ │ │ ├── core_tagger.py
│ │ │ │ ├── general_prompter.py
│ │ │ │ ├── metrics.py
│ │ │ │ └── smiles_canonicalization.py
│ │ │ ├── LLM4Mat.py
│ │ │ ├── Mol_Instructions/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── biotext.py
│ │ │ │ ├── molecule.py
│ │ │ │ ├── normalized_SW_score.py
│ │ │ │ └── protein.py
│ │ │ ├── PEER.py
│ │ │ ├── __init__.py
│ │ │ ├── bio_instruction.py
│ │ │ ├── bulk_modulus_material.py
│ │ │ ├── composition_material.py
│ │ │ ├── opi/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── config.py
│ │ │ │ ├── evaluator.py
│ │ │ │ ├── process_ec_numbers.py
│ │ │ │ └── utils/
│ │ │ │ ├── accuracy4fold_type.py
│ │ │ │ └── metrics4all.py
│ │ │ ├── uncond_RNA.py
│ │ │ ├── uncond_material.py
│ │ │ ├── unconditional_molecule_generation/
│ │ │ │ ├── UMG.py
│ │ │ │ └── __init__.py
│ │ │ └── unconditional_protein_generation/
│ │ │ ├── UPG.py
│ │ │ ├── __init__.py
│ │ │ ├── main.py
│ │ │ └── omegafold/
│ │ │ ├── __init__.py
│ │ │ ├── __main__.py
│ │ │ ├── confidence.py
│ │ │ ├── config.py
│ │ │ ├── decode.py
│ │ │ ├── embedders.py
│ │ │ ├── geoformer.py
│ │ │ ├── model.py
│ │ │ ├── modules.py
│ │ │ ├── omegaplm.py
│ │ │ ├── pipeline.py
│ │ │ └── utils/
│ │ │ ├── __init__.py
│ │ │ ├── protein_utils/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── aaframe.py
│ │ │ │ ├── functions.py
│ │ │ │ └── residue_constants.py
│ │ │ └── torch_utils.py
│ │ ├── ScienceQA.py
│ │ ├── SeedBench.py
│ │ ├── TheoremQA/
│ │ │ ├── __init__.py
│ │ │ ├── legacy.py
│ │ │ ├── main.py
│ │ │ ├── number_utils.py
│ │ │ └── utils.py
│ │ ├── __init__.py
│ │ ├── advglue.py
│ │ ├── afqmcd.py
│ │ ├── agieval/
│ │ │ ├── __init__.py
│ │ │ ├── agieval.py
│ │ │ ├── constructions.py
│ │ │ ├── dataset_loader.py
│ │ │ ├── evaluation.py
│ │ │ ├── math_equivalence.py
│ │ │ ├── post_process.py
│ │ │ └── utils.py
│ │ ├── aime2024.py
│ │ ├── anli.py
│ │ ├── anthropics_evals.py
│ │ ├── apps.py
│ │ ├── arc.py
│ │ ├── arc_prize_public_evaluation.py
│ │ ├── atlas/
│ │ │ ├── dataset_loader.py
│ │ │ ├── evaluation.py
│ │ │ └── prompt.py
│ │ ├── ax.py
│ │ ├── babilong/
│ │ │ ├── __init__.py
│ │ │ ├── babilong.py
│ │ │ ├── babilong_utils.py
│ │ │ └── prompts.py
│ │ ├── base.py
│ │ ├── bbeh.py
│ │ ├── bbh.py
│ │ ├── benbench.py
│ │ ├── beyondaime.py
│ │ ├── bigcodebench/
│ │ │ ├── __init__.py
│ │ │ ├── bigcodebench.py
│ │ │ └── extractor.py
│ │ ├── biodata.py
│ │ ├── boolq.py
│ │ ├── bustum.py
│ │ ├── c3.py
│ │ ├── calm/
│ │ │ ├── __init__.py
│ │ │ ├── calm.py
│ │ │ ├── data_processing/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── generate_questions.py
│ │ │ │ ├── prompt/
│ │ │ │ │ ├── AC-B_causal_judgement.py
│ │ │ │ │ ├── AR-B_CaLM-AR.py
│ │ │ │ │ ├── ATE.py
│ │ │ │ │ ├── BAS-B_backadj.py
│ │ │ │ │ ├── BAS-C_max-BAS.py
│ │ │ │ │ ├── BAS-C_min-BAS.py
│ │ │ │ │ ├── BAS-C_mix-BAS.py
│ │ │ │ │ ├── CA-B_FA.py
│ │ │ │ │ ├── CA-B_FP.py
│ │ │ │ │ ├── CB-B_collider-bias.py
│ │ │ │ │ ├── CDE.py
│ │ │ │ │ ├── CEG-O_E-CARE.py
│ │ │ │ │ ├── CEI-B.py
│ │ │ │ │ ├── CORR-B_correlation.py
│ │ │ │ │ ├── CR-B_det-counterfactual.py
│ │ │ │ │ ├── CR-C_CRASS.py
│ │ │ │ │ ├── EAE-B_exp-away.py
│ │ │ │ │ ├── ECI-B_CTB.py
│ │ │ │ │ ├── ECI-B_ESC.py
│ │ │ │ │ ├── ECI-B_MAVEN-ERE.py
│ │ │ │ │ ├── ETT.py
│ │ │ │ │ ├── FAS-C_FAS.py
│ │ │ │ │ ├── IV-C_CaLM-IV.py
│ │ │ │ │ ├── NDE.py
│ │ │ │ │ ├── NIE.py
│ │ │ │ │ ├── PCD-B_COPA.py
│ │ │ │ │ ├── PCD-B_E-CARE.py
│ │ │ │ │ ├── PCD-C_COPA.py
│ │ │ │ │ ├── PCD-C_E-CARE.py
│ │ │ │ │ ├── PN.py
│ │ │ │ │ └── PS.py
│ │ │ │ └── task_hiearchy.py
│ │ │ ├── evaluation/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── accuracy/
│ │ │ │ │ ├── choice.py
│ │ │ │ │ ├── open-ended.py
│ │ │ │ │ └── prob.py
│ │ │ │ ├── core_metrics.py
│ │ │ │ ├── error/
│ │ │ │ │ └── basic_adversarial/
│ │ │ │ │ ├── AC-B_causal_judgement.py
│ │ │ │ │ ├── AR-B_CaLM-AR.py
│ │ │ │ │ ├── AS.py
│ │ │ │ │ ├── CA-B.py
│ │ │ │ │ ├── CEI-B.py
│ │ │ │ │ ├── CLADDER.py
│ │ │ │ │ ├── CR-C_CRASS.py
│ │ │ │ │ ├── ECI.py
│ │ │ │ │ ├── Natural.py
│ │ │ │ │ ├── PCD-B.py
│ │ │ │ │ ├── PCD-C.py
│ │ │ │ │ └── Probability.py
│ │ │ │ ├── errors.py
│ │ │ │ └── labeling/
│ │ │ │ ├── AC-B_causal_judgement.py
│ │ │ │ ├── AR-B_CaLM-AR.py
│ │ │ │ ├── AS.py
│ │ │ │ ├── CA-B_FA.py
│ │ │ │ ├── CA-B_FP.py
│ │ │ │ ├── CEG-O_E-CARE.py
│ │ │ │ ├── CEI-B.py
│ │ │ │ ├── CLADDER.py
│ │ │ │ ├── CR-C_CRASS.py
│ │ │ │ ├── ECI.py
│ │ │ │ ├── Natural.py
│ │ │ │ ├── PCD-B.py
│ │ │ │ ├── PCD-C.py
│ │ │ │ ├── Probability.py
│ │ │ │ └── common_answers.py
│ │ │ └── utils/
│ │ │ ├── __init__.py
│ │ │ └── load_items.py
│ │ ├── cb.py
│ │ ├── ceval.py
│ │ ├── charm.py
│ │ ├── chatml/
│ │ │ ├── __init__.py
│ │ │ ├── chatml.py
│ │ │ └── verification.py
│ │ ├── chem_exam.py
│ │ ├── chembench.py
│ │ ├── chid.py
│ │ ├── chinese_simpleqa.py
│ │ ├── cibench.py
│ │ ├── circular.py
│ │ ├── civilcomments.py
│ │ ├── climaqa.py
│ │ ├── clozeTest_maxmin.py
│ │ ├── cluewsc.py
│ │ ├── cmb.py
│ │ ├── cmmlu.py
│ │ ├── cmnli.py
│ │ ├── cmo_fib.py
│ │ ├── cmphysbench/
│ │ │ ├── SEED/
│ │ │ │ ├── README.md
│ │ │ │ ├── SEED.py
│ │ │ │ ├── __init__.py
│ │ │ │ ├── extended_zss.py
│ │ │ │ ├── latex_pre_process.py
│ │ │ │ └── test.py
│ │ │ ├── __init__.py
│ │ │ └── cmphysbench.py
│ │ ├── cmrc.py
│ │ ├── codecompass/
│ │ │ ├── CodeCompass.py
│ │ │ ├── __init__.py
│ │ │ ├── codecompass_runner.py
│ │ │ ├── evaluator.py
│ │ │ ├── executor.py
│ │ │ ├── metrics.py
│ │ │ └── utils.py
│ │ ├── commonsenseqa.py
│ │ ├── commonsenseqa_cn.py
│ │ ├── compassbench_obj.py
│ │ ├── copa.py
│ │ ├── crowspairs.py
│ │ ├── crowspairs_cn.py
│ │ ├── csl.py
│ │ ├── custom.py
│ │ ├── cvalues.py
│ │ ├── dingo.py
│ │ ├── drcd.py
│ │ ├── drop.py
│ │ ├── drop_simple_eval.py
│ │ ├── ds1000.py
│ │ ├── ds1000_interpreter.py
│ │ ├── eese/
│ │ │ ├── eese.py
│ │ │ ├── eese_postprocessors.py
│ │ │ └── utils.py
│ │ ├── eprstmt.py
│ │ ├── flores.py
│ │ ├── game24.py
│ │ ├── gaokao_math.py
│ │ ├── generic.py
│ │ ├── govrepcrs.py
│ │ ├── gpqa.py
│ │ ├── gsm8k.py
│ │ ├── gsm_hard.py
│ │ ├── healthbench/
│ │ │ ├── healthbench.py
│ │ │ ├── sampler/
│ │ │ │ └── chat_completion_sampler.py
│ │ │ └── types.py
│ │ ├── hellaswag.py
│ │ ├── hle.py
│ │ ├── huggingface.py
│ │ ├── humaneval.py
│ │ ├── humaneval_multi.py
│ │ ├── humaneval_pro.py
│ │ ├── humanevalx.py
│ │ ├── hungarian_math.py
│ │ ├── inference_ppl.py
│ │ ├── infinitebench/
│ │ │ ├── __init__.py
│ │ │ ├── infinitebench_codedebug.py
│ │ │ ├── infinitebench_coderun.py
│ │ │ ├── infinitebench_endia.py
│ │ │ ├── infinitebench_enmc.py
│ │ │ ├── infinitebench_enqa.py
│ │ │ ├── infinitebench_ensum.py
│ │ │ ├── infinitebench_mathcalc.py
│ │ │ ├── infinitebench_mathfind.py
│ │ │ ├── infinitebench_retrievekv.py
│ │ │ ├── infinitebench_retrievenumber.py
│ │ │ ├── infinitebench_retrievepasskey.py
│ │ │ ├── infinitebench_zhqa.py
│ │ │ └── utils.py
│ │ ├── internsandbox.py
│ │ ├── iwslt2017.py
│ │ ├── jigsawmultilingual.py
│ │ ├── jsonl.py
│ │ ├── judge/
│ │ │ ├── __init__.py
│ │ │ ├── judgebench.py
│ │ │ ├── judgerbenchv2.py
│ │ │ ├── rewardbench.py
│ │ │ └── rmb.py
│ │ ├── kaoshi.py
│ │ ├── kcle.py
│ │ ├── korbench/
│ │ │ ├── __init__.py
│ │ │ ├── korbench.py
│ │ │ ├── korbench_dataset_config/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── config.yaml
│ │ │ │ ├── config_wrapper.py
│ │ │ │ └── prompt/
│ │ │ │ ├── 0_shot.yaml
│ │ │ │ ├── 3_shot.yaml
│ │ │ │ ├── __init__.py
│ │ │ │ ├── mixed.yaml
│ │ │ │ ├── self-correction.yaml
│ │ │ │ └── trick.yaml
│ │ │ └── korbench_utils.py
│ │ ├── lambada.py
│ │ ├── lawbench/
│ │ │ ├── __init__.py
│ │ │ ├── evaluation_functions/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── cjft.py
│ │ │ │ ├── flzx.py
│ │ │ │ ├── ftcs.py
│ │ │ │ ├── jdzy.py
│ │ │ │ ├── jec_ac.py
│ │ │ │ ├── jec_kd.py
│ │ │ │ ├── jetq.py
│ │ │ │ ├── lblj.py
│ │ │ │ ├── ljp_accusation.py
│ │ │ │ ├── ljp_article.py
│ │ │ │ ├── ljp_imprison.py
│ │ │ │ ├── sjjc.py
│ │ │ │ ├── wbfl.py
│ │ │ │ ├── wsjd.py
│ │ │ │ ├── xxcq.py
│ │ │ │ ├── ydlj.py
│ │ │ │ ├── yqzy.py
│ │ │ │ └── zxfl.py
│ │ │ ├── lawbench.py
│ │ │ └── utils/
│ │ │ ├── __init__.py
│ │ │ ├── char_smi.py
│ │ │ ├── compare_m2_for_evaluation.py
│ │ │ ├── comprehension_scores.py
│ │ │ ├── function_utils.py
│ │ │ ├── modules/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── alignment.py
│ │ │ │ ├── annotator.py
│ │ │ │ ├── classifier.py
│ │ │ │ ├── merger.py
│ │ │ │ ├── tokenization.py
│ │ │ │ └── tokenizer.py
│ │ │ ├── parallel_to_m2.py
│ │ │ └── rc_f1.py
│ │ ├── lcsts.py
│ │ ├── leval/
│ │ │ ├── __init__.py
│ │ │ ├── evaluators.py
│ │ │ ├── leval_coursera.py
│ │ │ ├── leval_financial_qa.py
│ │ │ ├── leval_gov_report_summ.py
│ │ │ ├── leval_gsm100.py
│ │ │ ├── leval_legal_contract_qa.py
│ │ │ ├── leval_meeting_summ.py
│ │ │ ├── leval_multidoc_qa.py
│ │ │ ├── leval_narrattive_qa.py
│ │ │ ├── leval_natural_question.py
│ │ │ ├── leval_news_summ.py
│ │ │ ├── leval_paper_assistant.py
│ │ │ ├── leval_patent_summ.py
│ │ │ ├── leval_quality.py
│ │ │ ├── leval_review_summ.py
│ │ │ ├── leval_scientific_qa.py
│ │ │ ├── leval_topic_retrieval.py
│ │ │ ├── leval_tpo.py
│ │ │ └── leval_tvshow_summ.py
│ │ ├── livecodebench/
│ │ │ ├── __init__.py
│ │ │ ├── evaluator.py
│ │ │ ├── execute_utils.py
│ │ │ ├── extract_utils.py
│ │ │ ├── livecodebench.py
│ │ │ ├── pass_k_utils.py
│ │ │ ├── prompts.py
│ │ │ └── testing_util.py
│ │ ├── livecodebench_pro/
│ │ │ ├── __init__.py
│ │ │ ├── livecodebench_pro.py
│ │ │ └── livecodebench_pro_evaluator.py
│ │ ├── livemathbench/
│ │ │ ├── __init__.py
│ │ │ ├── livemathbench.py
│ │ │ ├── prompts.py
│ │ │ └── utils.py
│ │ ├── livereasonbench/
│ │ │ ├── __init__.py
│ │ │ └── livereasonbench.py
│ │ ├── livestembench.py
│ │ ├── llm_compression.py
│ │ ├── lmeval.py
│ │ ├── longbench/
│ │ │ ├── __init__.py
│ │ │ ├── evaluators.py
│ │ │ ├── longbench_2wikim_qa.py
│ │ │ ├── longbench_dureader.py
│ │ │ ├── longbench_gov_report.py
│ │ │ ├── longbench_hotpot_qa.py
│ │ │ ├── longbench_lcc.py
│ │ │ ├── longbench_lsht.py
│ │ │ ├── longbench_multi_news.py
│ │ │ ├── longbench_multifieldqa_en.py
│ │ │ ├── longbench_multifieldqa_zh.py
│ │ │ ├── longbench_musique.py
│ │ │ ├── longbench_narrative_qa.py
│ │ │ ├── longbench_passage_count.py
│ │ │ ├── longbench_passage_retrieval_en.py
│ │ │ ├── longbench_passage_retrieval_zh.py
│ │ │ ├── longbench_qasper.py
│ │ │ ├── longbench_qmsum.py
│ │ │ ├── longbench_repobench.py
│ │ │ ├── longbench_samsum.py
│ │ │ ├── longbench_trec.py
│ │ │ ├── longbench_trivia_qa.py
│ │ │ └── longbench_vcsum.py
│ │ ├── longbenchv2.py
│ │ ├── lveval/
│ │ │ ├── __init__.py
│ │ │ ├── evaluators.py
│ │ │ ├── lveval_cmrc_mixup.py
│ │ │ ├── lveval_dureader_mixup.py
│ │ │ ├── lveval_factrecall_en.py
│ │ │ ├── lveval_factrecall_zh.py
│ │ │ ├── lveval_hotpotwikiqa_mixup.py
│ │ │ ├── lveval_lic_mixup.py
│ │ │ ├── lveval_loogle_CR_mixup.py
│ │ │ ├── lveval_loogle_MIR_mixup.py
│ │ │ ├── lveval_loogle_SD_mixup.py
│ │ │ ├── lveval_multifieldqa_en_mixup.py
│ │ │ └── lveval_multifieldqa_zh_mixup.py
│ │ ├── mastermath2024v1.py
│ │ ├── matbench/
│ │ │ ├── __init__.py
│ │ │ ├── matbench.py
│ │ │ └── post_process.py
│ │ ├── math.py
│ │ ├── math401.py
│ │ ├── math_intern.py
│ │ ├── mathbench.py
│ │ ├── mbpp.py
│ │ ├── mbpp_pro.py
│ │ ├── medbench/
│ │ │ ├── __init__.py
│ │ │ ├── constructions.py
│ │ │ ├── dataset_loader.py
│ │ │ ├── evaluation.py
│ │ │ ├── math_equivalence.py
│ │ │ ├── medbench.py
│ │ │ ├── post_process.py
│ │ │ └── utils.py
│ │ ├── medmcqa.py
│ │ ├── mgsm.py
│ │ ├── mmlu.py
│ │ ├── mmlu_cf.py
│ │ ├── mmlu_pro.py
│ │ ├── mmmlu.py
│ │ ├── mol_instructions_chem.py
│ │ ├── multipl_e.py
│ │ ├── multirc.py
│ │ ├── musr/
│ │ │ ├── __init__.py
│ │ │ ├── murder_mystery_solved_ex.py
│ │ │ ├── musr.py
│ │ │ ├── object_placements_solved_ex.py
│ │ │ ├── team_allocation_solved_ex.py
│ │ │ └── tree.py
│ │ ├── narrativeqa.py
│ │ ├── natural_question.py
│ │ ├── natural_question_cn.py
│ │ ├── needlebench/
│ │ │ ├── __init__.py
│ │ │ ├── atc.py
│ │ │ ├── atc_choice.py
│ │ │ ├── multi.py
│ │ │ ├── origin.py
│ │ │ └── parallel.py
│ │ ├── needlebench_v2/
│ │ │ ├── __init__.py
│ │ │ ├── atc.py
│ │ │ ├── atc_elder_only.py
│ │ │ ├── multi.py
│ │ │ ├── origin.py
│ │ │ └── parallel.py
│ │ ├── nejmaibench.py
│ │ ├── obqa.py
│ │ ├── ojbench.py
│ │ ├── olymmath.py
│ │ ├── omni_math.py
│ │ ├── phybench/
│ │ │ ├── EED.py
│ │ │ ├── __init__.py
│ │ │ ├── box_extract.py
│ │ │ ├── extended_zss.py
│ │ │ ├── latex_pre_process.py
│ │ │ └── phybench.py
│ │ ├── physics.py
│ │ ├── piqa.py
│ │ ├── py150.py
│ │ ├── qasper.py
│ │ ├── qaspercut.py
│ │ ├── race.py
│ │ ├── rbench.py
│ │ ├── realtoxicprompts.py
│ │ ├── reasonbench/
│ │ │ ├── ReasonBenchDataset.py
│ │ │ └── __init__.py
│ │ ├── record.py
│ │ ├── rolebench.py
│ │ ├── ruler/
│ │ │ ├── __init__.py
│ │ │ ├── ruler_cwe.py
│ │ │ ├── ruler_fwe.py
│ │ │ ├── ruler_niah.py
│ │ │ ├── ruler_qa.py
│ │ │ └── ruler_vt.py
│ │ ├── s3eval.py
│ │ ├── safety.py
│ │ ├── scibench.py
│ │ ├── scicode.py
│ │ ├── simpleqa.py
│ │ ├── siqa.py
│ │ ├── smolinstruct.py
│ │ ├── squad20.py
│ │ ├── srbench.py
│ │ ├── storycloze.py
│ │ ├── strategyqa.py
│ │ ├── subjective/
│ │ │ ├── __init__.py
│ │ │ ├── alignbench.py
│ │ │ ├── alpacaeval.py
│ │ │ ├── arena_hard.py
│ │ │ ├── commonbench.py
│ │ │ ├── compass_arena.py
│ │ │ ├── compass_arena_subjective_bench.py
│ │ │ ├── compassbench.py
│ │ │ ├── compassbench_checklist.py
│ │ │ ├── compassbench_control_length_bias.py
│ │ │ ├── corev2.py
│ │ │ ├── creationbench.py
│ │ │ ├── flames.py
│ │ │ ├── fofo.py
│ │ │ ├── followbench.py
│ │ │ ├── hellobench.py
│ │ │ ├── judgerbench.py
│ │ │ ├── mtbench.py
│ │ │ ├── mtbench101.py
│ │ │ ├── multiround.py
│ │ │ ├── subjective_cmp.py
│ │ │ ├── utils.py
│ │ │ ├── wildbench.py
│ │ │ └── writingbench.py
│ │ ├── summedits.py
│ │ ├── summscreen.py
│ │ ├── supergpqa/
│ │ │ ├── __init__.py
│ │ │ ├── supergpqa.py
│ │ │ ├── supergpqa_dataset_config/
│ │ │ │ ├── config_default.yaml
│ │ │ │ ├── config_reasoning_models.yaml
│ │ │ │ ├── config_wrapper.py
│ │ │ │ └── prompt/
│ │ │ │ ├── five-shot.yaml
│ │ │ │ ├── robustness-exp.yaml
│ │ │ │ ├── zero-shot-with-subfield.yaml
│ │ │ │ └── zero-shot.yaml
│ │ │ ├── supergpqa_eval.py
│ │ │ └── supergpqa_utils.py
│ │ ├── svamp.py
│ │ ├── tabmwp.py
│ │ ├── taco.py
│ │ ├── teval/
│ │ │ ├── __init__.py
│ │ │ ├── evaluators/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── instruct_evaluator.py
│ │ │ │ ├── planning_evaluator.py
│ │ │ │ ├── reason_retrieve_understand_evaluator.py
│ │ │ │ └── review_evaluator.py
│ │ │ ├── schema.py
│ │ │ └── utils/
│ │ │ ├── __init__.py
│ │ │ ├── convert_results.py
│ │ │ ├── format_load.py
│ │ │ ├── meta_template.py
│ │ │ └── template.py
│ │ ├── tnews.py
│ │ ├── triviaqa.py
│ │ ├── triviaqarc.py
│ │ ├── truthfulqa.py
│ │ ├── tydiqa.py
│ │ ├── wic.py
│ │ ├── wikibench.py
│ │ ├── winograd.py
│ │ ├── winogrande.py
│ │ ├── wnli.py
│ │ ├── wsc.py
│ │ ├── xcopa.py
│ │ ├── xiezhi.py
│ │ ├── xlsum.py
│ │ └── xsum.py
│ ├── evaluator/
│ │ ├── __init__.py
│ │ ├── cascade_evaluator.py
│ │ ├── generic_llm_evaluator.py
│ │ └── math_evaluator.py
│ ├── lagent/
│ │ ├── actions/
│ │ │ ├── ipython_interpreter.py
│ │ │ └── python_interpreter.py
│ │ └── agents/
│ │ └── react.py
│ ├── metrics/
│ │ ├── __init__.py
│ │ ├── dump_results.py
│ │ ├── mme_score.py
│ │ └── seedbench.py
│ ├── models/
│ │ ├── __init__.py
│ │ ├── accessory.py
│ │ ├── ai360_api.py
│ │ ├── alaya.py
│ │ ├── baichuan_api.py
│ │ ├── baidu_api.py
│ │ ├── bailing_api_oc.py
│ │ ├── base.py
│ │ ├── base_api.py
│ │ ├── bluelm_api.py
│ │ ├── bytedance_api.py
│ │ ├── claude_allesapin.py
│ │ ├── claude_api/
│ │ │ ├── __init__.py
│ │ │ ├── claude_api.py
│ │ │ └── postprocessors.py
│ │ ├── claude_sdk_api.py
│ │ ├── deepseek_api.py
│ │ ├── doubao.py
│ │ ├── doubao_api.py
│ │ ├── gemini_api.py
│ │ ├── glm.py
│ │ ├── huggingface.py
│ │ ├── huggingface_above_v4_33.py
│ │ ├── hunyuan_api.py
│ │ ├── intern_model.py
│ │ ├── interntrain.py
│ │ ├── krgpt_api.py
│ │ ├── lagent.py
│ │ ├── langchain.py
│ │ ├── lightllm_api.py
│ │ ├── llama2.py
│ │ ├── minimax_api.py
│ │ ├── mistral_api.py
│ │ ├── mixtral.py
│ │ ├── modelscope.py
│ │ ├── moonshot_api.py
│ │ ├── nanbeige_api.py
│ │ ├── openai_api.py
│ │ ├── openai_streaming.py
│ │ ├── pangu_api.py
│ │ ├── qwen_api.py
│ │ ├── rendu_api.py
│ │ ├── sensetime_api.py
│ │ ├── stepfun_api.py
│ │ ├── telechat_api/
│ │ │ ├── __init__.py
│ │ │ ├── telechat_api.py
│ │ │ ├── telechat_api_streaming.py
│ │ │ └── telechat_auth_sdk.py
│ │ ├── turbomind.py
│ │ ├── turbomind_api.py
│ │ ├── turbomind_with_tf_above_v4_33.py
│ │ ├── unigpt_api.py
│ │ ├── vllm.py
│ │ ├── vllm_with_tf_above_v4_33.py
│ │ ├── xunfei_api.py
│ │ ├── yayi_api.py
│ │ ├── yi_api.py
│ │ ├── zhipuai_api.py
│ │ └── zhipuai_v2_api.py
│ ├── openicl/
│ │ ├── __init__.py
│ │ ├── icl_dataset_reader.py
│ │ ├── icl_evaluator/
│ │ │ ├── __init__.py
│ │ │ ├── code_evaluator.py
│ │ │ ├── hf_metrics/
│ │ │ │ ├── accuracy.py
│ │ │ │ ├── rouge.py
│ │ │ │ ├── sacrebleu.py
│ │ │ │ └── squad.py
│ │ │ ├── icl_agent_evaluator.py
│ │ │ ├── icl_aucroc_evaluator.py
│ │ │ ├── icl_base_evaluator.py
│ │ │ ├── icl_bpc_evaluator.py
│ │ │ ├── icl_circular_evaluator.py
│ │ │ ├── icl_em_evaluator.py
│ │ │ ├── icl_hf_evaluator.py
│ │ │ ├── icl_jieba_rouge_evaluator.py
│ │ │ ├── icl_judge_evaluator.py
│ │ │ ├── icl_korbench_evaluator.py
│ │ │ ├── icl_misc_evaluator.py
│ │ │ ├── icl_plugin_evaluator.py
│ │ │ ├── icl_toxic_evaluator.py
│ │ │ ├── lm_evaluator.py
│ │ │ └── pi_llm_evaluator.py
│ │ ├── icl_inferencer/
│ │ │ ├── __init__.py
│ │ │ ├── icl_agent_inferencer.py
│ │ │ ├── icl_attack_inferencer.py
│ │ │ ├── icl_base_inferencer.py
│ │ │ ├── icl_chat_inferencer.py
│ │ │ ├── icl_chat_inferencer_parallel.py
│ │ │ ├── icl_chatml_inferencer.py
│ │ │ ├── icl_chatml_inferencer_parallel.py
│ │ │ ├── icl_clp_inferencer.py
│ │ │ ├── icl_gen_inferencer.py
│ │ │ ├── icl_gen_inferencer_parallel.py
│ │ │ ├── icl_inference_ppl_only_inferencer.py
│ │ │ ├── icl_ll_inferencer.py
│ │ │ ├── icl_mink_percent_inferencer.py
│ │ │ ├── icl_ppl_inferencer.py
│ │ │ ├── icl_ppl_only_inferencer.py
│ │ │ ├── icl_sc_inferencer.py
│ │ │ ├── icl_sw_ce_loss_inferencer.py
│ │ │ └── icl_tot_inferencer.py
│ │ ├── icl_prompt_template.py
│ │ ├── icl_raw_prompt_template.py
│ │ ├── icl_retriever/
│ │ │ ├── __init__.py
│ │ │ ├── icl_base_retriever.py
│ │ │ ├── icl_bm25_retriever.py
│ │ │ ├── icl_dpp_retriever.py
│ │ │ ├── icl_fix_k_retriever.py
│ │ │ ├── icl_mdl_retriever.py
│ │ │ ├── icl_random_retriever.py
│ │ │ ├── icl_sliding_k_retriever.py
│ │ │ ├── icl_topk_retriever.py
│ │ │ ├── icl_votek_retriever.py
│ │ │ └── icl_zero_retriever.py
│ │ └── utils/
│ │ ├── __init__.py
│ │ └── logging.py
│ ├── partitioners/
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── naive.py
│ │ ├── num_worker.py
│ │ ├── size.py
│ │ ├── sub_naive.py
│ │ ├── sub_num_worker.py
│ │ └── sub_size.py
│ ├── registry.py
│ ├── runners/
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── dlc.py
│ │ ├── local.py
│ │ ├── local_api.py
│ │ ├── rjob.py
│ │ ├── slurm.py
│ │ ├── slurm_sequential.py
│ │ └── volc.py
│ ├── summarizers/
│ │ ├── __init__.py
│ │ ├── circular.py
│ │ ├── default.py
│ │ ├── default_subjective.py
│ │ ├── llm_compression.py
│ │ ├── multi_faceted.py
│ │ ├── multi_model.py
│ │ ├── needlebench.py
│ │ ├── subjective/
│ │ │ ├── __init__.py
│ │ │ ├── alignmentbench.py
│ │ │ ├── all_obj.py
│ │ │ ├── alpacaeval.py
│ │ │ ├── arenahard.py
│ │ │ ├── charm.py
│ │ │ ├── common_summarizer.py
│ │ │ ├── compass_arena.py
│ │ │ ├── compass_arena_bradley_terry.py
│ │ │ ├── compassbench.py
│ │ │ ├── compassbench_v13.py
│ │ │ ├── corev2.py
│ │ │ ├── creationbench.py
│ │ │ ├── flames.py
│ │ │ ├── fofo.py
│ │ │ ├── followbench.py
│ │ │ ├── mtbench.py
│ │ │ ├── mtbench101.py
│ │ │ ├── multiround.py
│ │ │ ├── qacompassbench.py
│ │ │ ├── subjective.py
│ │ │ ├── subjective_post_process.py
│ │ │ ├── utils.py
│ │ │ └── wildbench.py
│ │ └── summarizer_pretrain.py
│ ├── tasks/
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── llm_eval.py
│ │ ├── openicl_attack.py
│ │ ├── openicl_eval.py
│ │ ├── openicl_eval_watch.py
│ │ ├── openicl_infer.py
│ │ ├── openicl_infer_concurrent.py
│ │ ├── outer_eval/
│ │ │ └── alpacaeval.py
│ │ └── subjective_eval.py
│ └── utils/
│ ├── __init__.py
│ ├── abbr.py
│ ├── auxiliary.py
│ ├── build.py
│ ├── collect_env.py
│ ├── datasets.py
│ ├── datasets_info.py
│ ├── dependency.py
│ ├── dict_postprocessors.py
│ ├── file.py
│ ├── fileio.py
│ ├── heartbeat.py
│ ├── infer_status.py
│ ├── lark.py
│ ├── logging.py
│ ├── menu.py
│ ├── network.py
│ ├── prompt.py
│ ├── result_station.py
│ ├── run.py
│ ├── text_postprocessors.py
│ └── types.py
├── run.py
├── setup.py
├── tests/
│ ├── TESTING_GUIDE.md
│ ├── TESTING_GUIDE_zh-CN.md
│ ├── datasets/
│ │ ├── test_aime2025.py
│ │ ├── test_aime2025_eval.py
│ │ ├── test_beyondaime.py
│ │ ├── test_humaneval.py
│ │ └── test_local_datasets.py
│ ├── models/
│ │ ├── test_base_model.py
│ │ ├── test_huggingface.py
│ │ ├── test_huggingface_above_v4_33.py
│ │ ├── test_openai_api.py
│ │ ├── test_openai_streaming.py
│ │ ├── test_turbomind.py
│ │ ├── test_turbomind_with_tf_above_v4_33.py
│ │ ├── test_vllm.py
│ │ └── test_vllm_with_tf_above_v4_33.py
│ ├── openicl/
│ │ ├── test_icl_chat_inferencer_parallel.py
│ │ ├── test_icl_chatml_inferencer_parallel.py
│ │ ├── test_icl_gen_inferencer_parallel.py
│ │ ├── test_prompt_template.py
│ │ └── test_raw_prompt_template.py
│ ├── partitioners/
│ │ ├── test_base_partitioner.py
│ │ └── test_naive.py
│ ├── prompt/
│ │ ├── test_api_template_parser.py
│ │ ├── test_lm_template_parser.py
│ │ └── test_prompt_list.py
│ ├── pytest.ini
│ ├── summarizers/
│ │ └── test_default.py
│ ├── tasks/
│ │ ├── test_base_task.py
│ │ ├── test_openicl_eval_watch.py
│ │ └── test_openicl_infer_concurrent.py
│ └── utils/
│ ├── test_heartbeat.py
│ ├── test_infer_status.py
│ └── test_text_postprocessors.py
└── tools/
├── case_analyzer.py
├── chatml_format_test.py
├── collect_code_preds.py
├── compare_configs.py
├── convert_alignmentbench.py
├── list_configs.py
├── prediction_merger.py
├── prompt_viewer.py
├── test_api_model.py
├── update_dataset_suffix.py
└── viz_multi_model.py
Showing preview only (473K chars total). Download the full file or copy to clipboard to get everything.
SYMBOL INDEX (5483 symbols across 811 files)
FILE: autotest/utils/compare_results.py
function compare_results (line 7) | def compare_results(folder1,
function compare_folders (line 31) | def compare_folders(folder1, folder2, results_ignore_list=None):
function get_all_subpaths (line 95) | def get_all_subpaths(directory):
FILE: autotest/utils/health_check.py
function health_check (line 7) | def health_check(url: str = 'http://0.0.0.0:23333', timeout: int = 300):
FILE: autotest/utils/oc_score_assert.py
function baseline_scores (line 11) | def baseline_scores(request):
function result_scores (line 20) | def result_scores():
class TestCmdCase (line 29) | class TestCmdCase:
method test_cmd_case1 (line 36) | def test_cmd_case1(self, baseline_scores, result_scores, model, dataset):
method test_cmd_case2 (line 50) | def test_cmd_case2(self, baseline_scores, result_scores, model, dataset):
method test_cmd_case3 (line 60) | def test_cmd_case3(self, baseline_scores, result_scores, model, dataset):
method test_cmd_case4 (line 71) | def test_cmd_case4(self, baseline_scores, result_scores, model, dataset):
method test_cmd_case5 (line 81) | def test_cmd_case5(self, baseline_scores, result_scores, model, dataset):
function assert_score (line 87) | def assert_score(model_type, score, baseline, dataset: str = ''):
function find_csv_files (line 132) | def find_csv_files(directory):
function read_csv_file (line 145) | def read_csv_file(file_path):
FILE: docs/en/conf.py
function get_version (line 33) | def get_version():
function builder_inited_handler (line 229) | def builder_inited_handler(app):
function setup (line 233) | def setup(app):
FILE: docs/en/statis.py
function table_format (line 42) | def table_format(data_list):
function generate_table (line 88) | def generate_table(data_list, title=None):
FILE: docs/zh_cn/conf.py
function get_version (line 33) | def get_version():
function builder_inited_handler (line 229) | def builder_inited_handler(app):
function setup (line 234) | def setup(app):
FILE: docs/zh_cn/statis.py
function table_format (line 40) | def table_format(data_list):
function generate_table (line 86) | def generate_table(data_list, title=None):
FILE: opencompass/cli/main.py
function _run_eval_tasks (line 23) | def _run_eval_tasks(runner, tasks):
function _is_eval_daemon (line 32) | def _is_eval_daemon(task_type) -> bool:
function parse_args (line 38) | def parse_args():
function parse_slurm_args (line 215) | def parse_slurm_args(slurm_parser):
function parse_dlc_args (line 233) | def parse_dlc_args(dlc_parser):
function parse_hf_args (line 241) | def parse_hf_args(hf_parser):
function parse_custom_dataset_args (line 261) | def parse_custom_dataset_args(custom_dataset_parser):
function main (line 273) | def main():
FILE: opencompass/configs/datasets/lveval/lvevalcmrc_mixup/lveval_cmrc_mixup_gen_465823.py
function get_dataset_names (line 36) | def get_dataset_names(dataset_name, length_levels):
FILE: opencompass/configs/datasets/lveval/lvevaldureader_mixup/lveval_dureader_mixup_gen_465823.py
function get_dataset_names (line 37) | def get_dataset_names(dataset_name, length_levels):
FILE: opencompass/configs/datasets/lveval/lvevalfactrecall_en/lveval_factrecall_en_gen_9a836f.py
function get_dataset_names (line 36) | def get_dataset_names(dataset_name, length_levels):
FILE: opencompass/configs/datasets/lveval/lvevalfactrecall_zh/lveval_factrecall_zh_gen_dbee70.py
function get_dataset_names (line 36) | def get_dataset_names(dataset_name, length_levels):
FILE: opencompass/configs/datasets/lveval/lvevalhotpotwikiqa_mixup/lveval_hotpotwikiqa_mixup_gen_77ce82.py
function get_dataset_names (line 39) | def get_dataset_names(dataset_name, length_levels):
FILE: opencompass/configs/datasets/lveval/lvevallic_mixup/lveval_lic_mixup_gen_01eb0c.py
function get_dataset_names (line 36) | def get_dataset_names(dataset_name, length_levels):
FILE: opencompass/configs/datasets/lveval/lvevalloogle_CR_mixup/lveval_loogle_CR_mixup_gen_d7ea36.py
function get_dataset_names (line 36) | def get_dataset_names(dataset_name, length_levels):
FILE: opencompass/configs/datasets/lveval/lvevalloogle_MIR_mixup/lveval_loogle_MIR_mixup_gen_d7ea36.py
function get_dataset_names (line 36) | def get_dataset_names(dataset_name, length_levels):
FILE: opencompass/configs/datasets/lveval/lvevalloogle_SD_mixup/lveval_loogle_SD_mixup_gen_d7ea36.py
function get_dataset_names (line 36) | def get_dataset_names(dataset_name, length_levels):
FILE: opencompass/configs/datasets/lveval/lvevalmultifieldqa_en_mixup/lveval_multifieldqa_en_mixup_gen_d7ea36.py
function get_dataset_names (line 39) | def get_dataset_names(dataset_name, length_levels):
FILE: opencompass/configs/datasets/lveval/lvevalmultifieldqa_zh_mixup/lveval_multifieldqa_zh_mixup_gen_0fbdad.py
function get_dataset_names (line 39) | def get_dataset_names(dataset_name, length_levels):
FILE: opencompass/configs/datasets/mmmlu/mmmlu_prompt.py
function get_few_shot_prompts_ar (line 5) | def get_few_shot_prompts_ar(_hint, _prompt):
function get_few_shot_prompts_bn (line 21) | def get_few_shot_prompts_bn(_hint, _prompt):
function get_few_shot_prompts_de (line 36) | def get_few_shot_prompts_de(_hint, _prompt):
function get_few_shot_prompts_es (line 51) | def get_few_shot_prompts_es(_hint, _prompt):
function get_few_shot_prompts_fr (line 66) | def get_few_shot_prompts_fr(_hint, _prompt):
function get_few_shot_prompts_hi (line 81) | def get_few_shot_prompts_hi(_hint, _prompt):
function get_few_shot_prompts_id (line 96) | def get_few_shot_prompts_id(_hint, _prompt):
function get_few_shot_prompts_it (line 111) | def get_few_shot_prompts_it(_hint, _prompt):
function get_few_shot_prompts_ja (line 126) | def get_few_shot_prompts_ja(_hint, _prompt):
function get_few_shot_prompts_ko (line 141) | def get_few_shot_prompts_ko(_hint, _prompt):
function get_few_shot_prompts_pt (line 156) | def get_few_shot_prompts_pt(_hint, _prompt):
function get_few_shot_prompts_zh (line 171) | def get_few_shot_prompts_zh(_hint, _prompt):
function get_few_shot_prompts_sw (line 186) | def get_few_shot_prompts_sw(_hint, _prompt):
function get_few_shot_prompts_yo (line 200) | def get_few_shot_prompts_yo(_hint, _prompt):
FILE: opencompass/configs/datasets/needlebench/needlebench_1000k/needlebench_multi_reasoning_1000k.py
function logistic (line 11) | def logistic(x, L=100, x0=50, k=0.1):
function generate_linear_space (line 15) | def generate_linear_space(start, end, num):
function generate_depth_percents (line 24) | def generate_depth_percents(intervals, interval_type):
FILE: opencompass/configs/datasets/needlebench/needlebench_1000k/needlebench_multi_retrieval_1000k.py
function logistic (line 11) | def logistic(x, L=100, x0=50, k=0.1):
function generate_linear_space (line 15) | def generate_linear_space(start, end, num):
function generate_depth_percents (line 24) | def generate_depth_percents(intervals, interval_type):
FILE: opencompass/configs/datasets/needlebench/needlebench_1000k/needlebench_single_1000k.py
function logistic (line 11) | def logistic(x, L=100, x0=50, k=0.1):
function generate_linear_space (line 15) | def generate_linear_space(start, end, num):
function generate_depth_percents (line 24) | def generate_depth_percents(intervals, interval_type):
FILE: opencompass/configs/datasets/needlebench/needlebench_128k/needlebench_multi_reasoning_128k.py
function logistic (line 11) | def logistic(x, L=100, x0=50, k=0.1):
function generate_linear_space (line 15) | def generate_linear_space(start, end, num):
function generate_depth_percents (line 24) | def generate_depth_percents(intervals, interval_type):
FILE: opencompass/configs/datasets/needlebench/needlebench_128k/needlebench_multi_retrieval_128k.py
function logistic (line 11) | def logistic(x, L=100, x0=50, k=0.1):
function generate_linear_space (line 15) | def generate_linear_space(start, end, num):
function generate_depth_percents (line 24) | def generate_depth_percents(intervals, interval_type):
FILE: opencompass/configs/datasets/needlebench/needlebench_128k/needlebench_single_128k.py
function logistic (line 11) | def logistic(x, L=100, x0=50, k=0.1):
function generate_linear_space (line 15) | def generate_linear_space(start, end, num):
function generate_depth_percents (line 24) | def generate_depth_percents(intervals, interval_type):
FILE: opencompass/configs/datasets/needlebench/needlebench_200k/needlebench_multi_reasoning_200k.py
function logistic (line 11) | def logistic(x, L=100, x0=50, k=0.1):
function generate_linear_space (line 15) | def generate_linear_space(start, end, num):
function generate_depth_percents (line 24) | def generate_depth_percents(intervals, interval_type):
FILE: opencompass/configs/datasets/needlebench/needlebench_200k/needlebench_multi_retrieval_200k.py
function logistic (line 11) | def logistic(x, L=100, x0=50, k=0.1):
function generate_linear_space (line 15) | def generate_linear_space(start, end, num):
function generate_depth_percents (line 24) | def generate_depth_percents(intervals, interval_type):
FILE: opencompass/configs/datasets/needlebench/needlebench_200k/needlebench_single_200k.py
function logistic (line 11) | def logistic(x, L=100, x0=50, k=0.1):
function generate_linear_space (line 15) | def generate_linear_space(start, end, num):
function generate_depth_percents (line 24) | def generate_depth_percents(intervals, interval_type):
FILE: opencompass/configs/datasets/needlebench/needlebench_256k/needlebench_multi_reasoning_256k.py
function logistic (line 11) | def logistic(x, L=100, x0=50, k=0.1):
function generate_linear_space (line 15) | def generate_linear_space(start, end, num):
function generate_depth_percents (line 24) | def generate_depth_percents(intervals, interval_type):
FILE: opencompass/configs/datasets/needlebench/needlebench_256k/needlebench_multi_retrieval_256k.py
function logistic (line 11) | def logistic(x, L=100, x0=50, k=0.1):
function generate_linear_space (line 15) | def generate_linear_space(start, end, num):
function generate_depth_percents (line 24) | def generate_depth_percents(intervals, interval_type):
FILE: opencompass/configs/datasets/needlebench/needlebench_256k/needlebench_single_256k.py
function logistic (line 11) | def logistic(x, L=100, x0=50, k=0.1):
function generate_linear_space (line 15) | def generate_linear_space(start, end, num):
function generate_depth_percents (line 24) | def generate_depth_percents(intervals, interval_type):
FILE: opencompass/configs/datasets/needlebench/needlebench_32k/needlebench_multi_reasoning_32k.py
function logistic (line 11) | def logistic(x, L=100, x0=50, k=0.1):
function generate_linear_space (line 15) | def generate_linear_space(start, end, num):
function generate_depth_percents (line 24) | def generate_depth_percents(intervals, interval_type):
FILE: opencompass/configs/datasets/needlebench/needlebench_32k/needlebench_multi_retrieval_32k.py
function logistic (line 11) | def logistic(x, L=100, x0=50, k=0.1):
function generate_linear_space (line 15) | def generate_linear_space(start, end, num):
function generate_depth_percents (line 24) | def generate_depth_percents(intervals, interval_type):
FILE: opencompass/configs/datasets/needlebench/needlebench_32k/needlebench_single_32k.py
function logistic (line 11) | def logistic(x, L=100, x0=50, k=0.1):
function generate_linear_space (line 15) | def generate_linear_space(start, end, num):
function generate_depth_percents (line 24) | def generate_depth_percents(intervals, interval_type):
FILE: opencompass/configs/datasets/needlebench/needlebench_4k/needlebench_multi_reasoning_4k.py
function logistic (line 11) | def logistic(x, L=100, x0=50, k=0.1):
function generate_linear_space (line 15) | def generate_linear_space(start, end, num):
function generate_depth_percents (line 24) | def generate_depth_percents(intervals, interval_type):
FILE: opencompass/configs/datasets/needlebench/needlebench_4k/needlebench_multi_retrieval_4k.py
function logistic (line 11) | def logistic(x, L=100, x0=50, k=0.1):
function generate_linear_space (line 15) | def generate_linear_space(start, end, num):
function generate_depth_percents (line 24) | def generate_depth_percents(intervals, interval_type):
FILE: opencompass/configs/datasets/needlebench/needlebench_4k/needlebench_single_4k.py
function logistic (line 11) | def logistic(x, L=100, x0=50, k=0.1):
function generate_linear_space (line 15) | def generate_linear_space(start, end, num):
function generate_depth_percents (line 24) | def generate_depth_percents(intervals, interval_type):
FILE: opencompass/configs/datasets/needlebench/needlebench_8k/needlebench_multi_reasoning_8k.py
function logistic (line 11) | def logistic(x, L=100, x0=50, k=0.1):
function generate_linear_space (line 15) | def generate_linear_space(start, end, num):
function generate_depth_percents (line 24) | def generate_depth_percents(intervals, interval_type):
FILE: opencompass/configs/datasets/needlebench/needlebench_8k/needlebench_multi_retrieval_8k.py
function logistic (line 11) | def logistic(x, L=100, x0=50, k=0.1):
function generate_linear_space (line 15) | def generate_linear_space(start, end, num):
function generate_depth_percents (line 24) | def generate_depth_percents(intervals, interval_type):
FILE: opencompass/configs/datasets/needlebench/needlebench_8k/needlebench_multi_retrieval_compare_batch_8k.py
function logistic (line 11) | def logistic(x, L=100, x0=50, k=0.1):
function generate_linear_space (line 15) | def generate_linear_space(start, end, num):
function generate_depth_percents (line 24) | def generate_depth_percents(intervals, interval_type):
FILE: opencompass/configs/datasets/needlebench/needlebench_8k/needlebench_single_8k.py
function logistic (line 11) | def logistic(x, L=100, x0=50, k=0.1):
function generate_linear_space (line 15) | def generate_linear_space(start, end, num):
function generate_depth_percents (line 24) | def generate_depth_percents(intervals, interval_type):
FILE: opencompass/configs/datasets/needlebench/needlebench_base/needlebench_single.py
function logistic (line 11) | def logistic(x, L=100, x0=50, k=0.1):
function generate_linear_space (line 15) | def generate_linear_space(start, end, num):
function generate_depth_percents (line 24) | def generate_depth_percents(intervals, interval_type):
FILE: opencompass/configs/datasets/needlebench_v2/needlebench_v2_8k/needlebench_v2_multi_retrieval_compare_batch_8k.py
function logistic (line 11) | def logistic(x, L=100, x0=50, k=0.1):
function generate_linear_space (line 15) | def generate_linear_space(start, end, num):
function generate_depth_percents (line 24) | def generate_depth_percents(intervals, interval_type):
FILE: opencompass/configs/summarizers/needlebench.py
function create_m_rs_names_list (line 5) | def create_m_rs_names_list(context_lengths, depths, needle_counts,
function create_summarizer (line 33) | def create_summarizer(context_lengths, depths, dataset_size,
FILE: opencompass/configs/summarizers/scireasoner.py
function calculate_opi (line 6) | def calculate_opi(scores):
function calculate_smol (line 14) | def calculate_smol(scores):
function calculate_mol (line 23) | def calculate_mol(scores):
function calculate_llm4mat (line 31) | def calculate_llm4mat(scores):
function calculate_unconditional_gen (line 37) | def calculate_unconditional_gen(scores):
class SciReasonerSummarizer (line 177) | class SciReasonerSummarizer(DefaultSummarizer):
method __init__ (line 178) | def __init__(self, mini_set=False, show_details=False, *args, **kwargs):
method _calculate_group_metrics (line 194) | def _calculate_group_metrics(self, raw_results, parsed_results, datase...
FILE: opencompass/datasets/CARDBiomedBench.py
function _parse (line 8) | def _parse(item, prompt_mode):
class CARDBiomedBenchDataset (line 18) | class CARDBiomedBenchDataset(BaseDataset):
method load (line 21) | def load(path: str, prompt_mode: str, **kwargs):
FILE: opencompass/datasets/ClinicBench.py
class ClinicBenchDataset (line 9) | class ClinicBenchDataset(BaseDataset):
method load_single (line 12) | def load_single(path):
method load (line 17) | def load(path):
FILE: opencompass/datasets/Earth_Silver.py
class Earth_Silver_MCQDataset (line 9) | class Earth_Silver_MCQDataset(BaseDataset):
method load (line 12) | def load(path: str, prompt_mode: str = 'zero-shot', **kwargs):
FILE: opencompass/datasets/FinanceIQ.py
class FinanceIQDataset (line 13) | class FinanceIQDataset(BaseDataset):
method load (line 21) | def load(path: str, name: str):
FILE: opencompass/datasets/GaokaoBench.py
class GaokaoBenchDataset (line 15) | class GaokaoBenchDataset(BaseDataset):
method load (line 18) | def load(path: str, filename: str, name: str):
class GaokaoBenchEvaluator (line 37) | class GaokaoBenchEvaluator(BaseEvaluator):
method __init__ (line 39) | def __init__(self, question_type) -> None:
method do_predictions_postprocess (line 44) | def do_predictions_postprocess(self, model_output, answer_lenth=None):
method ensure_same_length (line 91) | def ensure_same_length(self, pred, refr):
method score (line 96) | def score(self, predictions, references):
function _gaokao_register (line 152) | def _gaokao_register(question_type):
FILE: opencompass/datasets/IFBench/evaluation_lib.py
class InputExample (line 30) | class InputExample:
class OutputExample (line 38) | class OutputExample:
function read_prompt_list (line 46) | def read_prompt_list(input_jsonl_filename):
function write_outputs (line 60) | def write_outputs(output_jsonl_filename, outputs):
function test_instruction_following_strict (line 78) | def test_instruction_following_strict(
function test_instruction_following_loose (line 112) | def test_instruction_following_loose(
function read_prompt_to_response_dict (line 167) | def read_prompt_to_response_dict(input_jsonl_filename):
function print_report (line 177) | def print_report(outputs):
FILE: opencompass/datasets/IFBench/ifbench.py
class IFBenchEvaluator (line 7) | class IFBenchEvaluator(BaseEvaluator):
method score (line 9) | def score(self, predictions, references, origin_prompt):
FILE: opencompass/datasets/IFBench/instructions.py
class Instruction (line 86) | class Instruction:
method __init__ (line 89) | def __init__(self, instruction_id):
method build_description (line 92) | def build_description(self, **kwargs):
method get_instruction_args (line 95) | def get_instruction_args(self):
method get_instruction_args_keys (line 98) | def get_instruction_args_keys(self):
method check_following (line 101) | def check_following(self, value):
class WordCountRangeChecker (line 107) | class WordCountRangeChecker(Instruction):
method build_description (line 110) | def build_description(self, *, min_words=None, max_words=None):
method get_instruction_args (line 138) | def get_instruction_args(self):
method get_instruction_args_keys (line 142) | def get_instruction_args_keys(self):
method check_following (line 146) | def check_following(self, value):
class UniqueWordCountChecker (line 152) | class UniqueWordCountChecker(Instruction):
method build_description (line 155) | def build_description(self, *, N=None):
method get_instruction_args (line 175) | def get_instruction_args(self):
method get_instruction_args_keys (line 179) | def get_instruction_args_keys(self):
method check_following (line 183) | def check_following(self, value):
class StopWordPercentageChecker (line 193) | class StopWordPercentageChecker(Instruction):
method build_description (line 196) | def build_description(self, *, percentage=None):
method get_instruction_args (line 214) | def get_instruction_args(self):
method get_instruction_args_keys (line 218) | def get_instruction_args_keys(self):
method check_following (line 222) | def check_following(self, value):
class SentTypeRatioChecker (line 230) | class SentTypeRatioChecker(Instruction):
method build_description (line 233) | def build_description(self):
method get_instruction_args (line 239) | def get_instruction_args(self):
method get_instruction_args_keys (line 242) | def get_instruction_args_keys(self):
method check_following (line 246) | def check_following(self, value):
class SentBalanceChecker (line 257) | class SentBalanceChecker(Instruction):
method build_description (line 260) | def build_description(self):
method get_instruction_args (line 266) | def get_instruction_args(self):
method get_instruction_args_keys (line 269) | def get_instruction_args_keys(self):
method check_following (line 273) | def check_following(self, value):
class ConjunctionCountChecker (line 285) | class ConjunctionCountChecker(Instruction):
method build_description (line 288) | def build_description(self, *, small_n=None):
method get_instruction_args (line 306) | def get_instruction_args(self):
method get_instruction_args_keys (line 310) | def get_instruction_args_keys(self):
method check_following (line 314) | def check_following(self, value):
class PersonNameCountChecker (line 326) | class PersonNameCountChecker(Instruction):
method build_description (line 329) | def build_description(self, *, N=None):
method get_instruction_args (line 346) | def get_instruction_args(self):
method get_instruction_args_keys (line 350) | def get_instruction_args_keys(self):
method check_following (line 354) | def check_following(self, value):
class NGramOverlapChecker (line 406) | class NGramOverlapChecker(Instruction):
method build_description (line 409) | def build_description(self, *, reference_text=None, percentage=None):
method get_instruction_args (line 428) | def get_instruction_args(self):
method get_instruction_args_keys (line 432) | def get_instruction_args_keys(self):
method check_following (line 436) | def check_following(self, value):
class NumbersCountChecker (line 445) | class NumbersCountChecker(Instruction):
method build_description (line 448) | def build_description(self, *, N=None):
method get_instruction_args (line 465) | def get_instruction_args(self):
method get_instruction_args_keys (line 469) | def get_instruction_args_keys(self):
method check_following (line 473) | def check_following(self, value):
class AlphabetLoopChecker (line 481) | class AlphabetLoopChecker(Instruction):
method build_description (line 484) | def build_description(self):
method get_instruction_args (line 489) | def get_instruction_args(self):
method get_instruction_args_keys (line 493) | def get_instruction_args_keys(self):
method check_following (line 497) | def check_following(self, value):
class SingleVowelParagraphChecker (line 515) | class SingleVowelParagraphChecker(Instruction):
method build_description (line 518) | def build_description(self):
method get_instruction_args (line 523) | def get_instruction_args(self):
method get_instruction_args_keys (line 527) | def get_instruction_args_keys(self):
method check_following (line 531) | def check_following(self, value):
class ConsonantClusterChecker (line 543) | class ConsonantClusterChecker(Instruction):
method build_description (line 546) | def build_description(self):
method get_instruction_args (line 551) | def get_instruction_args(self):
method get_instruction_args_keys (line 555) | def get_instruction_args_keys(self):
method check_following (line 559) | def check_following(self, value):
class IncrementingAlliterationChecker (line 574) | class IncrementingAlliterationChecker(Instruction):
method build_description (line 577) | def build_description(self):
method get_instruction_args (line 583) | def get_instruction_args(self):
method get_instruction_args_keys (line 587) | def get_instruction_args_keys(self):
method check_following (line 591) | def check_following(self, value):
class PalindromeChecker (line 619) | class PalindromeChecker(Instruction):
method build_description (line 622) | def build_description(self):
method get_instruction_args (line 627) | def get_instruction_args(self):
method get_instruction_args_keys (line 631) | def get_instruction_args_keys(self):
method check_following (line 635) | def check_following(self, value):
class PunctuationCoverChecker (line 643) | class PunctuationCoverChecker(Instruction):
method build_description (line 646) | def build_description(self):
method get_instruction_args (line 651) | def get_instruction_args(self):
method get_instruction_args_keys (line 655) | def get_instruction_args_keys(self):
method check_following (line 659) | def check_following(self, value):
class NestedParenthesesChecker (line 673) | class NestedParenthesesChecker(Instruction):
method build_description (line 676) | def build_description(self):
method get_instruction_args (line 681) | def get_instruction_args(self):
method get_instruction_args_keys (line 685) | def get_instruction_args_keys(self):
method check_following (line 689) | def check_following(self, value):
class NestedQuotesChecker (line 719) | class NestedQuotesChecker(Instruction):
method build_description (line 722) | def build_description(self):
method get_instruction_args (line 727) | def get_instruction_args(self):
method get_instruction_args_keys (line 731) | def get_instruction_args_keys(self):
method check_following (line 735) | def check_following(self, value):
class PrimeLengthsChecker (line 756) | class PrimeLengthsChecker(Instruction):
method build_description (line 759) | def build_description(self):
method get_instruction_args (line 764) | def get_instruction_args(self):
method get_instruction_args_keys (line 768) | def get_instruction_args_keys(self):
method check_following (line 772) | def check_following(self, value):
class OptionsResponseChecker (line 783) | class OptionsResponseChecker(Instruction):
method build_description (line 786) | def build_description(self, *, options=None):
method get_instruction_args (line 817) | def get_instruction_args(self):
method get_instruction_args_keys (line 821) | def get_instruction_args_keys(self):
method check_following (line 825) | def check_following(self, value):
class NewLineWordsChecker (line 836) | class NewLineWordsChecker(Instruction):
method build_description (line 839) | def build_description(self):
method get_instruction_args (line 844) | def get_instruction_args(self):
method get_instruction_args_keys (line 848) | def get_instruction_args_keys(self):
method check_following (line 852) | def check_following(self, value):
class EmojiSentenceChecker (line 861) | class EmojiSentenceChecker(Instruction):
method build_description (line 864) | def build_description(self):
method get_instruction_args (line 870) | def get_instruction_args(self):
method get_instruction_args_keys (line 874) | def get_instruction_args_keys(self):
method check_following (line 878) | def check_following(self, value):
class CharacterCountUniqueWordsChecker (line 904) | class CharacterCountUniqueWordsChecker(Instruction):
method build_description (line 907) | def build_description(self):
method get_instruction_args (line 913) | def get_instruction_args(self):
method get_instruction_args_keys (line 917) | def get_instruction_args_keys(self):
method check_following (line 921) | def check_following(self, value):
class NthWordJapaneseChecker (line 933) | class NthWordJapaneseChecker(Instruction):
method build_description (line 936) | def build_description(self, *, N=None):
method get_instruction_args (line 959) | def get_instruction_args(self):
method get_instruction_args_keys (line 963) | def get_instruction_args_keys(self):
method check_following (line 967) | def check_following(self, value):
class StartWithVerbChecker (line 992) | class StartWithVerbChecker(Instruction):
method build_description (line 995) | def build_description(self):
method get_instruction_args (line 1001) | def get_instruction_args(self):
method get_instruction_args_keys (line 1005) | def get_instruction_args_keys(self):
method check_following (line 1009) | def check_following(self, value):
class LimitedWordRepeatChecker (line 1015) | class LimitedWordRepeatChecker(Instruction):
method build_description (line 1018) | def build_description(self, *, small_n=None):
method get_instruction_args (line 1035) | def get_instruction_args(self):
method get_instruction_args_keys (line 1039) | def get_instruction_args_keys(self):
method check_following (line 1043) | def check_following(self, value):
class IncludeKeywordChecker (line 1053) | class IncludeKeywordChecker(Instruction):
method build_description (line 1056) | def build_description(self, *, word=None, N=None):
method get_instruction_args (line 1083) | def get_instruction_args(self):
method get_instruction_args_keys (line 1087) | def get_instruction_args_keys(self):
method check_following (line 1091) | def check_following(self, value):
class PronounCountChecker (line 1099) | class PronounCountChecker(Instruction):
method build_description (line 1102) | def build_description(self, *, N=None):
method get_instruction_args (line 1119) | def get_instruction_args(self):
method get_instruction_args_keys (line 1123) | def get_instruction_args_keys(self):
method check_following (line 1127) | def check_following(self, value):
class AlternateParitySyllablesChecker (line 1141) | class AlternateParitySyllablesChecker(Instruction):
method build_description (line 1144) | def build_description(self):
method get_instruction_args (line 1149) | def get_instruction_args(self):
method get_instruction_args_keys (line 1153) | def get_instruction_args_keys(self):
method check_following (line 1157) | def check_following(self, value):
class LastWordFirstNextChecker (line 1164) | class LastWordFirstNextChecker(Instruction):
method build_description (line 1167) | def build_description(self):
method get_instruction_args (line 1173) | def get_instruction_args(self):
method get_instruction_args_keys (line 1177) | def get_instruction_args_keys(self):
method check_following (line 1181) | def check_following(self, value):
class ParagraphLastFirstWordMatchChecker (line 1192) | class ParagraphLastFirstWordMatchChecker(Instruction):
method build_description (line 1195) | def build_description(self):
method get_instruction_args (line 1200) | def get_instruction_args(self):
method get_instruction_args_keys (line 1204) | def get_instruction_args_keys(self):
method check_following (line 1208) | def check_following(self, value):
class IncrementingWordCountChecker (line 1223) | class IncrementingWordCountChecker(Instruction):
method build_description (line 1226) | def build_description(self, *, small_n=None):
method get_instruction_args (line 1245) | def get_instruction_args(self):
method get_instruction_args_keys (line 1249) | def get_instruction_args_keys(self):
method check_following (line 1253) | def check_following(self, value):
class NoConsecutiveFirstLetterChecker (line 1270) | class NoConsecutiveFirstLetterChecker(Instruction):
method build_description (line 1273) | def build_description(self):
method get_instruction_args (line 1278) | def get_instruction_args(self):
method get_instruction_args_keys (line 1282) | def get_instruction_args_keys(self):
method check_following (line 1286) | def check_following(self, value):
class IndentStairsChecker (line 1297) | class IndentStairsChecker(Instruction):
method build_description (line 1300) | def build_description(self):
method get_instruction_args (line 1305) | def get_instruction_args(self):
method get_instruction_args_keys (line 1309) | def get_instruction_args_keys(self):
method check_following (line 1313) | def check_following(self, value):
class QuoteExplanationChecker (line 1325) | class QuoteExplanationChecker(Instruction):
method build_description (line 1328) | def build_description(self):
method get_instruction_args (line 1333) | def get_instruction_args(self):
method get_instruction_args_keys (line 1337) | def get_instruction_args_keys(self):
method check_following (line 1341) | def check_following(self, value):
class SpecialBulletPointsChecker (line 1354) | class SpecialBulletPointsChecker(Instruction):
method build_description (line 1357) | def build_description(self, *, sep=None):
method get_instruction_args (line 1373) | def get_instruction_args(self):
method get_instruction_args_keys (line 1377) | def get_instruction_args_keys(self):
method check_following (line 1381) | def check_following(self, value):
class ItalicsThesisChecker (line 1386) | class ItalicsThesisChecker(Instruction):
method build_description (line 1389) | def build_description(self):
method get_instruction_args (line 1394) | def get_instruction_args(self):
method get_instruction_args_keys (line 1398) | def get_instruction_args_keys(self):
method check_following (line 1402) | def check_following(self, value):
class SubBulletPointsChecker (line 1423) | class SubBulletPointsChecker(Instruction):
method build_description (line 1426) | def build_description(self):
method get_instruction_args (line 1431) | def get_instruction_args(self):
method get_instruction_args_keys (line 1435) | def get_instruction_args_keys(self):
method check_following (line 1439) | def check_following(self, value):
class SomeBulletPointsChecker (line 1449) | class SomeBulletPointsChecker(Instruction):
method build_description (line 1452) | def build_description(self):
method get_instruction_args (line 1457) | def get_instruction_args(self):
method get_instruction_args_keys (line 1461) | def get_instruction_args_keys(self):
method check_following (line 1465) | def check_following(self, value):
class PrintMultiplesChecker (line 1486) | class PrintMultiplesChecker(Instruction):
method build_description (line 1489) | def build_description(self, **kwargs):
method get_instruction_args (line 1493) | def get_instruction_args(self):
method get_instruction_args_keys (line 1497) | def get_instruction_args_keys(self):
method check_following (line 1501) | def check_following(self, value):
class MultipleChoiceQuestionsChecker (line 1509) | class MultipleChoiceQuestionsChecker(Instruction):
method build_description (line 1512) | def build_description(self, **kwargs):
method get_instruction_args (line 1516) | def get_instruction_args(self):
method get_instruction_args_keys (line 1520) | def get_instruction_args_keys(self):
method check_following (line 1524) | def check_following(self, value):
class ReverseNewlineChecker (line 1557) | class ReverseNewlineChecker(Instruction):
method build_description (line 1560) | def build_description(self, **kwargs):
method get_instruction_args (line 1564) | def get_instruction_args(self):
method get_instruction_args_keys (line 1568) | def get_instruction_args_keys(self):
method check_following (line 1572) | def check_following(self, value):
class WordReverseOrderChecker (line 1619) | class WordReverseOrderChecker(Instruction):
method build_description (line 1622) | def build_description(self, **kwargs):
method get_instruction_args (line 1627) | def get_instruction_args(self):
method get_instruction_args_keys (line 1631) | def get_instruction_args_keys(self):
method check_following (line 1635) | def check_following(self, value):
class CharacterReverseOrderChecker (line 1644) | class CharacterReverseOrderChecker(Instruction):
method build_description (line 1647) | def build_description(self, **kwargs):
method get_instruction_args (line 1651) | def get_instruction_args(self):
method get_instruction_args_keys (line 1655) | def get_instruction_args_keys(self):
method check_following (line 1659) | def check_following(self, value):
class SentenceAlphabetChecker (line 1664) | class SentenceAlphabetChecker(Instruction):
method build_description (line 1667) | def build_description(self, **kwargs):
method get_instruction_args (line 1672) | def get_instruction_args(self):
method get_instruction_args_keys (line 1676) | def get_instruction_args_keys(self):
method check_following (line 1680) | def check_following(self, value):
class EuropeanCapitalsSortChecker (line 1690) | class EuropeanCapitalsSortChecker(Instruction):
method build_description (line 1693) | def build_description(self, **kwargs):
method get_instruction_args (line 1698) | def get_instruction_args(self):
method get_instruction_args_keys (line 1702) | def get_instruction_args_keys(self):
method check_following (line 1706) | def check_following(self, value):
class CityCSVChecker (line 1739) | class CityCSVChecker(Instruction):
method build_description (line 1742) | def build_description(self, **kwargs):
method get_instruction_args (line 1747) | def get_instruction_args(self):
method get_instruction_args_keys (line 1751) | def get_instruction_args_keys(self):
method check_following (line 1755) | def check_following(self, value):
class SpecialCharacterCSVChecker (line 1772) | class SpecialCharacterCSVChecker(Instruction):
method build_description (line 1775) | def build_description(self, **kwargs):
method get_instruction_args (line 1780) | def get_instruction_args(self):
method get_instruction_args_keys (line 1784) | def get_instruction_args_keys(self):
method check_following (line 1788) | def check_following(self, value):
class QuotesCSVChecker (line 1812) | class QuotesCSVChecker(Instruction):
method build_description (line 1815) | def build_description(self, **kwargs):
method get_instruction_args (line 1820) | def get_instruction_args(self):
method get_instruction_args_keys (line 1824) | def get_instruction_args_keys(self):
method check_following (line 1828) | def check_following(self, value):
class DateFormatListChecker (line 1852) | class DateFormatListChecker(Instruction):
method build_description (line 1855) | def build_description(self, **kwargs):
method get_instruction_args (line 1860) | def get_instruction_args(self):
method get_instruction_args_keys (line 1864) | def get_instruction_args_keys(self):
method check_following (line 1868) | def check_following(self, value):
class KeywordsMultipleChecker (line 1890) | class KeywordsMultipleChecker(Instruction):
method build_description (line 1893) | def build_description(self, *, keyword1=None, keyword2=None, keyword3=...
method get_instruction_args (line 1920) | def get_instruction_args(self):
method get_instruction_args_keys (line 1924) | def get_instruction_args_keys(self):
method check_following (line 1927) | def check_following(self, value):
class KeywordSpecificPositionChecker (line 1935) | class KeywordSpecificPositionChecker(Instruction):
method build_description (line 1938) | def build_description(self, keyword=None, n=None, m=None):
method get_instruction_args (line 1970) | def get_instruction_args(self):
method get_instruction_args_keys (line 1974) | def get_instruction_args_keys(self):
method check_following (line 1978) | def check_following(self, value):
class WordsPositionChecker (line 2000) | class WordsPositionChecker(Instruction):
method build_description (line 2003) | def build_description(self, *, keyword=None):
method get_instruction_args (line 2021) | def get_instruction_args(self):
method get_instruction_args_keys (line 2025) | def get_instruction_args_keys(self):
method check_following (line 2029) | def check_following(self, value):
class RepeatChangeChecker (line 2048) | class RepeatChangeChecker(Instruction):
method build_description (line 2051) | def build_description(self, *, prompt_to_repeat=None):
method get_instruction_args (line 2070) | def get_instruction_args(self):
method get_instruction_args_keys (line 2074) | def get_instruction_args_keys(self):
method check_following (line 2078) | def check_following(self, value):
class RepeatSimpleChecker (line 2096) | class RepeatSimpleChecker(Instruction):
method build_description (line 2099) | def build_description(self):
method get_instruction_args (line 2106) | def get_instruction_args(self):
method get_instruction_args_keys (line 2109) | def get_instruction_args_keys(self):
method check_following (line 2113) | def check_following(self, value):
class RepeatSpanChecker (line 2126) | class RepeatSpanChecker(Instruction):
method build_description (line 2129) | def build_description(self, prompt_to_repeat=None, n_start=None, n_end...
method get_instruction_args (line 2156) | def get_instruction_args(self):
method get_instruction_args_keys (line 2160) | def get_instruction_args_keys(self):
method check_following (line 2164) | def check_following(self, value):
class TitleCaseChecker (line 2171) | class TitleCaseChecker(Instruction):
method build_description (line 2174) | def build_description(self):
method get_instruction_args (line 2181) | def get_instruction_args(self):
method get_instruction_args_keys (line 2185) | def get_instruction_args_keys(self):
method check_following (line 2189) | def check_following(self, value):
class OutputTemplateChecker (line 2210) | class OutputTemplateChecker(Instruction):
method build_description (line 2213) | def build_description(self):
method get_instruction_args (line 2220) | def get_instruction_args(self):
method get_instruction_args_keys (line 2224) | def get_instruction_args_keys(self):
method check_following (line 2228) | def check_following(self, value):
class NoWhitespaceChecker (line 2244) | class NoWhitespaceChecker(Instruction):
method build_description (line 2247) | def build_description(self):
method get_instruction_args (line 2254) | def get_instruction_args(self):
method get_instruction_args_keys (line 2258) | def get_instruction_args_keys(self):
method check_following (line 2262) | def check_following(self, value):
FILE: opencompass/datasets/IFBench/instructions_util.py
function split_into_sentences (line 1579) | def split_into_sentences(text):
function count_words (line 1630) | def count_words(text):
function _get_sentence_tokenizer (line 1639) | def _get_sentence_tokenizer():
function count_stopwords (line 1643) | def count_stopwords(text):
function generate_keywords (line 1652) | def generate_keywords(num_keywords):
FILE: opencompass/datasets/IFEval/evaluation_main.py
class InputExample (line 44) | class InputExample:
class OutputExample (line 52) | class OutputExample:
function test_instruction_following_strict (line 60) | def test_instruction_following_strict(
function test_instruction_following_loose (line 91) | def test_instruction_following_loose(
FILE: opencompass/datasets/IFEval/ifeval.py
class IFEvalDataset (line 15) | class IFEvalDataset(BaseDataset):
method load (line 18) | def load(path):
class IFEvaluator (line 29) | class IFEvaluator(BaseEvaluator):
method score (line 31) | def score(self, predictions, references, origin_prompt):
FILE: opencompass/datasets/IFEval/instructions.py
class Instruction (line 98) | class Instruction:
method __init__ (line 101) | def __init__(self, instruction_id):
method build_description (line 104) | def build_description(self, **kwargs):
method get_instruction_args (line 107) | def get_instruction_args(self):
method get_instruction_args_keys (line 110) | def get_instruction_args_keys(self):
method check_following (line 114) | def check_following(self, value):
class ResponseLanguageChecker (line 118) | class ResponseLanguageChecker(Instruction):
method build_description (line 121) | def build_description(self, *, language=None):
method get_instruction_args (line 144) | def get_instruction_args(self):
method get_instruction_args_keys (line 148) | def get_instruction_args_keys(self):
method check_following (line 152) | def check_following(self, value):
class NumberOfSentences (line 173) | class NumberOfSentences(Instruction):
method build_description (line 176) | def build_description(self, *, num_sentences=None, relation=None):
method get_instruction_args (line 214) | def get_instruction_args(self):
method get_instruction_args_keys (line 221) | def get_instruction_args_keys(self):
method check_following (line 225) | def check_following(self, value):
class PlaceholderChecker (line 245) | class PlaceholderChecker(Instruction):
method build_description (line 248) | def build_description(self, *, num_placeholders=None):
method get_instruction_args (line 267) | def get_instruction_args(self):
method get_instruction_args_keys (line 271) | def get_instruction_args_keys(self):
method check_following (line 275) | def check_following(self, value):
class BulletListChecker (line 290) | class BulletListChecker(Instruction):
method build_description (line 293) | def build_description(self, *, num_bullets=None):
method get_instruction_args (line 312) | def get_instruction_args(self):
method get_instruction_args_keys (line 316) | def get_instruction_args_keys(self):
method check_following (line 320) | def check_following(self, value):
class ConstrainedResponseChecker (line 337) | class ConstrainedResponseChecker(Instruction):
method build_description (line 340) | def build_description(self):
method get_instruction_args (line 349) | def get_instruction_args(self):
method get_instruction_args_keys (line 353) | def get_instruction_args_keys(self):
method check_following (line 357) | def check_following(self, value):
class ConstrainedStartChecker (line 374) | class ConstrainedStartChecker(Instruction):
method build_description (line 377) | def build_description(self, *, starter=None):
method get_instruction_args (line 396) | def get_instruction_args(self):
method get_instruction_args_keys (line 400) | def get_instruction_args_keys(self):
method check_following (line 404) | def check_following(self, value):
class HighlightSectionChecker (line 422) | class HighlightSectionChecker(Instruction):
method build_description (line 425) | def build_description(self, *, num_highlights=None):
method get_instruction_args (line 446) | def get_instruction_args(self):
method get_instruction_args_keys (line 450) | def get_instruction_args_keys(self):
method check_following (line 454) | def check_following(self, value):
class SectionChecker (line 478) | class SectionChecker(Instruction):
method build_description (line 481) | def build_description(self, *, section_spliter=None, num_sections=None):
method get_instruction_args (line 511) | def get_instruction_args(self):
method get_instruction_args_keys (line 518) | def get_instruction_args_keys(self):
method check_following (line 522) | def check_following(self, value):
class ParagraphChecker (line 541) | class ParagraphChecker(Instruction):
method build_description (line 544) | def build_description(self, *, num_paragraphs=None):
method get_instruction_args (line 564) | def get_instruction_args(self):
method get_instruction_args_keys (line 568) | def get_instruction_args_keys(self):
method check_following (line 572) | def check_following(self, value):
class PostscriptChecker (line 596) | class PostscriptChecker(Instruction):
method build_description (line 599) | def build_description(self, *, postscript_marker=None):
method get_instruction_args (line 621) | def get_instruction_args(self):
method get_instruction_args_keys (line 625) | def get_instruction_args_keys(self):
method check_following (line 629) | def check_following(self, value):
class RephraseChecker (line 652) | class RephraseChecker(Instruction):
method build_description (line 655) | def build_description(self, *, original_message):
method get_instruction_args (line 679) | def get_instruction_args(self):
method get_instruction_args_keys (line 683) | def get_instruction_args_keys(self):
method check_following (line 687) | def check_following(self, value):
method is_change (line 709) | def is_change(self, response):
method strip_changes (line 714) | def strip_changes(self, response):
class KeywordChecker (line 719) | class KeywordChecker(Instruction):
method build_description (line 722) | def build_description(self, *, keywords=None):
method get_instruction_args (line 745) | def get_instruction_args(self):
method get_instruction_args_keys (line 749) | def get_instruction_args_keys(self):
method check_following (line 753) | def check_following(self, value):
class KeywordFrequencyChecker (line 761) | class KeywordFrequencyChecker(Instruction):
method build_description (line 764) | def build_description(self,
method get_instruction_args (line 812) | def get_instruction_args(self):
method get_instruction_args_keys (line 820) | def get_instruction_args_keys(self):
method check_following (line 824) | def check_following(self, value):
class NumberOfWords (line 836) | class NumberOfWords(Instruction):
method build_description (line 839) | def build_description(self, *, num_words=None, relation=None):
method get_instruction_args (line 875) | def get_instruction_args(self):
method get_instruction_args_keys (line 882) | def get_instruction_args_keys(self):
method check_following (line 886) | def check_following(self, value):
class JsonFormat (line 896) | class JsonFormat(Instruction):
method build_description (line 899) | def build_description(self):
method get_instruction_args (line 905) | def get_instruction_args(self):
method get_instruction_args_keys (line 909) | def get_instruction_args_keys(self):
method check_following (line 913) | def check_following(self, value):
class ParagraphFirstWordCheck (line 924) | class ParagraphFirstWordCheck(Instruction):
method build_description (line 927) | def build_description(self,
method get_instruction_args (line 970) | def get_instruction_args(self):
method get_instruction_args_keys (line 978) | def get_instruction_args_keys(self):
method check_following (line 982) | def check_following(self, value):
class KeySentenceChecker (line 1029) | class KeySentenceChecker(Instruction):
method build_description (line 1032) | def build_description(self, key_sentences=None, num_sentences=None):
method get_instruction_args (line 1064) | def get_instruction_args(self):
method get_instruction_args_keys (line 1071) | def get_instruction_args_keys(self):
method check_following (line 1075) | def check_following(self, value):
class ForbiddenWords (line 1086) | class ForbiddenWords(Instruction):
method build_description (line 1089) | def build_description(self, forbidden_words=None):
method get_instruction_args (line 1112) | def get_instruction_args(self):
method get_instruction_args_keys (line 1116) | def get_instruction_args_keys(self):
method check_following (line 1120) | def check_following(self, value):
class RephraseParagraph (line 1128) | class RephraseParagraph(Instruction):
method build_description (line 1131) | def build_description(self, *, original_paragraph, low, high):
method get_instruction_args (line 1160) | def get_instruction_args(self):
method get_instruction_args_keys (line 1168) | def get_instruction_args_keys(self):
method check_following (line 1172) | def check_following(self, value):
class TwoResponsesChecker (line 1186) | class TwoResponsesChecker(Instruction):
method build_description (line 1189) | def build_description(self):
method get_instruction_args (line 1196) | def get_instruction_args(self):
method get_instruction_args_keys (line 1200) | def get_instruction_args_keys(self):
method check_following (line 1204) | def check_following(self, value):
class RepeatPromptThenAnswer (line 1225) | class RepeatPromptThenAnswer(Instruction):
method build_description (line 1228) | def build_description(self, *, prompt_to_repeat=None):
method get_instruction_args (line 1248) | def get_instruction_args(self):
method get_instruction_args_keys (line 1251) | def get_instruction_args_keys(self):
method check_following (line 1255) | def check_following(self, value):
class EndChecker (line 1262) | class EndChecker(Instruction):
method build_description (line 1265) | def build_description(self, *, end_phrase=None):
method get_instruction_args (line 1283) | def get_instruction_args(self):
method get_instruction_args_keys (line 1286) | def get_instruction_args_keys(self):
method check_following (line 1290) | def check_following(self, value):
class TitleChecker (line 1297) | class TitleChecker(Instruction):
method build_description (line 1300) | def build_description(self):
method get_instruction_args (line 1307) | def get_instruction_args(self):
method get_instruction_args_keys (line 1310) | def get_instruction_args_keys(self):
method check_following (line 1314) | def check_following(self, value):
class LetterFrequencyChecker (line 1326) | class LetterFrequencyChecker(Instruction):
method build_description (line 1329) | def build_description(self,
method get_instruction_args (line 1379) | def get_instruction_args(self):
method get_instruction_args_keys (line 1387) | def get_instruction_args_keys(self):
method check_following (line 1391) | def check_following(self, value):
class CapitalLettersEnglishChecker (line 1403) | class CapitalLettersEnglishChecker(Instruction):
method build_description (line 1406) | def build_description(self):
method get_instruction_args (line 1413) | def get_instruction_args(self):
method get_instruction_args_keys (line 1416) | def get_instruction_args_keys(self):
method check_following (line 1420) | def check_following(self, value):
class LowercaseLettersEnglishChecker (line 1434) | class LowercaseLettersEnglishChecker(Instruction):
method build_description (line 1438) | def build_description(self):
method get_instruction_args (line 1445) | def get_instruction_args(self):
method get_instruction_args_keys (line 1448) | def get_instruction_args_keys(self):
method check_following (line 1452) | def check_following(self, value):
class CommaChecker (line 1466) | class CommaChecker(Instruction):
method build_description (line 1469) | def build_description(self):
method get_instruction_args (line 1475) | def get_instruction_args(self):
method get_instruction_args_keys (line 1478) | def get_instruction_args_keys(self):
method check_following (line 1482) | def check_following(self, value):
class CapitalWordFrequencyChecker (line 1487) | class CapitalWordFrequencyChecker(Instruction):
method build_description (line 1490) | def build_description(
method get_instruction_args (line 1525) | def get_instruction_args(self):
method get_instruction_args_keys (line 1532) | def get_instruction_args_keys(self):
method check_following (line 1536) | def check_following(self, value):
class QuotationChecker (line 1550) | class QuotationChecker(Instruction):
method build_description (line 1553) | def build_description(self):
method get_instruction_args (line 1559) | def get_instruction_args(self):
method get_instruction_args_keys (line 1563) | def get_instruction_args_keys(self):
method check_following (line 1567) | def check_following(self, value):
FILE: opencompass/datasets/IFEval/instructions_registry.py
function conflict_make (line 175) | def conflict_make(conflicts):
FILE: opencompass/datasets/IFEval/instructions_util.py
function split_into_sentences (line 71) | def split_into_sentences(text):
function count_words (line 123) | def count_words(text):
function _get_sentence_tokenizer (line 132) | def _get_sentence_tokenizer():
function count_sentences (line 136) | def count_sentences(text):
function generate_keywords (line 143) | def generate_keywords(num_keywords):
FILE: opencompass/datasets/LCBench.py
class LCDataset (line 23) | class LCDataset(BaseDataset):
method load (line 26) | def load(path: str,
class TimeOutException (line 67) | class TimeOutException(Exception):
function swallow_io (line 72) | def swallow_io():
function time_limit (line 81) | def time_limit(seconds: float):
class WriteOnlyStringIO (line 94) | class WriteOnlyStringIO(io.StringIO):
method read (line 97) | def read(self, *args, **kwargs):
method readline (line 100) | def readline(self, *args, **kwargs):
method readlines (line 103) | def readlines(self, *args, **kwargs):
method readable (line 106) | def readable(self, *args, **kwargs):
class redirect_stdin (line 111) | class redirect_stdin(contextlib._RedirectStream): # type: ignore
class LCEvaluator (line 116) | class LCEvaluator(BaseEvaluator):
method score (line 118) | def score(self, predictions, references):
method _process_answer (line 203) | def _process_answer(self, text):
method _process_test (line 265) | def _process_test(self, test_case, code):
function execution (line 300) | def execution(programs, task_ids, timeout):
class LCPassKEvaluator (line 344) | class LCPassKEvaluator(LCEvaluator):
method __init__ (line 351) | def __init__(self, k=(1, 10, 100)) -> None:
method estimate_pass_at_k (line 357) | def estimate_pass_at_k(
method score (line 383) | def score(self, predictions, references):
FILE: opencompass/datasets/MMLUArabic.py
class MMLUArabicDataset (line 13) | class MMLUArabicDataset(BaseDataset):
method load (line 16) | def load(path: str, name: str):
FILE: opencompass/datasets/MedCalc_Bench.py
function check_correctness (line 14) | def check_correctness(answer: str, ground_truth, calid, upper_limit,
function extract_answer (line 73) | def extract_answer(answer, calid):
function _parse (line 212) | def _parse(item, prompt_mode):
class MedCalc_BenchDataset (line 231) | class MedCalc_BenchDataset(BaseDataset):
method load (line 234) | def load(path: str, prompt_mode: str, **kwargs):
class MedCalcOfficial_Evaluator (line 249) | class MedCalcOfficial_Evaluator(BaseEvaluator):
method score (line 251) | def score(self, predictions, references, test_set):
FILE: opencompass/datasets/MedQA.py
class MedQADataset (line 9) | class MedQADataset(BaseDataset):
method load_single (line 12) | def load_single(path):
method load (line 27) | def load(path):
FILE: opencompass/datasets/MedXpertQA.py
function _parse (line 12) | def _parse(item, prompt_mode):
class MedXpertQADataset (line 20) | class MedXpertQADataset(BaseDataset):
method load (line 23) | def load(path: str, prompt_mode: str, **kwargs):
class MedXpertQAEvaluator (line 35) | class MedXpertQAEvaluator(BaseEvaluator):
method score (line 37) | def score(self, predictions, references, test_set):
function answer_cleansing (line 59) | def answer_cleansing(
function _generic_llmjudge_postprocess (line 106) | def _generic_llmjudge_postprocess(judgement: str):
function MedXpertQA_llmjudge_postprocess (line 113) | def MedXpertQA_llmjudge_postprocess(
FILE: opencompass/datasets/Medbullets.py
function _parse (line 13) | def _parse(item: dict, prompt_mode: str) -> dict:
class MedbulletsDataset (line 37) | class MedbulletsDataset(BaseDataset):
method load (line 40) | def load(path: str, prompt_mode: str = 'zero-shot', **kwargs):
class MedbulletsEvaluator (line 60) | class MedbulletsEvaluator(BaseEvaluator):
method score (line 62) | def score(self, predictions, references, test_set):
function answer_cleansing (line 89) | def answer_cleansing(
function _generic_llmjudge_postprocess (line 137) | def _generic_llmjudge_postprocess(judgement: str):
function medbullets_llmjudge_postprocess (line 144) | def medbullets_llmjudge_postprocess(
FILE: opencompass/datasets/NPHardEval/cmp_GCP_D.py
function q2text (line 18) | def q2text(q, p=gcp_dPrompts):
class CMP_GCP_D_Dataset (line 35) | class CMP_GCP_D_Dataset(BaseDataset):
method load (line 38) | def load(path: str):
class CMP_GCP_D_Evaluator (line 60) | class CMP_GCP_D_Evaluator(BaseEvaluator):
method score (line 62) | def score(self, predictions, references):
method parse_xml_to_dict (line 93) | def parse_xml_to_dict(self, xml_string):
method read_dimacs_format (line 115) | def read_dimacs_format(self, dimacs_str):
method gcp_greedy_solution (line 132) | def gcp_greedy_solution(self, adjacency_list):
method gcp_decision_check (line 147) | def gcp_decision_check(self, dimacs_str, answer, k_colors):
FILE: opencompass/datasets/NPHardEval/cmp_KSP.py
function q2text (line 14) | def q2text(q, p=kspPrompts):
class CMP_KSP_Dataset (line 29) | class CMP_KSP_Dataset(BaseDataset):
method load (line 32) | def load(path: str):
class CMP_KSP_Evaluator (line 54) | class CMP_KSP_Evaluator(BaseEvaluator):
method score (line 56) | def score(self, predictions, references):
method parse_xml_to_dict (line 91) | def parse_xml_to_dict(self, xml_string):
method ksp_optimal_solution (line 113) | def ksp_optimal_solution(self, knapsacks, capacity):
method kspCheck (line 133) | def kspCheck(self, instance, solution):
FILE: opencompass/datasets/NPHardEval/cmp_TSP_D.py
function q2text (line 20) | def q2text(adj_matrix, distance_limit, p=tsp_dPrompts):
class CMP_TSP_D_Dataset (line 37) | class CMP_TSP_D_Dataset(BaseDataset):
method load (line 40) | def load(path: str):
class CMP_TSP_D_Evaluator (line 67) | class CMP_TSP_D_Evaluator(BaseEvaluator):
method score (line 69) | def score(self, predictions, references):
method parse_xml_to_dict (line 106) | def parse_xml_to_dict(self, xml_string):
method tsp_approx (line 128) | def tsp_approx(self, distance_matrix):
method tsp_decision_check (line 137) | def tsp_decision_check(self, distance_matrix, threshold, tour):
FILE: opencompass/datasets/NPHardEval/hard_GCP.py
function q2text (line 14) | def q2text(q, p=gcpPrompts): # q is the data for the HP-hard question, ...
class HardGCPDataset (line 32) | class HardGCPDataset(BaseDataset):
method load (line 35) | def load(path: str):
class HardGCPEvaluator (line 57) | class HardGCPEvaluator(BaseEvaluator):
method score (line 59) | def score(self, predictions, references):
method parse_xml_to_dict (line 89) | def parse_xml_to_dict(self, xml_string):
method gcpCheck (line 117) | def gcpCheck(self, dimacs_str, answer_str):
method read_dimacs_format (line 137) | def read_dimacs_format(self, dimacs_str):
method parse_answer (line 158) | def parse_answer(self, llm_string):
FILE: opencompass/datasets/NPHardEval/hard_MSP.py
function q2text (line 15) | def q2text(q, p=mspPrompts): # q is the data for the HP-hard question, ...
class Hard_MSP_Dataset (line 35) | class Hard_MSP_Dataset(BaseDataset):
method load (line 38) | def load(path: str):
class Hard_MSP_Evaluator (line 59) | class Hard_MSP_Evaluator(BaseEvaluator):
method score (line 61) | def score(self, predictions, references):
method mspCheck (line 88) | def mspCheck(self, instance, llm_string):
method parse_xml_to_dict (line 179) | def parse_xml_to_dict(self, xml_string):
FILE: opencompass/datasets/NPHardEval/hard_TSP.py
function q2text (line 17) | def q2text(q, p=tspPrompts): # q is the data for the HP-hard question, ...
class Hard_TSP_Dataset (line 33) | class Hard_TSP_Dataset(BaseDataset):
method load (line 36) | def load(path: str):
class Hard_TSP_Evaluator (line 61) | class Hard_TSP_Evaluator(BaseEvaluator):
method score (line 63) | def score(self, predictions, references):
method parse_xml_to_dict (line 91) | def parse_xml_to_dict(self, xml_string):
method tspCheck (line 119) | def tspCheck(self, distance_matrix, llm_string):
method greedy_tsp (line 186) | def greedy_tsp(self, distance_matrix):
FILE: opencompass/datasets/NPHardEval/p_BSP.py
function q2text (line 14) | def q2text(q, p=bspPrompts):
class P_BSP_Dataset (line 28) | class P_BSP_Dataset(BaseDataset):
method load (line 31) | def load(path: str):
class P_BSP_Evaluator (line 54) | class P_BSP_Evaluator(BaseEvaluator):
method score (line 56) | def score(self, predictions, references):
method parse_xml_to_dict (line 84) | def parse_xml_to_dict(self, xml_string):
method bsp_check (line 106) | def bsp_check(self, instance, solution):
FILE: opencompass/datasets/NPHardEval/p_EDP.py
function q2text (line 14) | def q2text(q, p=edpPrompts):
class P_EDP_Dataset (line 25) | class P_EDP_Dataset(BaseDataset):
method load (line 28) | def load(path: str):
class P_EDP_Evaluator (line 51) | class P_EDP_Evaluator(BaseEvaluator):
method score (line 53) | def score(self, predictions, references):
method compute_min_edit_distance (line 81) | def compute_min_edit_distance(self, string_a, string_b):
method edp_check (line 99) | def edp_check(self, instance, solution):
method parse_xml_to_dict (line 121) | def parse_xml_to_dict(self, xml_string):
FILE: opencompass/datasets/NPHardEval/p_SPP.py
function q2text (line 19) | def q2text(q, p=sppPrompts):
class P_SPP_Dataset (line 38) | class P_SPP_Dataset(BaseDataset):
method load (line 41) | def load(path: str):
class P_SPP_Evaluator (line 61) | class P_SPP_Evaluator(BaseEvaluator):
method score (line 63) | def score(self, predictions, references):
method parse_xml_to_dict (line 91) | def parse_xml_to_dict(self, xml_string):
method ssp_optimal_solution (line 120) | def ssp_optimal_solution(self, instance, source, target):
method spp_check (line 140) | def spp_check(self, instance, solution, start_node=None, end_node=None):
FILE: opencompass/datasets/NPHardEval/utils.py
function append_root_tags (line 5) | def append_root_tags(string):
function parse_xml_to_dict (line 13) | def parse_xml_to_dict(xml_string):
FILE: opencompass/datasets/OlympiadBench.py
class OlympiadBenchDataset (line 25) | class OlympiadBenchDataset(BaseDataset):
method load (line 35) | def load(path: str, name: str = None, **kwargs):
function get_single_answer_type_text (line 121) | def get_single_answer_type_text(answer_type, is_chinese):
function get_answer_type_text (line 146) | def get_answer_type_text(answer_type, is_chinese, multiple_answer):
class OlympiadBenchPrompter (line 186) | class OlympiadBenchPrompter:
method __init__ (line 188) | def __init__(self):
method make_prompt (line 191) | def make_prompt(
class MathJudger (line 277) | class MathJudger:
method __init__ (line 279) | def __init__(self):
method split_by_comma (line 297) | def split_by_comma(self, expr: str):
method trans_plus_minus_sign (line 315) | def trans_plus_minus_sign(self, expr_list: list):
method judge (line 326) | def judge(self, expression1, expression2, precision=1e-8):
method is_interval (line 375) | def is_interval(self, epr):
method sympy_sub_pi (line 378) | def sympy_sub_pi(self, expression_sympy):
method is_equal (line 381) | def is_equal(self, expression1, expression2):
method numerical_equal (line 419) | def numerical_equal(
method expression_equal (line 442) | def expression_equal(self, exp1, exp2):
method equation_equal (line 494) | def equation_equal(self, expression1, expression2):
method interval_equal (line 535) | def interval_equal(self, expression1, expression2):
method preprocess (line 578) | def preprocess(self, expression1, expression2):
method can_compute_power (line 644) | def can_compute_power(self, expr):
function olympiadbench_postprocess_v2 (line 680) | def olympiadbench_postprocess_v2(text: str,
class OlympiadBenchEvaluator (line 702) | class OlympiadBenchEvaluator(BaseEvaluator):
method __init__ (line 705) | def __init__(self, version='v1'):
method score (line 710) | def score(self, predictions, references): # Remove questions parameter
class OlympiadBenchTemplate (line 765) | class OlympiadBenchTemplate(PromptTemplate):
method __init__ (line 768) | def __init__(self):
method generate_item (line 774) | def generate_item(self, entry: Dict, *args, **kwargs) -> str:
FILE: opencompass/datasets/OpenFinData.py
class OpenFinDataDataset (line 14) | class OpenFinDataDataset(BaseDataset):
method load (line 17) | def load(path: str, name: str):
class OpenFinDataKWEvaluator (line 25) | class OpenFinDataKWEvaluator(BaseEvaluator):
method __init__ (line 27) | def __init__(self, ):
method score (line 30) | def score(self, predictions, references):
FILE: opencompass/datasets/OpenSWI.py
class OpenSWIDataset (line 17) | class OpenSWIDataset(BaseDataset):
method load (line 20) | def load(path: str, name: str):
function extract_list (line 39) | def extract_list(text):
class OpenSWIMSEEvaluator (line 55) | class OpenSWIMSEEvaluator(BaseEvaluator):
method __init__ (line 58) | def __init__(self) -> None:
method score (line 61) | def score(self, predictions, references):
FILE: opencompass/datasets/PI_LLM.py
class PILLMDataset (line 11) | class PILLMDataset(BaseDataset):
method load (line 33) | def load(**kwargs) -> Dataset:
FILE: opencompass/datasets/PMMEval/flores.py
function wmt_postprocess (line 18) | def wmt_postprocess(text: str, lang: str) -> str:
function compute_maximum_bleu_value (line 27) | def compute_maximum_bleu_value(gen: str, ref: str, lang: str):
function trim_multiple_space (line 56) | def trim_multiple_space(tokes):
class SpaceTokenizer (line 60) | class SpaceTokenizer(object):
method __call__ (line 62) | def __call__(self, sent):
class NonASCIITokenizer (line 69) | class NonASCIITokenizer(object):
method __init__ (line 71) | def __init__(self):
method __call__ (line 78) | def __call__(self, sent):
function build_tokenizer (line 93) | def build_tokenizer(lang: str):
function tokenize (line 102) | def tokenize(sent, lang):
function pmmeval_flores_postprocess (line 109) | def pmmeval_flores_postprocess(text: str, lang_fullname: str) -> Tuple[s...
class PMMEvalFloresDataset (line 114) | class PMMEvalFloresDataset(BaseDataset):
method load (line 117) | def load(path: str, lang_fullname: str):
class PMMEvalFloresEvaluator (line 138) | class PMMEvalFloresEvaluator(BaseEvaluator):
method score (line 140) | def score(self, predictions, references):
FILE: opencompass/datasets/PMMEval/humanevalxl.py
class PMMEvalHumanEvalXLDataset (line 27) | class PMMEvalHumanEvalXLDataset(BaseDataset):
method load (line 30) | def load(path: str, lang: str, program_lang: str):
class PMMEvalHumanEvalXLEvaluator (line 51) | class PMMEvalHumanEvalXLEvaluator(BaseEvaluator):
method __init__ (line 53) | def __init__(self,
method score (line 72) | def score(self, predictions, references):
method _code_eval_service (line 123) | def _code_eval_service(self, file_path):
function _clean_up_code (line 151) | def _clean_up_code(text: str, language_type: str, reference) -> str:
FILE: opencompass/datasets/PMMEval/mgsm.py
function _get_last_digit (line 13) | def _get_last_digit(s):
class PMMEvalMGSMDataset (line 29) | class PMMEvalMGSMDataset(BaseDataset):
method load (line 32) | def load(path: str, lang: str):
class PMMEvalMGSMEvaluator (line 52) | class PMMEvalMGSMEvaluator(BaseEvaluator):
method score (line 54) | def score(self, predictions, references):
FILE: opencompass/datasets/PMMEval/mhellaswag.py
function extract_choice (line 27) | def extract_choice(gen, lang):
function extract_choice_fuzzy (line 59) | def extract_choice_fuzzy(gen, lang):
function pmmeval_mhellaswag_postprocess (line 68) | def pmmeval_mhellaswag_postprocess(text: str, lang_code: str) -> Tuple[s...
class PMMEvalMHellaswagDataset (line 73) | class PMMEvalMHellaswagDataset(BaseDataset):
method load (line 76) | def load(path: str, lang: str):
class PMMEvalMHellaswagEvaluator (line 96) | class PMMEvalMHellaswagEvaluator(BaseEvaluator):
method score (line 98) | def score(self, predictions, references):
FILE: opencompass/datasets/PMMEval/mifeval.py
function test_instruction_following_strict (line 14) | def test_instruction_following_strict(inp, response, lang_code):
function test_instruction_following_loose (line 45) | def test_instruction_following_loose(inp, response, lang_code):
function pmmeval_mifeval_postprocess (line 95) | def pmmeval_mifeval_postprocess(text: str, lang_code: str) -> Tuple[str]:
class PMMEvalMIFEvalDataset (line 100) | class PMMEvalMIFEvalDataset(BaseDataset):
method load (line 103) | def load(path: str, lang: str):
class PMMEvalMIFEvalEvaluator (line 123) | class PMMEvalMIFEvalEvaluator(BaseEvaluator):
method score (line 125) | def score(self, predictions, references, test_set):
FILE: opencompass/datasets/PMMEval/mifeval_utils/combination_checker.py
function repeat_prompt_checker (line 1) | def repeat_prompt_checker(input_string: str, prompt_to_repeat: str, **kw...
function two_responses_checker (line 8) | def two_responses_checker(input_string: str, **kwargs):
FILE: opencompass/datasets/PMMEval/mifeval_utils/detectable_content_checker.py
function number_placeholders_checker (line 4) | def number_placeholders_checker(input_string: str, num_placeholders: int,
function postscript_checker (line 10) | def postscript_checker(input_string: str, postscript_marker: str, **kwar...
FILE: opencompass/datasets/PMMEval/mifeval_utils/detectable_format_checker.py
function removeprefix (line 5) | def removeprefix(s, prefix):
function removesuffix (line 12) | def removesuffix(s, suffix):
function constrained_response_checker (line 43) | def constrained_response_checker(input_string: str, lang_code: str, **kw...
function number_bullet_lists_checker (line 48) | def number_bullet_lists_checker(input_string: str, num_bullets: int, **k...
function number_highlighted_sections_checker (line 57) | def number_highlighted_sections_checker(input_string: str, num_highlight...
function title_checker (line 72) | def title_checker(input_string: str, **kwargs):
function json_format_checker (line 83) | def json_format_checker(input_string: str, **kwargs):
FILE: opencompass/datasets/PMMEval/mifeval_utils/keywords_checker.py
function forbidden_words_checker (line 1) | def forbidden_words_checker(input_string: str, forbidden_words: list,
FILE: opencompass/datasets/PMMEval/mifeval_utils/length_constraints_checker.py
function nth_paragraph_first_word_checker (line 4) | def nth_paragraph_first_word_checker(input_string: str, num_paragraphs: ...
function number_paragraphs_checker (line 27) | def number_paragraphs_checker(input_string: str, num_paragraphs: int,
function number_sentences_checker (line 35) | def number_sentences_checker(input_string: str, relation: str,
function number_words_checker (line 52) | def number_words_checker(input_string: str, relation: str, num_words: int,
FILE: opencompass/datasets/PMMEval/mifeval_utils/punctuation_checker.py
function no_comma_checker (line 17) | def no_comma_checker(input_string: str, lang_code: str, **kwargs):
FILE: opencompass/datasets/PMMEval/mifeval_utils/startend_checker.py
function end_checker_checker (line 1) | def end_checker_checker(input_string: str, end_phrase: str, **kwargs):
function quotation_checker (line 8) | def quotation_checker(input_string: str, lang_code: str, **kwargs):
FILE: opencompass/datasets/PMMEval/mlogiqa.py
function extract_choice (line 27) | def extract_choice(gen, lang):
function extract_choice_fuzzy (line 60) | def extract_choice_fuzzy(gen):
function pmmeval_mlogiqa_postprocess (line 69) | def pmmeval_mlogiqa_postprocess(text: str, lang_code: str) -> Tuple[str]:
class PMMEvalMLogiQADataset (line 74) | class PMMEvalMLogiQADataset(BaseDataset):
method load (line 77) | def load(path: str, lang: str):
class PMMEvalMLogiQAEvaluator (line 97) | class PMMEvalMLogiQAEvaluator(BaseEvaluator):
method score (line 99) | def score(self, predictions, references):
FILE: opencompass/datasets/PMMEval/mmmlu.py
function extract_choice (line 27) | def extract_choice(gen, lang):
function extract_choice_fuzzy (line 59) | def extract_choice_fuzzy(gen):
function pmmeval_mmmlu_postprocess (line 68) | def pmmeval_mmmlu_postprocess(text: str, lang_code: str) -> Tuple[str]:
class PMMEvalMMMLUDataset (line 73) | class PMMEvalMMMLUDataset(BaseDataset):
method load (line 76) | def load(path: str, lang: str, difficulty: str):
class PMMEvalMMMLUEvaluator (line 120) | class PMMEvalMMMLUEvaluator(BaseEvaluator):
method score (line 122) | def score(self, predictions, references):
FILE: opencompass/datasets/PMMEval/xnli.py
function extract_choice (line 28) | def extract_choice(gen, lang):
function extract_choice_fuzzy (line 60) | def extract_choice_fuzzy(gen, lang):
function pmmeval_xnli_postprocess (line 69) | def pmmeval_xnli_postprocess(text: str, lang_code: str) -> Tuple[str]:
class PMMEvalXNLIDataset (line 74) | class PMMEvalXNLIDataset(BaseDataset):
method load (line 77) | def load(path: str, lang: str):
class PMMEvalXNLIEvaluator (line 96) | class PMMEvalXNLIEvaluator(BaseEvaluator):
method score (line 98) | def score(self, predictions, references):
FILE: opencompass/datasets/ProcessBench.py
function extract_answer (line 11) | def extract_answer(solution_text: str):
class ProcessBenchEvalDataset (line 23) | class ProcessBenchEvalDataset(BaseDataset):
method load (line 26) | def load(path: str, subset: str, **kwargs):
class ProcessBenchEvaluator (line 52) | class ProcessBenchEvaluator(BaseEvaluator):
method score (line 54) | def score(self, predictions, references):
FILE: opencompass/datasets/ProteinLMBench.py
function _parse (line 10) | def _parse(item):
class ProteinLMBenchDataset (line 28) | class ProteinLMBenchDataset(BaseDataset):
method load (line 31) | def load(path: str, **kwargs):
class ProteinLMBenchEvaluator (line 38) | class ProteinLMBenchEvaluator(BaseEvaluator):
method score (line 40) | def score(self, predictions, references, test_set):
FILE: opencompass/datasets/PubMedQA.py
class PubMedQADataset (line 9) | class PubMedQADataset(BaseDataset):
method load_single (line 12) | def load_single(path):
method load (line 32) | def load(path):
FILE: opencompass/datasets/QuALITY.py
class QuALITYDataset (line 13) | class QuALITYDataset(BaseDataset):
method load (line 16) | def load(path: str):
class QuALITYEvaluator (line 44) | class QuALITYEvaluator(BaseEvaluator):
method score (line 46) | def score(self, predictions, references, test_set):
FILE: opencompass/datasets/SciEval.py
class SciEvalDataset (line 21) | class SciEvalDataset(BaseDataset):
method load (line 25) | def load(path: str, name: str, **kwargs) -> DatasetDict:
FILE: opencompass/datasets/SciKnowEval.py
function _parse (line 11) | def _parse(item, prompt_mode, discipline):
class SciKnowEvalDataset (line 25) | class SciKnowEvalDataset(BaseDataset):
method load (line 28) | def load(path: str, prompt_mode: str, **kwargs):
class SciKnowEvalEvaluator (line 51) | class SciKnowEvalEvaluator(BaseEvaluator):
method score (line 53) | def score(self, predictions, references, test_set):
function answer_cleansing (line 75) | def answer_cleansing(
FILE: opencompass/datasets/SciReasoner/GUE.py
class GUE_Dataset (line 19) | class GUE_Dataset(BaseDataset):
method load (line 22) | def load(path, task, mini_set=False):
function remove_think_tags (line 66) | def remove_think_tags(text: str) -> str:
function GUE_postprocessor (line 75) | def GUE_postprocessor(text: Union[str, None]) -> str:
class GUE_Evaluator (line 161) | class GUE_Evaluator(BaseEvaluator):
method score (line 163) | def score(self, predictions, references):
FILE: opencompass/datasets/SciReasoner/LLM4Chem/evaluator.py
class LLM4ChemDataset (line 25) | class LLM4ChemDataset(BaseDataset):
method load (line 28) | def load(path, task, max_cut=-1, mini_set=False, hf_hub=False):
function extract_answer_part (line 71) | def extract_answer_part(outputs, left_tag, right_tag, mode='tag'):
function LLM4Chem_postprocess (line 96) | def LLM4Chem_postprocess(text, task, *args, **kwargs):
class LLM4Chem_Evaluator (line 146) | class LLM4Chem_Evaluator(BaseEvaluator):
method __init__ (line 148) | def __init__(self, task, *args, **kwargs):
method score (line 152) | def score(self, predictions, references):
FILE: opencompass/datasets/SciReasoner/LLM4Chem/retrosynthesis_evaluator.py
function smi_tokenizer (line 29) | def smi_tokenizer(smi):
function canonicalize_smiles_clear_map (line 43) | def canonicalize_smiles_clear_map(smiles, synthon=False, return_max_frag...
function compute_rank (line 104) | def compute_rank(prediction_group,
function Retrosynthesis_postprocess (line 188) | def Retrosynthesis_postprocess(text: Union[str, None]) -> str:
class RetrosynthesisEvaluator (line 221) | class RetrosynthesisEvaluator(BaseEvaluator):
method __init__ (line 227) | def __init__(self,
method score (line 246) | def score(self, predictions, references):
FILE: opencompass/datasets/SciReasoner/LLM4Chem/utils/chat_generation.py
function generate_chat (line 1) | def generate_chat(input_text, output_text=None, prefix_chat=None):
FILE: opencompass/datasets/SciReasoner/LLM4Chem/utils/core_tagger.py
function find_sub_sequence (line 1) | def find_sub_sequence(whole, sub):
class CoreTagger (line 32) | class CoreTagger(object):
method __init__ (line 34) | def __init__(self,
method generate_mask (line 49) | def generate_mask(self, token_ids, output_begin, sample):
class CoreTaggerGeneral (line 111) | class CoreTaggerGeneral(object):
method __init__ (line 113) | def __init__(self,
method generate_mask (line 128) | def generate_mask(self, token_ids, prompt_mask, sample):
FILE: opencompass/datasets/SciReasoner/LLM4Chem/utils/general_prompter.py
function get_chat_content (line 1) | def get_chat_content(conversation, tokenize=False):
class GeneralPrompter (line 21) | class GeneralPrompter(object):
method __init__ (line 23) | def __init__(self, apply_chat_template_func, response_split='[/INST]'):
method generate_prompt (line 27) | def generate_prompt(self, chat, tokenize=False, *args, **kargs) -> str:
method get_response (line 34) | def get_response(self, output: str) -> str:
FILE: opencompass/datasets/SciReasoner/LLM4Chem/utils/metrics.py
function convert_smiles_list_into_mol_list (line 28) | def convert_smiles_list_into_mol_list(smiles_list,
function judge_exact_match (line 54) | def judge_exact_match(pred_can_smiles_list, gold_can_smiles_list):
function calculate_fingerprint_similarity (line 74) | def calculate_fingerprint_similarity(pred_mol_list,
function judge_multiple_match (line 112) | def judge_multiple_match(pred_can_smiles_list, golds_can_smiles_list):
function calculate_smiles_metrics (line 148) | def calculate_smiles_metrics(preds_smiles_list,
function judge_string_exact_match (line 283) | def judge_string_exact_match(pred_string_list, golds_string_list):
function judge_string_split_match (line 296) | def judge_string_split_match(pred_string_list,
function parse_molecule (line 313) | def parse_molecule(molecular_formula):
function count_element_match (line 377) | def count_element_match(pred_formula_list, golds_formula_list):
function calculate_formula_metrics (line 408) | def calculate_formula_metrics(preds_formula_list,
function calculate_text_metrics (line 508) | def calculate_text_metrics(pred_text_list,
function calculate_number_metrics (line 592) | def calculate_number_metrics(pred_text_list, gold_text_list):
function calculate_boolean_metrics (line 632) | def calculate_boolean_metrics(pred_text_list, gold_text_list):
FILE: opencompass/datasets/SciReasoner/LLM4Chem/utils/smiles_canonicalization.py
function canonicalize (line 10) | def canonicalize(smiles, isomeric=False, canonical=True, kekulize=False):
function canonicalize_molecule_smiles (line 82) | def canonicalize_molecule_smiles(smiles,
function canonicalize_reaction_smiles (line 144) | def canonicalize_reaction_smiles(smiles,
function get_molecule_id (line 176) | def get_molecule_id(smiles, remove_duplicate=True):
FILE: opencompass/datasets/SciReasoner/LLM4Mat.py
class LLM4MatDataset (line 21) | class LLM4MatDataset(BaseDataset):
method load (line 24) | def load(path,
function remove_think_tags (line 85) | def remove_think_tags(text: str) -> str:
function extract_strict_value (line 93) | def extract_strict_value(text: str, property: str) -> str:
function LLM4Mat_postprocessor (line 139) | def LLM4Mat_postprocessor(text: Union[str, None], property):
class LLM4Mat_Evaluator (line 150) | class LLM4Mat_Evaluator(BaseEvaluator):
method score (line 152) | def score(self, predictions, references):
FILE: opencompass/datasets/SciReasoner/Mol_Instructions/biotext.py
function CER_calculate_f1_score (line 20) | def CER_calculate_f1_score(true_entities, predicted_entities):
function calculate_f1_score (line 35) | def calculate_f1_score(true_entities, predicted_entities):
function calculate_accuracy_ (line 60) | def calculate_accuracy_(predictions, references):
function CER_calculate_accuracy_ (line 72) | def CER_calculate_accuracy_(predictions, references):
function ture_or_false_calculate_accuracy_ (line 85) | def ture_or_false_calculate_accuracy_(predictions, references):
function calculate_macro_f1_ (line 119) | def calculate_macro_f1_(predictions, references):
function multi_choice_question_calculate_accuracy (line 138) | def multi_choice_question_calculate_accuracy(question_data):
function multi_choice_question_calculate_accuracy_ (line 163) | def multi_choice_question_calculate_accuracy_(predictions, references):
class Mol_Instructions_Dataset_BioText (line 185) | class Mol_Instructions_Dataset_BioText(BaseDataset):
method load (line 188) | def load(path, task, max_cut=-1, mini_set=False, hf_hub=False):
function Mol_Instructions_postprocess_BioText (line 232) | def Mol_Instructions_postprocess_BioText(text, task, *args, **kwargs):
class Mol_Instructions_Evaluator_BioText (line 271) | class Mol_Instructions_Evaluator_BioText(BaseEvaluator):
method __init__ (line 273) | def __init__(self, task='protein_design', *args, **kwargs):
method score (line 277) | def score(self, predictions: List[str], references: List[str]):
FILE: opencompass/datasets/SciReasoner/Mol_Instructions/molecule.py
class Mol_Instructions_Dataset (line 41) | class Mol_Instructions_Dataset(BaseDataset):
method load (line 44) | def load(path, task, max_cut=-1, mini_set=False, hf_hub=False):
function convert_to_canonical_smiles (line 87) | def convert_to_canonical_smiles(smiles):
function Mol_Instructions_postprocess_Mol (line 99) | def Mol_Instructions_postprocess_Mol(text, task, *args, **kwargs):
function compute_MAE_property_prediction_str (line 149) | def compute_MAE_property_prediction_str(predictions, references):
function compute_fingerprint_metricts (line 158) | def compute_fingerprint_metricts(
function compute_mol_translation_selfies (line 221) | def compute_mol_translation_selfies(predictions, references):
function fix_smiles_brackets (line 319) | def fix_smiles_brackets(smiles):
class Mol_Instructions_Evaluator_Mol (line 333) | class Mol_Instructions_Evaluator_Mol(BaseEvaluator):
method __init__ (line 335) | def __init__(self, task, *args, **kwargs):
method score (line 339) | def score(self, predictions, references):
function compute_text_translation_metrics (line 385) | def compute_text_translation_metrics(
FILE: opencompass/datasets/SciReasoner/Mol_Instructions/normalized_SW_score.py
function normalized_smith_waterman (line 4) | def normalized_smith_waterman(seq1,
function Mol_Instructions_postprocess_Protein_Design (line 102) | def Mol_Instructions_postprocess_Protein_Design(text, *args, **kwargs):
FILE: opencompass/datasets/SciReasoner/Mol_Instructions/protein.py
class Mol_Instructions_Dataset_Protein_Design (line 23) | class Mol_Instructions_Dataset_Protein_Design(BaseDataset):
method load (line 26) | def load(path, task, max_cut=-1, mini_set=False, hf_hub=False):
function Mol_Instructions_postprocess_Protein (line 70) | def Mol_Instructions_postprocess_Protein(text, *args, **kwargs):
class Mol_Instructions_Evaluator_Protein (line 84) | class Mol_Instructions_Evaluator_Protein(RougeEvaluator):
method __init__ (line 86) | def __init__(self,
function Mol_Instructions_postprocess_Protein_Design (line 95) | def Mol_Instructions_postprocess_Protein_Design(text, *args, **kwargs):
class Mol_Instructions_Evaluator_Protein_Design (line 113) | class Mol_Instructions_Evaluator_Protein_Design(BaseEvaluator):
method __init__ (line 115) | def __init__(self, task='protein_design', *args, **kwargs):
method score (line 119) | def score(self, predictions: List[str], references: List[str]):
FILE: opencompass/datasets/SciReasoner/PEER.py
class PEER_Dataset (line 25) | class PEER_Dataset(BaseDataset):
method load (line 28) | def load(path, task, max_cut=-1, mini_set=False, hf_hub=False):
function PEER_postprocess_default (line 72) | def PEER_postprocess_default(text: Union[str, None]) -> str:
function PEER_postprocess (line 81) | def PEER_postprocess(text: Union[str, None]) -> str:
function PEER_postprocess_float_compare (line 155) | def PEER_postprocess_float_compare(text: Union[str, None],
function calculate_accuracy (line 180) | def calculate_accuracy(pred_text_list, gold_text_list):
class PEER_Evaluator (line 244) | class PEER_Evaluator(BaseEvaluator):
method __init__ (line 246) | def __init__(self,
method _retry_api (line 270) | def _retry_api(self, fn, *args, **kwargs):
method ask_gpt25 (line 286) | def ask_gpt25(self, question, answer, prediction):
method ask_gpt25_batch (line 321) | def ask_gpt25_batch(self, questions, answers, predictions):
method score (line 343) | def score(self, predictions, references):
class PEERRuleEvaluator (line 474) | class PEERRuleEvaluator(BaseEvaluator):
method score (line 476) | def score(self,
function peer_llm_judge_postprocess (line 525) | def peer_llm_judge_postprocess(output: Dict, output_path: str) -> Dict:
FILE: opencompass/datasets/SciReasoner/bio_instruction.py
class Bioinstruction_Dataset (line 35) | class Bioinstruction_Dataset(BaseDataset):
method load (line 38) | def load(path, task, mini_set=False, hf_hub=False):
function extract_answer_part (line 83) | def extract_answer_part(outputs, left_tag, right_tag, mode='tag'):
function extract_numeric_values (line 107) | def extract_numeric_values(text):
function generic_replace (line 149) | def generic_replace(m):
function classify_by_sentiment_model (line 161) | def classify_by_sentiment_model(text):
function classify_by_keywords (line 193) | def classify_by_keywords(text):
function process_regression_task (line 259) | def process_regression_task(task_name, task_entries, model_name):
function compute_spearman (line 322) | def compute_spearman(label_values, result_values):
function compute_R2 (line 378) | def compute_R2(label_values, result_values):
function compute_mixed_score (line 435) | def compute_mixed_score(label_values,
function compute_R2_for_ProgrammableRNASwitches_task (line 530) | def compute_R2_for_ProgrammableRNASwitches_task(task_name, task_entries,
function compute_PCC_for_enhancer_activity_task (line 670) | def compute_PCC_for_enhancer_activity_task(task_name, task_entries,
function process_binary_classification_task (line 794) | def process_binary_classification_task(task_name, task_entries, model_na...
function compute_MCC (line 880) | def compute_MCC(label_classes, result_classes):
function compute_Acc (line 893) | def compute_Acc(label_classes, result_classes):
function extract_rna_family (line 912) | def extract_rna_family(text):
function compute_Acc_for_NoncodingRNAFamily_task (line 920) | def compute_Acc_for_NoncodingRNAFamily_task(task_name, task_entries,
function extract_modifications (line 971) | def extract_modifications(text):
function convert_to_binary_vector (line 981) | def convert_to_binary_vector(modifications, classes=modification_classes):
function compute_AUC_for_Modification_task (line 997) | def compute_AUC_for_Modification_task(task_name, task_entries, model_name):
function count_f1_max (line 1081) | def count_f1_max(pred, target):
function round_and_scale_results (line 1138) | def round_and_scale_results(data, decimal_places=3, scale_factor=100):
function ec_to_multihot (line 1149) | def ec_to_multihot(ec_list, ec_labels):
function compute_Fmax_for_FunctionEC_task (line 1162) | def compute_Fmax_for_FunctionEC_task(task_name, task_entries, ec_labels,
function preprocess_input_data (line 1225) | def preprocess_input_data(input_file_path, prediction, mini_set=False):
class bio_instruction_Evaluator (line 1294) | class bio_instruction_Evaluator(BaseEvaluator):
method __init__ (line 1296) | def __init__(self,
method score (line 1310) | def score(self, predictions):
FILE: opencompass/datasets/SciReasoner/bulk_modulus_material.py
class Bulk_modulus_material_Dataset (line 19) | class Bulk_modulus_material_Dataset(BaseDataset):
method load (line 22) | def load(path, mini_set=False):
function material_postprocessor (line 63) | def material_postprocessor(text: Union[str, None]) -> str:
class material_Evaluator (line 74) | class material_Evaluator(BaseEvaluator):
method __init__ (line 82) | def __init__(self, data_path=None, **kwargs):
method _load_ground_truths (line 92) | def _load_ground_truths(self):
method _normalize (line 106) | def _normalize(self, formula: str) -> str:
method score (line 112) | def score(self, predictions: List[dict]):
FILE: opencompass/datasets/SciReasoner/composition_material.py
function extract_elements_from_prompt (line 18) | def extract_elements_from_prompt(prompt: str) -> list:
function composition_precision (line 52) | def composition_precision(elements: list[str], prediction: str) -> float:
class Composition_material_Dataset (line 63) | class Composition_material_Dataset(BaseDataset):
method load (line 66) | def load(path, mini_set=False):
function material_postprocessor (line 111) | def material_postprocessor(text: Union[str, None]) -> str:
class composition_Evaluator (line 122) | class composition_Evaluator(BaseEvaluator):
method __init__ (line 124) | def __init__(self, data_path, tuning_data=None, **kwargs):
method _load_original_inputs (line 135) | def _load_original_inputs(self):
method _normalize (line 147) | def _normalize(self, formula):
method score (line 152) | def score(self, predictions):
FILE: opencompass/datasets/SciReasoner/opi/evaluator.py
class OpiDataset (line 20) | class OpiDataset(BaseDataset):
method load (line 23) | def load(path, task, max_cut=-1, mini_set=False, hf_hub=False):
function extract_answer_part (line 64) | def extract_answer_part(outputs, left_tag, right_tag, mode='tag'):
function opi_postprocess (line 89) | def opi_postprocess(text, task, *args, **kwargs):
class Opi_Evaluator (line 97) | class Opi_Evaluator(BaseEvaluator):
method __init__ (line 99) | def __init__(self, task, *args, **kwargs):
method score (line 103) | def score(self, predictions, references):
method _evaluate_function (line 127) | def _evaluate_function(self, predictions, references):
method _evaluate_subcellular_localization (line 152) | def _evaluate_subcellular_localization(self, predictions, references):
method _evaluate_fold_type (line 177) | def _evaluate_fold_type(self, predictions, references):
method _evaluate_multilabel (line 203) | def _evaluate_multilabel(self, predictions, references):
method _evaluate_text_similarity (line 252) | def _evaluate_text_similarity(self, predictions, references):
method _evaluate_general (line 272) | def _evaluate_general(self, predictions, references):
FILE: opencompass/datasets/SciReasoner/opi/process_ec_numbers.py
function add_spaces_to_ec_number (line 6) | def add_spaces_to_ec_number(text: str) -> str:
function process_json_value (line 20) | def process_json_value(value: Any) -> Any:
function process_ec_json_file (line 34) | def process_ec_json_file(input_file: str, output_file: str) -> None:
FILE: opencompass/datasets/SciReasoner/opi/utils/accuracy4fold_type.py
function load_json (line 7) | def load_json(file_path):
function compute_accuracy4fold_type (line 13) | def compute_accuracy4fold_type(eval_file, test_files):
FILE: opencompass/datasets/SciReasoner/opi/utils/metrics4all.py
function calculate_metrics (line 14) | def calculate_metrics(output, target):
function calculate_rouge_l (line 32) | def calculate_rouge_l(output, target):
function process_json_file (line 38) | def process_json_file(json_file_path):
function main (line 98) | def main(eval_res_path):
FILE: opencompass/datasets/SciReasoner/uncond_RNA.py
class Uncond_RNA_Dataset (line 17) | class Uncond_RNA_Dataset(BaseDataset):
method load (line 20) | def load(num, prompt):
function RNA_postprocessor (line 26) | def RNA_postprocessor(text: Union[str, None]) -> str:
class RNA_Evaluator (line 46) | class RNA_Evaluator(BaseEvaluator):
method score (line 48) | def score(self, predictions, references):
method run_rnafold (line 101) | def run_rnafold(self, input_fasta, output_dir):
method parse_mfe (line 113) | def parse_mfe(self, output_file):
method run_cmscan (line 123) | def run_cmscan(self, fasta_file, output_dir, rfam_cm, rfam_clanin):
method parse_unique_families (line 137) | def parse_unique_families(self, tblout_file):
FILE: opencompass/datasets/SciReasoner/uncond_material.py
class Uncond_material_Dataset (line 12) | class Uncond_material_Dataset(BaseDataset):
method load (line 15) | def load(num, prompt):
function material_postprocessor (line 21) | def material_postprocessor(text: Union[str, None]) -> str:
class uncond_material_Evaluator (line 33) | class uncond_material_Evaluator(BaseEvaluator):
method score (line 35) | def score(self, predictions):
FILE: opencompass/datasets/SciReasoner/unconditional_molecule_generation/UMG.py
class UMG_Dataset (line 16) | class UMG_Dataset(BaseDataset):
method load (line 19) | def load(max_cut=-1):
class UMG_Evaluator (line 50) | class UMG_Evaluator(BaseEvaluator):
method __init__ (line 52) | def __init__(self, *args, **kwargs):
method is_valid_smiles_rdkit (line 55) | def is_valid_smiles_rdkit(self, s):
method extract_smiles_simple (line 67) | def extract_smiles_simple(self, text: str) -> str | None:
method score (line 93) | def score(self, predictions):
FILE: opencompass/datasets/SciReasoner/unconditional_protein_generation/UPG.py
class UPGDataset (line 11) | class UPGDataset(BaseDataset):
method load (line 14) | def load(tag_bool=True, max_cut=-1):
function UPG_postprocess (line 61) | def UPG_postprocess(text):
class UPG_Evaluator (line 94) | class UPG_Evaluator(BaseEvaluator):
method __init__ (line 96) | def __init__(self, *args, **kwargs):
method _calculate_sequence_identity (line 99) | def _calculate_sequence_identity(self, seq1, seq2):
method score (line 115) | def score(self, predictions, references=None):
FILE: opencompass/datasets/SciReasoner/unconditional_protein_generation/omegafold/__main__.py
function main (line 37) | def main(protein_list):
FILE: opencompass/datasets/SciReasoner/unconditional_protein_generation/omegafold/confidence.py
function get_all_confidence (line 38) | def get_all_confidence(
function _compute_confidence (line 93) | def _compute_confidence(logits: torch.Tensor) -> torch.Tensor:
class ConfidenceHead (line 123) | class ConfidenceHead(modules.OFModule):
method __init__ (line 130) | def __init__(self, cfg: argparse.Namespace):
method forward (line 140) | def forward(self, node_repr: torch.Tensor) -> torch.Tensor:
FILE: opencompass/datasets/SciReasoner/unconditional_protein_generation/omegafold/config.py
function _make_config (line 31) | def _make_config(input_dict: dict) -> argparse.Namespace:
function make_config (line 42) | def make_config(model_idx: int = 1) -> argparse.Namespace:
FILE: opencompass/datasets/SciReasoner/unconditional_protein_generation/omegafold/decode.py
class InvariantPointAttention (line 42) | class InvariantPointAttention(modules.OFModule):
method __init__ (line 49) | def __init__(self, cfg: argparse.Namespace) -> None:
method forward (line 89) | def forward(self, node_repr: torch.Tensor, edge_repr: torch.Tensor,
method _get_scalar (line 148) | def _get_scalar(linear: nn.Linear, inputs: torch.Tensor,
method _get_point (line 167) | def _get_point(linear: nn.Linear, inputs: torch.Tensor, n_head: int,
class TorsionAngleHead (line 188) | class TorsionAngleHead(modules.OFModule):
method __init__ (line 194) | def __init__(self, cfg: argparse.Namespace):
method forward (line 211) | def forward(
class StructureCycle (line 241) | class StructureCycle(modules.OFModule):
method __init__ (line 248) | def __init__(self, cfg: argparse.Namespace) -> None:
method forward (line 260) | def forward(
class StructureModule (line 296) | class StructureModule(modules.OFModule):
method __init__ (line 299) | def __init__(self, cfg: argparse.Namespace):
method forward (line 310) | def forward(
FILE: opencompass/datasets/SciReasoner/unconditional_protein_generation/omegafold/embedders.py
function _get_pos (line 38) | def _get_pos(shape: torch.Size, device: torch.device, dtype: torch.dtype,
function _apply_embed (line 62) | def _apply_embed(inputs: torch.Tensor, sin: torch.Tensor, cos: torch.Ten...
class EdgeEmbedder (line 106) | class EdgeEmbedder(modules.OFModule):
method __init__ (line 112) | def __init__(self, cfg: argparse.Namespace) -> None:
method forward (line 119) | def forward(self, fasta_sequence: torch.Tensor,
class RoPE (line 128) | class RoPE(nn.Module):
method __init__ (line 136) | def __init__(self, input_dim: int) -> None:
method forward (line 151) | def forward(self, tensor: torch.Tensor,
method _compute_sin_cos (line 170) | def _compute_sin_cos(
class RelPosEmbedder (line 190) | class RelPosEmbedder(nn.Embedding):
method forward (line 197) | def forward(self, num_res: int) -> torch.Tensor:
class StructEmbedder (line 213) | class StructEmbedder(modules.OFModule):
method __init__ (line 219) | def __init__(self, cfg: argparse.Namespace):
method forward (line 245) | def forward(
method _sharded_compute (line 272) | def _sharded_compute(self, pairwise_fasta: torch.Tensor, d: torch.Tensor,
class PairStructEmbedder (line 299) | class PairStructEmbedder(StructEmbedder):
method forward (line 301) | def forward(
class RecycleEmbedder (line 317) | class RecycleEmbedder(modules.OFModule):
method __init__ (line 323) | def __init__(self, cfg: argparse.Namespace):
method forward (line 336) | def forward(
FILE: opencompass/datasets/SciReasoner/unconditional_protein_generation/omegafold/geoformer.py
class GeoFormerBlock (line 41) | class GeoFormerBlock(modules.OFModule):
method __init__ (line 47) | def __init__(self, cfg: argparse.Namespace) -> None:
method forward (line 79) | def forward(
method _column_attention (line 114) | def _column_attention(self, node_repr, mask, fwd_cfg):
class GeoFormer (line 126) | class GeoFormer(modules.OFModule):
method __init__ (line 128) | def __init__(self, cfg: argparse.Namespace):
method forward (line 134) | def forward(
FILE: opencompass/datasets/SciReasoner/unconditional_protein_generation/omegafold/model.py
class OmegaFoldCycle (line 43) | class OmegaFoldCycle(modules.OFModule):
method __init__ (line 45) | def __init__(self, cfg: argparse.Namespace) -> None:
method forward (line 52) | def forward(
class OmegaFold (line 107) | class OmegaFold(modules.OFModule):
method __init__ (line 115) | def __init__(self, cfg: argparse.Namespace) -> None:
method forward (line 124) | def forward(
method deep_sequence_embed (line 185) | def deep_sequence_embed(
method create_initial_prev_dict (line 212) | def create_initial_prev_dict(
FILE: opencompass/datasets/SciReasoner/unconditional_protein_generation/omegafold/modules.py
function softmax (line 38) | def softmax(x: torch.Tensor,
function _attention (line 66) | def _attention(
function attention (line 96) | def attention(
class OFModule (line 161) | class OFModule(nn.Module):
method __init__ (line 167) | def __init__(self, cfg: typing.Optional[argparse.Namespace]) -> None:
method device (line 172) | def device(self) -> torch.device:
method dtype (line 176) | def dtype(self) -> torch.dtype:
class Transition (line 180) | class Transition(OFModule):
method __init__ (line 182) | def __init__(self, d: int, n: int, activation: str) -> None:
method forward (line 192) | def forward(self, x: torch.Tensor,
class MultiHeadedScaling (line 204) | class MultiHeadedScaling(OFModule):
method __init__ (line 210) | def __init__(
method forward (line 241) | def forward(self, x: torch.Tensor) -> typing.List[torch.Tensor]:
method reset_parameters (line 262) | def reset_parameters(self):
class Val2ContBins (line 267) | class Val2ContBins(OFModule):
method __init__ (line 269) | def __init__(
method forward (line 285) | def forward(self, dist_x): # (*)
class Val2Bins (line 295) | class Val2Bins(OFModule):
method __init__ (line 303) | def __init__(self, cfg: argparse.Namespace) -> None:
method forward (line 310) | def forward(self, dist: torch.Tensor) -> torch.Tensor:
class Node2Edge (line 326) | class Node2Edge(OFModule):
method __init__ (line 332) | def __init__(self, in_dim: int, proj_dim: int, out_dim: int) -> None:
method forward (line 340) | def forward(self, node_repr: torch.Tensor,
class Attention (line 357) | class Attention(OFModule):
method __init__ (line 371) | def __init__(self, q_dim: int, kv_dim: int, n_head: int, gating: bool,
method forward (line 391) | def forward(
method _get_attn_out (line 435) | def _get_attn_out(self, q_inputs, kv_inputs, fwd_cfg, bias):
class AttentionWEdgeBias (line 463) | class AttentionWEdgeBias(OFModule):
method __init__ (line 465) | def __init__(self, d_node: int, d_edge: int, n_head: int,
method forward (line 480) | def forward(
function _get_sharded_stacked (line 512) | def _get_sharded_stacked(edge_repr: torch.Tensor, subbatch_size: int):
class GeometricAttention (line 525) | class GeometricAttention(OFModule):
method __init__ (line 530) | def __init__(self, d_edge: int, c: int, n_head: int, n_axis: int) -> N...
method _get_attended (line 554) | def _get_attended(self, edge_repr: torch.Tensor, mask: torch.Tensor,
method _get_gated (line 576) | def _get_gated(self, edge_repr: torch.Tensor, mask: torch.Tensor, fwd_...
method _get_sliced_weight (line 601) | def _get_sliced_weight(self, weight: torch.Tensor, shift=0):
method _get_act_row (line 607) | def _get_act_row(self, edge_row: torch.Tensor,
method _get_act_col (line 615) | def _get_act_col(self, edge_row: torch.Tensor,
method forward (line 623) | def forward(self, edge_repr: torch.Tensor, mask: torch.Tensor,
FILE: opencompass/datasets/SciReasoner/unconditional_protein_generation/omegafold/omegaplm.py
function _get_qk_scaling (line 38) | def _get_qk_scaling(num_res: torch.Tensor, attn_dim: int) -> torch.Tensor:
class GatedAttentionUnit (line 55) | class GatedAttentionUnit(modules.OFModule):
method __init__ (line 60) | def __init__(self, cfg: argparse.Namespace):
method forward (line 73) | def forward(
class OmegaPLMLayer (line 113) | class OmegaPLMLayer(modules.OFModule):
method __init__ (line 123) | def __init__(self, cfg: argparse.Namespace) -> None:
method forward (line 127) | def forward(
class OmegaPLM (line 151) | class OmegaPLM(modules.OFModule):
method __init__ (line 163) | def __init__(self, cfg: argparse.Namespace) -> None:
method forward (line 172) | def forward(
method _get_finetuning_scale (line 208) | def _get_finetuning_scale(self, mask: torch.Tensor,
FILE: opencompass/datasets/SciReasoner/unconditional_protein_generation/omegafold/pipeline.py
function _mps_is_available (line 49) | def _mps_is_available():
function _set_precision (line 59) | def _set_precision(allow_tf32: bool) -> None:
function path_leaf (line 78) | def path_leaf(path: str) -> str:
function fasta2inputs (line 93) | def fasta2inputs(
function list2inputs (line 181) | def list2inputs(
function save_pdb (line 247) | def save_pdb(pos14: torch.Tensor,
function _load_weights (line 305) | def _load_weights(
function _get_device (line 334) | def _get_device(device) -> str:
function get_args (line 367) | def get_args() -> typing.Tuple[types.SimpleNamespace, collections.Ordere...
FILE: opencompass/datasets/SciReasoner/unconditional_protein_generation/omegafold/utils/protein_utils/aaframe.py
class AAFrame (line 52) | class AAFrame(object):
method __init__ (line 57) | def __init__(self,
method unit (line 91) | def unit(self) -> str:
method _assign (line 101) | def _assign(self, translation: torch.Tensor, rotation: torch.Tensor,
method to_nanometers (line 132) | def to_nanometers(self, in_place: bool = True) -> 'AAFrame':
method to_angstrom (line 156) | def to_angstrom(self, in_place: bool) -> 'AAFrame':
method translation (line 181) | def translation(self) -> torch.Tensor:
method translation (line 191) | def translation(self, value: torch.Tensor) -> None:
method rotation (line 203) | def rotation(self) -> torch.Tensor:
method rotation (line 213) | def rotation(self, value: torch.Tensor) -> None:
method mask (line 229) | def mask(self) -> torch.Tensor:
method mask (line 239) | def mask(self, value: torch.Tensor):
method default_init (line 243) | def default_init(
method _neg_dim (line 282) | def _neg_dim(cls, dim: int) -> Tuple[int, int, int]:
method unsqueeze (line 288) | def unsqueeze(self, dim: int) -> 'AAFrame':
method sum (line 300) | def sum(self, dim: int, keepdim: bool = False) -> 'AAFrame':
method dim_apply (line 322) | def dim_apply(self, func: callable, dim: int) -> 'AAFrame':
method _construct_frame (line 346) | def _construct_frame(
method from_4x4 (line 380) | def from_4x4(cls, m: torch.Tensor, mask: torch.Tensor,
method transform (line 403) | def transform(self, pos: torch.Tensor) -> torch.Tensor:
method from_torsion (line 470) | def from_torsion(
method __getitem__ (line 517) | def __getitem__(self, idx: Union[slice, int, torch.Tensor]) -> 'AAFrame':
method __setitem__ (line 545) | def __setitem__(self, key: Union[int, torch.Tensor, List[int]],
method device (line 573) | def device(self) -> torch.device:
method shape (line 584) | def shape(self) -> torch.Size:
method __mul__ (line 592) | def __mul__(self, other) -> 'AAFrame':
method _tensor_multiplication (line 598) | def _tensor_multiplication(self, other: torch.Tensor) -> 'AAFrame':
method _combine_transformation (line 624) | def _combine_transformation(self, other: 'AAFrame') -> 'AAFrame':
method __repr__ (line 669) | def __repr__(self) -> str:
method view (line 672) | def view(self, *args) -> 'AAFrame':
method dtype (line 693) | def dtype(self):
method expand_w_torsion (line 696) | def expand_w_torsion(self, torsion_angles: torch.Tensor,
method rotate (line 778) | def rotate(self, rotation: torch.Tensor):
method expanded_to_pos (line 805) | def expanded_to_pos(
method __len__ (line 854) | def __len__(self):
method inverse (line 858) | def inverse(self) -> 'AAFrame':
method position_in_frame (line 874) | def position_in_frame(self, pos: torch.Tensor) -> torch.Tensor:
method from_tensor (line 888) | def from_tensor(cls, tensor, unit: str) -> 'AAFrame':
function torsion_mask_to_atom14_mask (line 907) | def torsion_mask_to_atom14_mask(torsion_mask: torch.Tensor,
FILE: opencompass/datasets/SciReasoner/unconditional_protein_generation/omegafold/utils/protein_utils/functions.py
function get_norm (line 33) | def get_norm(x: torch.Tensor) -> torch.Tensor:
function robust_normalize (line 46) | def robust_normalize(x: torch.Tensor,
function quaternion_to_matrix (line 64) | def quaternion_to_matrix(quaternions: torch.Tensor) -> torch.Tensor:
function batch_matrix_vector (line 99) | def batch_matrix_vector(matrix: torch.Tensor,
function create_pseudo_beta (line 117) | def create_pseudo_beta(atom_pos: torch.Tensor,
function bit_wise_not (line 139) | def bit_wise_not(boolean_tensor: torch.Tensor) -> torch.Tensor:
FILE: opencompass/datasets/SciReasoner/unconditional_protein_generation/omegafold/utils/protein_utils/residue_constants.py
function residx_to_3 (line 437) | def residx_to_3(idx):
function get_chi_angle_atom_indices (line 448) | def get_chi_angle_atom_indices():
function _make_rigid_transformation_4x4 (line 476) | def _make_rigid_transformation_4x4(ex: torch.Tensor, ey: torch.Tensor,
function _make_aa_constants (line 506) | def _make_aa_constants():
function substitute (line 674) | def substitute(res: str):
FILE: opencompass/datasets/SciReasoner/unconditional_protein_generation/omegafold/utils/torch_utils.py
function mask2bias (line 38) | def mask2bias(mask: torch.Tensor, *, inf: float = 1e9) -> torch.Tensor:
function normalize (line 52) | def normalize(inputs: torch.Tensor,
function masked_mean (line 83) | def masked_mean(values: torch.Tensor,
function recursive_to (line 106) | def recursive_to(obj: typing.Any, **kwargs) -> typing.Any:
FILE: opencompass/datasets/ScienceQA.py
class ScienceQADataset (line 9) | class ScienceQADataset(BaseDataset):
method load_single (line 12) | def load_single(path):
method load (line 30) | def load(path):
FILE: opencompass/datasets/SeedBench.py
class SeedBenchDataset (line 20) | class SeedBenchDataset(BaseDataset):
method load (line 23) | def load(data_files: str,
class F1Evaluator (line 51) | class F1Evaluator(BaseEvaluator):
method __init__ (line 59) | def __init__(self, seed: int = 0) -> None:
method _preprocess (line 63) | def _preprocess(self, predictions: List, references: List) -> dict:
method _postprocess (line 69) | def _postprocess(self, scores: dict) -> dict:
method score (line 72) | def score(self, predictions: List, references: List) -> dict:
class F1ScoreEvaluator (line 136) | class F1ScoreEvaluator(F1Evaluator):
method __init__ (line 139) | def __init__(self) -> None:
function my_multiple_select_postprocess (line 145) | def my_multiple_select_postprocess(text: str) -> str:
class AverageRougeEvaluator (line 152) | class AverageRougeEvaluator(BaseEvaluator):
method __init__ (line 160) | def __init__(self, seed: int = 0) -> None:
method _preprocess (line 164) | def _preprocess(self, predictions: List, references: List) -> dict:
method _postprocess (line 176) | def _postprocess(self, scores: dict) -> dict:
method score (line 179) | def score(self, predictions: List, references: List) -> dict:
class AverageRougeScoreEvaluator (line 244) | class AverageRougeScoreEvaluator(AverageRougeEvaluator):
method __init__ (line 247) | def __init__(self) -> None:
class AccScoreStrEvaluator (line 251) | class AccScoreStrEvaluator(BaseEvaluator):
method __init__ (line 259) | def __init__(self, seed: int = 0) -> None:
method _preprocess (line 263) | def _preprocess(self, predictions: List, references: List) -> dict:
method _postprocess (line 269) | def _postprocess(self, scores: dict) -> dict:
method score (line 272) | def score(self, predictions: List, references: List) -> dict:
class AccScoreStr_Evaluator (line 305) | class AccScoreStr_Evaluator(AccScoreStrEvaluator):
method __init__ (line 308) | def __init__(self) -> None:
FILE: opencompass/datasets/TheoremQA/legacy.py
class TheoremQADataset (line 12) | class TheoremQADataset(BaseDataset):
method load (line 15) | def load(path: str):
function TheoremQA_postprocess (line 21) | def TheoremQA_postprocess(text: str) -> str:
function TheoremQA_postprocess_v2 (line 31) | def TheoremQA_postprocess_v2(text: str) -> str:
FILE: opencompass/datasets/TheoremQA/main.py
class TheoremQADatasetV3 (line 16) | class TheoremQADatasetV3(BaseDataset):
method load (line 19) | def load(path: str):
function TheoremQA_postprocess_v3 (line 29) | def TheoremQA_postprocess_v3(text: str) -> str:
function TheoremQA_postprocess_v4 (line 33) | def TheoremQA_postprocess_v4(text: str) -> str:
class TheoremQAEvaluatorV3 (line 42) | class TheoremQAEvaluatorV3(BaseEvaluator):
method score (line 43) | def score(self, predictions, references, test_set):
FILE: opencompass/datasets/TheoremQA/number_utils.py
function floatify (line 7) | def floatify(num: str):
function within_eps (line 18) | def within_eps(pred: float, gt: float):
function clean_units (line 26) | def clean_units(pred_str: str):
function number_it (line 50) | def number_it(num):
function compare_two_numbers (line 76) | def compare_two_numbers(p, gt):
function compare_two_list (line 88) | def compare_two_list(pred, gt):
FILE: opencompass/datasets/TheoremQA/utils.py
function time_limit (line 7) | def time_limit(seconds: float):
function extract_theoremqa_answer (line 19) | def extract_theoremqa_answer(pred: str, answer_flag: bool = True):
function answer_clean (line 56) | def answer_clean(direct_answer_trigger_for_fewshot: tuple, pred: str):
function compare_answer_with_groundtruth (line 96) | def compare_answer_with_groundtruth(answer: str, groundtruth_str: str, g...
FILE: opencompass/datasets/advglue.py
class AdvDataset (line 12) | class AdvDataset(BaseDataset):
method __init__ (line 22) | def __init__(
method aug_with_original_data (line 34) | def aug_with_original_data(self, dataset):
method load (line 52) | def load(self, path):
class AdvSst2Dataset (line 80) | class AdvSst2Dataset(AdvDataset):
method __init__ (line 83) | def __init__(self, **kwargs):
class AdvQqpDataset (line 91) | class AdvQqpDataset(AdvDataset):
method __init__ (line 94) | def __init__(self, **kwargs):
class AdvMnliDataset (line 104) | class AdvMnliDataset(AdvDataset):
method __init__ (line 107) | def __init__(self, **kwargs):
class AdvMnliMMDataset (line 117) | class AdvMnliMMDataset(AdvDataset):
method __init__ (line 120) | def __init__(self, **kwargs):
class AdvQnliDataset (line 129) | class AdvQnliDataset(AdvDataset):
method __init__ (line 132) | def __init__(self, **kwargs):
class AdvRteDataset (line 141) | class AdvRteDataset(AdvDataset):
method __init__ (line 144) | def __init__(self, **kwargs):
class AccDropEvaluator (line 151) | class AccDropEvaluator(AccEvaluator):
method __init__ (line 154) | def __init__(self) -> None:
method score (line 157) | def score(self, predictions: List, references: List) -> dict:
FILE: opencompass/datasets/afqmcd.py
class AFQMCDatasetV2 (line 13) | class AFQMCDatasetV2(BaseDataset):
method load (line 16) | def load(path, local_mode=False):
FILE: opencompass/datasets/agieval/agieval.py
class AGIEvalDataset (line 17) | class AGIEvalDataset(BaseDataset):
method load (line 20) | def load(path: str, name: str, setting_name: str):
class AGIEvalDataset_v2 (line 39) | class AGIEvalDataset_v2(BaseDataset):
method load (line 42) | def load(path: str, name: str, setting_name: str):
class AGIEvalEvaluator (line 89) | class AGIEvalEvaluator(BaseEvaluator):
method score (line 91) | def score(self, predictions, references):
class AGIEvalEvaluator_mcq (line 106) | class AGIEvalEvaluator_mcq(BaseEvaluator):
method score (line 108) | def score(self, predictions, references):
FILE: opencompass/datasets/agieval/constructions.py
class TaskSchema (line 5) | class TaskSchema(object):
method __init__ (line 7) | def __init__(self,
method to_dict (line 21) | def to_dict(self):
class AgiInstance (line 33) | class AgiInstance(object):
method __init__ (line 35) | def __init__(self, task_description, data_source, task_schema, output,
method to_dict (line 44) | def to_dict(self):
class ChatGPTSchema (line 55) | class ChatGPTSchema(object):
method __init__ (line 57) | def __init__(self, context=None, metadata=''):
method to_dict (line 61) | def to_dict(self):
class ResultsForHumanSchema (line 65) | class ResultsForHumanSchema(object):
method __init__ (line 67) | def __init__(self,
method to_dict (line 87) | def to_dict(self):
method to_tsv (line 101) | def to_tsv(result_list, path):
FILE: opencompass/datasets/agieval/dataset_loader.py
function convert_zero_shot (line 31) | def convert_zero_shot(line, dataset_name):
function convert_zero_shot_CoT_stage1 (line 66) | def convert_zero_shot_CoT_stage1(line, dataset_name):
function combine_prompt (line 95) | def combine_prompt(prompt_path,
function _lazy_load_enc (line 170) | def _lazy_load_enc():
function concat_prompt (line 177) | def concat_prompt(demos,
function concat_prompt_chat_mode (line 210) | def concat_prompt_chat_mode(demos,
function convert_few_shot (line 242) | def convert_few_shot(line, dataset_name, demo, n_shot, chat_mode=False):
function load_dataset (line 275) | def load_dataset(dataset_name,
function generate_second_stage_input (line 330) | def generate_second_stage_input(dataset_name,
function load_dataset_as_result_schema (line 366) | def load_dataset_as_result_schema(dataset_name, parent_path):
FILE: opencompass/datasets/agieval/evaluation.py
function convert_to_set (line 6) | def convert_to_set(item):
function evaluate_single_sample (line 16) | def evaluate_single_sample(dataset_name, prediction, label):
FILE: opencompass/datasets/agieval/math_equivalence.py
function _fix_fracs (line 5) | def _fix_fracs(string):
function _fix_a_slash_b (line 37) | def _fix_a_slash_b(string):
function _remove_right_units (line 52) | def _remove_right_units(string):
function _fix_sqrt (line 62) | def _fix_sqrt(string):
function _strip_string (line 77) | def _strip_string(string):
function is_equiv (line 147) | def is_equiv(str1, str2, verbose=False):
FILE: opencompass/datasets/agieval/post_process.py
function extract_last_line (line 8) | def extract_last_line(string):
function remove_few_shot_prefix (line 17) | def remove_few_shot_prefix(string: str):
function try_parse_few_shot_qa_single_answer (line 29) | def try_parse_few_shot_qa_single_answer(string, setting_name, language='...
function try_parse_few_shot_pattern (line 46) | def try_parse_few_shot_pattern(string: str, dataset_name, setting_name):
function parse_few_shot_qa_single_answer (line 64) | def parse_few_shot_qa_single_answer(string, setting_name, language='en'):
function find_first_capital_letter (line 73) | def find_first_capital_letter(answer):
function extract_answer_in_bracket (line 82) | def extract_answer_in_bracket(answer, prefix='【', suffix='】'):
function parse_math_answer (line 92) | def parse_math_answer(setting_name, raw_string):
function parse_qa_multiple_answer (line 170) | def parse_qa_multiple_answer(string, setting_name):
function post_process (line 180) | def post_process(dataset_name, setting_name, prediction):
FILE: opencompass/datasets/agieval/utils.py
function read_jsonl (line 5) | def read_jsonl(path):
function save_jsonl (line 21) | def save_jsonl(lines, directory):
function extract_answer (line 27) | def extract_answer(js):
FILE: opencompass/datasets/aime2024.py
class Aime2024Dataset (line 12) | class Aime2024Dataset(BaseDataset):
method load (line 15) | def load(path, **kwargs):
FILE: opencompass/datasets/anli.py
class AnliDataset (line 8) | class AnliDataset(BaseDataset):
method load (line 11) | def load(path: str):
FILE: opencompass/datasets/anthropics_evals.py
class AiRiskDataset (line 6) | class AiRiskDataset(BaseDataset):
method load (line 9) | def load(path: str):
class PersonaDataset (line 26) | class PersonaDataset(BaseDataset):
method load (line 29) | def load(path: str):
class SycophancyDataset (line 46) | class SycophancyDataset(BaseDataset):
method load (line 49) | def load(path: str):
FILE: opencompass/datasets/apps.py
class APPSDataset (line 37) | class APPSDataset(BaseDataset):
method load (line 40) | def load(path: str, num_repeats: int = 1):
class APPS_miniDataset (line 93) | class APPS_miniDataset(BaseDataset):
method load (line 96) | def load(path: str, num_repeats: int = 1):
class APPSEvaluator (line 152) | class APPSEvaluator(BaseEvaluator):
method post_process (line 154) | def post_process(self, text):
method check_correctness (line 167) | def check_correctness(self, sample, generation, timeout, debug=True):
method evaluate_generations (line 193) | def evaluate_generations(self,
method estimate_pass_at_k (line 238) | def estimate_pass_at_k(self, num_samples, num_correct, k):
method compute_metrics (line 258) | def compute_metrics(self, results, k_list=[1, 10, 100]):
method score (line 289) | def score(self, predictions, references, test_set):
class CODE_TYPE (line 303) | class CODE_TYPE(Enum):
class TimeoutException (line 309) | class TimeoutException(Exception):
function timeout_handler (line 313) | def timeout_handler(signum, frame):
class Capturing (line 329) | class Capturing(list):
method __enter__ (line 331) | def __enter__(self):
method __exit__ (line 338) | def __exit__(self, *args):
function run_test (line 344) | def run_test(sample, test=None, debug=False):
function custom_compare_ (line 751) | def custom_compare_(output, ground_truth):
function stripped_string_compare (line 767) | def stripped_string_compare(s1, s2):
function call_method (line 773) | def call_method(method, inputs):
function reliability_guard (line 800) | def reliability_guard(maximum_memory_bytes=None):
FILE: opencompass/datasets/arc.py
class ARCDataset (line 14) | class ARCDataset(BaseDataset):
method load (line 17) | def load(path: str, name: str):
class ARCDatasetClean (line 64) | class ARCDatasetClean(BaseDataset):
method load_contamination_annotations (line 69) | def load_contamination_annotations(path, split='val'):
method load (line 95) | def load(path: str, name: str):
FILE: opencompass/datasets/arc_prize_public_evaluation.py
class ARCPrizeDataset (line 17) | class ARCPrizeDataset(BaseDataset):
method load (line 156) | def load(path: str, version: str):
class ARCPrizeEvaluator (line 178) | class ARCPrizeEvaluator(BaseEvaluator):
method score (line 180) | def score(self, predictions: List[str],
function extract_solution (line 197) | def extract_solution(text):
function pad_array_with_value (line 219) | def pad_array_with_value(array, target_shape, pad_value):
function compare_solutions_with_padding (line 226) | def compare_solutions_with_padding(generated_output: List[int],
FILE: opencompass/datasets/atlas/dataset_loader.py
class ATLASDataset (line 8) | class ATLASDataset(BaseDataset):
method load (line 11) | def load(split: str = 'val'):
FILE: opencompass/datasets/atlas/evaluation.py
function fix_json_slash (line 23) | def fix_json_slash(s: str) -> str:
function atlas_pred_postprocess (line 27) | def atlas_pred_postprocess(
function get_final_results (line 58) | def get_final_results(parsed_judges: List[List[Dict]],
function process_judge_output (line 88) | def process_judge_output(
function atlas_judge_postprocess (line 164) | def atlas_judge_postprocess(
class ATLASLLMEvaluator (line 188) | class ATLASLLMEvaluator(BaseEvaluator):
method __init__ (line 201) | def __init__(
method build_inferencer (line 228) | def build_inferencer(self):
method score (line 256) | def score(
method output_postprocess (line 362) | def output_postprocess(self, output: Dict, dataset=None) -> Dict:
method default_judge_cfg (line 382) | def default_judge_cfg(self):
FILE: opencompass/datasets/ax.py
class AXDatasetV2 (line 13) | class AXDatasetV2(BaseDataset):
method load (line 16) | def load(path: str):
FILE: opencompass/datasets/babilong/babilong.py
class BabiLongDataset (line 18) | class BabiLongDataset(BaseDataset):
method load (line 21) | def load(
class BabiLongEvaluator (line 97) | class BabiLongEvaluator(BaseEvaluator):
method score (line 99) | def score(self, predictions, gold):
FILE: opencompass/datasets/babilong/babilong_utils.py
function compare_answers (line 11) | def compare_answers(target, output):
function get_dataset_df (line 33) | def get_dataset_df(dataset_path, max_n_facts=None):
class TaskDataset (line 88) | class TaskDataset(Dataset):
method __init__ (line 91) | def __init__(self, dataset_path, max_n_facts=None):
method __getitem__ (line 95) | def __getitem__(self, ind):
method __len__ (line 107) | def __len__(self):
function sum_lengths (line 111) | def sum_lengths(sentences):
class SentenceSampler (line 115) | class SentenceSampler:
method __init__ (line 118) | def __init__(
method get_sample (line 137) | def get_sample(self, sample_size):
method sample_sentences_ (line 163) | def sample_sentences_(self, sample_size):
method next_sample_ (line 179) | def next_sample_(self):
method length_is_ok (line 190) | def length_is_ok(self, tokenized):
class NoiseInjectionDataset (line 200) | class NoiseInjectionDataset(Dataset):
method __init__ (line 206) | def __init__(
method __getitem__ (line 227) | def __getitem__(self, ind):
method __len__ (line 284) | def __len__(self):
method get_sample_size (line 287) | def get_sample_size(self):
FILE: opencompass/datasets/babilong/prompts.py
function get_formatted_input (line 17) | def get_formatted_input(
FILE: opencompass/datasets/base.py
class BaseDataset (line 11) | class BaseDataset:
method __init__ (line 13) | def __init__(self,
method _init_reader (line 47) | def _init_reader(self, **kwargs):
method train (line 51) | def train(self):
method test (line 55) | def test(self):
method load (line 59) | def load(**kwargs) -> Union[Dataset, DatasetDict]:
FILE: opencompass/datasets/bbeh.py
class BBEHDataset (line 17) | class BBEHDataset(BaseDataset):
method load (line 20) | def load(path: str, name: str):
function bbeh_freeform_postprocess (line 33) | def bbeh_freeform_postprocess(text: str) -> str:
function bbeh_mcq_postprocess (line 61) | def bbeh_mcq_postprocess(text: str) -> str:
class BBEHEvaluator (line 84) | class BBEHEvaluator(BaseEvaluator):
method score (line 86) | def score(self, predictions, references):
class BBEHEvaluator_mcq (line 123) | class BBEHEvaluator_mcq(BaseEvaluator):
method score (line 125) | def score(self, predictions, references):
FILE: opencompass/datasets/bbh.py
class BBHDataset (line 17) | class BBHDataset(BaseDataset):
method load (line 20) | def load(path: str, name: str):
function bbh_mcq_postprocess (line 33) | def bbh_mcq_postprocess(text: str) -> str:
function bbh_freeform_postprocess (line 48) | def bbh_freeform_postprocess(text: str) -> str:
class BBHEvaluator (line 66) | class BBHEvaluator(BaseEvaluator):
method score (line 68) | def score(self, predictions, references):
class BBHEvaluator_mcq (line 92) | class BBHEvaluator_mcq(BaseEvaluator):
method score (line 94) | def score(self, predictions, references):
FILE: opencompass/datasets/benbench.py
class BenBenchDataset (line 15) | class BenBenchDataset(BaseDataset):
method load (line 18) | def load(path: str, tokenizer_path: str, tokenizer_kwargs: Optional[Di...
function exact_match_score (line 46) | def exact_match_score(predicted_text, original_text):
function edit_similarity_score (line 49) | def edit_similarity_score(predicted_text, original_text):
function rouge_l_score (line 58) | def rouge_l_score(predicted_text, original_text):
class BenbenEvaluator (line 67) | class BenbenEvaluator(BaseEvaluator):
method score (line 69) | def score(self, predictions, references):
FILE: opencompass/datasets/beyondaime.py
class BeyondAIMEDataset (line 9) | class BeyondAIMEDataset(BaseDataset):
method load (line 12) | def load(path, **kwargs):
FILE: opencompass/datasets/bigcodebench/bigcodebench.py
class BigCodeBenchDataset (line 21) | class BigCodeBenchDataset(BaseDataset):
method load (line 24) | def load(path: str = 'opencompass/bigcodebench',
class BigCodeBenchEvaluator (line 60) | class BigCodeBenchEvaluator(BaseEvaluator):
method __init__ (line 70) | def __init__(
method score (line 106) | def score(self, predictions, references):
method _results_processor (line 213) | def _results_processor(self, results):
FILE: opencompass/datasets/bigcodebench/extractor.py
function syntax_check (line 21) | def syntax_check(code, verbose=False):
function code_extract (line 31) | def code_extract(text: str) -> str:
function get_deps (line 49) | def get_deps(nodes: List[Tuple[str, Node]]) -> Dict[str, Set[str]]:
function get_function_dependency (line 66) | def get_function_dependency(entrypoint: str,
function get_definition_name (line 81) | def get_definition_name(node: Node) -> str:
function traverse_tree (line 87) | def traverse_tree(node: Node) -> Generator[Node, None, None]:
function has_return_statement (line 106) | def has_return_statement(node: Node) -> bool:
function extract_target_code_or_empty (line 114) | def extract_target_code_or_empty(code: str,
function extract_code_generation (line 182) | def extract_code_generation(model_output: str,
FILE: opencompass/datasets/biodata.py
class BiodataDataset (line 23) | class BiodataDataset(BaseDataset):
method load (line 26) | def load(path: str, name: str):
function extract_boxed_text (line 49) | def extract_boxed_text(text):
class BiodataClsEvaluator (line 71) | class BiodataClsEvaluator(BaseEvaluator):
method __init__ (line 74) | def __init__(self) -> None:
method score (line 77) | def score(self, predictions, references):
function extract_number (line 114) | def extract_number(text):
class BiodataRMSEEvaluator (line 124) | class BiodataRMSEEvaluator(BaseEvaluator):
method __init__ (line 127) | def __init__(self) -> None:
method score (line 130) | def score(self, predictions, references):
function extract_dict_text (line 155) | def extract_dict_text(text):
class BiodataDictEvaluator (line 166) | class BiodataDictEvaluator(BaseEvaluator):
method __init__ (line 169) | def __init__(self) -> None:
method score (line 172) | def score(self, predictions, references):
class BiodataStringEvaluator (line 220) | class BiodataStringEvaluator(BaseEvaluator):
method __init__ (line 223) | def __init__(self) -> None:
method score (line 226) | def score(self, predictions, references):
function dedup_ec_codes (line 266) | def dedup_ec_codes(ec_numer_list):
function count_f1_max (line 313) | def count_f1_max(pred, target):
class BiodataECNumberEvaluator (line 371) | class BiodataECNumberEvaluator(BaseEvaluator):
method __init__ (line 374) | def __init__(self) -> None:
method ec_to_multihot (line 483) | def ec_to_multihot(self, ec_list, ec_labels):
method score (line 494) | def score(self, predictions, references):
class BiodataTaskDataset (line 540) | class BiodataTaskDataset(BaseDataset):
method load (line 543) | def load(path: str, task: str):
function pearson_correlation_coefficient (line 597) | def pearson_correlation_coefficient(y_true, y_pred):
function spearman_correlation_coefficient (line 639) | def spearman_correlation_coefficient(y_true, y_pred):
function r_squared (line 681) | def r_squared(y_true, y_pred):
function multiple_label_auc (line 724) | def multiple_label_auc(y_true, y_pred):
function mixed_score (line 752) | def mixed_score(y_true, y_pred, low_range=(30, 1e3)):
class BiodataMCCEvaluator (line 854) | class BiodataMCCEvaluator(BaseEvaluator):
method __init__ (line 857) | def __init__(self) -> None:
method score (line 860) | def score(self, predictions, references):
class BiodataPCCEvaluator (line 891) | class BiodataPCCEvaluator(BaseEvaluator):
method __init__ (line 894) | def __init__(self) -> None:
method score (line 897) | def score(self, predictions, references):
class BiodataSpearmanEvaluator (line 944) | class BiodataSpearmanEvaluator(BaseEvaluator):
method __init__ (line 947) | def __init__(self) -> None:
method score (line 950) | def score(self, predictions, references):
class BiodataMixedScoreEvaluator (line 977) | class BiodataMixedScoreEvaluator(BaseEvaluator):
method __init__ (line 980) | def __init__(self) -> None:
method score (line 983) | def score(self, predictions, references):
class BiodataR2Evaluator (line 1011) | class BiodataR2Evaluator(BaseEvaluator):
method __init__ (line 1014) | def __init__(self) -> None:
method score (line 1017) | def score(self, predictions, references):
class BiodataAucEvaluator (line 1088) | class BiodataAucEvaluator(BaseEvaluator):
method __init__ (line 1092) | def __init__(self, predefined_labels=None) -> None:
method score (line 1098) | def score(self, predictions, references):
class BiodataAccEvaluator (line 1139) | class BiodataAccEvaluator(BaseEvaluator):
method __init__ (line 1142) | def __init__(self) -> None:
method score (line 1145) | def score(self, predictions, references):
FILE: opencompass/datasets/boolq.py
class BoolQDataset (line 12) | class BoolQDataset(BaseDataset):
method load (line 15) | def load(**kwargs):
class BoolQDatasetV2 (line 30) | class BoolQDatasetV2(BaseDataset):
method load (line 33) | def load(path):
class BoolQDatasetV3 (line 45) | class BoolQDatasetV3(BaseDataset):
method load (line 48) | def load(path):
FILE: opencompass/datasets/bustum.py
class bustumDataset_V2 (line 11) | class bustumDataset_V2(BaseDataset):
method load (line 14) | def load(path):
FILE: opencompass/datasets/c3.py
class C3Dataset (line 11) | class C3Dataset(BaseDataset):
method load (line 14) | def load(path: str):
class C3Dataset_V2 (line 57) | class C3Dataset_V2(BaseDataset):
method load (line 60) | def load(path: str):
FILE: opencompass/datasets/calm/calm.py
class CaLMDataset (line 16) | class CaLMDataset(BaseDataset):
method load (line 19) | def load(path: str, prompt_style: str) -> datasets.Dataset:
class CaLMEvaluator (line 26) | class CaLMEvaluator(BaseEvaluator):
method __init__ (line 28) | def __init__(self, core_metrics, error_analysis, prompt_style,
method score (line 36) | def score(
FILE: opencompass/datasets/calm/data_processing/generate_questions.py
function get_get_prompt_func (line 8) | def get_get_prompt_func(task):
function generate_question_list (line 152) | def generate_question_list(dataset_path, prompt_style):
FILE: opencompass/datasets/calm/data_processing/prompt/AC-B_causal_judgement.py
function get_prompt (line 144) | def get_prompt(task_name, prompt_style, item, prompt_style_str=''):
FILE: opencompass/datasets/calm/data_processing/prompt/AR-B_CaLM-AR.py
function get_prompt (line 148) | def get_prompt(task_name, prompt_style, item, prompt_style_str=''):
FILE: opencompass/datasets/calm/data_processing/prompt/ATE.py
function get_prompt (line 176) | def get_prompt(task_name, prompt_style, item, prompt_style_str=''):
FILE: opencompass/datasets/calm/data_processing/prompt/BAS-B_backadj.py
function get_prompt (line 132) | def get_prompt(task_name, prompt_style, item, prompt_style_str=''):
FILE: opencompass/datasets/calm/data_processing/prompt/BAS-C_max-BAS.py
function get_prompt (line 317) | def get_prompt(task_name, prompt_style, item, prompt_style_str=''):
FILE: opencompass/datasets/calm/data_processing/prompt/BAS-C_min-BAS.py
function get_prompt (line 348) | def get_prompt(task_name, prompt_style, item, prompt_style_str=''):
FILE: opencompass/datasets/calm/data_processing/prompt/BAS-C_mix-BAS.py
function get_prompt (line 352) | def get_prompt(task_name, prompt_style, item, prompt_style_str=''):
FILE: opencompass/datasets/calm/data_processing/prompt/CA-B_FA.py
function get_prompt (line 167) | def get_prompt(task_name, prompt_style, item, prompt_style_str=''):
FILE: opencompass/datasets/calm/data_processing/prompt/CA-B_FP.py
function get_prompt (line 167) | def get_prompt(task_name, prompt_style, item, prompt_style_str=''):
FILE: opencompass/datasets/calm/data_processing/prompt/CB-B_collider-bias.py
function get_prompt (line 150) | def get_prompt(task_name, prompt_style, item, prompt_style_str=''):
FILE: opencompass/datasets/calm/data_processing/prompt/CDE.py
function get_prompt (line 176) | def get_prompt(task_name, prompt_style, item, prompt_style_str=''):
FILE: opencompass/datasets/calm/data_processing/prompt/CEG-O_E-CARE.py
function get_prompt (line 203) | def get_prompt(task_name, prompt_style, item, prompt_style_str=''):
FILE: opencompass/datasets/calm/data_processing/prompt/CEI-B.py
function get_prompt (line 175) | def get_prompt(task_name, prompt_style, item, prompt_style_str=''):
FILE: opencompass/datasets/calm/data_processing/prompt/CORR-B_correlation.py
function get_prompt (line 130) | def get_prompt(task_name, prompt_style, item, prompt_style_str=''):
FILE: opencompass/datasets/calm/data_processing/prompt/CR-B_det-counterfactual.py
function get_prompt (line 131) | def get_prompt(task_name, prompt_style, item, prompt_style_str=''):
FILE: opencompass/datasets/calm/data_processing/prompt/CR-C_CRASS.py
function get_prompt (line 322) | def get_prompt(task_name, prompt_style, item, prompt_style_str=''):
FILE: opencompass/datasets/calm/data_processing/prompt/EAE-B_exp-away.py
function get_prompt (line 122) | def get_prompt(task_name, prompt_style, item, prompt_style_str=''):
FILE: opencompass/datasets/calm/data_processing/prompt/ECI-B_CTB.py
function get_prompt (line 166) | def get_prompt(task_name, prompt_style, item, prompt_style_str=''):
FILE: opencompass/datasets/calm/data_processing/prompt/ECI-B_ESC.py
function get_prompt (line 166) | def get_prompt(task_name, prompt_style, item, prompt_style_str=''):
FILE: opencompass/datasets/calm/data_processing/prompt/ECI-B_MAVEN-ERE.py
function get_prompt (line 166) | def get_prompt(task_name, prompt_style, item, prompt_style_str=''):
FILE: opencompass/datasets/calm/data_processing/prompt/ETT.py
function get_prompt (line 176) | def get_prompt(task_name, prompt_style, item, prompt_style_str=''):
FILE: opencompass/datasets/calm/data_processing/prompt/FAS-C_FAS.py
function get_prompt (line 369) | def get_prompt(task_name, prompt_style, item, prompt_style_str=''):
FILE: opencompass/datasets/calm/data_processing/prompt/IV-C_CaLM-IV.py
function get_prompt (line 321) | def get_prompt(task_name, prompt_style, item, prompt_style_str=''):
FILE: opencompass/datasets/calm/data_processing/prompt/NDE.py
function get_prompt (line 171) | def get_prompt(task_name, prompt_style, item, prompt_style_str=''):
FILE: opencompass/datasets/calm/data_processing/prompt/NIE.py
function get_prompt (line 170) | def get_prompt(task_name, prompt_style, item, prompt_style_str=''):
FILE: opencompass/datasets/calm/data_processing/prompt/PCD-B_COPA.py
function get_prompt (line 201) | def get_prompt(task_name, prompt_style, item, prompt_style_str=''):
FILE: opencompass/datasets/calm/data_processing/prompt/PCD-B_E-CARE.py
function get_prompt (line 201) | def get_prompt(task_name, prompt_style, item, prompt_style_str=''):
FILE: opencompass/datasets/calm/data_processing/prompt/PCD-C_COPA.py
function get_prompt (line 248) | def get_prompt(task_name, prompt_style, item, prompt_style_str=''):
FILE: opencompass/datasets/calm/data_processing/prompt/PCD-C_E-CARE.py
function get_prompt (line 246) | def get_prompt(task_name, prompt_style, item, prompt_style_str=''):
FILE: opencompass/datasets/calm/data_processing/prompt/PN.py
function get_prompt (line 166) | def get_prompt(task_name, prompt_style, item, prompt_style_str=''):
FILE: opencompass/datasets/calm/data_processing/prompt/PS.py
function get_prompt (line 165) | def get_prompt(task_name, prompt_style, item, prompt_style_str=''):
FILE: opencompass/datasets/calm/evaluation/accuracy/choice.py
function compute_acc (line 1) | def compute_acc(gt_list, pred_list):
FILE: opencompass/datasets/calm/evaluation/accuracy/open-ended.py
function is_chinese (line 5) | def is_chinese(text):
function compute_acc (line 12) | def compute_acc(gt_list, pred_list):
FILE: opencompass/datasets/calm/evaluation/accuracy/prob.py
function compute_acc (line 1) | def compute_acc(gt_list, pred_list):
FILE: opencompass/datasets/calm/evaluation/core_metrics.py
function initialize_core_metric_evaluation_components (line 127) | def initialize_core_metric_evaluation_components(task):
function compute_core_metrics (line 285) | def compute_core_metrics(items, task, prompt_style, gt_items):
FILE: opencompass/datasets/calm/evaluation/error/basic_adversarial/AC-B_causal_judgement.py
function check_standalization (line 5) | def check_standalization(model_response, prompt_style, type):
function check_empty (line 13) | def check_empty(model_response):
function check_repetition (line 20) | def check_repetition(model_response):
function contains_chinese (line 31) | def contains_chinese(text):
function contains_english (line 37) | def contains_english(text):
function check_abnormality (line 44) | def check_abnormality(preds):
FILE: opencompass/datasets/calm/evaluation/error/basic_adversarial/AR-B_CaLM-AR.py
function check_standalization (line 4) | def check_standalization(model_response, prompt_style, type):
function check_empty (line 11) | def check_empty(model_response):
function check_repetition (line 18) | def check_repetition(model_response):
function contains_chinese (line 26) | def contains_chinese(text):
function contains_english (line 32) | def contains_english(text):
function check_abnormality (line 39) | def check_abnormality(preds):
FILE: opencompass/datasets/calm/evaluation/error/basic_adversarial/AS.py
function check_standalization (line 5) | def check_standalization(model_response, prompt_style, type):
function check_empty (line 13) | def check_empty(model_response):
function check_repetition (line 20) | def check_repetition(model_response):
function contains_chinese (line 31) | def contains_chinese(text):
function contains_english (line 38) | def contains_english(text):
function check_abnormality (line 45) | def check_abnormality(preds):
FILE: opencompass/datasets/calm/evaluation/error/basic_adversarial/CA-B.py
function check_standalization (line 4) | def check_standalization(model_response, prompt_style, type):
function check_empty (line 11) | def check_empty(model_response):
function check_repetition (line 18) | def check_repetition(model_response):
function contains_chinese (line 29) | def contains_chinese(text):
function contains_english (line 35) | def contains_english(text):
function check_abnormality (line 42) | def check_abnormality(preds):
FILE: opencompass/datasets/calm/evaluation/error/basic_adversarial/CEI-B.py
function check_standalization (line 4) | def check_standalization(model_response, prompt_style, type):
function check_empty (line 11) | def check_empty(model_response):
function check_repetition (line 18) | def check_repetition(model_response):
function contains_chinese (line 29) | def contains_chinese(text):
function contains_english (line 36) | def contains_english(text):
function check_abnormality (line 43) | def check_abnormality(preds):
FILE: opencompass/datasets/calm/evaluation/error/basic_adversarial/CLADDER.py
function check_standalization (line 4) | def check_standalization(model_response, prompt_style, type):
function check_empty (line 11) | def check_empty(model_response):
function check_repetition (line 18) | def check_repetition(model_response):
function contains_chinese (line 26) | def contains_chinese(text):
function contains_english (line 33) | def contains_english(text):
function check_abnormality (line 40) | def check_abnormality(preds):
FILE: opencompass/datasets/calm/evaluation/error/basic_adversarial/CR-C_CRASS.py
function check_standalization (line 4) | def check_standalization(model_response, prompt_style, type):
function check_empty (line 13) | def check_empty(model_response):
function check_repetition (line 20) | def check_repetition(model_response):
function contains_chinese (line 30) | def contains_chinese(text):
function contains_english (line 37) | def contains_english(text):
function check_abnormality (line 44) | def check_abnormality(preds):
FILE: opencompass/datasets/calm/evaluation/error/basic_adversarial/ECI.py
function check_standalization (line 4) | def check_standalization(model_response, prompt_style, type):
function check_empty (line 11) | def check_empty(model_response):
function check_repetition (line 18) | def check_repetition(model_response):
function contains_chinese (line 29) | def contains_chinese(text):
function contains_english (line 36) | def contains_english(text):
function check_abnormality (line 43) | def check_abnormality(preds):
FILE: opencompass/datasets/calm/evaluation/error/basic_adversarial/Natural.py
function check_standalization (line 5) | def check_standalization(model_response, prompt_style, type):
function check_empty (line 13) | def check_empty(model_response):
function check_repetition (line 20) | def check_repetition(model_response):
function contains_chinese (line 31) | def contains_chinese(text):
function contains_english (line 37) | def contains_english(text):
function check_abnormality (line 46) | def check_abnormality(preds):
FILE: opencompass/datasets/calm/evaluation/error/basic_adversarial/PCD-B.py
function check_standalization (line 4) | def check_standalization(model_response, prompt_style, type):
function check_empty (line 11) | def check_empty(model_response):
function check_repetition (line 18) | def check_repetition(model_response):
function contains_chinese (line 26) | def contains_chinese(text):
function contains_english (line 32) | def contains_english(text):
function check_abnormality (line 39) | def check_abnormality(preds):
FILE: opencompass/datasets/calm/evaluation/error/basic_adversarial/PCD-C.py
function check_standalization (line 4) | def check_standalization(model_response, prompt_style, type):
function check_empty (line 11) | def check_empty(model_response):
function check_repetition (line 18) | def check_repetition(model_response):
function contains_chinese (line 28) | def contains_chinese(text):
function contains_english (line 34) | def contains_english(text):
function check_abnormality (line 41) | def check_abnormality(preds):
FILE: opencompass/datasets/calm/evaluation/error/basic_adversarial/Probability.py
function check_standalization (line 5) | def check_standalization(model_response, prompt_style, type):
function check_empty (line 20) | def check_empty(model_response):
function check_repetition (line 27) | def check_repetition(model_response):
function contains_chinese (line 38) | def contains_chinese(text):
function contains_english (line 45) | def contains_english(text):
function check_abnormality (line 56) | def check_abnormality(preds):
FILE: opencompass/datasets/calm/evaluation/errors.py
function initialize_error_identification_components (line 11) | def initialize_error_identification_components(task, prompt_style):
function identify_model_errors (line 171) | def identify_model_errors(items, task, prompt_style, gt_items):
function get_item_error (line 221) | def get_item_error(model_response, task, error_module, prompt_style):
FILE: opencompass/datasets/calm/evaluation/labeling/AC-B_causal_judgement.py
function get_gt_label (line 6) | def get_gt_label(item):
function get_pred_label (line 14) | def get_pred_label(model_response, item, prompt_style, type):
FILE: opencompass/datasets/calm/evaluation/labeling/AR-B_CaLM-AR.py
function get_gt_label (line 6) | def get_gt_label(item):
function get_pred_label (line 10) | def get_pred_label(model_response, item, prompt_style, type):
FILE: opencompass/datasets/calm/evaluation/labeling/AS.py
function get_gt_label (line 7) | def get_gt_label(item):
function get_pred_label (line 11) | def get_pred_label(model_response, item, prompt_style, type):
FILE: opencompass/datasets/calm/evaluation/labeling/CA-B_FA.py
function get_gt_label (line 6) | def get_gt_label(item):
function get_pred_label (line 10) | def get_pred_label(model_response, item, prompt_style, type):
FILE: opencompass/datasets/calm/evaluation/labeling/CA-B_FP.py
function get_gt_label (line 6) | def get_gt_label(item):
function get_pred_label (line 10) | def get_pred_label(model_response, item, prompt_style, type):
FILE: opencompass/datasets/calm/evaluation/labeling/CEG-O_E-CARE.py
function get_gt_label (line 1) | def get_gt_label(item):
function get_pred_label (line 5) | def get_pred_label(model_response, item, prompt_style, type):
FILE: opencompass/datasets/calm/evaluation/labeling/CEI-B.py
function get_gt_label (line 6) | def get_gt_label(item):
function get_pred_label (line 10) | def get_pred_label(model_response, item, prompt_style, type):
FILE: opencompass/datasets/calm/evaluation/labeling/CLADDER.py
function get_gt_label (line 6) | def get_gt_label(item):
function get_pred_label (line 14) | def get_pred_label(model_response, item, prompt_style, type):
FILE: opencompass/datasets/calm/evaluation/labeling/CR-C_CRASS.py
function get_gt_label (line 8) | def get_gt_label(item):
function get_pred_label (line 12) | def get_pred_label(model_response, item, prompt_style, type):
FILE: opencompass/datasets/calm/evaluation/labeling/ECI.py
function get_gt_label (line 6) | def get_gt_label(item):
function get_pred_label (line 10) | def get_pred_label(model_response, item, prompt_style, type):
FILE: opencompass/datasets/calm/evaluation/labeling/Natural.py
function get_gt_label (line 9) | def get_gt_label(item):
function extract_answer (line 13) | def extract_answer(model_response, item, prompt_style, type):
function get_pred_label (line 57) | def get_pred_label(model_response, item, prompt_style, type):
FILE: opencompass/datasets/calm/evaluation/labeling/PCD-B.py
function get_gt_label (line 6) | def get_gt_label(item):
function get_pred_label (line 10) | def get_pred_label(model_response, item, prompt_style, type):
FILE: opencompass/datasets/calm/evaluation/labeling/PCD-C.py
function get_gt_label (line 6) | def get_gt_label(item):
function get_pred_label (line 10) | def get_pred_label(model_response, item, prompt_style, type):
FILE: opencompass/datasets/calm/evaluation/labeling/Probability.py
function get_gt_label (line 9) | def get_gt_label(item):
function extract_prob (line 14) | def extract_prob(model_response, prompt_style, type):
function get_pred_label (line 61) | def get_pred_label(model_response, item, prompt_style, type):
FILE: opencompass/datasets/calm/evaluation/labeling/common_answers.py
function is_numeric (line 301) | def is_numeric(value):
function add_quotes_to_unquoted (line 309) | def add_quotes_to_unquoted(json_str):
function change_quotation (line 314) | def change_quotation(json_str):
FILE: opencompass/datasets/calm/utils/load_items.py
function load_query_instances (line 5) | def load_query_instances(path):
FILE: opencompass/datasets/cb.py
class CBDatasetV2 (line 12) | class CBDatasetV2(BaseDataset):
method load (line 15) | def load(path):
FILE: opencompass/datasets/ceval.py
class CEvalDataset (line 15) | class CEvalDataset(BaseDataset):
method load (line 18) | def load(path: str, name: str, local_mode: bool = False):
class CEvalDatasetClean (line 41) | class CEvalDatasetClean(BaseDataset):
method load_contamination_annotations (line 46) | def load_contamination_annotations(path, split='val'):
method load (line 70) | def load(path: str, name: str):
FILE: opencompass/datasets/charm.py
function charm_reason_postprocess (line 17) | def charm_reason_postprocess(text: str) -> str:
class CharmReasonEvaluator (line 32) | class CharmReasonEvaluator(BaseEvaluator):
method score (line 34) | def score(self, predictions, references):
function charm_memory_eval (line 52) | def charm_memory_eval(pred: str, ref: Union[str, List[str]]) -> str:
class CharmMemoryEvaluator (line 85) | class CharmMemoryEvaluator(LMEvaluator):
method __init__ (line 91) | def __init__(self, prompt_template=None, *nargs, **kwargs):
method score (line 101) | def score(self, predictions, references, **kwargs):
class CharmDataset (line 146) | class CharmDataset(BaseDataset):
method load (line 149) | def load(path: str, name: str):
FILE: opencompass/datasets/chatml/chatml.py
class ChatMLDataset (line 13) | class ChatMLDataset(BaseDataset):
method load (line 65) | def load(path, file_name=None, local_mode=False):
FILE: opencompass/datasets/chatml/verification.py
class TextItem (line 7) | class TextItem(BaseModel):
class ImageItem (line 12) | class ImageItem(BaseModel):
class SystemMessage (line 20) | class SystemMessage(BaseModel):
class AssistantMessage (line 25) | class AssistantMessage(BaseModel):
class UserMessage (line 30) | class UserMessage(BaseModel):
class VerifyDataset (line 38) | class VerifyDataset(BaseModel):
method validate_answer_length (line 43) | def validate_answer_length(self) -> 'VerifyDataset':
FILE: opencompass/datasets/chem_exam.py
class ChemExamDataset (line 13) | class ChemExamDataset(BaseDataset):
method load (line 16) | def load(path: str):
function chem_exam_score_llmjudge_postprocess (line 56) | def chem_exam_score_llmjudge_postprocess(output, output_path, dataset):
FILE: opencompass/datasets/chembench.py
class ChemBenchDataset (line 13) | class ChemBenchDataset(BaseDataset):
method load (line 16) | def load(path: str, name: str):
FILE: opencompass/datasets/chid.py
class CHIDDataset (line 12) | class CHIDDataset(BaseDataset):
method load (line 15) | def load(**kwargs):
class CHIDDatasetV2 (line 33) | class CHIDDatasetV2(BaseDataset):
method load (line 36) | def load(path):
FILE: opencompass/datasets/chinese_simpleqa.py
function chinese_simpleqa_preprocess (line 88) | def chinese_simpleqa_preprocess(text: str) -> str:
class CsimpleqaDataset (line 94) | class CsimpleqaDataset(BaseDataset):
method load (line 96) | def load(self, path: str, name: str, *args, **kwargs):
function post_process_csimpleqa (line 135) | def post_process_csimpleqa(completion):
function get_judgeanswer_and_reference (line 146) | def get_judgeanswer_and_reference(result, filename, post_process):
function calculate_metrics (line 162) | def calculate_metrics(judged_answers):
function get_results (line 194) | def get_results(judged_answers):
function csimpleqa_postprocess (line 200) | def csimpleqa_postprocess(output: dict, output_path: str) -> dict:
FILE: opencompass/datasets/cibench.py
function load_experiment (line 19) | def load_experiment(file: str) -> dict:
function check_internet (line 100) | def check_internet():
class CIBenchDataset (line 114) | class CIBenchDataset(BaseDataset):
method load (line 118) | def load(path: str, internet_check: bool = False):
function sklearn_ssim (line 142) | def sklearn_ssim(pred_img, target_img):
function vl_model_score (line 187) | def vl_model_score(model, pred_img, ori_prompt, judge_prompt):
class CIBenchEvaluator (line 200) | class CIBenchEvaluator(BaseEvaluator):
method __init__ (line 220) | def __init__(self,
method check_user_data_dir (line 264) | def check_user_data_dir(self, user_data_dir):
method valid_step (line 280) | def valid_step(step):
method correct_step (line 293) | def correct_step(step, target) -> dict:
method text_step (line 326) | def text_step(self, step, target) -> dict:
method vis_similarity_step (line 349) | def vis_similarity_step(self, step, target, ori_prompt) -> dict:
method save_results (line 383) | def save_results(self, origin_prompt, steps, references):
method set_data_dir (line 459) | def set_data_dir(self, work_dir):
method unset_data_dir (line 469) | def unset_data_dir(self, work_dir):
method single_exp (line 473) | def single_exp(self, gold, steps, single_ori_prompt):
method get_output_dir (line 516) | def get_output_dir(self):
method score (line 526) | def score(self, predictions: List, references: List, steps: List,
FILE: opencompass/datasets/circular.py
function get_origin_patterns (line 22) | def get_origin_patterns(option_keys):
function get_circular_patterns (line 26) | def get_circular_patterns(option_keys):
function get_all_possible_patterns (line 35) | def get_all_possible_patterns(option_keys):
class CircularDatasetMeta (line 40) | class CircularDatasetMeta(type):
method make_circular_items (line 63) | def make_circular_items(
method make_circular_dataset (line 90) | def make_circular_dataset(dataset, circular_patterns, option_keys,
method make_circular (line 105) | def make_circular(
method __new__ (line 183) | def __new__(cls, name, bases, dct):
class CircularCEvalDataset (line 211) | class CircularCEvalDataset(CEvalDataset, metaclass=CircularDatasetMeta):
class CircularMMLUDataset (line 218) | class CircularMMLUDataset(MMLUDataset, metaclass=CircularDatasetMeta):
class CircularCMMLUDataset (line 225) | class CircularCMMLUDataset(CMMLUDataset, metaclass=CircularDatasetMeta):
class CircularCSQADataset (line 232) | class CircularCSQADataset(commonsenseqaDataset, metaclass=CircularDatase...
class CircularARCDataset (line 239) | class CircularARCDataset(ARCDataset, metaclass=CircularDatasetMeta):
method default_answer_key_switch_method (line 244) | def default_answer_key_switch_method(item, circular_pattern):
class CircularHSWAGDataset (line 250) | class CircularHSWAGDataset(HellaswagDataset_V2, metaclass=CircularDatase...
class CircularOBQADataset (line 257) | class CircularOBQADataset(OBQADataset, metaclass=CircularDatasetMeta):
class CircularRaceDataset (line 264) | class CircularRaceDataset(RaceDataset, metaclass=CircularDatasetMeta):
class CircularXiezhiDataset (line 271) | class CircularXiezhiDataset(XiezhiDataset, metaclass=CircularDatasetMeta):
class CircularsiqaDataset (line 278) | class CircularsiqaDataset(SiqaDatasetV3, metaclass=CircularDatasetMeta):
class CircularPIQADataset (line 285) | class CircularPIQADataset(PIQADatasetV2, metaclass=CircularDatasetMeta):
method default_answer_key_switch_method (line 290) | def default_answer_key_switch_method(item, circular_pattern):
class CircularEvaluator (line 296) | class CircularEvaluator(BaseEvaluator):
method __init__ (line 320) | def __init__(self, circular_pattern='circular'):
method score (line 324) | def score(self, predictions, references, test_set):
FILE: opencompass/datasets/civilcomments.py
class CivilCommentsDataset (line 9) | class CivilCommentsDataset(BaseDataset):
method load (line 12) | def load(**kwargs):
FILE: opencompass/datasets/climaqa.py
class ClimaQADataset (line 11) | class ClimaQADataset(BaseDataset):
method load (line 14) | def load(path: str, task: str, **kwargs):
FILE: opencompass/datasets/clozeTest_maxmin.py
class MaxminDataset (line 12) | class MaxminDataset(BaseDataset):
method load (line 15) | def load(test_path, answer_path=None):
FILE: opencompass/datasets/cluewsc.py
class CluewscDataset (line 12) | class CluewscDataset(BaseDataset):
method load (line 15) | def load(**kwargs):
class CluewscDatasetV2 (line 43) | class CluewscDatasetV2(BaseDataset):
method load (line 46) | def load(path):
FILE: opencompass/datasets/cmb.py
class CMBDataset (line 13) | class CMBDataset(BaseDataset):
method load (line 16) | def load(path: str):
FILE: opencompass/datasets/cmmlu.py
class CMMLUDataset (line 14) | class CMMLUDataset(BaseDataset):
method load (line 17) | def load(path: str, name: str, **kwargs):
FILE: opencompass/datasets/cmnli.py
class CMNLIDataset (line 13) | class CMNLIDataset(BaseDataset):
method load (line 16) | def load(path, local_mode: bool = False):
class CMNLIDatasetV2 (line 39) | class CMNLIDatasetV2(BaseDataset):
method load (line 42) | def load(path, local_mode: bool = False):
FILE: opencompass/datasets/cmo_fib.py
class CMOFibDataset (line 12) | class CMOFibDataset(BaseDataset):
method load (line 15) | def load(path):
FILE: opencompass/datasets/cmphysbench/SEED/SEED.py
function update_func (line 65) | def update_func(x, y):
function remove_func (line 74) | def remove_func(x):
function remove_tree_func (line 78) | def remove_tree_func(x):
function insert_func (line 85) | def insert_func(x):
function insert_tree_func (line 89) | def insert_tree_func(x):
function calc_tree_size (line 93) | def calc_tree_size(node):
function score_calc (line 134) | def score_calc(tree_dist, tree_size):
function numeric_score_calc (line 140) | def numeric_score_calc(student_answer_exp, ground_truth_exp):
function simplify_with_timeout (line 209) | def simplify_with_timeout(expr):
function time_simplify (line 213) | def time_simplify(expr):
function equal_with_timeout (line 222) | def equal_with_timeout(expr1, expr2):
function time_equal (line 226) | def time_equal(expr1, expr2):
function sympy_to_tree (line 234) | def sympy_to_tree(expr):
class TreeNode (line 306) | class TreeNode:
method __init__ (line 308) | def __init__(self, label, children=None, node_type='other'):
method get_children (line 314) | def get_children(self):
method __str__ (line 317) | def __str__(self):
function print_tree (line 321) | def print_tree(node, indent=0):
class LaTeXError (line 328) | class LaTeXError(Exception):
method __init__ (line 330) | def __init__(self, message='LaTeXError'):
class SymPyError (line 334) | class SymPyError(Exception):
method __init__ (line 336) | def __init__(self, message='SymPyError'):
class TreeError (line 340) | class TreeError(Exception):
method __init__ (line 342) | def __init__(self, message='TreeError'):
class DistError (line 346) | class DistError(Exception):
method __init__ (line 348) | def __init__(self, message='DistanceError'):
function Equation_standardize (line 352) | def Equation_standardize(latex):
function extract_interval (line 359) | def extract_interval(latex):
function judge_interval (line 381) | def judge_interval(latex):
function check_latex_wrap (line 399) | def check_latex_wrap(s):
function parse_bracketed_string (line 415) | def parse_bracketed_string(s):
function strip_dollar_signs (line 424) | def strip_dollar_signs(s):
function extract_numeric_part (line 433) | def extract_numeric_part(latex_str: str) -> str:
function extract_tuple (line 485) | def extract_tuple(latex):
function clean_latex_unit (line 538) | def clean_latex_unit(unit_str):
function parse_latex_quantity_general (line 557) | def parse_latex_quantity_general(latex_str):
function convert_and_output_general (line 592) | def convert_and_output_general(latex_qty1, latex_qty2, target_unit=None):
function SEED (line 620) | def SEED(answer_latex, test_latex, type, debug_mode=False):
FILE: opencompass/datasets/cmphysbench/SEED/extended_zss.py
class Node (line 13) | class Node(object):
method __init__ (line 15) | def __init__(self, label, children=None):
method get_children (line 20) | def get_children(node):
method get_label (line 24) | def get_label(node):
method addkid (line 27) | def addkid(self, node, before=False):
method get (line 35) | def get(self, label):
class AnnotatedTree (line 44) | class AnnotatedTree(object):
method __init__ (line 46) | def __init__(self, root, get_children):
function ext_distance (line 96) | def ext_distance(A, B, get_children, single_insert_cost, insert_cost,
FILE: opencompass/datasets/cmphysbench/SEED/latex_pre_process.py
function convert_caret_to_derivative (line 11) | def convert_caret_to_derivative(latex_str):
function preprocess_special_superscripts (line 26) | def preprocess_special_superscripts(latex_str):
function brackets_balanced (line 49) | def brackets_balanced(s: str) -> bool:
function remove_non_ascii (line 70) | def remove_non_ascii(text):
function extract_bracket_content (line 75) | def extract_bracket_content(s: str, bracket_position: int) -> str:
function find_first_unescaped_brace (line 107) | def find_first_unescaped_brace(s: str) -> int:
function extract_command (line 120) | def extract_command(s: str, brace_pos: int) -> str | None:
function remove_command (line 149) | def remove_command(s, command, keep_inside=False):
function convert_latex_fractions (line 230) | def convert_latex_fractions(latex_str):
function get_first_brace_command (line 248) | def get_first_brace_command(s: str) -> str | None:
function remove_overall_brace (line 257) | def remove_overall_brace(s: str) -> str:
function exp_frac (line 271) | def exp_frac(s):
function find_all (line 303) | def find_all(s, sub_str, allow_overlap=True):
function bar_inside_vec (line 319) | def bar_inside_vec(s):
function vec_lower_idx (line 346) | def vec_lower_idx(input_str):
function convert_vec_syntax (line 359) | def convert_vec_syntax(text):
function remove_outer_braces (line 384) | def remove_outer_braces(tex_str):
function extract_last_equal_content (line 396) | def extract_last_equal_content(s: str, strip_whitespace: bool = True) ->...
function first_pre_process (line 434) | def first_pre_process(s, t, extract_box=True):
function remove_text_from_latex (line 490) | def remove_text_from_latex(expr: str) -> str:
function extract_bracket_subscript_pairs (line 500) | def extract_bracket_subscript_pairs(expr):
function add_number_to_bracket_subscripts (line 535) | def add_number_to_bracket_subscripts(expr):
function insert_multiplication_symbols (line 550) | def insert_multiplication_symbols(expr):
function remove_all_text_commands (line 570) | def remove_all_text_commands(latex_str):
function convert_general_exp_format (line 582) | def convert_general_exp_format(latex_str):
function modify_latex_expression (line 590) | def modify_latex_expression(expr: str) -> str:
function wrap_single_subscripts (line 600) | def wrap_single_subscripts(s: str) -> str:
function replace_hc_text (line 613) | def replace_hc_text(s: str) -> str:
function standardize_dE_notation (line 630) | def standardize_dE_notation(s: str) -> str:
function replace_arrow_expression (line 635) | def replace_arrow_expression(s: str) -> str:
function preprocess_feynman_slash (line 643) | def preprocess_feynman_slash(latex_str: str) -> str:
function fix_subscript_on_parentheses (line 657) | def fix_subscript_on_parentheses(s: str) -> str:
function reorder_super_sub (line 667) | def reorder_super_sub(latex_str: str) -> str:
function second_pre_process (line 693) | def second_pre_process(s):
function add_parentheses_to_d (line 852) | def add_parentheses_to_d(expr):
class MyConfig (line 865) | class MyConfig:
class MyNormalization (line 882) | class MyNormalization:
function replace_derivative_frac_preserve_frac (line 903) | def replace_derivative_frac_preserve_frac(expr: str) -> str:
function master_convert_with_timeout (line 935) | def master_convert_with_timeout(s, t):
function master_convert (line 947) | def master_convert(s, t):
FILE: opencompass/datasets/cmphysbench/SEED/test.py
function run_case (line 25) | def run_case(idx: int, gt: str, pred: str, type: str, note: str = ''):
FILE: opencompass/datasets/cmphysbench/cmphysbench.py
class CMPhysBenchDataset (line 13) | class CMPhysBenchDataset(BaseDataset):
method load (line 16) | def load(path: str):
function extract_boxed_text_overlap (line 31) | def extract_boxed_text_overlap(text):
function extract_boxed_text_improved (line 67) | def extract_boxed_text_improved(text):
class CMPhysBenchEvaluator (line 136) | class CMPhysBenchEvaluator(BaseEvaluator):
method __init__ (line 139) | def __init__(self) -> None:
method score (line 142) | def score(self, predictions, references):
FILE: opencompass/datasets/cmrc.py
class CMRCDataset (line 12) | class CMRCDataset(BaseDataset):
method load (line 15) | def load(path: str):
function cmrc_postprocess (line 46) | def cmrc_postprocess(text: str) -> str:
FILE: opencompass/datasets/codecompass/CodeCompass.py
class BaseDataset (line 14) | class BaseDataset:
function get_data_path (line 17) | def get_data_path(path, local_mode=False):
class CodeCompassCodeGenerationDataset (line 21) | class CodeCompassCodeGenerationDataset(BaseDataset):
method load (line 48) | def load(path: str = 'opencompass/CodeCompass',
method _extract_limits (line 94) | def _extract_limits(problem_text: str) -> Dict[str, Any]:
method _process_item (line 120) | def _process_item(item: Dict[str, Any], system_prompt: str,
method _create_evaluation_sample (line 174) | def _create_evaluation_sample(
method validate_dataset (line 259) | def validate_dataset(dataset: DatasetDict) -> bool:
FILE: opencompass/datasets/codecompass/codecompass_runner.py
function run_test_for_cpp_problem (line 6) | def run_test_for_cpp_problem(sample: dict,
FILE: opencompass/datasets/codecompass/evaluator.py
class CodeCompassEvaluator (line 19) | class CodeCompassEvaluator(BaseEvaluator):
method __init__ (line 22) | def __init__(self,
method _build_results (line 45) | def _build_results(self, extracted_predictions: Dict[int, List[str]],
method score (line 77) | def score(self, predictions: List[Any],
method _prepare_sample (line 148) | def _prepare_sample(self, reference: Any, idx: int = -1) -> Dict[str, ...
method _run_parallel_evaluation (line 230) | def _run_parallel_evaluation(self,
FILE: opencompass/datasets/codecompass/executor.py
class LocalExecutor (line 10) | class LocalExecutor:
method __init__ (line 16) | def __init__(self,
method _set_resource_limits (line 29) | def _set_resource_limits(self):
method _compile_cpp (line 41) | def _compile_cpp(self, source_file: Path, temp_dir: Path) -> tuple:
method _run_executable (line 63) | def _run_executable(self, exec_file: Path, stdin_data: str) -> Dict:
method execute_code (line 116) | def execute_code(self, source_code: str, stdin: str, language: str,
method verify_output (line 145) | def verify_output(self, result: Dict, expected_output: str) -> Dict:
method submit_code (line 165) | def submit_code(self,
FILE: opencompass/datasets/codecompass/metrics.py
function estimate_pass_at_k (line 4) | def estimate_pass_at_k(num_samples, num_correct, k):
function compute_metrics_from_results (line 24) | def compute_metrics_from_results(results: dict, k_list=[1]):
FILE: opencompass/datasets/codecompass/utils.py
function extract_cpp_code (line 1) | def extract_cpp_code(model_output: str, model_type: str = 'chat'):
function extract_cpp_code_with_debug (line 36) | def extract_cpp_code_with_debug(model_output: str, model_type: str = 'ch...
FILE: opencompass/datasets/commonsenseqa.py
class commonsenseqaDataset (line 14) | class commonsenseqaDataset(BaseDataset):
method load (line 17) | def load(path):
FILE: opencompass/datasets/commonsenseqa_cn.py
class CommonsenseQADataset_CN (line 10) | class CommonsenseQADataset_CN(BaseDataset):
method load (line 13) | def load(path):
FILE: opencompass/datasets/compassbench_obj.py
function get_number (line 12) | def get_number(options):
class CompassBenchObjectiveV1_3 (line 21) | class CompassBenchObjectiveV1_3(BaseDataset):
method load (line 24) | def load(path: str, name: str):
class CompassBenchObjectiveMath (line 67) | class CompassBenchObjectiveMath(BaseDataset):
method load (line 70) | def load(path: str):
function compassbench_objective_v1_3_postprocess (line 96) | def compassbench_objective_v1_3_postprocess(text: str, name) -> str:
FILE: opencompass/datasets/copa.py
class COPADatasetV2 (line 12) | class COPADatasetV2(BaseDataset):
method load (line 15) | def load(path):
FILE: opencompass/datasets/crowspairs.py
class CrowspairsDataset (line 13) | class CrowspairsDataset(BaseDataset):
method load (line 16) | def load(**kwargs):
class CrowspairsDatasetV2 (line 28) | class CrowspairsDatasetV2(BaseDataset):
method load (line 31) | def load(**kwargs):
function crowspairs_postprocess (line 41) | def crowspairs_postprocess(text: str) -> str:
class CrowspairsEvaluator (line 60) | class CrowspairsEvaluator(BaseEvaluator):
method __init__ (line 64) | def __init__(self) -> None:
method score (line 67) | def score(self, predictions: List, references: List) -> dict:
FILE: opencompass/datasets/crowspairs_cn.py
class CrowspairsDatasetCN (line 10) | class CrowspairsDatasetCN(BaseDataset):
method load (line 14) | def load(path):
FILE: opencompass/datasets/csl.py
class CslDataset (line 12) | class CslDataset(BaseDataset):
method load (line 15) | def load(**kwargs):
class CslDatasetV2 (line 33) | class CslDatasetV2(BaseDataset):
method load (line 36) | def load(path):
FILE: opencompass/datasets/custom.py
class OptionSimAccEvaluator (line 21) | class OptionSimAccEvaluator(BaseEvaluator):
method __init__ (line 23) | def __init__(self, options) -> None:
method match_any_label (line 32) | def match_any_label(self, pred, test_item):
method score (line 68) | def score(self, predictions: List, references: List, test_set) -> dict:
class CircularOptionSimAccEvaluator (line 88) | class CircularOptionSimAccEvaluator(OptionSimAccEvaluator):
method __init__ (line 90) | def __init__(self, options, circular_pattern='circular'):
method score (line 94) | def score(self, predictions, references, test_set):
class CustomDataset (line 165) | class CustomDataset(BaseDataset):
method load (line 168) | def load(path, file_name=None, local_mode=False):
class CodeCustomDataset (line 187) | class CodeCustomDataset(BaseDataset):
method load (line 190) | def load(path, file_name=None, local_mode=False, num_repeats=1, **kwar...
class CircularCustomDataset (line 213) | class CircularCustomDataset(CustomDataset, metaclass=CircularDatasetMeta):
function stringfy_types (line 217) | def stringfy_types(obj):
function make_mcq_gen_config (line 226) | def make_mcq_gen_config(meta):
function make_circular_mcq_gen_config (line 274) | def make_circular_mcq_gen_config(meta):
function make_qa_gen_config (line 324) | def make_qa_gen_config(meta):
function make_mcq_ppl_config (line 373) | def make_mcq_ppl_config(meta):
function make_circular_mcq_ppl_config (line 425) | def make_circular_mcq_ppl_config(meta):
function parse_example_dataset (line 479) | def parse_example_dataset(config):
function make_custom_dataset_config (line 535) | def make_custom_dataset_config(config):
FILE: opencompass/datasets/cvalues.py
class CValuesDataset (line 12) | class CValuesDataset(BaseDataset):
method load (line 15) | def load(path):
FILE: opencompass/datasets/dingo.py
class DingoDataset (line 19) | class DingoDataset(BaseDataset):
method load (line 22) | def load(path: str):
class DingoLongDataset (line 35) | class DingoLongDataset(BaseDataset):
method load (line 38) | def load(path: str):
class DingoEvaluator (line 48) | class DingoEvaluator(BaseEvaluator):
method score (line 50) | def score(self, origin_prompt: List, predictions: List) -> dict:
FILE: opencompass/datasets/drcd.py
class DRCDDataset (line 12) | class DRCDDataset(BaseDataset):
method load (line 15) | def load(path: str):
function drcd_postprocess (line 46) | def drcd_postprocess(text: str) -> str:
FILE: opencompass/datasets/drop.py
class dropDataset (line 11) | class dropDataset(BaseDataset):
method get_answers (line 14) | def get_answers(validated_answers):
method load (line 29) | def load(path, only_number=True):
FILE: opencompass/datasets/drop_simple_eval.py
function normalize (line 18) | def normalize(s: str) -> str:
function fuzzy_match (line 28) | def fuzzy_match(s1: str, s2: str) -> bool:
class DropOpenAIDataset (line 39) | class DropOpenAIDataset(BaseDataset):
method load (line 42) | def load(path):
class DropOpenAIEvaluator (line 58) | class DropOpenAIEvaluator(BaseEvaluator):
method score (line 60) | def score(self, predictions, references):
FILE: opencompass/datasets/ds1000.py
class DS1000Dataset (line 35) | class DS1000Dataset(BaseDataset):
method get_data (line 52) | def get_data(self, problem_path: str) -> dict:
method load (line 89) | def load(self,
function ds1000_postprocess (line 128) | def ds1000_postprocess(text: str) -> str:
function ds1000_completion_postprocess (line 149) | def ds1000_completion_postprocess(text: str) -> str:
function ds1000_matplotlib_postprocess (line 160) | def ds1000_matplotlib_postprocess(text: str) -> str:
class DS1000Evaluator (line 182) | class DS1000Evaluator(BaseEvaluator):
method __init__ (line 185) | def __init__(self, num_workers=16) -> None:
method score_single (line 188) | def score_single(self, pred, refer):
method score (line 269) | def score(self, predictions, references):
class Command (line 280) | class Command(object):
method __init__ (line 283) | def __init__(self, cmd):
method run (line 287) | def run(self, timeout):
function import_source_file (line 318) | def import_source_file(fname, modname):
class DS1000ServiceEvaluator (line 350) | class DS1000ServiceEvaluator(BaseEvaluator):
method __init__ (line 366) | def __init__(self,
method score (line 379) | def score(self, predictions, references):
method _code_eval_service (line 415) | def _code_eval_service(self, file_path: str) -> tuple:
FILE: opencompass/datasets/ds1000_interpreter.py
class DS1000Dataset_Interperter (line 10) | class DS1000Dataset_Interperter(DS1000Dataset):
method load (line 13) | def load(
class DS1000InterpreterEvaluator (line 31) | class DS1000InterpreterEvaluator(BaseEvaluator):
method __init__ (line 39) | def __init__(self, action: str = 'PythonInterpreter'):
method get_action (line 42) | def get_action(self, step):
method score (line 47) | def score(self, predictions: List, references: List, steps: List):
FILE: opencompass/datasets/eese/eese.py
class EESEDataset (line 12) | class EESEDataset(BaseDataset):
method load (line 15) | def load(path: str, file_name: str = 'EESE.jsonl', **kwargs):
FILE: opencompass/datasets/eese/eese_postprocessors.py
function eese_score_postprocess_dict (line 8) | def eese_score_postprocess_dict(output: dict, output_path: str) -> dict:
FILE: opencompass/datasets/eese/utils.py
function extract_first_numeric_score (line 6) | def extract_first_numeric_score(score_text):
function process_results (line 31) | def process_results(results, overall_avg):
FILE: opencompass/datasets/eprstmt.py
class EprstmtDatasetV2 (line 12) | class EprstmtDatasetV2(BaseDataset):
method load (line 15) | def load(path):
FILE: opencompass/datasets/flores.py
class FloresFirst100Dataset (line 14) | class FloresFirst100Dataset(BaseDataset):
method load_single (line 17) | def load_single(src_path, tgt_path, src_lang, tgt_lang):
method load (line 31) | def load(path, name):
function flores_postprocess (line 70) | def flores_postprocess(text: str) -> str:
function flores_postprocess_chinese (line 76) | def flores_postprocess_chinese(text: str) -> str:
FILE: opencompass/datasets/game24.py
function get_current_numbers (line 157) | def get_current_numbers(y: str) -> str:
class Game24Dataset (line 162) | class Game24Dataset(BaseDataset):
method load (line 165) | def load(path: str):
class Game24PromptWrapper (line 172) | class Game24PromptWrapper:
method __init__ (line 183) | def __init__(self):
method standard_prompt_wrap (line 189) | def standard_prompt_wrap(x: str, y: str = '') -> str:
method cot_prompt_wrap (line 193) | def cot_prompt_wrap(x: str, y: str = '') -> str:
method propose_prompt_wrap (line 197) | def propose_prompt_wrap(x: str, y: str = '') -> str:
method value_prompt_wrap (line 206) | def value_prompt_wrap(x: str, y: str) -> str:
method value_outputs_unwrap (line 215) | def value_outputs_unwrap(x: str, y: str, value_outputs: list) -> float:
function game24_postprocess (line 229) | def game24_postprocess(output: str):
class Game24Evaluator (line 235) | class Game24Evaluator(BaseEvaluator):
method __init__ (line 237) | def __init__(self) -> None:
method check_nums (line 242) | def check_nums(self, prediction, reference):
method score (line 252) | def score(self, predictions: List, references: List) -> dict:
FILE: opencompass/datasets/gaokao_math.py
function extract_boxed_answer (line 66) | def extract_boxed_answer(text):
class GaoKaoMATHDataset (line 74) | class GaoKaoMATHDataset(BaseDataset):
method load (line 77) | def load(path: str):
class GaoKaoMATHEvaluator (line 93) | class GaoKaoMATHEvaluator(BaseEvaluator):
method __init__ (line 95) | def __init__(self,
method batch_response (line 126) | def batch_response(self, models, inputs):
method postprocess (line 142) | def postprocess(self, questions, predictions, question_type='None'):
method score (line 167) | def score(self, predictions, references, origin_prompt, test_set):
FILE: opencompass/datasets/generic.py
function get_final_results (line 7) | def get_final_results(judged_answers,
function _generic_llmjudge_postprocess (line 63) | def _generic_llmjudge_postprocess(judgement: str,
function generic_llmjudge_postprocess (line 73) | def generic_llmjudge_postprocess(
function generic_llmjudge_academic_postprocess (line 105) | def generic_llmjudge_academic_postprocess(
FILE: opencompass/datasets/govrepcrs.py
class GovRepcrsDataset (line 13) | class GovRepcrsDataset(BaseDataset):
method load (line 16) | def load(path: str):
FILE: opencompass/datasets/gpqa.py
class GPQADataset (line 16) | class GPQADataset(BaseDataset):
method load (line 19) | def load(path: str, name: str, **kwargs):
class GPQAEvaluator (line 47) | class GPQAEvaluator(BaseEvaluator):
method score (line 49) | def score(self, predictions, references):
class GPQASimpleEvalDataset (line 67) | class GPQASimpleEvalDataset(BaseDataset):
method load (line 71) | def load(path: str, name: str):
function GPQA_Simple_Eval_postprocess (line 110) | def GPQA_Simple_Eval_postprocess(text: str) -> str:
FILE: opencompass/datasets/gsm8k.py
class GSM8KDataset (line 16) | class GSM8KDataset(BaseDataset):
method load (line 19) | def load(path):
function gsm8k_dataset_postprocess (line 39) | def gsm8k_dataset_postprocess(text: str) -> str:
function gsm8k_postprocess (line 44) | def gsm8k_postprocess(text: str) -> str:
class Gsm8kEvaluator (line 52) | class Gsm8kEvaluator(BaseEvaluator):
method is_equal (line 54) | def is_equal(self, pred, refer):
method score (line 62) | def score(self, predictions, references):
class Gsm8kAgentEvaluator (line 82) | class Gsm8kAgentEvaluator(BaseEvaluator):
method __init__ (line 90) | def __init__(self, action: str = 'PythonInterpreter'):
method is_equal (line 93) | def is_equal(self, pred, refer):
method soft_equal (line 101) | def soft_equal(self, pred, refer, step):
method get_action (line 112) | def get_action(self, step):
method score (line 117) | def score(self, predictions, references, steps):
FILE: opencompass/datasets/gsm_hard.py
class GSMHardDataset (line 12) | class GSMHardDataset(BaseDataset):
method load (line 15) | def load(path):
FILE: opencompass/datasets/healthbench/healthbench.py
function map_with_progress (line 21) | def map_with_progress(
class RubricItem (line 91) | class RubricItem:
method __init__ (line 93) | def __init__(self, criterion: str, points: float, tags: list[str]):
method __str__ (line 98) | def __str__(self):
method to_dict (line 101) | def to_dict(self):
method from_dict (line 109) | def from_dict(cls, d: dict):
function _parse (line 117) | def _parse(item):
function parse_json_to_dict (line 123) | def parse_json_to_dict(json_string: str) -> dict:
function calculate_score (line 133) | def calculate_score(
function get_usage_dict (line 152) | def get_usage_dict(response_usage) -> dict[str, int | None]:
function _compute_clipped_stats (line 208) | def _compute_clipped_stats(
function _aggregate_get_clipped_mean (line 230) | def _aggregate_get_clipped_mean(
class HealthBenchDataset (line 260) | class HealthBenchDataset(BaseDataset):
method load (line 263) | def load(path: str, **kwargs):
class HealthBenchEvaluator (line 279) | class HealthBenchEvaluator(BaseEvaluator):
method __init__ (line 283) | def __init__(
method grade_sample (line 298) | def grade_sample(
method score (line 392) | def score(self, predictions, references, test_set):
FILE: opencompass/datasets/healthbench/sampler/chat_completion_sampler.py
class ChatCompletionSampler (line 16) | class ChatCompletionSampler(SamplerBase):
method __init__ (line 19) | def __init__(
method _handle_image (line 37) | def _handle_image(
method _handle_text (line 52) | def _handle_text(self, text: str):
method _pack_message (line 55) | def _pack_message(self, role: str, content: Any):
method __call__ (line 58) | def __call__(self, message_list: MessageList) -> SamplerResponse:
FILE: opencompass/datasets/healthbench/types.py
class SamplerResponse (line 9) | class SamplerResponse:
class SamplerBase (line 16) | class SamplerBase:
method __call__ (line 20) | def __call__(
class EvalResult (line 28) | class EvalResult:
class SingleEvalResult (line 39) | class SingleEvalResult:
class Eval (line 50) | class Eval:
method __call__ (line 53) | def __call__(self, sampler: SamplerBase) -> EvalResult:
FILE: opencompass/datasets/hellaswag.py
class HellaswagDataset (line 14) | class HellaswagDataset(BaseDataset):
method load (line 17) | def load(path):
class HellaswagDataset_V2 (line 49) | class HellaswagDataset_V2(BaseDataset):
method load (line 52) | def load(path):
class HellaswagDataset_V3 (line 84) | class HellaswagDataset_V3(BaseDataset):
method load (line 87) | def load(path):
class HellaswagDatasetwithICE (line 119) | class HellaswagDatasetwithICE(BaseDataset):
method load (line 122) | def load(path):
class HellaswagDatasetClean (line 160) | class HellaswagDatasetClean(BaseDataset):
method load_contamination_annotations (line 165) | def load_contamination_annotations(path, split='val'):
method load (line 191) | def load(path):
FILE: opencompass/datasets/hle.py
class HLEDataset (line 9) | class HLEDataset(BaseDataset):
method load (line 12) | def load(path: str, category: str | None = None):
FILE: opencompass/datasets/huggingface.py
class HFDataset (line 10) | class HFDataset(BaseDataset):
method load (line 13) | def load(**kwargs):
FILE: opencompass/datasets/humaneval.py
class HumanevalDataset (line 33) | class HumanevalDataset(BaseDataset):
method load (line 36) | def load(path: str, num_repeats: int = 1, local_mode: bool = False):
class HumanEvalEvaluator (line 70) | class HumanEvalEvaluator(BaseEvaluator):
method __init__ (line 73) | def __init__(self, k: List[int] = [1, 10, 100]) -> None:
method score (line 82) | def score(self, predictions, references, test_set):
class HumanEvalPlusEvaluator (line 119) | class HumanEvalPlusEvaluator(BaseEvaluator):
method __init__ (line 122) | def __init__(self, k: List[int] = [1, 10, 100]) -> None:
method score (line 131) | def score(self, predictions, references, test_set):
function humaneval_postprocess_v2 (line 182) | def humaneval_postprocess_v2(text: str) -> str:
function humaneval_postprocess_v3 (line 188) | def humaneval_postprocess_v3(text: str) -> str:
function humaneval_internal_v2_postprocess (line 194) | def humaneval_internal_v2_postprocess(text: str):
function humaneval_internal_v1_postprocess (line 207) | def humaneval_internal_v1_postprocess(text: str) -> str:
FILE: opencompass/datasets/humaneval_multi.py
class HumanevalMultiDataset (line 45) | class HumanevalMultiDataset(BaseDataset):
method load (line 48) | def load(path, language, version, num_repeats: int = 1, **kwargs):
class HumanevalMultiEvaluator (line 79) | class HumanevalMultiEvaluator(BaseEvaluator):
method __init__ (line 81) | def __init__(self,
method stop_at_stop_token (line 94) | def stop_at_stop_token(self, decoded_string, stop_tokens):
method _code_eval_service (line 108) | def _code_eval_service(self, file_path):
method estimator (line 133) | def estimator(self, n: int, c: int, k: int) -> float:
method for_file (line 141) | def for_file(self, path):
method score (line 162) | def score(self, predictions, references, test_set):
FILE: opencompass/datasets/humaneval_pro.py
class HumanevalevalProDataset (line 25) | class HumanevalevalProDataset(BaseDataset):
method load (line 28) | def load(path, local_mode=False):
class HumanevalProEvaluator (line 38) | class HumanevalProEvaluator(CodeEvaluator):
method score (line 40) | def score(self, predictions: List, references: List,
FILE: opencompass/datasets/humanevalx.py
class HumanevalXDataset (line 30) | class HumanevalXDataset(BaseDataset):
method load (line 33) | def load(path, language, **kwargs):
method _stream_jsonl_all (line 42) | def _stream_jsonl_all(filename: str) -> Iterable[Dict]:
class HumanevalXEvaluator (line 56) | class HumanevalXEvaluator(BaseEvaluator):
method __init__ (line 76) | def __init__(self,
method score (line 93) | def score(self, predictions, references):
method _code_eval_service (line 143) | def _code_eval_service(self, file_path):
function _clean_up_code (line 172) | def _clean_up_code(text: str, language_type: str, reference) -> str:
FILE: opencompass/datasets/hungarian_math.py
class HungarianExamMathDataset (line 11) | class HungarianExamMathDataset(BaseDataset):
method load (line 14) | def load(path):
FILE: opencompass/datasets/inference_ppl.py
class InferencePPLDataset (line 13) | class InferencePPLDataset(BaseDataset):
method load (line 16) | def load(path: str, name: List[str] = None, samples: int = None):
FILE: opencompass/datasets/infinitebench/infinitebench_codedebug.py
class InfiniteBenchcodedebugDataset (line 11) | class InfiniteBenchcodedebugDataset(BaseDataset):
method load (line 14) | def load(path: str):
FILE: opencompass/datasets/infinitebench/infinitebench_coderun.py
class InfiniteBenchcoderunDataset (line 13) | class InfiniteBenchcoderunDataset(BaseDataset):
method load (line 16) | def load(path: str):
FILE: opencompass/datasets/infinitebench/infinitebench_endia.py
class InfiniteBenchendiaDataset (line 14) | class InfiniteBenchendiaDataset(BaseDataset):
method load (line 17) | def load(path: str):
class InfiniteBenchendiaEvaluator (line 37) | class InfiniteBenchendiaEvaluator(BaseEvaluator):
method score (line 39) | def score(self, predictions: List, references: List) -> dict:
FILE: opencompass/datasets/infinitebench/infinitebench_enmc.py
class InfiniteBenchenmcDataset (line 11) | class InfiniteBenchenmcDataset(BaseDataset):
method load (line 14) | def load(path: str):
FILE: opencompass/datasets/infinitebench/infinitebench_enqa.py
class InfiniteBenchenqaDataset (line 11) | class InfiniteBenchenqaDataset(BaseDataset):
method load (line 14) | def load(path: str):
FILE: opencompass/datasets/infinitebench/infinitebench_ensum.py
class InfiniteBenchensumDataset (line 11) | class InfiniteBenchensumDataset(BaseDataset):
method load (line 14) | def load(path: str):
FILE: opencompass/datasets/infinitebench/infinitebench_mathcalc.py
class InfiniteBenchmathcalcDataset (line 15) | class InfiniteBenchmathcalcDataset(BaseDataset):
method load (line 18) | def load(path: str):
class InfiniteBenchmathcalcEvaluator (line 33) | class InfiniteBenchmathcalcEvaluator(BaseEvaluator):
method score (line 35) | def score(self, predictions: List, references: List) -> dict:
FILE: opencompass/datasets/infinitebench/infinitebench_mathfind.py
class InfiniteBenchmathfindDataset (line 13) | class InfiniteBenchmathfindDataset(BaseDataset):
method load (line 16) | def load(path: str):
FILE: opencompass/datasets/infinitebench/infinitebench_retrievekv.py
class InfiniteBenchretrievekvDataset (line 14) | class InfiniteBenchretrievekvDataset(BaseDataset):
method load (line 17) | def load(path: str):
class InfiniteBenchretrievekvEvaluator (line 37) | class InfiniteBenchretrievekvEvaluator(BaseEvaluator):
method score (line 39) | def score(self, predictions: List, references: List) -> dict:
FILE: opencompass/datasets/infinitebench/infinitebench_retrievenumber.py
class InfiniteBenchretrievenumberDataset (line 11) | class InfiniteBenchretrievenumberDataset(BaseDataset):
method load (line 14) | def load(path: str):
FILE: opencompass/datasets/infinitebench/infinitebench_retrievepasskey.py
class InfiniteBenchretrievepasskeyDataset (line 11) | class InfiniteBenchretrievepasskeyDataset(BaseDataset):
method load (line 14) | def load(path: str):
FILE: opencompass/datasets/infinitebench/infinitebench_zhqa.py
class InfiniteBenchzhqaDataset (line 11) | class InfiniteBenchzhqaDataset(BaseDataset):
method load (line 14) | def load(path: str):
FILE: opencompass/datasets/infinitebench/utils.py
function iter_jsonl (line 7) | def iter_jsonl(path):
function InfiniteBench_first_number_postprocess (line 14) | def InfiniteBench_first_number_postprocess(text: str) -> str:
FILE: opencompass/datasets/internsandbox.py
class InternSandboxDataset (line 15) | class InternSandboxDataset(BaseDataset):
method load (line 18) | def load(path: str, sandbox: str, local_mode: bool = False):
class InternSandboxEvaluator (line 32) | class InternSandboxEvaluator(BaseEvaluator):
method __init__ (line 34) | def __init__(self,
method score (line 41) | def score(self, predictions, references, test_set):
FILE: opencompass/datasets/iwslt2017.py
class IWSLT2017Dataset (line 9) | class IWSLT2017Dataset(BaseDataset):
method load (line 12) | def load(**kwargs):
FILE: opencompass/datasets/jigsawmultilingual.py
class JigsawMultilingualDataset (line 12) | cl
Copy disabled (too large)
Download .json
Condensed preview — 3168 files, each showing path, character count, and a content snippet. Download the .json file for the full structured content (11,214K chars).
[
{
"path": ".codespellrc",
"chars": 108,
"preview": "[codespell]\nskip = *.ipynb\ncount =\nquiet-level = 3\nignore-words-list = nd, ans, ques, rouge, softwares, wit\n"
},
{
"path": ".github/ISSUE_TEMPLATE/1_bug-report.yml",
"chars": 3289,
"preview": "name: 🐞 Bug report\ndescription: Create a report to help us improve\nlabels: [\"bug\"]\ntitle: \"[Bug] \"\nbody:\n - type: markd"
},
{
"path": ".github/ISSUE_TEMPLATE/2_feature-request.yml",
"chars": 1258,
"preview": "name: 🚀 Feature request\ndescription: Suggest an idea for this project\nlabels: [\"enhancement\"]\ntitle: \"[Feature] \"\nbody:\n"
},
{
"path": ".github/ISSUE_TEMPLATE/3_bug-report_zh.yml",
"chars": 2298,
"preview": "name: 🐞 报告 Bug\ndescription: 报告你在使用中遇到的不合预期的情况\nlabels: [\"bug\"]\ntitle: \"[Bug] \"\nbody:\n - type: markdown\n attributes:\n "
},
{
"path": ".github/ISSUE_TEMPLATE/4_feature-request_zh.yml",
"chars": 875,
"preview": "name: 🚀 功能建议\ndescription: 建议一项新的功能\nlabels: [\"enhancement\"]\ntitle: \"[Feature] \"\nbody:\n - type: markdown\n attributes:\n"
},
{
"path": ".github/ISSUE_TEMPLATE/config.yml",
"chars": 509,
"preview": "blank_issues_enabled: false\n\ncontact_links:\n - name: 📚 OpenCompass Documentation (官方文档)\n url: https://opencompass.re"
},
{
"path": ".github/pull_request_template.md",
"chars": 1541,
"preview": "Thanks for your contribution and we appreciate it a lot. The following instructions would make your pull request more he"
},
{
"path": ".github/workflows/daily-ete-test.yml",
"chars": 30183,
"preview": "name: daily_ete_test\n\non:\n workflow_dispatch:\n inputs:\n repo_org:\n required: false\n description: "
},
{
"path": ".github/workflows/link-check.yml",
"chars": 828,
"preview": "name: 'Link check'\n\non:\n schedule:\n # check links at 01:30 a.m. every day\n - cron: '30 1 * * *'\n\n workflow_dispa"
},
{
"path": ".github/workflows/lint.yml",
"chars": 534,
"preview": "name: lint\n\non: [push, pull_request]\n\nconcurrency:\n group: ${{ github.workflow }}-${{ github.ref }}\n cancel-in-progres"
},
{
"path": ".github/workflows/pr-run-test.yml",
"chars": 5603,
"preview": "name: pr_run_test\n\non:\n pull_request:\n paths-ignore:\n - 'README.md'\n - 'README_zh-CN.md'\n - 'docs/**'"
},
{
"path": ".github/workflows/pr-stage-check.yml",
"chars": 5607,
"preview": "name: pr_stage_test\n\non:\n pull_request:\n paths-ignore:\n - 'README.md'\n - 'README_zh-CN.md'\n - 'docs/*"
},
{
"path": ".github/workflows/publish-to-pypi.yml",
"chars": 855,
"preview": "name: deploy\n\non:\n push:\n workflow_dispatch:\n inputs:\n confirm_publish:\n description: 'Type YES to conf"
},
{
"path": ".github/workflows/unit-test.yml",
"chars": 1647,
"preview": "name: unit_test\n\non:\n pull_request:\n paths-ignore:\n - 'README.md'\n - 'README_zh-CN.md'\n - 'docs/**'\n "
},
{
"path": ".gitignore",
"chars": 1798,
"preview": ".DS_Store\noutput_*/\noutputs/\nscripts/\nicl_inference_output/\n.vscode/\ntmp/\nconfigs/eval_subjective_alignbench_test.py\ncon"
},
{
"path": ".owners.yml",
"chars": 218,
"preview": "assign:\n issues: enabled\n pull_requests: disabled\n strategy:\n # random\n daily-shift-based\n scedule:\n '*/1 *"
},
{
"path": ".pre-commit-config-zh-cn.yaml",
"chars": 3712,
"preview": "exclude: |\n (?x)^(\n tests/data/|\n tests/datasets/|\n tests/models/|\n opencompass/models/internal/|"
},
{
"path": ".pre-commit-config.yaml",
"chars": 3658,
"preview": "exclude: |\n (?x)^(\n tests/data/|\n tests/datasets/|\n tests/models/|\n opencompass/models/internal/|"
},
{
"path": "LICENSE",
"chars": 11408,
"preview": "Copyright 2020 OpenCompass Authors. All rights reserved.\n\n Apache License\n "
},
{
"path": "MANIFEST.in",
"chars": 210,
"preview": "recursive-include opencompass/configs *.py *.yml *.json *.txt *.md\nrecursive-include opencompass/openicl/icl_evaluator/h"
},
{
"path": "README.md",
"chars": 24631,
"preview": "<div align=\"center\">\n <img src=\"docs/en/_static/image/logo.svg\" width=\"500px\"/>\n <br />\n <br />\n\n[![][github-release-"
},
{
"path": "README_zh-CN.md",
"chars": 16217,
"preview": "<div align=\"center\">\n <img src=\"docs/zh_cn/_static/image/logo.svg\" width=\"500px\"/>\n <br />\n <br />\n\n[![][github-relea"
},
{
"path": "autotest/__init__.py",
"chars": 56,
"preview": "\"\"\"OpenCompass automated test package.\"\"\"\n\n__all__ = []\n"
},
{
"path": "autotest/cluster/__init__.py",
"chars": 63,
"preview": "\"\"\"OpenCompass inference test configurations.\"\"\"\n\n__all__ = []\n"
},
{
"path": "autotest/cluster/chat_models.py",
"chars": 2234,
"preview": "from mmengine.config import read_base\n\nfrom opencompass.models import (HuggingFacewithChatTemplate,\n "
},
{
"path": "autotest/eval/__init__.py",
"chars": 63,
"preview": "\"\"\"OpenCompass inference test configurations.\"\"\"\n\n__all__ = []\n"
},
{
"path": "autotest/eval/eval_base_fullbench.py",
"chars": 8080,
"preview": "from mmengine.config import read_base\n\nwith read_base():\n from autotest.eval.models import base_models\n from openc"
},
{
"path": "autotest/eval/eval_base_longtext_fullbench.py",
"chars": 1305,
"preview": "from mmengine.config import read_base\n\nwith read_base():\n from autotest.eval.models import base_models\n from openc"
},
{
"path": "autotest/eval/eval_chat_longtext_fullbench.py",
"chars": 1504,
"preview": "from mmengine.config import read_base\n\nwith read_base():\n from autotest.eval.models import models\n from opencompas"
},
{
"path": "autotest/eval/eval_chat_obj_fullbench_other.py",
"chars": 2511,
"preview": "from mmengine.config import read_base\n\nwith read_base():\n # Datasets\n from autotest.eval.models import judge_model"
},
{
"path": "autotest/eval/eval_chat_obj_fullbench_v5.py",
"chars": 11294,
"preview": "from mmengine.config import read_base\n\nwith read_base():\n # read hf models - chat models\n # Dataset\n from autot"
},
{
"path": "autotest/eval/eval_chat_obj_fullbench_v6.py",
"chars": 5416,
"preview": "from mmengine.config import read_base\n\nwith read_base():\n from autotest.eval.models import judge_models, models\n f"
},
{
"path": "autotest/eval/eval_chat_obj_fullbench_v7.py",
"chars": 7850,
"preview": "from mmengine.config import read_base\n\nwith read_base():\n # Datasets\n # Instruct Following\n # # # # Math Calcul"
},
{
"path": "autotest/eval/eval_chat_obj_fullbench_v8.py",
"chars": 2521,
"preview": "from mmengine.config import read_base\n\nwith read_base():\n # Datasets\n from autotest.eval.models import judge_model"
},
{
"path": "autotest/eval/eval_chat_obj_v8.py",
"chars": 2051,
"preview": "from mmengine.config import read_base\n\nwith read_base():\n # Datasets\n from autotest.eval.models import judge_model"
},
{
"path": "autotest/eval/eval_chat_sub_fullbench.py",
"chars": 5773,
"preview": "from mmengine.config import read_base\n\nfrom opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner\nfrom op"
},
{
"path": "autotest/eval/models.py",
"chars": 1989,
"preview": "from opencompass.models import TurboMindModel, TurboMindModelwithChatTemplate\nfrom opencompass.utils.text_postprocessors"
},
{
"path": "autotest/model/__init__.py",
"chars": 63,
"preview": "\"\"\"OpenCompass inference test configurations.\"\"\"\n\n__all__ = []\n"
},
{
"path": "autotest/model/base_datasets.py",
"chars": 980,
"preview": "from mmengine.config import read_base\n\nwith read_base():\n from opencompass.configs.datasets.gpqa.gpqa_few_shot_ppl_4b"
},
{
"path": "autotest/model/chat_datasets.py",
"chars": 1799,
"preview": "from mmengine.config import read_base\n\nwith read_base():\n from opencompass.configs.datasets.aime2025.aime2025_cascade"
},
{
"path": "autotest/model/constant.py",
"chars": 460,
"preview": "meta_template = dict(\n begin=dict(\n role='SYSTEM',\n api_role='SYSTEM',\n prompt='''\n Your "
},
{
"path": "autotest/model/infer_api.py",
"chars": 3884,
"preview": "from mmengine.config import read_base\n\nfrom opencompass.models.openai_api import OpenAISDK\nfrom opencompass.models.opena"
},
{
"path": "autotest/model/infer_api_rollout.py",
"chars": 3186,
"preview": "from mmengine.config import read_base\n\nfrom opencompass.models import OpenAISDKRollout\nfrom opencompass.utils.text_postp"
},
{
"path": "autotest/model/infer_lmdeploy_base.py",
"chars": 12991,
"preview": "from mmengine.config import read_base\n\nfrom opencompass.models import TurboMindModel\nfrom opencompass.utils.text_postpro"
},
{
"path": "autotest/model/infer_lmdeploy_chat.py",
"chars": 13946,
"preview": "from mmengine.config import read_base\n\nfrom opencompass.models import TurboMindModelwithChatTemplate\nfrom opencompass.ut"
},
{
"path": "autotest/model/infer_transformers_base.py",
"chars": 5027,
"preview": "from mmengine.config import read_base\n\nfrom opencompass.models import HuggingFaceBaseModel\nfrom opencompass.utils.text_p"
},
{
"path": "autotest/model/infer_transformers_chat.py",
"chars": 5490,
"preview": "from mmengine.config import read_base\n\nfrom opencompass.models import HuggingFacewithChatTemplate\nfrom opencompass.utils"
},
{
"path": "autotest/model/infer_vllm_base.py",
"chars": 11834,
"preview": "from mmengine.config import read_base\n\nfrom opencompass.models import VLLM\nfrom opencompass.utils.text_postprocessors im"
},
{
"path": "autotest/model/infer_vllm_chat.py",
"chars": 13603,
"preview": "from mmengine.config import read_base\n\nfrom opencompass.models import VLLMwithChatTemplate\nfrom opencompass.utils.text_p"
},
{
"path": "autotest/oc_score_baseline.yaml",
"chars": 2409,
"preview": "qwen2.5-7b-hf:\n demo_gsm8k_accuracy: 78.12\n race-middle_accuracy: 90.46\n race-high_accuracy: 86.54\n\ninternlm3-8"
},
{
"path": "autotest/utils/compare_results.py",
"chars": 4413,
"preview": "import filecmp\nimport os\n\nimport fire\n\n\ndef compare_results(folder1,\n folder2,\n co"
},
{
"path": "autotest/utils/health_check.py",
"chars": 538,
"preview": "from time import sleep\n\nimport fire\nimport requests\n\n\ndef health_check(url: str = 'http://0.0.0.0:23333', timeout: int ="
},
{
"path": "autotest/utils/oc_score_assert.py",
"chars": 6779,
"preview": "import csv\nimport os\n\nimport pytest\nimport yaml\n\noutput_path = 'regression_result_daily'\n\n\n@pytest.fixture()\ndef baselin"
},
{
"path": "dataset-index.yml",
"chars": 41997,
"preview": "- ifeval:\n name: IFEval\n category: Instruction Following\n paper: https://arxiv.org/pdf/2311.07911\n configpat"
},
{
"path": "docs/en/.readthedocs.yaml",
"chars": 249,
"preview": "version: 2\n\n# Set the version of Python and other tools you might need\nbuild:\n os: ubuntu-22.04\n tools:\n python: \"3"
},
{
"path": "docs/en/Makefile",
"chars": 634,
"preview": "# Minimal makefile for Sphinx documentation\n#\n\n# You can set these variables from the command line, and also\n# from the "
},
{
"path": "docs/en/_static/css/readthedocs.css",
"chars": 1018,
"preview": ".header-logo {\n background-image: url(\"../image/logo.svg\");\n background-size: 275px 80px;\n height: 80px;\n wi"
},
{
"path": "docs/en/_static/js/custom.js",
"chars": 499,
"preview": "var collapsedSections = ['Dataset Statistics'];\n\n$(document).ready(function () {\n $('.dataset').DataTable({\n \"stateS"
},
{
"path": "docs/en/_templates/404.html",
"chars": 473,
"preview": "{% extends \"layout.html\" %}\n\n{% block body %}\n\n<h1>Page Not Found</h1>\n<p>\n The page you are looking for cannot be foun"
},
{
"path": "docs/en/_templates/autosummary/class.rst",
"chars": 243,
"preview": ".. role:: hidden\n :class: hidden-section\n.. currentmodule:: {{ module }}\n\n\n{{ name | underline}}\n\n.. autoclass:: {{ n"
},
{
"path": "docs/en/_templates/callable.rst",
"chars": 265,
"preview": ".. role:: hidden\n :class: hidden-section\n.. currentmodule:: {{ module }}\n\n\n{{ name | underline}}\n\n.. autoclass:: {{ n"
},
{
"path": "docs/en/advanced_guides/accelerator_intro.md",
"chars": 6040,
"preview": "# Accelerate Evaluation Inference with vLLM or LMDeploy\n\n## Background\n\nDuring the OpenCompass evaluation process, the H"
},
{
"path": "docs/en/advanced_guides/circular_eval.md",
"chars": 5323,
"preview": "# CircularEval\n\n## Background\n\nFor multiple-choice questions, when a Language Model (LLM) provides the correct option, i"
},
{
"path": "docs/en/advanced_guides/code_eval.md",
"chars": 4656,
"preview": "# Code Evaluation Tutorial\n\nThis tutorial primarily focuses on evaluating a model's coding proficiency, using `humaneval"
},
{
"path": "docs/en/advanced_guides/code_eval_service.md",
"chars": 9566,
"preview": "# Code Evaluation Docker Tutorial\n\nTo complete the LLM code capability evaluation, we need to build a separate evaluatio"
},
{
"path": "docs/en/advanced_guides/contamination_eval.md",
"chars": 9217,
"preview": "# Data Contamination Assessment\n\n**Data Contamination** refers to the phenomenon where data intended for downstream test"
},
{
"path": "docs/en/advanced_guides/custom_dataset.md",
"chars": 9883,
"preview": "# Dataset Quick Evaluation Tutorial\n\nOpenCompass provides two paths for quickly evaluating the provided data, the data f"
},
{
"path": "docs/en/advanced_guides/evaluation_lightllm.md",
"chars": 3545,
"preview": "# Evaluation with Lightllm\n\nWe now support the evaluation of large language models using [Lightllm](https://github.com/M"
},
{
"path": "docs/en/advanced_guides/evaluation_lmdeploy.md",
"chars": 3453,
"preview": "# Evaluation with LMDeploy\n\nWe now support evaluation of models accelerated by the [LMDeploy](https://github.com/InternL"
},
{
"path": "docs/en/advanced_guides/llm_judge.md",
"chars": 12474,
"preview": "# LLM as Judge Evaluation\n\n## Introduction\n\nThe GenericLLMEvaluator is particularly useful for scenarios where rule-base"
},
{
"path": "docs/en/advanced_guides/longeval.md",
"chars": 16921,
"preview": "# Long Context Evaluation Guidance\n\n## Introduction\n\nAlthough large-scale language models (LLMs) such as GPT-4 have demo"
},
{
"path": "docs/en/advanced_guides/math_verify.md",
"chars": 5493,
"preview": "# General Math Evaluation Guidance\n\n## Introduction\n\nMathematical reasoning is a crucial capability for large language m"
},
{
"path": "docs/en/advanced_guides/needleinahaystack_eval.md",
"chars": 7316,
"preview": "# Needle In A Haystack Evaluation\n\n## Introduction to the Needle In A Haystack Test\n\nThe Needle In A Haystack test (insp"
},
{
"path": "docs/en/advanced_guides/new_dataset.md",
"chars": 5459,
"preview": "# Add a dataset\n\nAlthough OpenCompass has already included most commonly used datasets, users need to follow the steps b"
},
{
"path": "docs/en/advanced_guides/new_model.md",
"chars": 2598,
"preview": "# Add a Model\n\nCurrently, we support HF models, some model APIs, and some third-party models.\n\n## Adding API Models\n\nTo "
},
{
"path": "docs/en/advanced_guides/objective_judgelm_evaluation.md",
"chars": 7013,
"preview": "# Using Large Models as JudgeLLM for Objective Evaluation\n\n## Introduction\n\nTraditional objective evaluations often rely"
},
{
"path": "docs/en/advanced_guides/persistence.md",
"chars": 2777,
"preview": "# Evaluation Results Persistence\n\n## Introduction\n\nNormally, the evaluation results of OpenCompass will be saved to your"
},
{
"path": "docs/en/advanced_guides/prompt_attack.md",
"chars": 4849,
"preview": "# Prompt Attack\n\nWe support prompt attack following the idea of [PromptBench](https://github.com/microsoft/promptbench)."
},
{
"path": "docs/en/advanced_guides/subjective_evaluation.md",
"chars": 7873,
"preview": "# Subjective Evaluation Guidance\n\n## Introduction\n\nSubjective evaluation aims to assess the model's performance in tasks"
},
{
"path": "docs/en/conf.py",
"chars": 6993,
"preview": "# flake8: noqa\n# Configuration file for the Sphinx documentation builder.\n#\n# This file only contains a selection of the"
},
{
"path": "docs/en/docutils.conf",
"chars": 43,
"preview": "[html writers]\ntable_style: colwidths-auto\n"
},
{
"path": "docs/en/get_started/faq.md",
"chars": 12177,
"preview": "# FAQ\n\n## General\n\n### What are the differences and connections between `ppl` and `gen`?\n\n`ppl` stands for perplexity, a"
},
{
"path": "docs/en/get_started/installation.md",
"chars": 5682,
"preview": "# Installation\n\n## Basic Installation\n\n1. Prepare the OpenCompass runtime environment using Conda:\n\n```conda create --na"
},
{
"path": "docs/en/get_started/quick_start.md",
"chars": 16409,
"preview": "# Quick Start\n\n\n - [What is PR](#what-is-p"
},
{
"path": "docs/en/notes/news.md",
"chars": 8370,
"preview": "# News\n\n- **\\[2024.05.08\\]** We supported the evaluation of 4 MoE models: [Mixtral-8x22B-v0.1](configs/models/mixtral/hf"
},
{
"path": "docs/en/prompt/chain_of_thought.md",
"chars": 8396,
"preview": "# Chain of Thought\n\n## Background\n\nDuring the process of reasoning, CoT (Chain of Thought) method is an efficient way to"
},
{
"path": "docs/en/prompt/meta_template.md",
"chars": 12120,
"preview": "# Meta Template\n\n## Background\n\nIn the Supervised Fine-Tuning (SFT) process of Language Model Learning (LLM), we often i"
},
{
"path": "docs/en/prompt/overview.md",
"chars": 1016,
"preview": "# Prompt Overview\n\nThe prompt is the input to the Language Model (LLM), used to guide the model to generate text or calc"
},
{
"path": "docs/en/prompt/prompt_template.md",
"chars": 16682,
"preview": "# Prompt Template\n\n## Background\n\nIn language model evaluation, we often construct prompts from the original dataset acc"
},
{
"path": "docs/en/statis.py",
"chars": 3949,
"preview": "#! /usr/bin/env python\n\nfrom pathlib import Path\n\nimport yaml\nfrom tabulate import tabulate\n\nOC_ROOT = Path(__file__).ab"
},
{
"path": "docs/en/tools.md",
"chars": 5319,
"preview": "# Useful Tools\n\n## Prompt Viewer\n\nThis tool allows you to directly view the generated prompt without starting the full t"
},
{
"path": "docs/en/user_guides/config.md",
"chars": 7281,
"preview": "# Learn About Config\n\nOpenCompass uses the OpenMMLab modern style configuration files. If you are familiar with the Open"
},
{
"path": "docs/en/user_guides/corebench.md",
"chars": 3588,
"preview": "# Performance of Common Benchmarks\n\nWe have identified several well-known benchmarks for evaluating large language model"
},
{
"path": "docs/en/user_guides/datasets.md",
"chars": 4765,
"preview": "# Configure Datasets\n\nThis tutorial mainly focuses on selecting datasets supported by OpenCompass and preparing their co"
},
{
"path": "docs/en/user_guides/deepseek_r1.md",
"chars": 6814,
"preview": "# Tutorial for Evaluating Reasoning Models\n\nOpenCompass provides an evaluation tutorial for DeepSeek R1 series reasoning"
},
{
"path": "docs/en/user_guides/evaluation.md",
"chars": 7991,
"preview": "# Efficient Evaluation\n\nOpenCompass supports custom task partitioners (`Partitioner`), which enable flexible division of"
},
{
"path": "docs/en/user_guides/experimentation.md",
"chars": 6467,
"preview": "# Task Execution and Monitoring\n\n## Launching an Evaluation Task\n\nThe program entry for the evaluation task is `run.py`."
},
{
"path": "docs/en/user_guides/framework_overview.md",
"chars": 7780,
"preview": "# Overview\n\n## Evaluation Targets\n\nThe primary evaluation targets of this algorithm library are large language models. W"
},
{
"path": "docs/en/user_guides/interns1.md",
"chars": 2512,
"preview": "# Tutorial for Evaluating Intern-S1\n\nOpenCompass now provides the necessary configs for evaluating Intern-S1. Please per"
},
{
"path": "docs/en/user_guides/metrics.md",
"chars": 6314,
"preview": "# Metric Calculation\n\nIn the evaluation phase, we typically select the corresponding evaluation metric strategy based on"
},
{
"path": "docs/en/user_guides/models.md",
"chars": 4800,
"preview": "# Prepare Models\n\nTo support the evaluation of new models in OpenCompass, there are several ways:\n\n1. HuggingFace-based "
},
{
"path": "docs/en/user_guides/summarizer.md",
"chars": 3311,
"preview": "# Results Summary\n\nAfter the evaluation is complete, the results need to be printed on the screen or saved. This process"
},
{
"path": "docs/zh_cn/.readthedocs.yaml",
"chars": 252,
"preview": "version: 2\n\n# Set the version of Python and other tools you might need\nbuild:\n os: ubuntu-22.04\n tools:\n python: \"3"
},
{
"path": "docs/zh_cn/Makefile",
"chars": 634,
"preview": "# Minimal makefile for Sphinx documentation\n#\n\n# You can set these variables from the command line, and also\n# from the "
},
{
"path": "docs/zh_cn/_static/css/readthedocs.css",
"chars": 1018,
"preview": ".header-logo {\n background-image: url(\"../image/logo.svg\");\n background-size: 275px 80px;\n height: 80px;\n wi"
},
{
"path": "docs/zh_cn/_static/js/custom.js",
"chars": 446,
"preview": "var collapsedSections = ['数据集统计'];\n\n$(document).ready(function () {\n $('.dataset').DataTable({\n \"stateSave\": false,\n"
},
{
"path": "docs/zh_cn/_templates/404.html",
"chars": 473,
"preview": "{% extends \"layout.html\" %}\n\n{% block body %}\n\n<h1>Page Not Found</h1>\n<p>\n The page you are looking for cannot be foun"
},
{
"path": "docs/zh_cn/_templates/autosummary/class.rst",
"chars": 243,
"preview": ".. role:: hidden\n :class: hidden-section\n.. currentmodule:: {{ module }}\n\n\n{{ name | underline}}\n\n.. autoclass:: {{ n"
},
{
"path": "docs/zh_cn/_templates/callable.rst",
"chars": 265,
"preview": ".. role:: hidden\n :class: hidden-section\n.. currentmodule:: {{ module }}\n\n\n{{ name | underline}}\n\n.. autoclass:: {{ n"
},
{
"path": "docs/zh_cn/advanced_guides/accelerator_intro.md",
"chars": 4126,
"preview": "# 使用 vLLM 或 LMDeploy 来一键式加速评测推理\n\n## 背景\n\n在 OpenCompass 评测过程中,默认使用 Huggingface 的 transformers 库进行推理,这是一个非常通用的方案,但在某些情况下,我们"
},
{
"path": "docs/zh_cn/advanced_guides/circular_eval.md",
"chars": 3672,
"preview": "# 循环评测\n\n## 背景\n\n对于选择题而言,当 LLM 给出正确的选项,并不一定代表着它能真正地理解题意并经过推理得出答案,它也有可能是蒙对的。为了将这两种情形区分开,同时也为了降低 LLM 对选项的偏见,我们可以尝试使用循环评测 (Ci"
},
{
"path": "docs/zh_cn/advanced_guides/code_eval.md",
"chars": 3226,
"preview": "# 代码评测教程\n\n这里以 `humaneval` 和 `mbpp` 为例,主要介绍如何评测模型的代码能力。\n\n## pass@1\n\n如果只需要生成单条回复来评测pass@1的性能,可以直接使用[configs/datasets/human"
},
{
"path": "docs/zh_cn/advanced_guides/code_eval_service.md",
"chars": 6087,
"preview": "# 代码评测Docker教程\n\n为了完成LLM代码能力评测,我们需要搭建一套独立的评测环境,避免在开发环境执行错误代码从而造成不可避免的损失。目前 OpenCompass 使用的代码评测服务可参考[code-evaluator](https"
},
{
"path": "docs/zh_cn/advanced_guides/compassbench_intro.md",
"chars": 5523,
"preview": "# CompassBench 介绍\n\n## CompassBench 2.0 v1.3 版本\n\nCompassBench(官方自建榜单)经历了多次更新迭代,从2024年7月起,OpenCompass将会公布自建榜单的评测规则(评测配置文件)"
},
{
"path": "docs/zh_cn/advanced_guides/compassbench_v2_0.md",
"chars": 10324,
"preview": "# CompassBench 2.0 介绍\n\n\n## v1.0介绍\n为支持OpenCompass的年度榜单,本文将提供CompassBench的整体介绍。\n\n本次评测将在语言、知识、创作、推理、数学、代码、长文本、智能体能力的多项任务上开展"
},
{
"path": "docs/zh_cn/advanced_guides/contamination_eval.md",
"chars": 7123,
"preview": "# 数据污染评估\n\n**数据污染** 是指本应用在下游测试任务重的数据出现在了大语言模型 (LLM) 的训练数据中,从而导致在下游任务 (例如,摘要、自然语言推理、文本分类) 上指标虚高,无法反映模型真实泛化能力的现象。\n\n由于数据污染的源"
},
{
"path": "docs/zh_cn/advanced_guides/custom_dataset.md",
"chars": 6604,
"preview": "# 快速评测数据集\n\nOpenCompass提供了两种快速对提供的数据进行评测的路径,即基于ChatMLDataset的数据格式协议和基于CustomDataset的数据格式协议。\n相较于 [new_dataset.md](./new_da"
},
{
"path": "docs/zh_cn/advanced_guides/evaluation_lightllm.md",
"chars": 2287,
"preview": "# 评测 Lightllm 模型\n\n我们支持评测使用 [Lightllm](https://github.com/ModelTC/lightllm) 进行推理的大语言模型。Lightllm 是由商汤科技开发,是一个基于 Python 的 L"
},
{
"path": "docs/zh_cn/advanced_guides/evaluation_lmdeploy.md",
"chars": 2805,
"preview": "# 使用 LMDeploy 加速评测\n\n我们支持在评测大语言模型时,使用 [LMDeploy](https://github.com/InternLM/lmdeploy) 作为推理加速引擎。LMDeploy 是涵盖了 LLM 和 VLM 任"
},
{
"path": "docs/zh_cn/advanced_guides/llm_judge.md",
"chars": 8110,
"preview": "# LLM 作为评判器\n\n## 简介\n\nGenericLLMEvaluator组件特别适用于那些难以通过规则式方法(如正则表达式)进行完美判断的场景,例如:\n\n- 模型不输出选项标识而只输出选项内容的情况\n- 需要事实性判断的数据集\n- 需"
},
{
"path": "docs/zh_cn/advanced_guides/longeval.md",
"chars": 11914,
"preview": "# 长文本评测指引\n\n## 介绍\n\n虽然大语言模型(LLM)如GPT-4在处理自然语言任务已经展现出明显的优势,但目前的开源模型大多只能处理数千个token长度以内的文本,这限制了模型阅读书籍、撰写文本摘要等需要处理长文本的能力。为了探究模"
},
{
"path": "docs/zh_cn/advanced_guides/math_verify.md",
"chars": 3758,
"preview": "# 数学能力评测\n\n## 简介\n\n数学推理能力是大语言模型(LLMs)的一项关键能力。为了评估模型的数学能力,我们需要测试其逐步解决数学问题并提供准确最终答案的能力。OpenCompass 通过 CustomDataset 和 MATHVe"
},
{
"path": "docs/zh_cn/advanced_guides/needleinahaystack_eval.md",
"chars": 5946,
"preview": "# 大海捞针(Needle In A Haystack)实验评估\n\n## 大海捞针测试简介\n\n大海捞针测试(灵感来自[NeedleInAHaystack](https://github.com/gkamradt/LLMTest_Needle"
},
{
"path": "docs/zh_cn/advanced_guides/new_dataset.md",
"chars": 3267,
"preview": "# 支持新数据集\n\n尽管 OpenCompass 已经包含了大多数常用数据集,用户在支持新数据集的时候需要完成以下几个步骤:\n\n1. 在 `opencompass/datasets` 文件夹新增数据集脚本 `mydataset.py`, 该"
},
{
"path": "docs/zh_cn/advanced_guides/new_model.md",
"chars": 2044,
"preview": "# 支持新模型\n\n目前我们已经支持的模型有 HF 模型、部分模型 API 、部分第三方模型。\n\n## 新增API模型\n\n新增基于API的模型,需要在 `opencompass/models` 下新建 `mymodel_api.py` 文件,"
},
{
"path": "docs/zh_cn/advanced_guides/objective_judgelm_evaluation.md",
"chars": 5194,
"preview": "# 用大模型做为JudgeLLM进行客观评测\n\n## 介绍\n\n通常的客观评测虽有标准答案作为参考,但是在实际应用中,模型预测结果可能因为模型指令遵循能力不同或后处理函数的不完善而产生差异,导致无法抽取到正确的答案并与标准答案进行对比。因此客"
},
{
"path": "docs/zh_cn/advanced_guides/persistence.md",
"chars": 1352,
"preview": "# 评测结果持久化\n\n## 介绍\n\n通常情况下,OpenCompass的评测结果将会保存到工作目录下。 但在某些情况下,可能会产生用户间的数据共享,以及快速查看已有的公共评测结果等需求。 因此,我们提供了一个能够将评测结果快速转存到外部公共"
},
{
"path": "docs/zh_cn/advanced_guides/prompt_attack.md",
"chars": 3563,
"preview": "# 提示词攻击\n\nOpenCompass 支持[PromptBench](https://github.com/microsoft/promptbench)的提示词攻击。其主要想法是评估提示指令的鲁棒性,也就是说,当攻击或修改提示以指导任务"
},
{
"path": "docs/zh_cn/advanced_guides/subjective_evaluation.md",
"chars": 5119,
"preview": "# 主观评测指引\n\n## 介绍\n\n主观评测旨在评估模型在符合人类偏好的能力上的表现。这种评估的黄金准则是人类喜好,但标注成本很高。\n\n为了探究模型的主观能力,我们采用了JudgeLLM作为人类评估者的替代品([LLM-as-a-Judge]"
},
{
"path": "docs/zh_cn/conf.py",
"chars": 7038,
"preview": "# flake8: noqa\n# Configuration file for the Sphinx documentation builder.\n#\n# This file only contains a selection of the"
},
{
"path": "docs/zh_cn/cp_origin_docs.sh",
"chars": 232,
"preview": "#!/usr/bin/env bash\n\n# Copy *.md files from docs/ if it doesn't have a Chinese translation\n\nfor filename in $(find ../en"
},
{
"path": "docs/zh_cn/docutils.conf",
"chars": 43,
"preview": "[html writers]\ntable_style: colwidths-auto\n"
},
{
"path": "docs/zh_cn/get_started/faq.md",
"chars": 5956,
"preview": "# 常见问题\n\n## 通用\n\n### ppl 和 gen 有什么区别和联系?\n\n`ppl` 是困惑度 (perplexity) 的缩写,是一种评价模型进行语言建模能力的指标。在 OpenCompass 的语境下,它一般指一种选择题的做法:给"
},
{
"path": "docs/zh_cn/get_started/installation.md",
"chars": 3795,
"preview": "# 安装\n\n## 基础安装\n\n1. 使用Conda准备 OpenCompass 运行环境:\n\n ```bash\n conda create --name opencompass python=3.10 -y\n # conda c"
},
{
"path": "docs/zh_cn/get_started/quick_start.md",
"chars": 14557,
"preview": "# 快速开始\n\n\n\n## 概"
},
{
"path": "docs/zh_cn/index.rst",
"chars": 1811,
"preview": "欢迎来到 OpenCompass 中文教程!\n==========================================\n\nOpenCompass 上手路线\n-------------------------------\n\n为了用"
},
{
"path": "docs/zh_cn/notes/academic.md",
"chars": 3190,
"preview": "# 官网学术榜单精度复现指引\n\n为快捷、直观地向用户展示主流开源社区模型及商用模型在常用数据集上的综合表现,我们在官网以通常两周更新一次的频率持续维护大语言模型的[学术榜单](https://rank.opencompass.org.cn/"
},
{
"path": "docs/zh_cn/notes/contribution_guide.md",
"chars": 3897,
"preview": "# 为 OpenCompass 做贡献\n\n- [为 OpenCompass 做贡献](#为-opencompass-做贡献)\n - [什么是拉取请求?](#什么是拉取请求)\n - [基本的工作流:](#基本的工作流)\n - [具体步骤"
},
{
"path": "docs/zh_cn/notes/news.md",
"chars": 5804,
"preview": "# 新闻\n\n- **\\[2024.05.08\\]** 我们支持了以下四个MoE模型的评测配置文件: [Mixtral-8x22B-v0.1](configs/models/mixtral/hf_mixtral_8x22b_v0_1.py),"
},
{
"path": "docs/zh_cn/prompt/chain_of_thought.md",
"chars": 5068,
"preview": "# Chain of Thought\n\n## 背景\n\nCoT(思维链)是帮助大型语言模型解决如数学问题和关系推理问题等复杂问题的有效方式,在OpenCompass中,我们支持多种类型的CoT方法。\n\n 过程中,我们常常会根据实际的要求往对话内注入一些预定义的字符串,以求模型能按照一定的要求输出内容。例如,在一些 `ch"
},
{
"path": "docs/zh_cn/prompt/overview.md",
"chars": 455,
"preview": "# Prompt 概括\n\n提示词 (prompt) 是 LLM 的输入,用于让 LLM 往后续写内容或计算困惑度 (ppl),提示词的选取会对被评测模型的精度产生重大影响。如何将数据集转换为一系列的提示词的过程是由模板 (template)"
},
{
"path": "docs/zh_cn/prompt/prompt_template.md",
"chars": 11334,
"preview": "# Prompt 模板\n\n## 背景\n\n在语言模型的评测中,我们常会将原始数据集以一定的规则构造成 prompt,以便模型能够按照要求回答问题。\n\n通常,我们会在 prompt 开头放入指令,几个 in-context example(上下"
},
{
"path": "docs/zh_cn/statis.py",
"chars": 3538,
"preview": "#! /usr/bin/env python\n\nfrom pathlib import Path\n\nimport yaml\nfrom tabulate import tabulate\n\nOC_ROOT = Path(__file__).ab"
},
{
"path": "docs/zh_cn/tools.md",
"chars": 3858,
"preview": "# 实用工具\n\n## Prompt Viewer\n\n本工具允许你在不启动完整训练流程的情况下,直接查看生成的 prompt。如果传入的配置仅为数据集配置(如 `configs/datasets/nq/nq_gen_3dcea1.py`),则"
},
{
"path": "docs/zh_cn/user_guides/config.md",
"chars": 4732,
"preview": "# 学习配置文件\n\nOpenCompass 使用 OpenMMLab 新式风格的配置文件。如果你之前熟悉 OpenMMLab 风格的配置文件,可以直接阅读\n[纯 Python 风格的配置文件(Beta)](https://mmengine."
},
{
"path": "docs/zh_cn/user_guides/corebench.md",
"chars": 3449,
"preview": "# 主要数据集性能\n\n我们选择部分用于评估大型语言模型(LLMs)的知名基准,并提供了主要的LLMs在这些数据集上的详细性能结果。\n\n| Model | Version | Metric "
},
{
"path": "docs/zh_cn/user_guides/datasets.md",
"chars": 2994,
"preview": "# 配置数据集\n\n本节教程主要关注如何选择和配置所需要的数据集。请确保你已按照[数据集准备](../get_started/installation.md#数据集准备)中的步骤下载好数据集。\n\n## 数据集配置文件目录结构\n\n首先简单介绍一"
},
{
"path": "docs/zh_cn/user_guides/deepseek_r1.md",
"chars": 5056,
"preview": "# 强推理模型评测教程\n\nOpenCompass提供针对DeepSeek R1系列推理模型的评测教程(数学数据集)。\n\n- 在模型层面,我们建议使用Sampling方式,以减少因为Greedy评测带来的大量重复\n- 在数据集层面,我们对数据"
},
{
"path": "docs/zh_cn/user_guides/evaluation.md",
"chars": 5478,
"preview": "# 数据分片\n\nOpenCompass 支持自定义评测任务的任务划分器(`Partitioner`),实现评测任务的灵活切分;同时配合 `Runner` 控制任务执行的平台,如本机及集群。通过二者的组合,OpenCompass 可以将大评测"
},
{
"path": "docs/zh_cn/user_guides/experimentation.md",
"chars": 3404,
"preview": "# 任务运行和监控\n\n## 评测任务发起\n\n评测任务的程序入口为 `run.py`,使用方法如下:\n\n```shell\npython run.py $EXP {--slurm | --dlc | None} [-p PARTITION] ["
},
{
"path": "docs/zh_cn/user_guides/framework_overview.md",
"chars": 2790,
"preview": "# 整体概括\n\n## 评测对象\n\n本算法库的主要评测对象为语言大模型与多模态大模型。我们以语言大模型为例介绍评测的具体模型类型。\n\n- 基座模型:一般是经过海量的文本数据以自监督学习的方式进行训练获得的模型(如OpenAI的GPT-3,Me"
},
{
"path": "docs/zh_cn/user_guides/interns1.md",
"chars": 1849,
"preview": "# Intern-S1评测教程\n\nOpenCompass现已提供评测Intern-S1所需的相关模型配置与数据集配置。请顺序执行下列步骤来启动对Intern-S1的评测。\n\n## 模型下载与部署\n\nIntern-S1的模型权重现已开源,请从"
},
{
"path": "docs/zh_cn/user_guides/metrics.md",
"chars": 4593,
"preview": "# 评估指标\n\n在评测阶段,我们一般以数据集本身的特性来选取对应的评估策略,最主要的依据为**标准答案的类型**,一般以下几种类型:\n\n- **选项**:常见于分类任务,判断题以及选择题,目前这类问题的数据集占比最大,有 MMLU, CEv"
},
{
"path": "docs/zh_cn/user_guides/models.md",
"chars": 2887,
"preview": "# 准备模型\n\n要在 OpenCompass 中支持新模型的评测,有以下几种方式:\n\n1. 基于 HuggingFace 的模型\n2. 基于 API 的模型\n3. 自定义模型\n\n## 基于 HuggingFace 的模型\n\n在 OpenCo"
},
{
"path": "docs/zh_cn/user_guides/summarizer.md",
"chars": 1822,
"preview": "# 结果展示\n\n在评测完成后,评测的结果需要被打印到屏幕或者被保存下来,该过程是由 summarizer 控制的。\n\n```{note}\n如果 summarizer 出现在了 config 中,则评测结果输出会按照下述逻辑进行。\n如果 su"
},
{
"path": "examples/eval_OlympiadBench.py",
"chars": 1157,
"preview": "from mmengine.config import read_base\n\nwith read_base():\n from opencompass.configs.datasets.OlympiadBench.OlympiadBen"
},
{
"path": "examples/eval_PMMEval.py",
"chars": 1433,
"preview": "from mmengine.config import read_base\n\nfrom opencompass.models import HuggingFacewithChatTemplate\n\nwith read_base():\n "
},
{
"path": "examples/eval_ProcessBench.py",
"chars": 1742,
"preview": "from mmengine.config import read_base\nfrom opencompass.models import VLLMwithChatTemplate\n\nwith read_base():\n from op"
},
{
"path": "examples/eval_TheoremQA.py",
"chars": 1155,
"preview": "from mmengine.config import read_base\n\nwith read_base():\n from opencompass.configs.datasets.TheoremQA.TheoremQA_5shot"
},
{
"path": "examples/eval_academic_leaderboard_202407.py",
"chars": 7785,
"preview": "import os.path as osp\n\nfrom mmengine.config import read_base\n\nfrom opencompass.partitioners import NaivePartitioner, Num"
},
{
"path": "examples/eval_academic_leaderboard_202412.py",
"chars": 4864,
"preview": "import os.path as osp\n\nfrom mmengine.config import read_base\n\nfrom opencompass.partitioners import NaivePartitioner, Num"
},
{
"path": "examples/eval_academic_leaderboard_202502.py",
"chars": 5172,
"preview": "# flake8: noqa\n\nfrom mmengine.config import read_base\n\nfrom opencompass.partitioners import NaivePartitioner, NumWorkerP"
},
{
"path": "examples/eval_academic_leaderboard_REALTIME.py",
"chars": 4886,
"preview": "# flake8: noqa\n\nfrom mmengine.config import read_base\n\nfrom opencompass.partitioners import NaivePartitioner, NumWorkerP"
},
{
"path": "examples/eval_academic_telechat_thinking.py",
"chars": 5370,
"preview": "# flake8: noqa\n\nfrom mmengine.config import read_base\n\nfrom opencompass.partitioners import NaivePartitioner, NumWorkerP"
},
{
"path": "examples/eval_alaya.py",
"chars": 613,
"preview": "from mmengine.config import read_base\n\nwith read_base():\n from opencompass.configs.datasets.agieval.agieval_gen impor"
},
{
"path": "examples/eval_api_demo.py",
"chars": 395,
"preview": "from mmengine.config import read_base\n\nwith read_base():\n from opencompass.configs.datasets.demo.demo_gsm8k_chat_gen "
},
{
"path": "examples/eval_attack.py",
"chars": 847,
"preview": "from mmengine.config import read_base\n\nfrom opencompass.partitioners import NaivePartitioner\nfrom opencompass.runners im"
},
{
"path": "examples/eval_babilong.py",
"chars": 2039,
"preview": "from mmengine.config import read_base\n\nwith read_base():\n # Models\n # Datasets\n from opencompass.configs.datase"
},
{
"path": "examples/eval_base_demo.py",
"chars": 566,
"preview": "from mmengine.config import read_base\n\nwith read_base():\n from opencompass.configs.datasets.demo.demo_gsm8k_base_gen "
},
{
"path": "examples/eval_bench_intern_s1.py",
"chars": 5999,
"preview": "# flake8: noqa\n\nfrom mmengine.config import read_base\n\nfrom opencompass.partitioners import NaivePartitioner, NumWorkerP"
},
{
"path": "examples/eval_bluelm_32k_lveval.py",
"chars": 603,
"preview": "from mmengine.config import read_base\n\nwith read_base():\n from opencompass.configs.datasets.lveval.lveval import \\\n "
},
{
"path": "examples/eval_cascade_evaluator.py",
"chars": 5788,
"preview": "\nfrom mmengine.config import read_base\n\nfrom opencompass.openicl.icl_prompt_template import PromptTemplate\nfrom opencomp"
},
{
"path": "examples/eval_charm_mem.py",
"chars": 5366,
"preview": "from mmengine.config import read_base\nfrom opencompassopencompass.configs.models import OpenAI\n\nfrom opencompass.partiti"
},
{
"path": "examples/eval_charm_rea.py",
"chars": 6211,
"preview": "from mmengine.config import read_base\n\nwith read_base():\n from opencompass.configs.datasets.CHARM.charm_reason_gen_f8"
},
{
"path": "examples/eval_chat_agent.py",
"chars": 2217,
"preview": "from lagent import ReAct\nfrom lagent.agents.react import ReActProtocol\nfrom mmengine.config import read_base\n\nfrom openc"
},
{
"path": "examples/eval_chat_agent_baseline.py",
"chars": 1103,
"preview": "from mmengine.config import read_base\n\nfrom opencompass.models.openai_api import OpenAI\nfrom opencompass.partitioners im"
},
{
"path": "examples/eval_chat_demo.py",
"chars": 608,
"preview": "from mmengine.config import read_base\n\nwith read_base():\n from opencompass.configs.datasets.demo.demo_gsm8k_chat_gen "
},
{
"path": "examples/eval_chat_last.py",
"chars": 993,
"preview": "from mmengine.config import read_base\n\nfrom opencompass.models.openai_api import OpenAI\nfrom opencompass.openicl import "
},
{
"path": "examples/eval_chatml_datasets.py",
"chars": 1712,
"preview": "# flake8: noqa\n\nfrom mmengine.config import read_base\n\nfrom opencompass.partitioners import NaivePartitioner, NumWorkerP"
},
{
"path": "examples/eval_chembench.py",
"chars": 1329,
"preview": "from mmengine.config import read_base\n\nwith read_base():\n from opencompass.configs.datasets.ChemBench.ChemBench_gen i"
},
{
"path": "examples/eval_chinese_simpleqa.py",
"chars": 2272,
"preview": "from mmengine.config import read_base\n\nwith read_base():\n from opencompass.configs.datasets.chinese_simpleqa.chinese_"
},
{
"path": "examples/eval_cibench.py",
"chars": 5510,
"preview": "from copy import deepcopy\n\nfrom lagent import ReAct\nfrom lagent.agents.react import ReActProtocol\nfrom mmengine.config i"
},
{
"path": "examples/eval_cibench_api.py",
"chars": 4019,
"preview": "from lagent.agents.react import ReActProtocol\nfrom mmengine.config import read_base\n\nfrom opencompass.lagent.actions.ipy"
},
{
"path": "examples/eval_circular.py",
"chars": 4362,
"preview": "from mmengine.config import read_base\n\nfrom opencompass.datasets.circular import (\n CircularARCDataset, CircularCEval"
},
{
"path": "examples/eval_claude.py",
"chars": 674,
"preview": "from mmengine.config import read_base\n\nfrom opencompass.partitioners import NaivePartitioner\nfrom opencompass.runners im"
},
{
"path": "examples/eval_code_passk.py",
"chars": 1749,
"preview": "# This config is used for pass@k evaluation with `num_return_sequences`\n# That model can generate multiple responses for"
},
{
"path": "examples/eval_code_passk_repeat_dataset.py",
"chars": 1917,
"preview": "# This config is used for pass@k evaluation with dataset repetition\n# That model cannot generate multiple response for s"
},
{
"path": "examples/eval_codeagent.py",
"chars": 1649,
"preview": "from mmengine.config import read_base\n\nfrom opencompass.models import HuggingFaceCausalLM, OpenAI\nfrom opencompass.model"
},
{
"path": "examples/eval_codebench_full.py",
"chars": 4809,
"preview": "# This config is used to test all the code benchmarks\nfrom mmengine.config import read_base\nimport os.path as osp\nfrom o"
},
{
"path": "examples/eval_codegeex2.py",
"chars": 266,
"preview": "from mmengine.config import read_base\n\nwith read_base():\n from opencompass.configs.datasets.humanevalx.humanevalx_gen"
},
{
"path": "examples/eval_compassarena_subjectivebench.py",
"chars": 4815,
"preview": "from mmengine.config import read_base\n\nwith read_base():\n from opencompass.configs.datasets.subjective.compass_arena_"
},
{
"path": "examples/eval_compassarena_subjectivebench_bradleyterry.py",
"chars": 5052,
"preview": "from mmengine.config import read_base\n\nwith read_base():\n from opencompass.configs.datasets.subjective.compass_arena_"
},
{
"path": "examples/eval_contamination.py",
"chars": 934,
"preview": "from mmengine.config import read_base\n\nwith read_base():\n from opencompass.configs.datasets.ARC_c.ARC_c_clean_ppl imp"
},
{
"path": "examples/eval_corebench_2409_base_objective.py",
"chars": 7586,
"preview": "import os.path as osp\n\nfrom mmengine.config import read_base\n\nfrom opencompass.partitioners import NaivePartitioner, Num"
},
{
"path": "examples/eval_corebench_2409_chat_objective.py",
"chars": 8751,
"preview": "import os.path as osp\n\nfrom mmengine.config import read_base\n\nfrom opencompass.partitioners import NaivePartitioner, Num"
},
{
"path": "examples/eval_corebench_2409_longcontext.py",
"chars": 5884,
"preview": "import os.path as osp\nfrom copy import deepcopy\n\nfrom mmengine.config import read_base\n\nfrom opencompass.models import ("
}
]
// ... and 2968 more files (download for full content)
About this extraction
This page contains the full source code of the open-compass/opencompass GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 3168 files (10.1 MB), approximately 2.9M tokens, and a symbol index with 5483 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.