Repository: EleutherAI/lm-evaluation-harness Branch: main Commit: ee7e8f4fe58e Files: 15734 Total size: 11.1 MB Directory structure: gitextract_npmvb7su/ ├── .github/ │ └── workflows/ │ ├── new_tasks.yml │ ├── publish.yml │ └── unit_tests.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CITATION.bib ├── CODEOWNERS ├── LICENSE.md ├── MANIFEST.in ├── README.md ├── docs/ │ ├── API_guide.md │ ├── CONTRIBUTING.md │ ├── README.md │ ├── chat-template-readme.md │ ├── config_files.md │ ├── decontamination.md │ ├── footguns.md │ ├── interface.md │ ├── model_guide.md │ ├── new_task_guide.md │ ├── python-api.md │ └── task_guide.md ├── examples/ │ ├── lm-eval-overview.ipynb │ ├── transformer-lens.py │ ├── visualize-wandb.ipynb │ └── visualize-zeno.ipynb ├── ignore.txt ├── lm_eval/ │ ├── __init__.py │ ├── __main__.py │ ├── _cli/ │ │ ├── __init__.py │ │ ├── harness.py │ │ ├── ls.py │ │ ├── run.py │ │ ├── subcommand.py │ │ ├── utils.py │ │ └── validate.py │ ├── api/ │ │ ├── __init__.py │ │ ├── filter.py │ │ ├── group.py │ │ ├── instance.py │ │ ├── metrics.py │ │ ├── model.py │ │ ├── registry.py │ │ ├── samplers.py │ │ ├── task.py │ │ └── utils.py │ ├── caching/ │ │ ├── __init__.py │ │ └── cache.py │ ├── config/ │ │ ├── __init__.py │ │ ├── evaluate_config.py │ │ ├── group.py │ │ └── task.py │ ├── decontamination/ │ │ ├── __init__.py │ │ ├── archiver.py │ │ ├── decontaminate.py │ │ └── janitor.py │ ├── defaults.py │ ├── evaluator.py │ ├── evaluator_utils.py │ ├── filters/ │ │ ├── __init__.py │ │ ├── custom.py │ │ ├── decontamination.py │ │ ├── extraction.py │ │ ├── selection.py │ │ └── transformation.py │ ├── loggers/ │ │ ├── __init__.py │ │ ├── evaluation_tracker.py │ │ ├── utils.py │ │ └── wandb_logger.py │ ├── models/ │ │ ├── __init__.py │ │ ├── anthropic_llms.py │ │ ├── api_models.py │ │ ├── dummy.py │ │ ├── gguf.py │ │ ├── hf_audiolm.py │ │ ├── hf_steered.py │ │ ├── hf_vlms.py │ │ ├── huggingface.py │ │ ├── ibm_watsonx_ai.py │ │ ├── mamba_lm.py │ │ ├── megatron_lm.py │ │ ├── mistral3.py │ │ ├── nemo_lm.py │ │ ├── neuron_optimum.py │ │ ├── openai_completions.py │ │ ├── optimum_habana.py │ │ ├── optimum_ipex.py │ │ ├── optimum_lm.py │ │ ├── sglang_causallms.py │ │ ├── sglang_generate_API.py │ │ ├── textsynth.py │ │ ├── utils.py │ │ ├── utils_hf.py │ │ ├── vllm_causallms.py │ │ ├── vllm_vlms.py │ │ └── winml.py │ ├── prompts/ │ │ └── __init__.py │ ├── result_schema.py │ ├── tasks/ │ │ ├── README.md │ │ ├── __init__.py │ │ ├── _factory.py │ │ ├── _index.py │ │ ├── _yaml_loader.py │ │ ├── aclue/ │ │ │ ├── README.md │ │ │ ├── _aclue.yaml │ │ │ ├── _default_template_yaml │ │ │ ├── _generate_configs.py │ │ │ ├── aclue_ancient_chinese_culture.yaml │ │ │ ├── aclue_ancient_literature.yaml │ │ │ ├── aclue_ancient_medical.yaml │ │ │ ├── aclue_ancient_phonetics.yaml │ │ │ ├── aclue_basic_ancient_chinese.yaml │ │ │ ├── aclue_couplet_prediction.yaml │ │ │ ├── aclue_homographic_character_resolution.yaml │ │ │ ├── aclue_named_entity_recognition.yaml │ │ │ ├── aclue_poetry_appreciate.yaml │ │ │ ├── aclue_poetry_context_prediction.yaml │ │ │ ├── aclue_poetry_quality_assessment.yaml │ │ │ ├── aclue_poetry_sentiment_analysis.yaml │ │ │ ├── aclue_polysemy_resolution.yaml │ │ │ ├── aclue_reading_comprehension.yaml │ │ │ └── aclue_sentence_segmentation.yaml │ │ ├── acpbench/ │ │ │ ├── README.md │ │ │ ├── boolq_cot_2shot/ │ │ │ │ ├── _boolq_cot_2shot_yaml │ │ │ │ ├── act_reach.yaml │ │ │ │ ├── app.yaml │ │ │ │ ├── just.yaml │ │ │ │ ├── land.yaml │ │ │ │ ├── prog.yaml │ │ │ │ ├── reach.yaml │ │ │ │ └── val.yaml │ │ │ ├── gen_2shot/ │ │ │ │ ├── _gen_yaml_2shot │ │ │ │ ├── acp_grammar.lark │ │ │ │ ├── acp_utils.py │ │ │ │ ├── act_reach.yaml │ │ │ │ ├── app.yaml │ │ │ │ ├── just.yaml │ │ │ │ ├── land.yaml │ │ │ │ ├── next_act.yaml │ │ │ │ ├── prog.yaml │ │ │ │ ├── reach.yaml │ │ │ │ └── val.yaml │ │ │ ├── gen_2shot_with_pddl/ │ │ │ │ ├── _gen_yaml_2shot │ │ │ │ ├── acp_grammar.lark │ │ │ │ ├── acp_utils.py │ │ │ │ ├── act_reach.yaml │ │ │ │ ├── app.yaml │ │ │ │ ├── just.yaml │ │ │ │ ├── land.yaml │ │ │ │ ├── next_act.yaml │ │ │ │ ├── prog.yaml │ │ │ │ ├── reach.yaml │ │ │ │ └── val.yaml │ │ │ └── mcq_cot_2shot/ │ │ │ ├── _mcq_cot_2shot_yaml │ │ │ ├── act_reach.yaml │ │ │ ├── app.yaml │ │ │ ├── just.yaml │ │ │ ├── land.yaml │ │ │ ├── prog.yaml │ │ │ ├── reach.yaml │ │ │ └── val.yaml │ │ ├── aexams/ │ │ │ ├── README.md │ │ │ ├── _aexams.yaml │ │ │ ├── _default_template_yaml │ │ │ ├── aexams_Biology.yaml │ │ │ ├── aexams_IslamicStudies.yaml │ │ │ ├── aexams_Physics.yaml │ │ │ ├── aexams_Science.yaml │ │ │ └── aexams_Social.yaml │ │ ├── afrimgsm/ │ │ │ ├── README.md │ │ │ ├── direct/ │ │ │ │ ├── afrimgsm.yaml │ │ │ │ ├── prompt_1/ │ │ │ │ │ ├── afrimgsm_amh.yaml │ │ │ │ │ ├── afrimgsm_eng.yaml │ │ │ │ │ ├── afrimgsm_ewe.yaml │ │ │ │ │ ├── afrimgsm_fra.yaml │ │ │ │ │ ├── afrimgsm_hau.yaml │ │ │ │ │ ├── afrimgsm_ibo.yaml │ │ │ │ │ ├── afrimgsm_kin.yaml │ │ │ │ │ ├── afrimgsm_lin.yaml │ │ │ │ │ ├── afrimgsm_lug.yaml │ │ │ │ │ ├── afrimgsm_orm.yaml │ │ │ │ │ ├── afrimgsm_sna.yaml │ │ │ │ │ ├── afrimgsm_sot.yaml │ │ │ │ │ ├── afrimgsm_swa.yaml │ │ │ │ │ ├── afrimgsm_twi.yaml │ │ │ │ │ ├── afrimgsm_vai.yaml │ │ │ │ │ ├── afrimgsm_wol.yaml │ │ │ │ │ ├── afrimgsm_xho.yaml │ │ │ │ │ ├── afrimgsm_yaml │ │ │ │ │ ├── afrimgsm_yor.yaml │ │ │ │ │ └── afrimgsm_zul.yaml │ │ │ │ ├── prompt_2/ │ │ │ │ │ ├── afrimgsm_amh.yaml │ │ │ │ │ ├── afrimgsm_eng.yaml │ │ │ │ │ ├── afrimgsm_ewe.yaml │ │ │ │ │ ├── afrimgsm_fra.yaml │ │ │ │ │ ├── afrimgsm_hau.yaml │ │ │ │ │ ├── afrimgsm_ibo.yaml │ │ │ │ │ ├── afrimgsm_kin.yaml │ │ │ │ │ ├── afrimgsm_lin.yaml │ │ │ │ │ ├── afrimgsm_lug.yaml │ │ │ │ │ ├── afrimgsm_orm.yaml │ │ │ │ │ ├── afrimgsm_sna.yaml │ │ │ │ │ ├── afrimgsm_sot.yaml │ │ │ │ │ ├── afrimgsm_swa.yaml │ │ │ │ │ ├── afrimgsm_twi.yaml │ │ │ │ │ ├── afrimgsm_vai.yaml │ │ │ │ │ ├── afrimgsm_wol.yaml │ │ │ │ │ ├── afrimgsm_xho.yaml │ │ │ │ │ ├── afrimgsm_yaml │ │ │ │ │ ├── afrimgsm_yor.yaml │ │ │ │ │ └── afrimgsm_zul.yaml │ │ │ │ ├── prompt_3/ │ │ │ │ │ ├── afrimgsm_amh.yaml │ │ │ │ │ ├── afrimgsm_eng.yaml │ │ │ │ │ ├── afrimgsm_ewe.yaml │ │ │ │ │ ├── afrimgsm_fra.yaml │ │ │ │ │ ├── afrimgsm_hau.yaml │ │ │ │ │ ├── afrimgsm_ibo.yaml │ │ │ │ │ ├── afrimgsm_kin.yaml │ │ │ │ │ ├── afrimgsm_lin.yaml │ │ │ │ │ ├── afrimgsm_lug.yaml │ │ │ │ │ ├── afrimgsm_orm.yaml │ │ │ │ │ ├── afrimgsm_sna.yaml │ │ │ │ │ ├── afrimgsm_sot.yaml │ │ │ │ │ ├── afrimgsm_swa.yaml │ │ │ │ │ ├── afrimgsm_twi.yaml │ │ │ │ │ ├── afrimgsm_vai.yaml │ │ │ │ │ ├── afrimgsm_wol.yaml │ │ │ │ │ ├── afrimgsm_xho.yaml │ │ │ │ │ ├── afrimgsm_yaml │ │ │ │ │ ├── afrimgsm_yor.yaml │ │ │ │ │ └── afrimgsm_zul.yaml │ │ │ │ ├── prompt_4/ │ │ │ │ │ ├── afrimgsm_amh.yaml │ │ │ │ │ ├── afrimgsm_eng.yaml │ │ │ │ │ ├── afrimgsm_ewe.yaml │ │ │ │ │ ├── afrimgsm_fra.yaml │ │ │ │ │ ├── afrimgsm_hau.yaml │ │ │ │ │ ├── afrimgsm_ibo.yaml │ │ │ │ │ ├── afrimgsm_kin.yaml │ │ │ │ │ ├── afrimgsm_lin.yaml │ │ │ │ │ ├── afrimgsm_lug.yaml │ │ │ │ │ ├── afrimgsm_orm.yaml │ │ │ │ │ ├── afrimgsm_sna.yaml │ │ │ │ │ ├── afrimgsm_sot.yaml │ │ │ │ │ ├── afrimgsm_swa.yaml │ │ │ │ │ ├── afrimgsm_twi.yaml │ │ │ │ │ ├── afrimgsm_vai.yaml │ │ │ │ │ ├── afrimgsm_wol.yaml │ │ │ │ │ ├── afrimgsm_xho.yaml │ │ │ │ │ ├── afrimgsm_yaml │ │ │ │ │ ├── afrimgsm_yor.yaml │ │ │ │ │ └── afrimgsm_zul.yaml │ │ │ │ └── prompt_5/ │ │ │ │ ├── afrimgsm_amh.yaml │ │ │ │ ├── afrimgsm_eng.yaml │ │ │ │ ├── afrimgsm_ewe.yaml │ │ │ │ ├── afrimgsm_fra.yaml │ │ │ │ ├── afrimgsm_hau.yaml │ │ │ │ ├── afrimgsm_ibo.yaml │ │ │ │ ├── afrimgsm_kin.yaml │ │ │ │ ├── afrimgsm_lin.yaml │ │ │ │ ├── afrimgsm_lug.yaml │ │ │ │ ├── afrimgsm_orm.yaml │ │ │ │ ├── afrimgsm_sna.yaml │ │ │ │ ├── afrimgsm_sot.yaml │ │ │ │ ├── afrimgsm_swa.yaml │ │ │ │ ├── afrimgsm_twi.yaml │ │ │ │ ├── afrimgsm_vai.yaml │ │ │ │ ├── afrimgsm_wol.yaml │ │ │ │ ├── afrimgsm_xho.yaml │ │ │ │ ├── afrimgsm_yaml │ │ │ │ ├── afrimgsm_yor.yaml │ │ │ │ └── afrimgsm_zul.yaml │ │ │ ├── direct_cot/ │ │ │ │ ├── afrimgsm_cot.yaml │ │ │ │ ├── prompt_1/ │ │ │ │ │ ├── afrimgsm_cot_amh.yaml │ │ │ │ │ ├── afrimgsm_cot_eng.yaml │ │ │ │ │ ├── afrimgsm_cot_ewe.yaml │ │ │ │ │ ├── afrimgsm_cot_fra.yaml │ │ │ │ │ ├── afrimgsm_cot_hau.yaml │ │ │ │ │ ├── afrimgsm_cot_ibo.yaml │ │ │ │ │ ├── afrimgsm_cot_kin.yaml │ │ │ │ │ ├── afrimgsm_cot_lin.yaml │ │ │ │ │ ├── afrimgsm_cot_lug.yaml │ │ │ │ │ ├── afrimgsm_cot_orm.yaml │ │ │ │ │ ├── afrimgsm_cot_sna.yaml │ │ │ │ │ ├── afrimgsm_cot_sot.yaml │ │ │ │ │ ├── afrimgsm_cot_swa.yaml │ │ │ │ │ ├── afrimgsm_cot_twi.yaml │ │ │ │ │ ├── afrimgsm_cot_vai.yaml │ │ │ │ │ ├── afrimgsm_cot_wol.yaml │ │ │ │ │ ├── afrimgsm_cot_xho.yaml │ │ │ │ │ ├── afrimgsm_cot_yaml │ │ │ │ │ ├── afrimgsm_cot_yor.yaml │ │ │ │ │ └── afrimgsm_cot_zul.yaml │ │ │ │ ├── prompt_2/ │ │ │ │ │ ├── afrimgsm_cot_amh.yaml │ │ │ │ │ ├── afrimgsm_cot_eng.yaml │ │ │ │ │ ├── afrimgsm_cot_ewe.yaml │ │ │ │ │ ├── afrimgsm_cot_fra.yaml │ │ │ │ │ ├── afrimgsm_cot_hau.yaml │ │ │ │ │ ├── afrimgsm_cot_ibo.yaml │ │ │ │ │ ├── afrimgsm_cot_kin.yaml │ │ │ │ │ ├── afrimgsm_cot_lin.yaml │ │ │ │ │ ├── afrimgsm_cot_lug.yaml │ │ │ │ │ ├── afrimgsm_cot_orm.yaml │ │ │ │ │ ├── afrimgsm_cot_sna.yaml │ │ │ │ │ ├── afrimgsm_cot_sot.yaml │ │ │ │ │ ├── afrimgsm_cot_swa.yaml │ │ │ │ │ ├── afrimgsm_cot_twi.yaml │ │ │ │ │ ├── afrimgsm_cot_vai.yaml │ │ │ │ │ ├── afrimgsm_cot_wol.yaml │ │ │ │ │ ├── afrimgsm_cot_xho.yaml │ │ │ │ │ ├── afrimgsm_cot_yaml │ │ │ │ │ ├── afrimgsm_cot_yor.yaml │ │ │ │ │ └── afrimgsm_cot_zul.yaml │ │ │ │ ├── prompt_3/ │ │ │ │ │ ├── afrimgsm_cot_amh.yaml │ │ │ │ │ ├── afrimgsm_cot_eng.yaml │ │ │ │ │ ├── afrimgsm_cot_ewe.yaml │ │ │ │ │ ├── afrimgsm_cot_fra.yaml │ │ │ │ │ ├── afrimgsm_cot_hau.yaml │ │ │ │ │ ├── afrimgsm_cot_ibo.yaml │ │ │ │ │ ├── afrimgsm_cot_kin.yaml │ │ │ │ │ ├── afrimgsm_cot_lin.yaml │ │ │ │ │ ├── afrimgsm_cot_lug.yaml │ │ │ │ │ ├── afrimgsm_cot_orm.yaml │ │ │ │ │ ├── afrimgsm_cot_sna.yaml │ │ │ │ │ ├── afrimgsm_cot_sot.yaml │ │ │ │ │ ├── afrimgsm_cot_swa.yaml │ │ │ │ │ ├── afrimgsm_cot_twi.yaml │ │ │ │ │ ├── afrimgsm_cot_vai.yaml │ │ │ │ │ ├── afrimgsm_cot_wol.yaml │ │ │ │ │ ├── afrimgsm_cot_xho.yaml │ │ │ │ │ ├── afrimgsm_cot_yaml │ │ │ │ │ ├── afrimgsm_cot_yor.yaml │ │ │ │ │ └── afrimgsm_cot_zul.yaml │ │ │ │ ├── prompt_4/ │ │ │ │ │ ├── afrimgsm_cot_amh.yaml │ │ │ │ │ ├── afrimgsm_cot_eng.yaml │ │ │ │ │ ├── afrimgsm_cot_ewe.yaml │ │ │ │ │ ├── afrimgsm_cot_fra.yaml │ │ │ │ │ ├── afrimgsm_cot_hau.yaml │ │ │ │ │ ├── afrimgsm_cot_ibo.yaml │ │ │ │ │ ├── afrimgsm_cot_kin.yaml │ │ │ │ │ ├── afrimgsm_cot_lin.yaml │ │ │ │ │ ├── afrimgsm_cot_lug.yaml │ │ │ │ │ ├── afrimgsm_cot_orm.yaml │ │ │ │ │ ├── afrimgsm_cot_sna.yaml │ │ │ │ │ ├── afrimgsm_cot_sot.yaml │ │ │ │ │ ├── afrimgsm_cot_swa.yaml │ │ │ │ │ ├── afrimgsm_cot_twi.yaml │ │ │ │ │ ├── afrimgsm_cot_vai.yaml │ │ │ │ │ ├── afrimgsm_cot_wol.yaml │ │ │ │ │ ├── afrimgsm_cot_xho.yaml │ │ │ │ │ ├── afrimgsm_cot_yaml │ │ │ │ │ ├── afrimgsm_cot_yor.yaml │ │ │ │ │ └── afrimgsm_cot_zul.yaml │ │ │ │ └── prompt_5/ │ │ │ │ ├── afrimgsm_cot_amh.yaml │ │ │ │ ├── afrimgsm_cot_eng.yaml │ │ │ │ ├── afrimgsm_cot_ewe.yaml │ │ │ │ ├── afrimgsm_cot_fra.yaml │ │ │ │ ├── afrimgsm_cot_hau.yaml │ │ │ │ ├── afrimgsm_cot_ibo.yaml │ │ │ │ ├── afrimgsm_cot_kin.yaml │ │ │ │ ├── afrimgsm_cot_lin.yaml │ │ │ │ ├── afrimgsm_cot_lug.yaml │ │ │ │ ├── afrimgsm_cot_orm.yaml │ │ │ │ ├── afrimgsm_cot_sna.yaml │ │ │ │ ├── afrimgsm_cot_sot.yaml │ │ │ │ ├── afrimgsm_cot_swa.yaml │ │ │ │ ├── afrimgsm_cot_twi.yaml │ │ │ │ ├── afrimgsm_cot_vai.yaml │ │ │ │ ├── afrimgsm_cot_wol.yaml │ │ │ │ ├── afrimgsm_cot_xho.yaml │ │ │ │ ├── afrimgsm_cot_yaml │ │ │ │ ├── afrimgsm_cot_yor.yaml │ │ │ │ └── afrimgsm_cot_zul.yaml │ │ │ ├── gen_utils.py │ │ │ ├── gen_yaml.sh │ │ │ ├── run.sh │ │ │ ├── translate/ │ │ │ │ ├── afrimgsm_tt.yaml │ │ │ │ ├── prompt_1/ │ │ │ │ │ ├── afrimgsm_translate_amh.yaml │ │ │ │ │ ├── afrimgsm_translate_ewe.yaml │ │ │ │ │ ├── afrimgsm_translate_fra.yaml │ │ │ │ │ ├── afrimgsm_translate_hau.yaml │ │ │ │ │ ├── afrimgsm_translate_ibo.yaml │ │ │ │ │ ├── afrimgsm_translate_kin.yaml │ │ │ │ │ ├── afrimgsm_translate_lin.yaml │ │ │ │ │ ├── afrimgsm_translate_lug.yaml │ │ │ │ │ ├── afrimgsm_translate_orm.yaml │ │ │ │ │ ├── afrimgsm_translate_sna.yaml │ │ │ │ │ ├── afrimgsm_translate_sot.yaml │ │ │ │ │ ├── afrimgsm_translate_swa.yaml │ │ │ │ │ ├── afrimgsm_translate_twi.yaml │ │ │ │ │ ├── afrimgsm_translate_wol.yaml │ │ │ │ │ ├── afrimgsm_translate_xho.yaml │ │ │ │ │ ├── afrimgsm_translate_yaml │ │ │ │ │ ├── afrimgsm_translate_yor.yaml │ │ │ │ │ └── afrimgsm_translate_zul.yaml │ │ │ │ ├── prompt_2/ │ │ │ │ │ ├── afrimgsm_translate_amh.yaml │ │ │ │ │ ├── afrimgsm_translate_ewe.yaml │ │ │ │ │ ├── afrimgsm_translate_fra.yaml │ │ │ │ │ ├── afrimgsm_translate_hau.yaml │ │ │ │ │ ├── afrimgsm_translate_ibo.yaml │ │ │ │ │ ├── afrimgsm_translate_kin.yaml │ │ │ │ │ ├── afrimgsm_translate_lin.yaml │ │ │ │ │ ├── afrimgsm_translate_lug.yaml │ │ │ │ │ ├── afrimgsm_translate_orm.yaml │ │ │ │ │ ├── afrimgsm_translate_sna.yaml │ │ │ │ │ ├── afrimgsm_translate_sot.yaml │ │ │ │ │ ├── afrimgsm_translate_swa.yaml │ │ │ │ │ ├── afrimgsm_translate_twi.yaml │ │ │ │ │ ├── afrimgsm_translate_wol.yaml │ │ │ │ │ ├── afrimgsm_translate_xho.yaml │ │ │ │ │ ├── afrimgsm_translate_yaml │ │ │ │ │ ├── afrimgsm_translate_yor.yaml │ │ │ │ │ └── afrimgsm_translate_zul.yaml │ │ │ │ ├── prompt_3/ │ │ │ │ │ ├── afrimgsm_translate_amh.yaml │ │ │ │ │ ├── afrimgsm_translate_ewe.yaml │ │ │ │ │ ├── afrimgsm_translate_fra.yaml │ │ │ │ │ ├── afrimgsm_translate_hau.yaml │ │ │ │ │ ├── afrimgsm_translate_ibo.yaml │ │ │ │ │ ├── afrimgsm_translate_kin.yaml │ │ │ │ │ ├── afrimgsm_translate_lin.yaml │ │ │ │ │ ├── afrimgsm_translate_lug.yaml │ │ │ │ │ ├── afrimgsm_translate_orm.yaml │ │ │ │ │ ├── afrimgsm_translate_sna.yaml │ │ │ │ │ ├── afrimgsm_translate_sot.yaml │ │ │ │ │ ├── afrimgsm_translate_swa.yaml │ │ │ │ │ ├── afrimgsm_translate_twi.yaml │ │ │ │ │ ├── afrimgsm_translate_wol.yaml │ │ │ │ │ ├── afrimgsm_translate_xho.yaml │ │ │ │ │ ├── afrimgsm_translate_yaml │ │ │ │ │ ├── afrimgsm_translate_yor.yaml │ │ │ │ │ └── afrimgsm_translate_zul.yaml │ │ │ │ ├── prompt_4/ │ │ │ │ │ ├── afrimgsm_translate_amh.yaml │ │ │ │ │ ├── afrimgsm_translate_ewe.yaml │ │ │ │ │ ├── afrimgsm_translate_fra.yaml │ │ │ │ │ ├── afrimgsm_translate_hau.yaml │ │ │ │ │ ├── afrimgsm_translate_ibo.yaml │ │ │ │ │ ├── afrimgsm_translate_kin.yaml │ │ │ │ │ ├── afrimgsm_translate_lin.yaml │ │ │ │ │ ├── afrimgsm_translate_lug.yaml │ │ │ │ │ ├── afrimgsm_translate_orm.yaml │ │ │ │ │ ├── afrimgsm_translate_sna.yaml │ │ │ │ │ ├── afrimgsm_translate_sot.yaml │ │ │ │ │ ├── afrimgsm_translate_swa.yaml │ │ │ │ │ ├── afrimgsm_translate_twi.yaml │ │ │ │ │ ├── afrimgsm_translate_wol.yaml │ │ │ │ │ ├── afrimgsm_translate_xho.yaml │ │ │ │ │ ├── afrimgsm_translate_yaml │ │ │ │ │ ├── afrimgsm_translate_yor.yaml │ │ │ │ │ └── afrimgsm_translate_zul.yaml │ │ │ │ └── prompt_5/ │ │ │ │ ├── afrimgsm_translate_amh.yaml │ │ │ │ ├── afrimgsm_translate_ewe.yaml │ │ │ │ ├── afrimgsm_translate_fra.yaml │ │ │ │ ├── afrimgsm_translate_hau.yaml │ │ │ │ ├── afrimgsm_translate_ibo.yaml │ │ │ │ ├── afrimgsm_translate_kin.yaml │ │ │ │ ├── afrimgsm_translate_lin.yaml │ │ │ │ ├── afrimgsm_translate_lug.yaml │ │ │ │ ├── afrimgsm_translate_orm.yaml │ │ │ │ ├── afrimgsm_translate_sna.yaml │ │ │ │ ├── afrimgsm_translate_sot.yaml │ │ │ │ ├── afrimgsm_translate_swa.yaml │ │ │ │ ├── afrimgsm_translate_twi.yaml │ │ │ │ ├── afrimgsm_translate_wol.yaml │ │ │ │ ├── afrimgsm_translate_xho.yaml │ │ │ │ ├── afrimgsm_translate_yaml │ │ │ │ ├── afrimgsm_translate_yor.yaml │ │ │ │ └── afrimgsm_translate_zul.yaml │ │ │ ├── translate_cot/ │ │ │ │ ├── afrimgsm_tt_cot.yaml │ │ │ │ ├── prompt_1/ │ │ │ │ │ ├── afrimgsm_cot_translate_amh.yaml │ │ │ │ │ ├── afrimgsm_cot_translate_ewe.yaml │ │ │ │ │ ├── afrimgsm_cot_translate_fra.yaml │ │ │ │ │ ├── afrimgsm_cot_translate_hau.yaml │ │ │ │ │ ├── afrimgsm_cot_translate_ibo.yaml │ │ │ │ │ ├── afrimgsm_cot_translate_kin.yaml │ │ │ │ │ ├── afrimgsm_cot_translate_lin.yaml │ │ │ │ │ ├── afrimgsm_cot_translate_lug.yaml │ │ │ │ │ ├── afrimgsm_cot_translate_orm.yaml │ │ │ │ │ ├── afrimgsm_cot_translate_sna.yaml │ │ │ │ │ ├── afrimgsm_cot_translate_sot.yaml │ │ │ │ │ ├── afrimgsm_cot_translate_swa.yaml │ │ │ │ │ ├── afrimgsm_cot_translate_twi.yaml │ │ │ │ │ ├── afrimgsm_cot_translate_vai.yaml │ │ │ │ │ ├── afrimgsm_cot_translate_wol.yaml │ │ │ │ │ ├── afrimgsm_cot_translate_xho.yaml │ │ │ │ │ ├── afrimgsm_cot_translate_yaml │ │ │ │ │ ├── afrimgsm_cot_translate_yor.yaml │ │ │ │ │ └── afrimgsm_cot_translate_zul.yaml │ │ │ │ ├── prompt_2/ │ │ │ │ │ ├── afrimgsm_cot_translate_amh.yaml │ │ │ │ │ ├── afrimgsm_cot_translate_ewe.yaml │ │ │ │ │ ├── afrimgsm_cot_translate_fra.yaml │ │ │ │ │ ├── afrimgsm_cot_translate_hau.yaml │ │ │ │ │ ├── afrimgsm_cot_translate_ibo.yaml │ │ │ │ │ ├── afrimgsm_cot_translate_kin.yaml │ │ │ │ │ ├── afrimgsm_cot_translate_lin.yaml │ │ │ │ │ ├── afrimgsm_cot_translate_lug.yaml │ │ │ │ │ ├── afrimgsm_cot_translate_orm.yaml │ │ │ │ │ ├── afrimgsm_cot_translate_sna.yaml │ │ │ │ │ ├── afrimgsm_cot_translate_sot.yaml │ │ │ │ │ ├── afrimgsm_cot_translate_swa.yaml │ │ │ │ │ ├── afrimgsm_cot_translate_twi.yaml │ │ │ │ │ ├── afrimgsm_cot_translate_vai.yaml │ │ │ │ │ ├── afrimgsm_cot_translate_wol.yaml │ │ │ │ │ ├── afrimgsm_cot_translate_xho.yaml │ │ │ │ │ ├── afrimgsm_cot_translate_yaml │ │ │ │ │ ├── afrimgsm_cot_translate_yor.yaml │ │ │ │ │ └── afrimgsm_cot_translate_zul.yaml │ │ │ │ ├── prompt_3/ │ │ │ │ │ ├── afrimgsm_cot_translate_amh.yaml │ │ │ │ │ ├── afrimgsm_cot_translate_ewe.yaml │ │ │ │ │ ├── afrimgsm_cot_translate_fra.yaml │ │ │ │ │ ├── afrimgsm_cot_translate_hau.yaml │ │ │ │ │ ├── afrimgsm_cot_translate_ibo.yaml │ │ │ │ │ ├── afrimgsm_cot_translate_kin.yaml │ │ │ │ │ ├── afrimgsm_cot_translate_lin.yaml │ │ │ │ │ ├── afrimgsm_cot_translate_lug.yaml │ │ │ │ │ ├── afrimgsm_cot_translate_orm.yaml │ │ │ │ │ ├── afrimgsm_cot_translate_sna.yaml │ │ │ │ │ ├── afrimgsm_cot_translate_sot.yaml │ │ │ │ │ ├── afrimgsm_cot_translate_swa.yaml │ │ │ │ │ ├── afrimgsm_cot_translate_twi.yaml │ │ │ │ │ ├── afrimgsm_cot_translate_vai.yaml │ │ │ │ │ ├── afrimgsm_cot_translate_wol.yaml │ │ │ │ │ ├── afrimgsm_cot_translate_xho.yaml │ │ │ │ │ ├── afrimgsm_cot_translate_yaml │ │ │ │ │ ├── afrimgsm_cot_translate_yor.yaml │ │ │ │ │ └── afrimgsm_cot_translate_zul.yaml │ │ │ │ ├── prompt_4/ │ │ │ │ │ ├── afrimgsm_cot_translate_amh.yaml │ │ │ │ │ ├── afrimgsm_cot_translate_ewe.yaml │ │ │ │ │ ├── afrimgsm_cot_translate_fra.yaml │ │ │ │ │ ├── afrimgsm_cot_translate_hau.yaml │ │ │ │ │ ├── afrimgsm_cot_translate_ibo.yaml │ │ │ │ │ ├── afrimgsm_cot_translate_kin.yaml │ │ │ │ │ ├── afrimgsm_cot_translate_lin.yaml │ │ │ │ │ ├── afrimgsm_cot_translate_lug.yaml │ │ │ │ │ ├── afrimgsm_cot_translate_orm.yaml │ │ │ │ │ ├── afrimgsm_cot_translate_sna.yaml │ │ │ │ │ ├── afrimgsm_cot_translate_sot.yaml │ │ │ │ │ ├── afrimgsm_cot_translate_swa.yaml │ │ │ │ │ ├── afrimgsm_cot_translate_twi.yaml │ │ │ │ │ ├── afrimgsm_cot_translate_vai.yaml │ │ │ │ │ ├── afrimgsm_cot_translate_wol.yaml │ │ │ │ │ ├── afrimgsm_cot_translate_xho.yaml │ │ │ │ │ ├── afrimgsm_cot_translate_yaml │ │ │ │ │ ├── afrimgsm_cot_translate_yor.yaml │ │ │ │ │ └── afrimgsm_cot_translate_zul.yaml │ │ │ │ └── prompt_5/ │ │ │ │ ├── afrimgsm_cot_translate_amh.yaml │ │ │ │ ├── afrimgsm_cot_translate_ewe.yaml │ │ │ │ ├── afrimgsm_cot_translate_fra.yaml │ │ │ │ ├── afrimgsm_cot_translate_hau.yaml │ │ │ │ ├── afrimgsm_cot_translate_ibo.yaml │ │ │ │ ├── afrimgsm_cot_translate_kin.yaml │ │ │ │ ├── afrimgsm_cot_translate_lin.yaml │ │ │ │ ├── afrimgsm_cot_translate_lug.yaml │ │ │ │ ├── afrimgsm_cot_translate_orm.yaml │ │ │ │ ├── afrimgsm_cot_translate_sna.yaml │ │ │ │ ├── afrimgsm_cot_translate_sot.yaml │ │ │ │ ├── afrimgsm_cot_translate_swa.yaml │ │ │ │ ├── afrimgsm_cot_translate_twi.yaml │ │ │ │ ├── afrimgsm_cot_translate_vai.yaml │ │ │ │ ├── afrimgsm_cot_translate_wol.yaml │ │ │ │ ├── afrimgsm_cot_translate_xho.yaml │ │ │ │ ├── afrimgsm_cot_translate_yaml │ │ │ │ ├── afrimgsm_cot_translate_yor.yaml │ │ │ │ └── afrimgsm_cot_translate_zul.yaml │ │ │ └── utils.py │ │ ├── afrimmlu/ │ │ │ ├── README.md │ │ │ ├── direct/ │ │ │ │ ├── afrimmlu.yaml │ │ │ │ ├── prompt_1/ │ │ │ │ │ ├── afrimmlu_direct │ │ │ │ │ ├── afrimmlu_direct_amh.yaml │ │ │ │ │ ├── afrimmlu_direct_eng.yaml │ │ │ │ │ ├── afrimmlu_direct_ewe.yaml │ │ │ │ │ ├── afrimmlu_direct_fra.yaml │ │ │ │ │ ├── afrimmlu_direct_hau.yaml │ │ │ │ │ ├── afrimmlu_direct_ibo.yaml │ │ │ │ │ ├── afrimmlu_direct_kin.yaml │ │ │ │ │ ├── afrimmlu_direct_lin.yaml │ │ │ │ │ ├── afrimmlu_direct_lug.yaml │ │ │ │ │ ├── afrimmlu_direct_orm.yaml │ │ │ │ │ ├── afrimmlu_direct_sna.yaml │ │ │ │ │ ├── afrimmlu_direct_sot.yaml │ │ │ │ │ ├── afrimmlu_direct_swa.yaml │ │ │ │ │ ├── afrimmlu_direct_twi.yaml │ │ │ │ │ ├── afrimmlu_direct_wol.yaml │ │ │ │ │ ├── afrimmlu_direct_xho.yaml │ │ │ │ │ ├── afrimmlu_direct_yor.yaml │ │ │ │ │ ├── afrimmlu_direct_zul.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── prompt_2/ │ │ │ │ │ ├── afrimmlu_direct │ │ │ │ │ ├── afrimmlu_direct_amh.yaml │ │ │ │ │ ├── afrimmlu_direct_eng.yaml │ │ │ │ │ ├── afrimmlu_direct_ewe.yaml │ │ │ │ │ ├── afrimmlu_direct_fra.yaml │ │ │ │ │ ├── afrimmlu_direct_hau.yaml │ │ │ │ │ ├── afrimmlu_direct_ibo.yaml │ │ │ │ │ ├── afrimmlu_direct_kin.yaml │ │ │ │ │ ├── afrimmlu_direct_lin.yaml │ │ │ │ │ ├── afrimmlu_direct_lug.yaml │ │ │ │ │ ├── afrimmlu_direct_orm.yaml │ │ │ │ │ ├── afrimmlu_direct_sna.yaml │ │ │ │ │ ├── afrimmlu_direct_sot.yaml │ │ │ │ │ ├── afrimmlu_direct_swa.yaml │ │ │ │ │ ├── afrimmlu_direct_twi.yaml │ │ │ │ │ ├── afrimmlu_direct_wol.yaml │ │ │ │ │ ├── afrimmlu_direct_xho.yaml │ │ │ │ │ ├── afrimmlu_direct_yor.yaml │ │ │ │ │ ├── afrimmlu_direct_zul.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── prompt_3/ │ │ │ │ │ ├── afrimmlu_direct │ │ │ │ │ ├── afrimmlu_direct_amh.yaml │ │ │ │ │ ├── afrimmlu_direct_eng.yaml │ │ │ │ │ ├── afrimmlu_direct_ewe.yaml │ │ │ │ │ ├── afrimmlu_direct_fra.yaml │ │ │ │ │ ├── afrimmlu_direct_hau.yaml │ │ │ │ │ ├── afrimmlu_direct_ibo.yaml │ │ │ │ │ ├── afrimmlu_direct_kin.yaml │ │ │ │ │ ├── afrimmlu_direct_lin.yaml │ │ │ │ │ ├── afrimmlu_direct_lug.yaml │ │ │ │ │ ├── afrimmlu_direct_orm.yaml │ │ │ │ │ ├── afrimmlu_direct_sna.yaml │ │ │ │ │ ├── afrimmlu_direct_sot.yaml │ │ │ │ │ ├── afrimmlu_direct_swa.yaml │ │ │ │ │ ├── afrimmlu_direct_twi.yaml │ │ │ │ │ ├── afrimmlu_direct_wol.yaml │ │ │ │ │ ├── afrimmlu_direct_xho.yaml │ │ │ │ │ ├── afrimmlu_direct_yor.yaml │ │ │ │ │ ├── afrimmlu_direct_zul.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── prompt_4/ │ │ │ │ │ ├── afrimmlu_direct │ │ │ │ │ ├── afrimmlu_direct_amh.yaml │ │ │ │ │ ├── afrimmlu_direct_eng.yaml │ │ │ │ │ ├── afrimmlu_direct_ewe.yaml │ │ │ │ │ ├── afrimmlu_direct_fra.yaml │ │ │ │ │ ├── afrimmlu_direct_hau.yaml │ │ │ │ │ ├── afrimmlu_direct_ibo.yaml │ │ │ │ │ ├── afrimmlu_direct_kin.yaml │ │ │ │ │ ├── afrimmlu_direct_lin.yaml │ │ │ │ │ ├── afrimmlu_direct_lug.yaml │ │ │ │ │ ├── afrimmlu_direct_orm.yaml │ │ │ │ │ ├── afrimmlu_direct_sna.yaml │ │ │ │ │ ├── afrimmlu_direct_sot.yaml │ │ │ │ │ ├── afrimmlu_direct_swa.yaml │ │ │ │ │ ├── afrimmlu_direct_twi.yaml │ │ │ │ │ ├── afrimmlu_direct_wol.yaml │ │ │ │ │ ├── afrimmlu_direct_xho.yaml │ │ │ │ │ ├── afrimmlu_direct_yor.yaml │ │ │ │ │ ├── afrimmlu_direct_zul.yaml │ │ │ │ │ └── utils.py │ │ │ │ └── prompt_5/ │ │ │ │ ├── afrimmlu_direct │ │ │ │ ├── afrimmlu_direct_amh.yaml │ │ │ │ ├── afrimmlu_direct_eng.yaml │ │ │ │ ├── afrimmlu_direct_ewe.yaml │ │ │ │ ├── afrimmlu_direct_fra.yaml │ │ │ │ ├── afrimmlu_direct_hau.yaml │ │ │ │ ├── afrimmlu_direct_ibo.yaml │ │ │ │ ├── afrimmlu_direct_kin.yaml │ │ │ │ ├── afrimmlu_direct_lin.yaml │ │ │ │ ├── afrimmlu_direct_lug.yaml │ │ │ │ ├── afrimmlu_direct_orm.yaml │ │ │ │ ├── afrimmlu_direct_sna.yaml │ │ │ │ ├── afrimmlu_direct_sot.yaml │ │ │ │ ├── afrimmlu_direct_swa.yaml │ │ │ │ ├── afrimmlu_direct_twi.yaml │ │ │ │ ├── afrimmlu_direct_wol.yaml │ │ │ │ ├── afrimmlu_direct_xho.yaml │ │ │ │ ├── afrimmlu_direct_yor.yaml │ │ │ │ ├── afrimmlu_direct_zul.yaml │ │ │ │ └── utils.py │ │ │ ├── fewshot.sh │ │ │ ├── gen_utils.py │ │ │ ├── translate/ │ │ │ │ ├── afrimmlu_tt.yaml │ │ │ │ ├── prompt_1/ │ │ │ │ │ ├── afrimmlu_translate │ │ │ │ │ ├── afrimmlu_translate_amh.yaml │ │ │ │ │ ├── afrimmlu_translate_ewe.yaml │ │ │ │ │ ├── afrimmlu_translate_fra.yaml │ │ │ │ │ ├── afrimmlu_translate_hau.yaml │ │ │ │ │ ├── afrimmlu_translate_ibo.yaml │ │ │ │ │ ├── afrimmlu_translate_kin.yaml │ │ │ │ │ ├── afrimmlu_translate_lin.yaml │ │ │ │ │ ├── afrimmlu_translate_lug.yaml │ │ │ │ │ ├── afrimmlu_translate_orm.yaml │ │ │ │ │ ├── afrimmlu_translate_sna.yaml │ │ │ │ │ ├── afrimmlu_translate_sot.yaml │ │ │ │ │ ├── afrimmlu_translate_swa.yaml │ │ │ │ │ ├── afrimmlu_translate_twi.yaml │ │ │ │ │ ├── afrimmlu_translate_wol.yaml │ │ │ │ │ ├── afrimmlu_translate_xho.yaml │ │ │ │ │ ├── afrimmlu_translate_yor.yaml │ │ │ │ │ ├── afrimmlu_translate_zul.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── prompt_2/ │ │ │ │ │ ├── afrimmlu_translate │ │ │ │ │ ├── afrimmlu_translate_amh.yaml │ │ │ │ │ ├── afrimmlu_translate_ewe.yaml │ │ │ │ │ ├── afrimmlu_translate_fra.yaml │ │ │ │ │ ├── afrimmlu_translate_hau.yaml │ │ │ │ │ ├── afrimmlu_translate_ibo.yaml │ │ │ │ │ ├── afrimmlu_translate_kin.yaml │ │ │ │ │ ├── afrimmlu_translate_lin.yaml │ │ │ │ │ ├── afrimmlu_translate_lug.yaml │ │ │ │ │ ├── afrimmlu_translate_orm.yaml │ │ │ │ │ ├── afrimmlu_translate_sna.yaml │ │ │ │ │ ├── afrimmlu_translate_sot.yaml │ │ │ │ │ ├── afrimmlu_translate_swa.yaml │ │ │ │ │ ├── afrimmlu_translate_twi.yaml │ │ │ │ │ ├── afrimmlu_translate_wol.yaml │ │ │ │ │ ├── afrimmlu_translate_xho.yaml │ │ │ │ │ ├── afrimmlu_translate_yor.yaml │ │ │ │ │ ├── afrimmlu_translate_zul.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── prompt_3/ │ │ │ │ │ ├── afrimmlu_translate │ │ │ │ │ ├── afrimmlu_translate_amh.yaml │ │ │ │ │ ├── afrimmlu_translate_ewe.yaml │ │ │ │ │ ├── afrimmlu_translate_fra.yaml │ │ │ │ │ ├── afrimmlu_translate_hau.yaml │ │ │ │ │ ├── afrimmlu_translate_ibo.yaml │ │ │ │ │ ├── afrimmlu_translate_kin.yaml │ │ │ │ │ ├── afrimmlu_translate_lin.yaml │ │ │ │ │ ├── afrimmlu_translate_lug.yaml │ │ │ │ │ ├── afrimmlu_translate_orm.yaml │ │ │ │ │ ├── afrimmlu_translate_sna.yaml │ │ │ │ │ ├── afrimmlu_translate_sot.yaml │ │ │ │ │ ├── afrimmlu_translate_swa.yaml │ │ │ │ │ ├── afrimmlu_translate_twi.yaml │ │ │ │ │ ├── afrimmlu_translate_wol.yaml │ │ │ │ │ ├── afrimmlu_translate_xho.yaml │ │ │ │ │ ├── afrimmlu_translate_yor.yaml │ │ │ │ │ ├── afrimmlu_translate_zul.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── prompt_4/ │ │ │ │ │ ├── afrimmlu_translate │ │ │ │ │ ├── afrimmlu_translate_amh.yaml │ │ │ │ │ ├── afrimmlu_translate_ewe.yaml │ │ │ │ │ ├── afrimmlu_translate_fra.yaml │ │ │ │ │ ├── afrimmlu_translate_hau.yaml │ │ │ │ │ ├── afrimmlu_translate_ibo.yaml │ │ │ │ │ ├── afrimmlu_translate_kin.yaml │ │ │ │ │ ├── afrimmlu_translate_lin.yaml │ │ │ │ │ ├── afrimmlu_translate_lug.yaml │ │ │ │ │ ├── afrimmlu_translate_orm.yaml │ │ │ │ │ ├── afrimmlu_translate_sna.yaml │ │ │ │ │ ├── afrimmlu_translate_sot.yaml │ │ │ │ │ ├── afrimmlu_translate_swa.yaml │ │ │ │ │ ├── afrimmlu_translate_twi.yaml │ │ │ │ │ ├── afrimmlu_translate_wol.yaml │ │ │ │ │ ├── afrimmlu_translate_xho.yaml │ │ │ │ │ ├── afrimmlu_translate_yor.yaml │ │ │ │ │ ├── afrimmlu_translate_zul.yaml │ │ │ │ │ └── utils.py │ │ │ │ └── prompt_5/ │ │ │ │ ├── afrimmlu_translate │ │ │ │ ├── afrimmlu_translate_amh.yaml │ │ │ │ ├── afrimmlu_translate_ewe.yaml │ │ │ │ ├── afrimmlu_translate_fra.yaml │ │ │ │ ├── afrimmlu_translate_hau.yaml │ │ │ │ ├── afrimmlu_translate_ibo.yaml │ │ │ │ ├── afrimmlu_translate_kin.yaml │ │ │ │ ├── afrimmlu_translate_lin.yaml │ │ │ │ ├── afrimmlu_translate_lug.yaml │ │ │ │ ├── afrimmlu_translate_orm.yaml │ │ │ │ ├── afrimmlu_translate_sna.yaml │ │ │ │ ├── afrimmlu_translate_sot.yaml │ │ │ │ ├── afrimmlu_translate_swa.yaml │ │ │ │ ├── afrimmlu_translate_twi.yaml │ │ │ │ ├── afrimmlu_translate_wol.yaml │ │ │ │ ├── afrimmlu_translate_xho.yaml │ │ │ │ ├── afrimmlu_translate_yor.yaml │ │ │ │ ├── afrimmlu_translate_zul.yaml │ │ │ │ └── utils.py │ │ │ └── utils.py │ │ ├── afrixnli/ │ │ │ ├── README.md │ │ │ ├── anli prompt/ │ │ │ │ ├── en-direct/ │ │ │ │ │ ├── afrixnli_en_direct_amh.yaml │ │ │ │ │ ├── afrixnli_en_direct_eng.yaml │ │ │ │ │ ├── afrixnli_en_direct_ewe.yaml │ │ │ │ │ ├── afrixnli_en_direct_fra.yaml │ │ │ │ │ ├── afrixnli_en_direct_hau.yaml │ │ │ │ │ ├── afrixnli_en_direct_ibo.yaml │ │ │ │ │ ├── afrixnli_en_direct_kin.yaml │ │ │ │ │ ├── afrixnli_en_direct_lin.yaml │ │ │ │ │ ├── afrixnli_en_direct_lug.yaml │ │ │ │ │ ├── afrixnli_en_direct_orm.yaml │ │ │ │ │ ├── afrixnli_en_direct_sna.yaml │ │ │ │ │ ├── afrixnli_en_direct_sot.yaml │ │ │ │ │ ├── afrixnli_en_direct_swa.yaml │ │ │ │ │ ├── afrixnli_en_direct_twi.yaml │ │ │ │ │ ├── afrixnli_en_direct_wol.yaml │ │ │ │ │ ├── afrixnli_en_direct_xho.yaml │ │ │ │ │ ├── afrixnli_en_direct_yaml │ │ │ │ │ ├── afrixnli_en_direct_yor.yaml │ │ │ │ │ ├── afrixnli_en_direct_zul.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── native-direct/ │ │ │ │ │ ├── afrixnli_native_direct_amh.yaml │ │ │ │ │ ├── afrixnli_native_direct_eng.yaml │ │ │ │ │ ├── afrixnli_native_direct_ewe.yaml │ │ │ │ │ ├── afrixnli_native_direct_fra.yaml │ │ │ │ │ ├── afrixnli_native_direct_hau.yaml │ │ │ │ │ ├── afrixnli_native_direct_ibo.yaml │ │ │ │ │ ├── afrixnli_native_direct_kin.yaml │ │ │ │ │ ├── afrixnli_native_direct_lin.yaml │ │ │ │ │ ├── afrixnli_native_direct_lug.yaml │ │ │ │ │ ├── afrixnli_native_direct_orm.yaml │ │ │ │ │ ├── afrixnli_native_direct_sna.yaml │ │ │ │ │ ├── afrixnli_native_direct_sot.yaml │ │ │ │ │ ├── afrixnli_native_direct_swa.yaml │ │ │ │ │ ├── afrixnli_native_direct_twi.yaml │ │ │ │ │ ├── afrixnli_native_direct_wol.yaml │ │ │ │ │ ├── afrixnli_native_direct_xho.yaml │ │ │ │ │ ├── afrixnli_native_direct_yaml │ │ │ │ │ ├── afrixnli_native_direct_yor.yaml │ │ │ │ │ ├── afrixnli_native_direct_zul.yaml │ │ │ │ │ └── utils.py │ │ │ │ └── translate/ │ │ │ │ ├── afrixnli_translate_amh.yaml │ │ │ │ ├── afrixnli_translate_ewe.yaml │ │ │ │ ├── afrixnli_translate_fra.yaml │ │ │ │ ├── afrixnli_translate_hau.yaml │ │ │ │ ├── afrixnli_translate_ibo.yaml │ │ │ │ ├── afrixnli_translate_kin.yaml │ │ │ │ ├── afrixnli_translate_lin.yaml │ │ │ │ ├── afrixnli_translate_lug.yaml │ │ │ │ ├── afrixnli_translate_orm.yaml │ │ │ │ ├── afrixnli_translate_sna.yaml │ │ │ │ ├── afrixnli_translate_sot.yaml │ │ │ │ ├── afrixnli_translate_swa.yaml │ │ │ │ ├── afrixnli_translate_twi.yaml │ │ │ │ ├── afrixnli_translate_wol.yaml │ │ │ │ ├── afrixnli_translate_xho.yaml │ │ │ │ ├── afrixnli_translate_yaml │ │ │ │ ├── afrixnli_translate_yor.yaml │ │ │ │ ├── afrixnli_translate_zul.yaml │ │ │ │ └── utils.py │ │ │ ├── direct/ │ │ │ │ ├── afrixnli.yaml │ │ │ │ ├── prompt_1/ │ │ │ │ │ ├── afrixnli_amh.yaml │ │ │ │ │ ├── afrixnli_eng.yaml │ │ │ │ │ ├── afrixnli_ewe.yaml │ │ │ │ │ ├── afrixnli_fra.yaml │ │ │ │ │ ├── afrixnli_hau.yaml │ │ │ │ │ ├── afrixnli_ibo.yaml │ │ │ │ │ ├── afrixnli_kin.yaml │ │ │ │ │ ├── afrixnli_lin.yaml │ │ │ │ │ ├── afrixnli_lug.yaml │ │ │ │ │ ├── afrixnli_orm.yaml │ │ │ │ │ ├── afrixnli_sna.yaml │ │ │ │ │ ├── afrixnli_sot.yaml │ │ │ │ │ ├── afrixnli_swa.yaml │ │ │ │ │ ├── afrixnli_twi.yaml │ │ │ │ │ ├── afrixnli_wol.yaml │ │ │ │ │ ├── afrixnli_xho.yaml │ │ │ │ │ ├── afrixnli_yaml │ │ │ │ │ ├── afrixnli_yor.yaml │ │ │ │ │ ├── afrixnli_zul.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── prompt_2/ │ │ │ │ │ ├── afrixnli_amh.yaml │ │ │ │ │ ├── afrixnli_eng.yaml │ │ │ │ │ ├── afrixnli_ewe.yaml │ │ │ │ │ ├── afrixnli_fra.yaml │ │ │ │ │ ├── afrixnli_hau.yaml │ │ │ │ │ ├── afrixnli_ibo.yaml │ │ │ │ │ ├── afrixnli_kin.yaml │ │ │ │ │ ├── afrixnli_lin.yaml │ │ │ │ │ ├── afrixnli_lug.yaml │ │ │ │ │ ├── afrixnli_orm.yaml │ │ │ │ │ ├── afrixnli_sna.yaml │ │ │ │ │ ├── afrixnli_sot.yaml │ │ │ │ │ ├── afrixnli_swa.yaml │ │ │ │ │ ├── afrixnli_twi.yaml │ │ │ │ │ ├── afrixnli_wol.yaml │ │ │ │ │ ├── afrixnli_xho.yaml │ │ │ │ │ ├── afrixnli_yaml │ │ │ │ │ ├── afrixnli_yor.yaml │ │ │ │ │ ├── afrixnli_zul.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── prompt_3/ │ │ │ │ │ ├── afrixnli_amh.yaml │ │ │ │ │ ├── afrixnli_eng.yaml │ │ │ │ │ ├── afrixnli_ewe.yaml │ │ │ │ │ ├── afrixnli_fra.yaml │ │ │ │ │ ├── afrixnli_hau.yaml │ │ │ │ │ ├── afrixnli_ibo.yaml │ │ │ │ │ ├── afrixnli_kin.yaml │ │ │ │ │ ├── afrixnli_lin.yaml │ │ │ │ │ ├── afrixnli_lug.yaml │ │ │ │ │ ├── afrixnli_orm.yaml │ │ │ │ │ ├── afrixnli_sna.yaml │ │ │ │ │ ├── afrixnli_sot.yaml │ │ │ │ │ ├── afrixnli_swa.yaml │ │ │ │ │ ├── afrixnli_twi.yaml │ │ │ │ │ ├── afrixnli_wol.yaml │ │ │ │ │ ├── afrixnli_xho.yaml │ │ │ │ │ ├── afrixnli_yaml │ │ │ │ │ ├── afrixnli_yor.yaml │ │ │ │ │ ├── afrixnli_zul.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── prompt_4/ │ │ │ │ │ ├── afrixnli_amh.yaml │ │ │ │ │ ├── afrixnli_eng.yaml │ │ │ │ │ ├── afrixnli_ewe.yaml │ │ │ │ │ ├── afrixnli_fra.yaml │ │ │ │ │ ├── afrixnli_hau.yaml │ │ │ │ │ ├── afrixnli_ibo.yaml │ │ │ │ │ ├── afrixnli_kin.yaml │ │ │ │ │ ├── afrixnli_lin.yaml │ │ │ │ │ ├── afrixnli_lug.yaml │ │ │ │ │ ├── afrixnli_orm.yaml │ │ │ │ │ ├── afrixnli_sna.yaml │ │ │ │ │ ├── afrixnli_sot.yaml │ │ │ │ │ ├── afrixnli_swa.yaml │ │ │ │ │ ├── afrixnli_twi.yaml │ │ │ │ │ ├── afrixnli_wol.yaml │ │ │ │ │ ├── afrixnli_xho.yaml │ │ │ │ │ ├── afrixnli_yaml │ │ │ │ │ ├── afrixnli_yor.yaml │ │ │ │ │ ├── afrixnli_zul.yaml │ │ │ │ │ └── utils.py │ │ │ │ └── prompt_5/ │ │ │ │ ├── afrixnli_amh.yaml │ │ │ │ ├── afrixnli_eng.yaml │ │ │ │ ├── afrixnli_ewe.yaml │ │ │ │ ├── afrixnli_fra.yaml │ │ │ │ ├── afrixnli_hau.yaml │ │ │ │ ├── afrixnli_ibo.yaml │ │ │ │ ├── afrixnli_kin.yaml │ │ │ │ ├── afrixnli_lin.yaml │ │ │ │ ├── afrixnli_lug.yaml │ │ │ │ ├── afrixnli_orm.yaml │ │ │ │ ├── afrixnli_sna.yaml │ │ │ │ ├── afrixnli_sot.yaml │ │ │ │ ├── afrixnli_swa.yaml │ │ │ │ ├── afrixnli_twi.yaml │ │ │ │ ├── afrixnli_wol.yaml │ │ │ │ ├── afrixnli_xho.yaml │ │ │ │ ├── afrixnli_yaml │ │ │ │ ├── afrixnli_yor.yaml │ │ │ │ ├── afrixnli_zul.yaml │ │ │ │ └── utils.py │ │ │ ├── gen_utils.py │ │ │ ├── lai prompt/ │ │ │ │ ├── direct/ │ │ │ │ │ ├── afrixnli_manual_direct_amh.yaml │ │ │ │ │ ├── afrixnli_manual_direct_eng.yaml │ │ │ │ │ ├── afrixnli_manual_direct_ewe.yaml │ │ │ │ │ ├── afrixnli_manual_direct_fra.yaml │ │ │ │ │ ├── afrixnli_manual_direct_hau.yaml │ │ │ │ │ ├── afrixnli_manual_direct_ibo.yaml │ │ │ │ │ ├── afrixnli_manual_direct_kin.yaml │ │ │ │ │ ├── afrixnli_manual_direct_lin.yaml │ │ │ │ │ ├── afrixnli_manual_direct_lug.yaml │ │ │ │ │ ├── afrixnli_manual_direct_orm.yaml │ │ │ │ │ ├── afrixnli_manual_direct_sna.yaml │ │ │ │ │ ├── afrixnli_manual_direct_sot.yaml │ │ │ │ │ ├── afrixnli_manual_direct_swa.yaml │ │ │ │ │ ├── afrixnli_manual_direct_twi.yaml │ │ │ │ │ ├── afrixnli_manual_direct_wol.yaml │ │ │ │ │ ├── afrixnli_manual_direct_xho.yaml │ │ │ │ │ ├── afrixnli_manual_direct_yaml │ │ │ │ │ ├── afrixnli_manual_direct_yor.yaml │ │ │ │ │ ├── afrixnli_manual_direct_zul.yaml │ │ │ │ │ └── utils.py │ │ │ │ └── translate/ │ │ │ │ ├── afrixnli_manual_translate_amh.yaml │ │ │ │ ├── afrixnli_manual_translate_ewe.yaml │ │ │ │ ├── afrixnli_manual_translate_fra.yaml │ │ │ │ ├── afrixnli_manual_translate_hau.yaml │ │ │ │ ├── afrixnli_manual_translate_ibo.yaml │ │ │ │ ├── afrixnli_manual_translate_kin.yaml │ │ │ │ ├── afrixnli_manual_translate_lin.yaml │ │ │ │ ├── afrixnli_manual_translate_lug.yaml │ │ │ │ ├── afrixnli_manual_translate_orm.yaml │ │ │ │ ├── afrixnli_manual_translate_sna.yaml │ │ │ │ ├── afrixnli_manual_translate_sot.yaml │ │ │ │ ├── afrixnli_manual_translate_swa.yaml │ │ │ │ ├── afrixnli_manual_translate_twi.yaml │ │ │ │ ├── afrixnli_manual_translate_wol.yaml │ │ │ │ ├── afrixnli_manual_translate_xho.yaml │ │ │ │ ├── afrixnli_manual_translate_yaml │ │ │ │ ├── afrixnli_manual_translate_yor.yaml │ │ │ │ ├── afrixnli_manual_translate_zul.yaml │ │ │ │ └── utils.py │ │ │ ├── translate/ │ │ │ │ ├── afrixnli_tt.yaml │ │ │ │ ├── prompt_1/ │ │ │ │ │ ├── afrixnli_translate_amh.yaml │ │ │ │ │ ├── afrixnli_translate_ewe.yaml │ │ │ │ │ ├── afrixnli_translate_fra.yaml │ │ │ │ │ ├── afrixnli_translate_hau.yaml │ │ │ │ │ ├── afrixnli_translate_ibo.yaml │ │ │ │ │ ├── afrixnli_translate_kin.yaml │ │ │ │ │ ├── afrixnli_translate_lin.yaml │ │ │ │ │ ├── afrixnli_translate_lug.yaml │ │ │ │ │ ├── afrixnli_translate_orm.yaml │ │ │ │ │ ├── afrixnli_translate_sna.yaml │ │ │ │ │ ├── afrixnli_translate_sot.yaml │ │ │ │ │ ├── afrixnli_translate_swa.yaml │ │ │ │ │ ├── afrixnli_translate_twi.yaml │ │ │ │ │ ├── afrixnli_translate_wol.yaml │ │ │ │ │ ├── afrixnli_translate_xho.yaml │ │ │ │ │ ├── afrixnli_translate_yaml │ │ │ │ │ ├── afrixnli_translate_yor.yaml │ │ │ │ │ ├── afrixnli_translate_zul.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── prompt_2/ │ │ │ │ │ ├── afrixnli_translate_amh.yaml │ │ │ │ │ ├── afrixnli_translate_ewe.yaml │ │ │ │ │ ├── afrixnli_translate_fra.yaml │ │ │ │ │ ├── afrixnli_translate_hau.yaml │ │ │ │ │ ├── afrixnli_translate_ibo.yaml │ │ │ │ │ ├── afrixnli_translate_kin.yaml │ │ │ │ │ ├── afrixnli_translate_lin.yaml │ │ │ │ │ ├── afrixnli_translate_lug.yaml │ │ │ │ │ ├── afrixnli_translate_orm.yaml │ │ │ │ │ ├── afrixnli_translate_sna.yaml │ │ │ │ │ ├── afrixnli_translate_sot.yaml │ │ │ │ │ ├── afrixnli_translate_swa.yaml │ │ │ │ │ ├── afrixnli_translate_twi.yaml │ │ │ │ │ ├── afrixnli_translate_wol.yaml │ │ │ │ │ ├── afrixnli_translate_xho.yaml │ │ │ │ │ ├── afrixnli_translate_yaml │ │ │ │ │ ├── afrixnli_translate_yor.yaml │ │ │ │ │ ├── afrixnli_translate_zul.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── prompt_3/ │ │ │ │ │ ├── afrixnli_translate_amh.yaml │ │ │ │ │ ├── afrixnli_translate_ewe.yaml │ │ │ │ │ ├── afrixnli_translate_fra.yaml │ │ │ │ │ ├── afrixnli_translate_hau.yaml │ │ │ │ │ ├── afrixnli_translate_ibo.yaml │ │ │ │ │ ├── afrixnli_translate_kin.yaml │ │ │ │ │ ├── afrixnli_translate_lin.yaml │ │ │ │ │ ├── afrixnli_translate_lug.yaml │ │ │ │ │ ├── afrixnli_translate_orm.yaml │ │ │ │ │ ├── afrixnli_translate_sna.yaml │ │ │ │ │ ├── afrixnli_translate_sot.yaml │ │ │ │ │ ├── afrixnli_translate_swa.yaml │ │ │ │ │ ├── afrixnli_translate_twi.yaml │ │ │ │ │ ├── afrixnli_translate_wol.yaml │ │ │ │ │ ├── afrixnli_translate_xho.yaml │ │ │ │ │ ├── afrixnli_translate_yaml │ │ │ │ │ ├── afrixnli_translate_yor.yaml │ │ │ │ │ ├── afrixnli_translate_zul.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── prompt_4/ │ │ │ │ │ ├── afrixnli_translate_amh.yaml │ │ │ │ │ ├── afrixnli_translate_ewe.yaml │ │ │ │ │ ├── afrixnli_translate_fra.yaml │ │ │ │ │ ├── afrixnli_translate_hau.yaml │ │ │ │ │ ├── afrixnli_translate_ibo.yaml │ │ │ │ │ ├── afrixnli_translate_kin.yaml │ │ │ │ │ ├── afrixnli_translate_lin.yaml │ │ │ │ │ ├── afrixnli_translate_lug.yaml │ │ │ │ │ ├── afrixnli_translate_orm.yaml │ │ │ │ │ ├── afrixnli_translate_sna.yaml │ │ │ │ │ ├── afrixnli_translate_sot.yaml │ │ │ │ │ ├── afrixnli_translate_swa.yaml │ │ │ │ │ ├── afrixnli_translate_twi.yaml │ │ │ │ │ ├── afrixnli_translate_wol.yaml │ │ │ │ │ ├── afrixnli_translate_xho.yaml │ │ │ │ │ ├── afrixnli_translate_yaml │ │ │ │ │ ├── afrixnli_translate_yor.yaml │ │ │ │ │ ├── afrixnli_translate_zul.yaml │ │ │ │ │ └── utils.py │ │ │ │ └── prompt_5/ │ │ │ │ ├── afrixnli_translate_amh.yaml │ │ │ │ ├── afrixnli_translate_ewe.yaml │ │ │ │ ├── afrixnli_translate_fra.yaml │ │ │ │ ├── afrixnli_translate_hau.yaml │ │ │ │ ├── afrixnli_translate_ibo.yaml │ │ │ │ ├── afrixnli_translate_kin.yaml │ │ │ │ ├── afrixnli_translate_lin.yaml │ │ │ │ ├── afrixnli_translate_lug.yaml │ │ │ │ ├── afrixnli_translate_orm.yaml │ │ │ │ ├── afrixnli_translate_sna.yaml │ │ │ │ ├── afrixnli_translate_sot.yaml │ │ │ │ ├── afrixnli_translate_swa.yaml │ │ │ │ ├── afrixnli_translate_twi.yaml │ │ │ │ ├── afrixnli_translate_wol.yaml │ │ │ │ ├── afrixnli_translate_xho.yaml │ │ │ │ ├── afrixnli_translate_yaml │ │ │ │ ├── afrixnli_translate_yor.yaml │ │ │ │ ├── afrixnli_translate_zul.yaml │ │ │ │ └── utils.py │ │ │ └── utils.py │ │ ├── afrobench/ │ │ │ ├── README.md │ │ │ ├── adr/ │ │ │ │ ├── README.md │ │ │ │ ├── afridiacritics.yaml │ │ │ │ ├── gen_utils.py │ │ │ │ ├── prompt_1/ │ │ │ │ │ ├── afridiacritics_bbj.yaml │ │ │ │ │ ├── afridiacritics_fon.yaml │ │ │ │ │ ├── afridiacritics_ibo.yaml │ │ │ │ │ ├── afridiacritics_wol.yaml │ │ │ │ │ ├── afridiacritics_yaml │ │ │ │ │ └── afridiacritics_yor.yaml │ │ │ │ ├── prompt_2/ │ │ │ │ │ ├── afridiacritics_bbj.yaml │ │ │ │ │ ├── afridiacritics_fon.yaml │ │ │ │ │ ├── afridiacritics_ibo.yaml │ │ │ │ │ ├── afridiacritics_wol.yaml │ │ │ │ │ ├── afridiacritics_yaml │ │ │ │ │ └── afridiacritics_yor.yaml │ │ │ │ ├── prompt_3/ │ │ │ │ │ ├── afridiacritics_bbj.yaml │ │ │ │ │ ├── afridiacritics_fon.yaml │ │ │ │ │ ├── afridiacritics_ibo.yaml │ │ │ │ │ ├── afridiacritics_wol.yaml │ │ │ │ │ ├── afridiacritics_yaml │ │ │ │ │ └── afridiacritics_yor.yaml │ │ │ │ ├── prompt_4/ │ │ │ │ │ ├── afridiacritics_bbj.yaml │ │ │ │ │ ├── afridiacritics_fon.yaml │ │ │ │ │ ├── afridiacritics_ibo.yaml │ │ │ │ │ ├── afridiacritics_wol.yaml │ │ │ │ │ ├── afridiacritics_yaml │ │ │ │ │ └── afridiacritics_yor.yaml │ │ │ │ └── prompt_5/ │ │ │ │ ├── afridiacritics_bbj.yaml │ │ │ │ ├── afridiacritics_fon.yaml │ │ │ │ ├── afridiacritics_ibo.yaml │ │ │ │ ├── afridiacritics_wol.yaml │ │ │ │ ├── afridiacritics_yaml │ │ │ │ └── afridiacritics_yor.yaml │ │ │ ├── afriqa/ │ │ │ │ ├── README.md │ │ │ │ ├── afriqa.yaml │ │ │ │ ├── prompt_1/ │ │ │ │ │ ├── afriqa │ │ │ │ │ ├── afriqa_bem.yaml │ │ │ │ │ ├── afriqa_fon.yaml │ │ │ │ │ ├── afriqa_hau.yaml │ │ │ │ │ ├── afriqa_ibo.yaml │ │ │ │ │ ├── afriqa_kin.yaml │ │ │ │ │ ├── afriqa_swa.yaml │ │ │ │ │ ├── afriqa_twi.yaml │ │ │ │ │ ├── afriqa_yor.yaml │ │ │ │ │ ├── afriqa_zul.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── prompt_2/ │ │ │ │ │ ├── afriqa │ │ │ │ │ ├── afriqa_bem.yaml │ │ │ │ │ ├── afriqa_fon.yaml │ │ │ │ │ ├── afriqa_hau.yaml │ │ │ │ │ ├── afriqa_ibo.yaml │ │ │ │ │ ├── afriqa_kin.yaml │ │ │ │ │ ├── afriqa_swa.yaml │ │ │ │ │ ├── afriqa_twi.yaml │ │ │ │ │ ├── afriqa_yor.yaml │ │ │ │ │ ├── afriqa_zul.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── prompt_3/ │ │ │ │ │ ├── afriqa │ │ │ │ │ ├── afriqa_bem.yaml │ │ │ │ │ ├── afriqa_fon.yaml │ │ │ │ │ ├── afriqa_hau.yaml │ │ │ │ │ ├── afriqa_ibo.yaml │ │ │ │ │ ├── afriqa_kin.yaml │ │ │ │ │ ├── afriqa_swa.yaml │ │ │ │ │ ├── afriqa_twi.yaml │ │ │ │ │ ├── afriqa_yor.yaml │ │ │ │ │ ├── afriqa_zul.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── prompt_4/ │ │ │ │ │ ├── afriqa │ │ │ │ │ ├── afriqa_bem.yaml │ │ │ │ │ ├── afriqa_fon.yaml │ │ │ │ │ ├── afriqa_hau.yaml │ │ │ │ │ ├── afriqa_ibo.yaml │ │ │ │ │ ├── afriqa_kin.yaml │ │ │ │ │ ├── afriqa_swa.yaml │ │ │ │ │ ├── afriqa_twi.yaml │ │ │ │ │ ├── afriqa_yor.yaml │ │ │ │ │ ├── afriqa_zul.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── prompt_5/ │ │ │ │ │ ├── afriqa │ │ │ │ │ ├── afriqa_bem.yaml │ │ │ │ │ ├── afriqa_fon.yaml │ │ │ │ │ ├── afriqa_hau.yaml │ │ │ │ │ ├── afriqa_ibo.yaml │ │ │ │ │ ├── afriqa_kin.yaml │ │ │ │ │ ├── afriqa_swa.yaml │ │ │ │ │ ├── afriqa_twi.yaml │ │ │ │ │ ├── afriqa_yor.yaml │ │ │ │ │ ├── afriqa_zul.yaml │ │ │ │ │ └── utils.py │ │ │ │ └── utils.py │ │ │ ├── afrisenti/ │ │ │ │ ├── README.md │ │ │ │ ├── afrisenti.yaml │ │ │ │ ├── fewshot.sh │ │ │ │ ├── prompt_1/ │ │ │ │ │ ├── afrisenti │ │ │ │ │ ├── afrisenti_amh.yaml │ │ │ │ │ ├── afrisenti_arq.yaml │ │ │ │ │ ├── afrisenti_ary.yaml │ │ │ │ │ ├── afrisenti_hau.yaml │ │ │ │ │ ├── afrisenti_ibo.yaml │ │ │ │ │ ├── afrisenti_kin.yaml │ │ │ │ │ ├── afrisenti_orm.yaml │ │ │ │ │ ├── afrisenti_pcm.yaml │ │ │ │ │ ├── afrisenti_por.yaml │ │ │ │ │ ├── afrisenti_swa.yaml │ │ │ │ │ ├── afrisenti_tir.yaml │ │ │ │ │ ├── afrisenti_tso.yaml │ │ │ │ │ ├── afrisenti_twi.yaml │ │ │ │ │ ├── afrisenti_yor.yaml │ │ │ │ │ ├── run.sh │ │ │ │ │ ├── utils.py │ │ │ │ │ └── xx.py │ │ │ │ ├── prompt_2/ │ │ │ │ │ ├── afrisenti │ │ │ │ │ ├── afrisenti_amh.yaml │ │ │ │ │ ├── afrisenti_arq.yaml │ │ │ │ │ ├── afrisenti_ary.yaml │ │ │ │ │ ├── afrisenti_hau.yaml │ │ │ │ │ ├── afrisenti_ibo.yaml │ │ │ │ │ ├── afrisenti_kin.yaml │ │ │ │ │ ├── afrisenti_orm.yaml │ │ │ │ │ ├── afrisenti_pcm.yaml │ │ │ │ │ ├── afrisenti_por.yaml │ │ │ │ │ ├── afrisenti_swa.yaml │ │ │ │ │ ├── afrisenti_tir.yaml │ │ │ │ │ ├── afrisenti_tso.yaml │ │ │ │ │ ├── afrisenti_twi.yaml │ │ │ │ │ ├── afrisenti_yor.yaml │ │ │ │ │ ├── run.sh │ │ │ │ │ ├── utils.py │ │ │ │ │ └── xx.py │ │ │ │ ├── prompt_3/ │ │ │ │ │ ├── afrisenti │ │ │ │ │ ├── afrisenti_amh.yaml │ │ │ │ │ ├── afrisenti_arq.yaml │ │ │ │ │ ├── afrisenti_ary.yaml │ │ │ │ │ ├── afrisenti_hau.yaml │ │ │ │ │ ├── afrisenti_ibo.yaml │ │ │ │ │ ├── afrisenti_kin.yaml │ │ │ │ │ ├── afrisenti_orm.yaml │ │ │ │ │ ├── afrisenti_pcm.yaml │ │ │ │ │ ├── afrisenti_por.yaml │ │ │ │ │ ├── afrisenti_swa.yaml │ │ │ │ │ ├── afrisenti_tir.yaml │ │ │ │ │ ├── afrisenti_tso.yaml │ │ │ │ │ ├── afrisenti_twi.yaml │ │ │ │ │ ├── afrisenti_yor.yaml │ │ │ │ │ ├── utils.py │ │ │ │ │ └── xx.py │ │ │ │ ├── prompt_4/ │ │ │ │ │ ├── afrisenti │ │ │ │ │ ├── afrisenti_amh.yaml │ │ │ │ │ ├── afrisenti_arq.yaml │ │ │ │ │ ├── afrisenti_ary.yaml │ │ │ │ │ ├── afrisenti_hau.yaml │ │ │ │ │ ├── afrisenti_ibo.yaml │ │ │ │ │ ├── afrisenti_kin.yaml │ │ │ │ │ ├── afrisenti_orm.yaml │ │ │ │ │ ├── afrisenti_pcm.yaml │ │ │ │ │ ├── afrisenti_por.yaml │ │ │ │ │ ├── afrisenti_swa.yaml │ │ │ │ │ ├── afrisenti_tir.yaml │ │ │ │ │ ├── afrisenti_tso.yaml │ │ │ │ │ ├── afrisenti_twi.yaml │ │ │ │ │ ├── afrisenti_yor.yaml │ │ │ │ │ ├── utils.py │ │ │ │ │ └── xx.py │ │ │ │ ├── prompt_5/ │ │ │ │ │ ├── afrisenti │ │ │ │ │ ├── afrisenti_amh.yaml │ │ │ │ │ ├── afrisenti_arq.yaml │ │ │ │ │ ├── afrisenti_ary.yaml │ │ │ │ │ ├── afrisenti_hau.yaml │ │ │ │ │ ├── afrisenti_ibo.yaml │ │ │ │ │ ├── afrisenti_kin.yaml │ │ │ │ │ ├── afrisenti_orm.yaml │ │ │ │ │ ├── afrisenti_pcm.yaml │ │ │ │ │ ├── afrisenti_por.yaml │ │ │ │ │ ├── afrisenti_swa.yaml │ │ │ │ │ ├── afrisenti_tir.yaml │ │ │ │ │ ├── afrisenti_tso.yaml │ │ │ │ │ ├── afrisenti_twi.yaml │ │ │ │ │ ├── afrisenti_yor.yaml │ │ │ │ │ ├── utils.py │ │ │ │ │ └── xx.py │ │ │ │ └── utils.py │ │ │ ├── afrobench-lite.yaml │ │ │ ├── afrobench.yaml │ │ │ ├── belebele/ │ │ │ │ ├── README.md │ │ │ │ ├── belebele.yaml │ │ │ │ ├── prompt_1/ │ │ │ │ │ ├── belebele │ │ │ │ │ ├── belebele_afr.yaml │ │ │ │ │ ├── belebele_amh.yaml │ │ │ │ │ ├── belebele_ary.yaml │ │ │ │ │ ├── belebele_arz.yaml │ │ │ │ │ ├── belebele_bam.yaml │ │ │ │ │ ├── belebele_eng.yaml │ │ │ │ │ ├── belebele_fra.yaml │ │ │ │ │ ├── belebele_fuv.yaml │ │ │ │ │ ├── belebele_gaz.yaml │ │ │ │ │ ├── belebele_hau.yaml │ │ │ │ │ ├── belebele_ibo.yaml │ │ │ │ │ ├── belebele_kea.yaml │ │ │ │ │ ├── belebele_kin.yaml │ │ │ │ │ ├── belebele_lin.yaml │ │ │ │ │ ├── belebele_lug.yaml │ │ │ │ │ ├── belebele_luo.yaml │ │ │ │ │ ├── belebele_nya.yaml │ │ │ │ │ ├── belebele_plt.yaml │ │ │ │ │ ├── belebele_por.yaml │ │ │ │ │ ├── belebele_sna.yaml │ │ │ │ │ ├── belebele_som.yaml │ │ │ │ │ ├── belebele_sot.yaml │ │ │ │ │ ├── belebele_ssw.yaml │ │ │ │ │ ├── belebele_swa.yaml │ │ │ │ │ ├── belebele_tir.yaml │ │ │ │ │ ├── belebele_tsn.yaml │ │ │ │ │ ├── belebele_tso.yaml │ │ │ │ │ ├── belebele_wol.yaml │ │ │ │ │ ├── belebele_xho.yaml │ │ │ │ │ ├── belebele_yor.yaml │ │ │ │ │ └── belebele_zul.yaml │ │ │ │ ├── prompt_2/ │ │ │ │ │ ├── belebele │ │ │ │ │ ├── belebele_afr.yaml │ │ │ │ │ ├── belebele_amh.yaml │ │ │ │ │ ├── belebele_ary.yaml │ │ │ │ │ ├── belebele_arz.yaml │ │ │ │ │ ├── belebele_bam.yaml │ │ │ │ │ ├── belebele_eng.yaml │ │ │ │ │ ├── belebele_fra.yaml │ │ │ │ │ ├── belebele_fuv.yaml │ │ │ │ │ ├── belebele_gaz.yaml │ │ │ │ │ ├── belebele_hau.yaml │ │ │ │ │ ├── belebele_ibo.yaml │ │ │ │ │ ├── belebele_kea.yaml │ │ │ │ │ ├── belebele_kin.yaml │ │ │ │ │ ├── belebele_lin.yaml │ │ │ │ │ ├── belebele_lug.yaml │ │ │ │ │ ├── belebele_luo.yaml │ │ │ │ │ ├── belebele_nya.yaml │ │ │ │ │ ├── belebele_plt.yaml │ │ │ │ │ ├── belebele_por.yaml │ │ │ │ │ ├── belebele_sna.yaml │ │ │ │ │ ├── belebele_som.yaml │ │ │ │ │ ├── belebele_sot.yaml │ │ │ │ │ ├── belebele_ssw.yaml │ │ │ │ │ ├── belebele_swa.yaml │ │ │ │ │ ├── belebele_tir.yaml │ │ │ │ │ ├── belebele_tsn.yaml │ │ │ │ │ ├── belebele_tso.yaml │ │ │ │ │ ├── belebele_wol.yaml │ │ │ │ │ ├── belebele_xho.yaml │ │ │ │ │ ├── belebele_yor.yaml │ │ │ │ │ └── belebele_zul.yaml │ │ │ │ ├── prompt_3/ │ │ │ │ │ ├── belebele │ │ │ │ │ ├── belebele_afr.yaml │ │ │ │ │ ├── belebele_amh.yaml │ │ │ │ │ ├── belebele_ary.yaml │ │ │ │ │ ├── belebele_arz.yaml │ │ │ │ │ ├── belebele_bam.yaml │ │ │ │ │ ├── belebele_eng.yaml │ │ │ │ │ ├── belebele_fra.yaml │ │ │ │ │ ├── belebele_fuv.yaml │ │ │ │ │ ├── belebele_gaz.yaml │ │ │ │ │ ├── belebele_hau.yaml │ │ │ │ │ ├── belebele_ibo.yaml │ │ │ │ │ ├── belebele_kea.yaml │ │ │ │ │ ├── belebele_kin.yaml │ │ │ │ │ ├── belebele_lin.yaml │ │ │ │ │ ├── belebele_lug.yaml │ │ │ │ │ ├── belebele_luo.yaml │ │ │ │ │ ├── belebele_nya.yaml │ │ │ │ │ ├── belebele_plt.yaml │ │ │ │ │ ├── belebele_por.yaml │ │ │ │ │ ├── belebele_sna.yaml │ │ │ │ │ ├── belebele_som.yaml │ │ │ │ │ ├── belebele_sot.yaml │ │ │ │ │ ├── belebele_ssw.yaml │ │ │ │ │ ├── belebele_swa.yaml │ │ │ │ │ ├── belebele_tir.yaml │ │ │ │ │ ├── belebele_tsn.yaml │ │ │ │ │ ├── belebele_tso.yaml │ │ │ │ │ ├── belebele_wol.yaml │ │ │ │ │ ├── belebele_xho.yaml │ │ │ │ │ ├── belebele_yor.yaml │ │ │ │ │ └── belebele_zul.yaml │ │ │ │ ├── prompt_4/ │ │ │ │ │ ├── belebele │ │ │ │ │ ├── belebele_afr.yaml │ │ │ │ │ ├── belebele_amh.yaml │ │ │ │ │ ├── belebele_ary.yaml │ │ │ │ │ ├── belebele_arz.yaml │ │ │ │ │ ├── belebele_bam.yaml │ │ │ │ │ ├── belebele_eng.yaml │ │ │ │ │ ├── belebele_fra.yaml │ │ │ │ │ ├── belebele_fuv.yaml │ │ │ │ │ ├── belebele_gaz.yaml │ │ │ │ │ ├── belebele_hau.yaml │ │ │ │ │ ├── belebele_ibo.yaml │ │ │ │ │ ├── belebele_kea.yaml │ │ │ │ │ ├── belebele_kin.yaml │ │ │ │ │ ├── belebele_lin.yaml │ │ │ │ │ ├── belebele_lug.yaml │ │ │ │ │ ├── belebele_luo.yaml │ │ │ │ │ ├── belebele_nya.yaml │ │ │ │ │ ├── belebele_plt.yaml │ │ │ │ │ ├── belebele_por.yaml │ │ │ │ │ ├── belebele_sna.yaml │ │ │ │ │ ├── belebele_som.yaml │ │ │ │ │ ├── belebele_sot.yaml │ │ │ │ │ ├── belebele_ssw.yaml │ │ │ │ │ ├── belebele_swa.yaml │ │ │ │ │ ├── belebele_tir.yaml │ │ │ │ │ ├── belebele_tsn.yaml │ │ │ │ │ ├── belebele_tso.yaml │ │ │ │ │ ├── belebele_wol.yaml │ │ │ │ │ ├── belebele_xho.yaml │ │ │ │ │ ├── belebele_yor.yaml │ │ │ │ │ └── belebele_zul.yaml │ │ │ │ ├── prompt_5/ │ │ │ │ │ ├── belebele │ │ │ │ │ ├── belebele_afr.yaml │ │ │ │ │ ├── belebele_amh.yaml │ │ │ │ │ ├── belebele_ary.yaml │ │ │ │ │ ├── belebele_arz.yaml │ │ │ │ │ ├── belebele_bam.yaml │ │ │ │ │ ├── belebele_eng.yaml │ │ │ │ │ ├── belebele_fra.yaml │ │ │ │ │ ├── belebele_fuv.yaml │ │ │ │ │ ├── belebele_gaz.yaml │ │ │ │ │ ├── belebele_hau.yaml │ │ │ │ │ ├── belebele_ibo.yaml │ │ │ │ │ ├── belebele_kea.yaml │ │ │ │ │ ├── belebele_kin.yaml │ │ │ │ │ ├── belebele_lin.yaml │ │ │ │ │ ├── belebele_lug.yaml │ │ │ │ │ ├── belebele_luo.yaml │ │ │ │ │ ├── belebele_nya.yaml │ │ │ │ │ ├── belebele_plt.yaml │ │ │ │ │ ├── belebele_por.yaml │ │ │ │ │ ├── belebele_sna.yaml │ │ │ │ │ ├── belebele_som.yaml │ │ │ │ │ ├── belebele_sot.yaml │ │ │ │ │ ├── belebele_ssw.yaml │ │ │ │ │ ├── belebele_swa.yaml │ │ │ │ │ ├── belebele_tir.yaml │ │ │ │ │ ├── belebele_tsn.yaml │ │ │ │ │ ├── belebele_tso.yaml │ │ │ │ │ ├── belebele_wol.yaml │ │ │ │ │ ├── belebele_xho.yaml │ │ │ │ │ ├── belebele_yor.yaml │ │ │ │ │ └── belebele_zul.yaml │ │ │ │ └── utils.py │ │ │ ├── flores/ │ │ │ │ ├── README.md │ │ │ │ ├── flores.yaml │ │ │ │ ├── gen_utils.py │ │ │ │ ├── prompt_1/ │ │ │ │ │ ├── african-english/ │ │ │ │ │ │ ├── flores │ │ │ │ │ │ ├── flores_ace_Arab-eng_Latn.yaml │ │ │ │ │ │ ├── flores_ace_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_acq_Arab-eng_Latn.yaml │ │ │ │ │ │ ├── flores_aeb_Arab-eng_Latn.yaml │ │ │ │ │ │ ├── flores_afr_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_aka_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_amh_Ethi-eng_Latn.yaml │ │ │ │ │ │ ├── flores_ary_Arab-eng_Latn.yaml │ │ │ │ │ │ ├── flores_arz_Arab-eng_Latn.yaml │ │ │ │ │ │ ├── flores_bam_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_ban_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_bem_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_cjk_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_dik_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_dyu_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_ewe_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_fon_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_fra_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_fuv_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_gaz_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_hau_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_ibo_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_kab_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_kam_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_kbp_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_kea_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_kik_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_kin_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_kmb_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_knc_Arab-eng_Latn.yaml │ │ │ │ │ │ ├── flores_knc_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_kon_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_lin_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_lua_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_lug_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_luo_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_mos_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_nso_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_nus_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_nya_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_plt_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_run_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_sag_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_sna_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_som_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_sot_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_ssw_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_sun_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_swh_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_taq_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_taq_Tfng-eng_Latn.yaml │ │ │ │ │ │ ├── flores_tir_Ethi-eng_Latn.yaml │ │ │ │ │ │ ├── flores_tsn_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_tso_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_tum_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_twi_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_tzm_Tfng-eng_Latn.yaml │ │ │ │ │ │ ├── flores_umb_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_wol_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_xho_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_yor_Latn-eng_Latn.yaml │ │ │ │ │ │ └── flores_zul_Latn-eng_Latn.yaml │ │ │ │ │ ├── english-african/ │ │ │ │ │ │ ├── flores │ │ │ │ │ │ ├── flores_eng_Latn-ace_Arab.yaml │ │ │ │ │ │ ├── flores_eng_Latn-ace_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-acq_Arab.yaml │ │ │ │ │ │ ├── flores_eng_Latn-aeb_Arab.yaml │ │ │ │ │ │ ├── flores_eng_Latn-afr_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-aka_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-amh_Ethi.yaml │ │ │ │ │ │ ├── flores_eng_Latn-ary_Arab.yaml │ │ │ │ │ │ ├── flores_eng_Latn-arz_Arab.yaml │ │ │ │ │ │ ├── flores_eng_Latn-bam_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-ban_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-bem_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-cjk_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-dik_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-dyu_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-ewe_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-fon_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-fra_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-fuv_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-gaz_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-hau_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-ibo_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-kab_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-kam_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-kbp_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-kea_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-kik_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-kin_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-kmb_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-knc_Arab.yaml │ │ │ │ │ │ ├── flores_eng_Latn-knc_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-kon_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-lin_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-lua_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-lug_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-luo_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-mos_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-nso_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-nus_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-nya_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-plt_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-run_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-sag_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-sna_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-som_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-sot_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-ssw_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-sun_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-swh_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-taq_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-taq_Tfng.yaml │ │ │ │ │ │ ├── flores_eng_Latn-tir_Ethi.yaml │ │ │ │ │ │ ├── flores_eng_Latn-tsn_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-tso_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-tum_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-twi_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-tzm_Tfng.yaml │ │ │ │ │ │ ├── flores_eng_Latn-umb_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-wol_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-xho_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-yor_Latn.yaml │ │ │ │ │ │ └── flores_eng_Latn-zul_Latn.yaml │ │ │ │ │ └── flores │ │ │ │ ├── prompt_2/ │ │ │ │ │ ├── african-english/ │ │ │ │ │ │ ├── flores │ │ │ │ │ │ ├── flores_ace_Arab-eng_Latn.yaml │ │ │ │ │ │ ├── flores_ace_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_acq_Arab-eng_Latn.yaml │ │ │ │ │ │ ├── flores_aeb_Arab-eng_Latn.yaml │ │ │ │ │ │ ├── flores_afr_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_aka_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_amh_Ethi-eng_Latn.yaml │ │ │ │ │ │ ├── flores_ary_Arab-eng_Latn.yaml │ │ │ │ │ │ ├── flores_arz_Arab-eng_Latn.yaml │ │ │ │ │ │ ├── flores_bam_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_ban_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_bem_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_cjk_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_dik_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_dyu_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_ewe_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_fon_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_fra_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_fuv_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_gaz_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_hau_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_ibo_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_kab_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_kam_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_kbp_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_kea_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_kik_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_kin_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_kmb_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_knc_Arab-eng_Latn.yaml │ │ │ │ │ │ ├── flores_knc_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_kon_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_lin_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_lua_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_lug_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_luo_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_mos_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_nso_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_nus_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_nya_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_plt_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_run_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_sag_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_sna_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_som_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_sot_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_ssw_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_sun_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_swh_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_taq_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_taq_Tfng-eng_Latn.yaml │ │ │ │ │ │ ├── flores_tir_Ethi-eng_Latn.yaml │ │ │ │ │ │ ├── flores_tsn_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_tso_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_tum_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_twi_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_tzm_Tfng-eng_Latn.yaml │ │ │ │ │ │ ├── flores_umb_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_wol_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_xho_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── flores_yor_Latn-eng_Latn.yaml │ │ │ │ │ │ └── flores_zul_Latn-eng_Latn.yaml │ │ │ │ │ ├── english-african/ │ │ │ │ │ │ ├── flores │ │ │ │ │ │ ├── flores_eng_Latn-ace_Arab.yaml │ │ │ │ │ │ ├── flores_eng_Latn-ace_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-acq_Arab.yaml │ │ │ │ │ │ ├── flores_eng_Latn-aeb_Arab.yaml │ │ │ │ │ │ ├── flores_eng_Latn-afr_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-aka_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-amh_Ethi.yaml │ │ │ │ │ │ ├── flores_eng_Latn-ary_Arab.yaml │ │ │ │ │ │ ├── flores_eng_Latn-arz_Arab.yaml │ │ │ │ │ │ ├── flores_eng_Latn-bam_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-ban_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-bem_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-cjk_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-dik_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-dyu_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-ewe_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-fon_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-fra_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-fuv_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-gaz_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-hau_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-ibo_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-kab_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-kam_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-kbp_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-kea_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-kik_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-kin_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-kmb_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-knc_Arab.yaml │ │ │ │ │ │ ├── flores_eng_Latn-knc_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-kon_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-lin_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-lua_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-lug_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-luo_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-mos_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-nso_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-nus_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-nya_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-plt_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-run_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-sag_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-sna_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-som_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-sot_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-ssw_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-sun_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-swh_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-taq_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-taq_Tfng.yaml │ │ │ │ │ │ ├── flores_eng_Latn-tir_Ethi.yaml │ │ │ │ │ │ ├── flores_eng_Latn-tsn_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-tso_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-tum_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-twi_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-tzm_Tfng.yaml │ │ │ │ │ │ ├── flores_eng_Latn-umb_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-wol_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-xho_Latn.yaml │ │ │ │ │ │ ├── flores_eng_Latn-yor_Latn.yaml │ │ │ │ │ │ └── flores_eng_Latn-zul_Latn.yaml │ │ │ │ │ └── flores │ │ │ │ └── prompt_3/ │ │ │ │ ├── african-english/ │ │ │ │ │ ├── flores │ │ │ │ │ ├── flores_ace_Arab-eng_Latn.yaml │ │ │ │ │ ├── flores_ace_Latn-eng_Latn.yaml │ │ │ │ │ ├── flores_acq_Arab-eng_Latn.yaml │ │ │ │ │ ├── flores_aeb_Arab-eng_Latn.yaml │ │ │ │ │ ├── flores_afr_Latn-eng_Latn.yaml │ │ │ │ │ ├── flores_aka_Latn-eng_Latn.yaml │ │ │ │ │ ├── flores_amh_Ethi-eng_Latn.yaml │ │ │ │ │ ├── flores_ary_Arab-eng_Latn.yaml │ │ │ │ │ ├── flores_arz_Arab-eng_Latn.yaml │ │ │ │ │ ├── flores_bam_Latn-eng_Latn.yaml │ │ │ │ │ ├── flores_ban_Latn-eng_Latn.yaml │ │ │ │ │ ├── flores_bem_Latn-eng_Latn.yaml │ │ │ │ │ ├── flores_cjk_Latn-eng_Latn.yaml │ │ │ │ │ ├── flores_dik_Latn-eng_Latn.yaml │ │ │ │ │ ├── flores_dyu_Latn-eng_Latn.yaml │ │ │ │ │ ├── flores_ewe_Latn-eng_Latn.yaml │ │ │ │ │ ├── flores_fon_Latn-eng_Latn.yaml │ │ │ │ │ ├── flores_fra_Latn-eng_Latn.yaml │ │ │ │ │ ├── flores_fuv_Latn-eng_Latn.yaml │ │ │ │ │ ├── flores_gaz_Latn-eng_Latn.yaml │ │ │ │ │ ├── flores_hau_Latn-eng_Latn.yaml │ │ │ │ │ ├── flores_ibo_Latn-eng_Latn.yaml │ │ │ │ │ ├── flores_kab_Latn-eng_Latn.yaml │ │ │ │ │ ├── flores_kam_Latn-eng_Latn.yaml │ │ │ │ │ ├── flores_kbp_Latn-eng_Latn.yaml │ │ │ │ │ ├── flores_kea_Latn-eng_Latn.yaml │ │ │ │ │ ├── flores_kik_Latn-eng_Latn.yaml │ │ │ │ │ ├── flores_kin_Latn-eng_Latn.yaml │ │ │ │ │ ├── flores_kmb_Latn-eng_Latn.yaml │ │ │ │ │ ├── flores_knc_Arab-eng_Latn.yaml │ │ │ │ │ ├── flores_knc_Latn-eng_Latn.yaml │ │ │ │ │ ├── flores_kon_Latn-eng_Latn.yaml │ │ │ │ │ ├── flores_lin_Latn-eng_Latn.yaml │ │ │ │ │ ├── flores_lua_Latn-eng_Latn.yaml │ │ │ │ │ ├── flores_lug_Latn-eng_Latn.yaml │ │ │ │ │ ├── flores_luo_Latn-eng_Latn.yaml │ │ │ │ │ ├── flores_mos_Latn-eng_Latn.yaml │ │ │ │ │ ├── flores_nso_Latn-eng_Latn.yaml │ │ │ │ │ ├── flores_nus_Latn-eng_Latn.yaml │ │ │ │ │ ├── flores_nya_Latn-eng_Latn.yaml │ │ │ │ │ ├── flores_plt_Latn-eng_Latn.yaml │ │ │ │ │ ├── flores_run_Latn-eng_Latn.yaml │ │ │ │ │ ├── flores_sag_Latn-eng_Latn.yaml │ │ │ │ │ ├── flores_sna_Latn-eng_Latn.yaml │ │ │ │ │ ├── flores_som_Latn-eng_Latn.yaml │ │ │ │ │ ├── flores_sot_Latn-eng_Latn.yaml │ │ │ │ │ ├── flores_ssw_Latn-eng_Latn.yaml │ │ │ │ │ ├── flores_sun_Latn-eng_Latn.yaml │ │ │ │ │ ├── flores_swh_Latn-eng_Latn.yaml │ │ │ │ │ ├── flores_taq_Latn-eng_Latn.yaml │ │ │ │ │ ├── flores_taq_Tfng-eng_Latn.yaml │ │ │ │ │ ├── flores_tir_Ethi-eng_Latn.yaml │ │ │ │ │ ├── flores_tsn_Latn-eng_Latn.yaml │ │ │ │ │ ├── flores_tso_Latn-eng_Latn.yaml │ │ │ │ │ ├── flores_tum_Latn-eng_Latn.yaml │ │ │ │ │ ├── flores_twi_Latn-eng_Latn.yaml │ │ │ │ │ ├── flores_tzm_Tfng-eng_Latn.yaml │ │ │ │ │ ├── flores_umb_Latn-eng_Latn.yaml │ │ │ │ │ ├── flores_wol_Latn-eng_Latn.yaml │ │ │ │ │ ├── flores_xho_Latn-eng_Latn.yaml │ │ │ │ │ ├── flores_yor_Latn-eng_Latn.yaml │ │ │ │ │ └── flores_zul_Latn-eng_Latn.yaml │ │ │ │ ├── english-african/ │ │ │ │ │ ├── flores │ │ │ │ │ ├── flores_eng_Latn-ace_Arab.yaml │ │ │ │ │ ├── flores_eng_Latn-ace_Latn.yaml │ │ │ │ │ ├── flores_eng_Latn-acq_Arab.yaml │ │ │ │ │ ├── flores_eng_Latn-aeb_Arab.yaml │ │ │ │ │ ├── flores_eng_Latn-afr_Latn.yaml │ │ │ │ │ ├── flores_eng_Latn-aka_Latn.yaml │ │ │ │ │ ├── flores_eng_Latn-amh_Ethi.yaml │ │ │ │ │ ├── flores_eng_Latn-ary_Arab.yaml │ │ │ │ │ ├── flores_eng_Latn-arz_Arab.yaml │ │ │ │ │ ├── flores_eng_Latn-bam_Latn.yaml │ │ │ │ │ ├── flores_eng_Latn-ban_Latn.yaml │ │ │ │ │ ├── flores_eng_Latn-bem_Latn.yaml │ │ │ │ │ ├── flores_eng_Latn-cjk_Latn.yaml │ │ │ │ │ ├── flores_eng_Latn-dik_Latn.yaml │ │ │ │ │ ├── flores_eng_Latn-dyu_Latn.yaml │ │ │ │ │ ├── flores_eng_Latn-ewe_Latn.yaml │ │ │ │ │ ├── flores_eng_Latn-fon_Latn.yaml │ │ │ │ │ ├── flores_eng_Latn-fra_Latn.yaml │ │ │ │ │ ├── flores_eng_Latn-fuv_Latn.yaml │ │ │ │ │ ├── flores_eng_Latn-gaz_Latn.yaml │ │ │ │ │ ├── flores_eng_Latn-hau_Latn.yaml │ │ │ │ │ ├── flores_eng_Latn-ibo_Latn.yaml │ │ │ │ │ ├── flores_eng_Latn-kab_Latn.yaml │ │ │ │ │ ├── flores_eng_Latn-kam_Latn.yaml │ │ │ │ │ ├── flores_eng_Latn-kbp_Latn.yaml │ │ │ │ │ ├── flores_eng_Latn-kea_Latn.yaml │ │ │ │ │ ├── flores_eng_Latn-kik_Latn.yaml │ │ │ │ │ ├── flores_eng_Latn-kin_Latn.yaml │ │ │ │ │ ├── flores_eng_Latn-kmb_Latn.yaml │ │ │ │ │ ├── flores_eng_Latn-knc_Arab.yaml │ │ │ │ │ ├── flores_eng_Latn-knc_Latn.yaml │ │ │ │ │ ├── flores_eng_Latn-kon_Latn.yaml │ │ │ │ │ ├── flores_eng_Latn-lin_Latn.yaml │ │ │ │ │ ├── flores_eng_Latn-lua_Latn.yaml │ │ │ │ │ ├── flores_eng_Latn-lug_Latn.yaml │ │ │ │ │ ├── flores_eng_Latn-luo_Latn.yaml │ │ │ │ │ ├── flores_eng_Latn-mos_Latn.yaml │ │ │ │ │ ├── flores_eng_Latn-nso_Latn.yaml │ │ │ │ │ ├── flores_eng_Latn-nus_Latn.yaml │ │ │ │ │ ├── flores_eng_Latn-nya_Latn.yaml │ │ │ │ │ ├── flores_eng_Latn-plt_Latn.yaml │ │ │ │ │ ├── flores_eng_Latn-run_Latn.yaml │ │ │ │ │ ├── flores_eng_Latn-sag_Latn.yaml │ │ │ │ │ ├── flores_eng_Latn-sna_Latn.yaml │ │ │ │ │ ├── flores_eng_Latn-som_Latn.yaml │ │ │ │ │ ├── flores_eng_Latn-sot_Latn.yaml │ │ │ │ │ ├── flores_eng_Latn-ssw_Latn.yaml │ │ │ │ │ ├── flores_eng_Latn-sun_Latn.yaml │ │ │ │ │ ├── flores_eng_Latn-swh_Latn.yaml │ │ │ │ │ ├── flores_eng_Latn-taq_Latn.yaml │ │ │ │ │ ├── flores_eng_Latn-taq_Tfng.yaml │ │ │ │ │ ├── flores_eng_Latn-tir_Ethi.yaml │ │ │ │ │ ├── flores_eng_Latn-tsn_Latn.yaml │ │ │ │ │ ├── flores_eng_Latn-tso_Latn.yaml │ │ │ │ │ ├── flores_eng_Latn-tum_Latn.yaml │ │ │ │ │ ├── flores_eng_Latn-twi_Latn.yaml │ │ │ │ │ ├── flores_eng_Latn-tzm_Tfng.yaml │ │ │ │ │ ├── flores_eng_Latn-umb_Latn.yaml │ │ │ │ │ ├── flores_eng_Latn-wol_Latn.yaml │ │ │ │ │ ├── flores_eng_Latn-xho_Latn.yaml │ │ │ │ │ ├── flores_eng_Latn-yor_Latn.yaml │ │ │ │ │ └── flores_eng_Latn-zul_Latn.yaml │ │ │ │ └── flores │ │ │ ├── injongointent/ │ │ │ │ ├── README.md │ │ │ │ ├── gen_utils.py │ │ │ │ ├── injongointent.yaml │ │ │ │ ├── prompt_1/ │ │ │ │ │ ├── injongointent │ │ │ │ │ ├── injongointent_amh.yaml │ │ │ │ │ ├── injongointent_eng.yaml │ │ │ │ │ ├── injongointent_ewe.yaml │ │ │ │ │ ├── injongointent_hau.yaml │ │ │ │ │ ├── injongointent_ibo.yaml │ │ │ │ │ ├── injongointent_kin.yaml │ │ │ │ │ ├── injongointent_lin.yaml │ │ │ │ │ ├── injongointent_lug.yaml │ │ │ │ │ ├── injongointent_orm.yaml │ │ │ │ │ ├── injongointent_sna.yaml │ │ │ │ │ ├── injongointent_sot.yaml │ │ │ │ │ ├── injongointent_swa.yaml │ │ │ │ │ ├── injongointent_twi.yaml │ │ │ │ │ ├── injongointent_wol.yaml │ │ │ │ │ ├── injongointent_xho.yaml │ │ │ │ │ ├── injongointent_yor.yaml │ │ │ │ │ ├── injongointent_zul.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── prompt_2/ │ │ │ │ │ ├── injongointent │ │ │ │ │ ├── injongointent_amh.yaml │ │ │ │ │ ├── injongointent_eng.yaml │ │ │ │ │ ├── injongointent_ewe.yaml │ │ │ │ │ ├── injongointent_hau.yaml │ │ │ │ │ ├── injongointent_ibo.yaml │ │ │ │ │ ├── injongointent_kin.yaml │ │ │ │ │ ├── injongointent_lin.yaml │ │ │ │ │ ├── injongointent_lug.yaml │ │ │ │ │ ├── injongointent_orm.yaml │ │ │ │ │ ├── injongointent_sna.yaml │ │ │ │ │ ├── injongointent_sot.yaml │ │ │ │ │ ├── injongointent_swa.yaml │ │ │ │ │ ├── injongointent_twi.yaml │ │ │ │ │ ├── injongointent_wol.yaml │ │ │ │ │ ├── injongointent_xho.yaml │ │ │ │ │ ├── injongointent_yor.yaml │ │ │ │ │ ├── injongointent_zul.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── prompt_3/ │ │ │ │ │ ├── injongointent │ │ │ │ │ ├── injongointent_amh.yaml │ │ │ │ │ ├── injongointent_eng.yaml │ │ │ │ │ ├── injongointent_ewe.yaml │ │ │ │ │ ├── injongointent_hau.yaml │ │ │ │ │ ├── injongointent_ibo.yaml │ │ │ │ │ ├── injongointent_kin.yaml │ │ │ │ │ ├── injongointent_lin.yaml │ │ │ │ │ ├── injongointent_lug.yaml │ │ │ │ │ ├── injongointent_orm.yaml │ │ │ │ │ ├── injongointent_sna.yaml │ │ │ │ │ ├── injongointent_sot.yaml │ │ │ │ │ ├── injongointent_swa.yaml │ │ │ │ │ ├── injongointent_twi.yaml │ │ │ │ │ ├── injongointent_wol.yaml │ │ │ │ │ ├── injongointent_xho.yaml │ │ │ │ │ ├── injongointent_yor.yaml │ │ │ │ │ ├── injongointent_zul.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── prompt_4/ │ │ │ │ │ ├── injongointent │ │ │ │ │ ├── injongointent_amh.yaml │ │ │ │ │ ├── injongointent_eng.yaml │ │ │ │ │ ├── injongointent_ewe.yaml │ │ │ │ │ ├── injongointent_hau.yaml │ │ │ │ │ ├── injongointent_ibo.yaml │ │ │ │ │ ├── injongointent_kin.yaml │ │ │ │ │ ├── injongointent_lin.yaml │ │ │ │ │ ├── injongointent_lug.yaml │ │ │ │ │ ├── injongointent_orm.yaml │ │ │ │ │ ├── injongointent_sna.yaml │ │ │ │ │ ├── injongointent_sot.yaml │ │ │ │ │ ├── injongointent_swa.yaml │ │ │ │ │ ├── injongointent_twi.yaml │ │ │ │ │ ├── injongointent_wol.yaml │ │ │ │ │ ├── injongointent_xho.yaml │ │ │ │ │ ├── injongointent_yor.yaml │ │ │ │ │ ├── injongointent_zul.yaml │ │ │ │ │ └── utils.py │ │ │ │ └── prompt_5/ │ │ │ │ ├── injongointent │ │ │ │ ├── injongointent_amh.yaml │ │ │ │ ├── injongointent_eng.yaml │ │ │ │ ├── injongointent_ewe.yaml │ │ │ │ ├── injongointent_hau.yaml │ │ │ │ ├── injongointent_ibo.yaml │ │ │ │ ├── injongointent_kin.yaml │ │ │ │ ├── injongointent_lin.yaml │ │ │ │ ├── injongointent_lug.yaml │ │ │ │ ├── injongointent_orm.yaml │ │ │ │ ├── injongointent_sna.yaml │ │ │ │ ├── injongointent_sot.yaml │ │ │ │ ├── injongointent_swa.yaml │ │ │ │ ├── injongointent_twi.yaml │ │ │ │ ├── injongointent_wol.yaml │ │ │ │ ├── injongointent_xho.yaml │ │ │ │ ├── injongointent_yor.yaml │ │ │ │ ├── injongointent_zul.yaml │ │ │ │ └── utils.py │ │ │ ├── mafand/ │ │ │ │ ├── README.md │ │ │ │ ├── gen_utils.py │ │ │ │ ├── mafand.yaml │ │ │ │ ├── prompt_1/ │ │ │ │ │ ├── african-english/ │ │ │ │ │ │ ├── mafand │ │ │ │ │ │ ├── mafand_amh-en.yaml │ │ │ │ │ │ ├── mafand_bam-fr.yaml │ │ │ │ │ │ ├── mafand_bbj-fr.yaml │ │ │ │ │ │ ├── mafand_ewe-fr.yaml │ │ │ │ │ │ ├── mafand_fon-fr.yaml │ │ │ │ │ │ ├── mafand_hau-en.yaml │ │ │ │ │ │ ├── mafand_ibo-en.yaml │ │ │ │ │ │ ├── mafand_kin-en.yaml │ │ │ │ │ │ ├── mafand_lug-en.yaml │ │ │ │ │ │ ├── mafand_luo-en.yaml │ │ │ │ │ │ ├── mafand_mos-fr.yaml │ │ │ │ │ │ ├── mafand_nya-en.yaml │ │ │ │ │ │ ├── mafand_pcm-en.yaml │ │ │ │ │ │ ├── mafand_sna-en.yaml │ │ │ │ │ │ ├── mafand_swa-en.yaml │ │ │ │ │ │ ├── mafand_tsn-en.yaml │ │ │ │ │ │ ├── mafand_twi-en.yaml │ │ │ │ │ │ ├── mafand_wol-fr.yaml │ │ │ │ │ │ ├── mafand_xho-en.yaml │ │ │ │ │ │ ├── mafand_yor-en.yaml │ │ │ │ │ │ ├── mafand_zul-en.yaml │ │ │ │ │ │ └── utils.py │ │ │ │ │ └── english-african/ │ │ │ │ │ ├── mafand │ │ │ │ │ ├── mafand_en-amh.yaml │ │ │ │ │ ├── mafand_en-hau.yaml │ │ │ │ │ ├── mafand_en-ibo.yaml │ │ │ │ │ ├── mafand_en-kin.yaml │ │ │ │ │ ├── mafand_en-lug.yaml │ │ │ │ │ ├── mafand_en-luo.yaml │ │ │ │ │ ├── mafand_en-nya.yaml │ │ │ │ │ ├── mafand_en-pcm.yaml │ │ │ │ │ ├── mafand_en-sna.yaml │ │ │ │ │ ├── mafand_en-swa.yaml │ │ │ │ │ ├── mafand_en-tsn.yaml │ │ │ │ │ ├── mafand_en-twi.yaml │ │ │ │ │ ├── mafand_en-xho.yaml │ │ │ │ │ ├── mafand_en-yor.yaml │ │ │ │ │ ├── mafand_en-zul.yaml │ │ │ │ │ ├── mafand_fr-bam.yaml │ │ │ │ │ ├── mafand_fr-bbj.yaml │ │ │ │ │ ├── mafand_fr-ewe.yaml │ │ │ │ │ ├── mafand_fr-fon.yaml │ │ │ │ │ ├── mafand_fr-mos.yaml │ │ │ │ │ ├── mafand_fr-wol.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── prompt_2/ │ │ │ │ │ ├── african-english/ │ │ │ │ │ │ ├── mafand │ │ │ │ │ │ ├── mafand_amh-en.yaml │ │ │ │ │ │ ├── mafand_bam-fr.yaml │ │ │ │ │ │ ├── mafand_bbj-fr.yaml │ │ │ │ │ │ ├── mafand_ewe-fr.yaml │ │ │ │ │ │ ├── mafand_fon-fr.yaml │ │ │ │ │ │ ├── mafand_hau-en.yaml │ │ │ │ │ │ ├── mafand_ibo-en.yaml │ │ │ │ │ │ ├── mafand_kin-en.yaml │ │ │ │ │ │ ├── mafand_lug-en.yaml │ │ │ │ │ │ ├── mafand_luo-en.yaml │ │ │ │ │ │ ├── mafand_mos-fr.yaml │ │ │ │ │ │ ├── mafand_nya-en.yaml │ │ │ │ │ │ ├── mafand_pcm-en.yaml │ │ │ │ │ │ ├── mafand_sna-en.yaml │ │ │ │ │ │ ├── mafand_swa-en.yaml │ │ │ │ │ │ ├── mafand_tsn-en.yaml │ │ │ │ │ │ ├── mafand_twi-en.yaml │ │ │ │ │ │ ├── mafand_wol-fr.yaml │ │ │ │ │ │ ├── mafand_xho-en.yaml │ │ │ │ │ │ ├── mafand_yor-en.yaml │ │ │ │ │ │ ├── mafand_zul-en.yaml │ │ │ │ │ │ └── utils.py │ │ │ │ │ └── english-african/ │ │ │ │ │ ├── mafand │ │ │ │ │ ├── mafand_en-amh.yaml │ │ │ │ │ ├── mafand_en-hau.yaml │ │ │ │ │ ├── mafand_en-ibo.yaml │ │ │ │ │ ├── mafand_en-kin.yaml │ │ │ │ │ ├── mafand_en-lug.yaml │ │ │ │ │ ├── mafand_en-luo.yaml │ │ │ │ │ ├── mafand_en-nya.yaml │ │ │ │ │ ├── mafand_en-pcm.yaml │ │ │ │ │ ├── mafand_en-sna.yaml │ │ │ │ │ ├── mafand_en-swa.yaml │ │ │ │ │ ├── mafand_en-tsn.yaml │ │ │ │ │ ├── mafand_en-twi.yaml │ │ │ │ │ ├── mafand_en-xho.yaml │ │ │ │ │ ├── mafand_en-yor.yaml │ │ │ │ │ ├── mafand_en-zul.yaml │ │ │ │ │ ├── mafand_fr-bam.yaml │ │ │ │ │ ├── mafand_fr-bbj.yaml │ │ │ │ │ ├── mafand_fr-ewe.yaml │ │ │ │ │ ├── mafand_fr-fon.yaml │ │ │ │ │ ├── mafand_fr-mos.yaml │ │ │ │ │ ├── mafand_fr-wol.yaml │ │ │ │ │ └── utils.py │ │ │ │ └── prompt_3/ │ │ │ │ ├── african-english/ │ │ │ │ │ ├── mafand │ │ │ │ │ ├── mafand_amh-en.yaml │ │ │ │ │ ├── mafand_bam-fr.yaml │ │ │ │ │ ├── mafand_bbj-fr.yaml │ │ │ │ │ ├── mafand_ewe-fr.yaml │ │ │ │ │ ├── mafand_fon-fr.yaml │ │ │ │ │ ├── mafand_hau-en.yaml │ │ │ │ │ ├── mafand_ibo-en.yaml │ │ │ │ │ ├── mafand_kin-en.yaml │ │ │ │ │ ├── mafand_lug-en.yaml │ │ │ │ │ ├── mafand_luo-en.yaml │ │ │ │ │ ├── mafand_mos-fr.yaml │ │ │ │ │ ├── mafand_nya-en.yaml │ │ │ │ │ ├── mafand_pcm-en.yaml │ │ │ │ │ ├── mafand_sna-en.yaml │ │ │ │ │ ├── mafand_swa-en.yaml │ │ │ │ │ ├── mafand_tsn-en.yaml │ │ │ │ │ ├── mafand_twi-en.yaml │ │ │ │ │ ├── mafand_wol-fr.yaml │ │ │ │ │ ├── mafand_xho-en.yaml │ │ │ │ │ ├── mafand_yor-en.yaml │ │ │ │ │ ├── mafand_zul-en.yaml │ │ │ │ │ └── utils.py │ │ │ │ └── english-african/ │ │ │ │ ├── mafand │ │ │ │ ├── mafand_en-amh.yaml │ │ │ │ ├── mafand_en-hau.yaml │ │ │ │ ├── mafand_en-ibo.yaml │ │ │ │ ├── mafand_en-kin.yaml │ │ │ │ ├── mafand_en-lug.yaml │ │ │ │ ├── mafand_en-luo.yaml │ │ │ │ ├── mafand_en-nya.yaml │ │ │ │ ├── mafand_en-pcm.yaml │ │ │ │ ├── mafand_en-sna.yaml │ │ │ │ ├── mafand_en-swa.yaml │ │ │ │ ├── mafand_en-tsn.yaml │ │ │ │ ├── mafand_en-twi.yaml │ │ │ │ ├── mafand_en-xho.yaml │ │ │ │ ├── mafand_en-yor.yaml │ │ │ │ ├── mafand_en-zul.yaml │ │ │ │ ├── mafand_fr-bam.yaml │ │ │ │ ├── mafand_fr-bbj.yaml │ │ │ │ ├── mafand_fr-ewe.yaml │ │ │ │ ├── mafand_fr-fon.yaml │ │ │ │ ├── mafand_fr-mos.yaml │ │ │ │ ├── mafand_fr-wol.yaml │ │ │ │ └── utils.py │ │ │ ├── masakhaner/ │ │ │ │ ├── README.md │ │ │ │ ├── gen_utils.py │ │ │ │ ├── masakhaner.yaml │ │ │ │ ├── prompt_1/ │ │ │ │ │ ├── masakhaner │ │ │ │ │ ├── masakhaner_am.yaml │ │ │ │ │ ├── masakhaner_bbj.yaml │ │ │ │ │ ├── masakhaner_bm.yaml │ │ │ │ │ ├── masakhaner_ee.yaml │ │ │ │ │ ├── masakhaner_ha.yaml │ │ │ │ │ ├── masakhaner_ig.yaml │ │ │ │ │ ├── masakhaner_lg.yaml │ │ │ │ │ ├── masakhaner_luo.yaml │ │ │ │ │ ├── masakhaner_mos.yaml │ │ │ │ │ ├── masakhaner_ny.yaml │ │ │ │ │ ├── masakhaner_pcm.yaml │ │ │ │ │ ├── masakhaner_rw.yaml │ │ │ │ │ ├── masakhaner_sn.yaml │ │ │ │ │ ├── masakhaner_sw.yaml │ │ │ │ │ ├── masakhaner_tn.yaml │ │ │ │ │ ├── masakhaner_tw.yaml │ │ │ │ │ ├── masakhaner_wo.yaml │ │ │ │ │ ├── masakhaner_xh.yaml │ │ │ │ │ ├── masakhaner_yo.yaml │ │ │ │ │ ├── masakhaner_zu.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── prompt_2/ │ │ │ │ │ ├── masakhaner │ │ │ │ │ ├── masakhaner_am.yaml │ │ │ │ │ ├── masakhaner_bbj.yaml │ │ │ │ │ ├── masakhaner_bm.yaml │ │ │ │ │ ├── masakhaner_ee.yaml │ │ │ │ │ ├── masakhaner_ha.yaml │ │ │ │ │ ├── masakhaner_ig.yaml │ │ │ │ │ ├── masakhaner_lg.yaml │ │ │ │ │ ├── masakhaner_luo.yaml │ │ │ │ │ ├── masakhaner_mos.yaml │ │ │ │ │ ├── masakhaner_ny.yaml │ │ │ │ │ ├── masakhaner_pcm.yaml │ │ │ │ │ ├── masakhaner_rw.yaml │ │ │ │ │ ├── masakhaner_sn.yaml │ │ │ │ │ ├── masakhaner_sw.yaml │ │ │ │ │ ├── masakhaner_tn.yaml │ │ │ │ │ ├── masakhaner_tw.yaml │ │ │ │ │ ├── masakhaner_wo.yaml │ │ │ │ │ ├── masakhaner_xh.yaml │ │ │ │ │ ├── masakhaner_yo.yaml │ │ │ │ │ ├── masakhaner_zu.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── prompt_3/ │ │ │ │ │ ├── masakhaner │ │ │ │ │ ├── masakhaner_am.yaml │ │ │ │ │ ├── masakhaner_bbj.yaml │ │ │ │ │ ├── masakhaner_bm.yaml │ │ │ │ │ ├── masakhaner_ee.yaml │ │ │ │ │ ├── masakhaner_ha.yaml │ │ │ │ │ ├── masakhaner_ig.yaml │ │ │ │ │ ├── masakhaner_lg.yaml │ │ │ │ │ ├── masakhaner_luo.yaml │ │ │ │ │ ├── masakhaner_mos.yaml │ │ │ │ │ ├── masakhaner_ny.yaml │ │ │ │ │ ├── masakhaner_pcm.yaml │ │ │ │ │ ├── masakhaner_rw.yaml │ │ │ │ │ ├── masakhaner_sn.yaml │ │ │ │ │ ├── masakhaner_sw.yaml │ │ │ │ │ ├── masakhaner_tn.yaml │ │ │ │ │ ├── masakhaner_tw.yaml │ │ │ │ │ ├── masakhaner_wo.yaml │ │ │ │ │ ├── masakhaner_xh.yaml │ │ │ │ │ ├── masakhaner_yo.yaml │ │ │ │ │ ├── masakhaner_zu.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── prompt_4/ │ │ │ │ │ ├── masakhaner │ │ │ │ │ ├── masakhaner_am.yaml │ │ │ │ │ ├── masakhaner_bbj.yaml │ │ │ │ │ ├── masakhaner_bm.yaml │ │ │ │ │ ├── masakhaner_ee.yaml │ │ │ │ │ ├── masakhaner_ha.yaml │ │ │ │ │ ├── masakhaner_ig.yaml │ │ │ │ │ ├── masakhaner_lg.yaml │ │ │ │ │ ├── masakhaner_luo.yaml │ │ │ │ │ ├── masakhaner_mos.yaml │ │ │ │ │ ├── masakhaner_ny.yaml │ │ │ │ │ ├── masakhaner_pcm.yaml │ │ │ │ │ ├── masakhaner_rw.yaml │ │ │ │ │ ├── masakhaner_sn.yaml │ │ │ │ │ ├── masakhaner_sw.yaml │ │ │ │ │ ├── masakhaner_tn.yaml │ │ │ │ │ ├── masakhaner_tw.yaml │ │ │ │ │ ├── masakhaner_wo.yaml │ │ │ │ │ ├── masakhaner_xh.yaml │ │ │ │ │ ├── masakhaner_yo.yaml │ │ │ │ │ ├── masakhaner_zu.yaml │ │ │ │ │ └── utils.py │ │ │ │ └── prompt_5/ │ │ │ │ ├── masakhaner │ │ │ │ ├── masakhaner_am.yaml │ │ │ │ ├── masakhaner_bbj.yaml │ │ │ │ ├── masakhaner_bm.yaml │ │ │ │ ├── masakhaner_ee.yaml │ │ │ │ ├── masakhaner_ha.yaml │ │ │ │ ├── masakhaner_ig.yaml │ │ │ │ ├── masakhaner_lg.yaml │ │ │ │ ├── masakhaner_luo.yaml │ │ │ │ ├── masakhaner_mos.yaml │ │ │ │ ├── masakhaner_ny.yaml │ │ │ │ ├── masakhaner_pcm.yaml │ │ │ │ ├── masakhaner_rw.yaml │ │ │ │ ├── masakhaner_sn.yaml │ │ │ │ ├── masakhaner_sw.yaml │ │ │ │ ├── masakhaner_tn.yaml │ │ │ │ ├── masakhaner_tw.yaml │ │ │ │ ├── masakhaner_wo.yaml │ │ │ │ ├── masakhaner_xh.yaml │ │ │ │ ├── masakhaner_yo.yaml │ │ │ │ ├── masakhaner_zu.yaml │ │ │ │ └── utils.py │ │ │ ├── masakhanews/ │ │ │ │ ├── README.md │ │ │ │ ├── masakhanews.yaml │ │ │ │ ├── prompt_1/ │ │ │ │ │ ├── masakhanews │ │ │ │ │ ├── masakhanews_amh.yaml │ │ │ │ │ ├── masakhanews_eng.yaml │ │ │ │ │ ├── masakhanews_fra.yaml │ │ │ │ │ ├── masakhanews_hau.yaml │ │ │ │ │ ├── masakhanews_ibo.yaml │ │ │ │ │ ├── masakhanews_lin.yaml │ │ │ │ │ ├── masakhanews_lug.yaml │ │ │ │ │ ├── masakhanews_orm.yaml │ │ │ │ │ ├── masakhanews_pcm.yaml │ │ │ │ │ ├── masakhanews_run.yaml │ │ │ │ │ ├── masakhanews_sna.yaml │ │ │ │ │ ├── masakhanews_som.yaml │ │ │ │ │ ├── masakhanews_swa.yaml │ │ │ │ │ ├── masakhanews_tir.yaml │ │ │ │ │ ├── masakhanews_xho.yaml │ │ │ │ │ ├── masakhanews_yor.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── prompt_2/ │ │ │ │ │ ├── masakhanews │ │ │ │ │ ├── masakhanews_amh.yaml │ │ │ │ │ ├── masakhanews_eng.yaml │ │ │ │ │ ├── masakhanews_fra.yaml │ │ │ │ │ ├── masakhanews_hau.yaml │ │ │ │ │ ├── masakhanews_ibo.yaml │ │ │ │ │ ├── masakhanews_lin.yaml │ │ │ │ │ ├── masakhanews_lug.yaml │ │ │ │ │ ├── masakhanews_orm.yaml │ │ │ │ │ ├── masakhanews_pcm.yaml │ │ │ │ │ ├── masakhanews_run.yaml │ │ │ │ │ ├── masakhanews_sna.yaml │ │ │ │ │ ├── masakhanews_som.yaml │ │ │ │ │ ├── masakhanews_swa.yaml │ │ │ │ │ ├── masakhanews_tir.yaml │ │ │ │ │ ├── masakhanews_xho.yaml │ │ │ │ │ ├── masakhanews_yor.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── prompt_3/ │ │ │ │ │ ├── masakhanews │ │ │ │ │ ├── masakhanews_amh.yaml │ │ │ │ │ ├── masakhanews_eng.yaml │ │ │ │ │ ├── masakhanews_fra.yaml │ │ │ │ │ ├── masakhanews_hau.yaml │ │ │ │ │ ├── masakhanews_ibo.yaml │ │ │ │ │ ├── masakhanews_lin.yaml │ │ │ │ │ ├── masakhanews_lug.yaml │ │ │ │ │ ├── masakhanews_orm.yaml │ │ │ │ │ ├── masakhanews_pcm.yaml │ │ │ │ │ ├── masakhanews_run.yaml │ │ │ │ │ ├── masakhanews_sna.yaml │ │ │ │ │ ├── masakhanews_som.yaml │ │ │ │ │ ├── masakhanews_swa.yaml │ │ │ │ │ ├── masakhanews_tir.yaml │ │ │ │ │ ├── masakhanews_xho.yaml │ │ │ │ │ ├── masakhanews_yor.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── prompt_4/ │ │ │ │ │ ├── masakhanews │ │ │ │ │ ├── masakhanews_amh.yaml │ │ │ │ │ ├── masakhanews_eng.yaml │ │ │ │ │ ├── masakhanews_fra.yaml │ │ │ │ │ ├── masakhanews_hau.yaml │ │ │ │ │ ├── masakhanews_ibo.yaml │ │ │ │ │ ├── masakhanews_lin.yaml │ │ │ │ │ ├── masakhanews_lug.yaml │ │ │ │ │ ├── masakhanews_orm.yaml │ │ │ │ │ ├── masakhanews_pcm.yaml │ │ │ │ │ ├── masakhanews_run.yaml │ │ │ │ │ ├── masakhanews_sna.yaml │ │ │ │ │ ├── masakhanews_som.yaml │ │ │ │ │ ├── masakhanews_swa.yaml │ │ │ │ │ ├── masakhanews_tir.yaml │ │ │ │ │ ├── masakhanews_xho.yaml │ │ │ │ │ ├── masakhanews_yor.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── prompt_5/ │ │ │ │ │ ├── masakhanews │ │ │ │ │ ├── masakhanews_amh.yaml │ │ │ │ │ ├── masakhanews_eng.yaml │ │ │ │ │ ├── masakhanews_fra.yaml │ │ │ │ │ ├── masakhanews_hau.yaml │ │ │ │ │ ├── masakhanews_ibo.yaml │ │ │ │ │ ├── masakhanews_lin.yaml │ │ │ │ │ ├── masakhanews_lug.yaml │ │ │ │ │ ├── masakhanews_orm.yaml │ │ │ │ │ ├── masakhanews_pcm.yaml │ │ │ │ │ ├── masakhanews_run.yaml │ │ │ │ │ ├── masakhanews_sna.yaml │ │ │ │ │ ├── masakhanews_som.yaml │ │ │ │ │ ├── masakhanews_swa.yaml │ │ │ │ │ ├── masakhanews_tir.yaml │ │ │ │ │ ├── masakhanews_xho.yaml │ │ │ │ │ ├── masakhanews_yor.yaml │ │ │ │ │ └── utils.py │ │ │ │ └── utils.py │ │ │ ├── masakhapos/ │ │ │ │ ├── README.md │ │ │ │ ├── gen_utils.py │ │ │ │ ├── masakhapos.yaml │ │ │ │ ├── prompt_1/ │ │ │ │ │ ├── masakhapos_bam.yaml │ │ │ │ │ ├── masakhapos_bbj.yaml │ │ │ │ │ ├── masakhapos_ewe.yaml │ │ │ │ │ ├── masakhapos_fon.yaml │ │ │ │ │ ├── masakhapos_hau.yaml │ │ │ │ │ ├── masakhapos_ibo.yaml │ │ │ │ │ ├── masakhapos_kin.yaml │ │ │ │ │ ├── masakhapos_lug.yaml │ │ │ │ │ ├── masakhapos_luo.yaml │ │ │ │ │ ├── masakhapos_mos.yaml │ │ │ │ │ ├── masakhapos_nya.yaml │ │ │ │ │ ├── masakhapos_pcm.yaml │ │ │ │ │ ├── masakhapos_sna.yaml │ │ │ │ │ ├── masakhapos_swa.yaml │ │ │ │ │ ├── masakhapos_tsn.yaml │ │ │ │ │ ├── masakhapos_twi.yaml │ │ │ │ │ ├── masakhapos_wol.yaml │ │ │ │ │ ├── masakhapos_xho.yaml │ │ │ │ │ ├── masakhapos_yaml │ │ │ │ │ ├── masakhapos_yor.yaml │ │ │ │ │ ├── masakhapos_zul.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── prompt_2/ │ │ │ │ │ ├── masakhapos_bam.yaml │ │ │ │ │ ├── masakhapos_bbj.yaml │ │ │ │ │ ├── masakhapos_ewe.yaml │ │ │ │ │ ├── masakhapos_fon.yaml │ │ │ │ │ ├── masakhapos_hau.yaml │ │ │ │ │ ├── masakhapos_ibo.yaml │ │ │ │ │ ├── masakhapos_kin.yaml │ │ │ │ │ ├── masakhapos_lug.yaml │ │ │ │ │ ├── masakhapos_luo.yaml │ │ │ │ │ ├── masakhapos_mos.yaml │ │ │ │ │ ├── masakhapos_nya.yaml │ │ │ │ │ ├── masakhapos_pcm.yaml │ │ │ │ │ ├── masakhapos_sna.yaml │ │ │ │ │ ├── masakhapos_swa.yaml │ │ │ │ │ ├── masakhapos_tsn.yaml │ │ │ │ │ ├── masakhapos_twi.yaml │ │ │ │ │ ├── masakhapos_wol.yaml │ │ │ │ │ ├── masakhapos_xho.yaml │ │ │ │ │ ├── masakhapos_yaml │ │ │ │ │ ├── masakhapos_yor.yaml │ │ │ │ │ ├── masakhapos_zul.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── prompt_3/ │ │ │ │ │ ├── masakhapos_bam.yaml │ │ │ │ │ ├── masakhapos_bbj.yaml │ │ │ │ │ ├── masakhapos_ewe.yaml │ │ │ │ │ ├── masakhapos_fon.yaml │ │ │ │ │ ├── masakhapos_hau.yaml │ │ │ │ │ ├── masakhapos_ibo.yaml │ │ │ │ │ ├── masakhapos_kin.yaml │ │ │ │ │ ├── masakhapos_lug.yaml │ │ │ │ │ ├── masakhapos_luo.yaml │ │ │ │ │ ├── masakhapos_mos.yaml │ │ │ │ │ ├── masakhapos_nya.yaml │ │ │ │ │ ├── masakhapos_pcm.yaml │ │ │ │ │ ├── masakhapos_sna.yaml │ │ │ │ │ ├── masakhapos_swa.yaml │ │ │ │ │ ├── masakhapos_tsn.yaml │ │ │ │ │ ├── masakhapos_twi.yaml │ │ │ │ │ ├── masakhapos_wol.yaml │ │ │ │ │ ├── masakhapos_xho.yaml │ │ │ │ │ ├── masakhapos_yaml │ │ │ │ │ ├── masakhapos_yor.yaml │ │ │ │ │ ├── masakhapos_zul.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── prompt_4/ │ │ │ │ │ ├── masakhapos_bam.yaml │ │ │ │ │ ├── masakhapos_bbj.yaml │ │ │ │ │ ├── masakhapos_ewe.yaml │ │ │ │ │ ├── masakhapos_fon.yaml │ │ │ │ │ ├── masakhapos_hau.yaml │ │ │ │ │ ├── masakhapos_ibo.yaml │ │ │ │ │ ├── masakhapos_kin.yaml │ │ │ │ │ ├── masakhapos_lug.yaml │ │ │ │ │ ├── masakhapos_luo.yaml │ │ │ │ │ ├── masakhapos_mos.yaml │ │ │ │ │ ├── masakhapos_nya.yaml │ │ │ │ │ ├── masakhapos_pcm.yaml │ │ │ │ │ ├── masakhapos_sna.yaml │ │ │ │ │ ├── masakhapos_swa.yaml │ │ │ │ │ ├── masakhapos_tsn.yaml │ │ │ │ │ ├── masakhapos_twi.yaml │ │ │ │ │ ├── masakhapos_wol.yaml │ │ │ │ │ ├── masakhapos_xho.yaml │ │ │ │ │ ├── masakhapos_yaml │ │ │ │ │ ├── masakhapos_yor.yaml │ │ │ │ │ ├── masakhapos_zul.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── prompt_5/ │ │ │ │ │ ├── masakhapos_bam.yaml │ │ │ │ │ ├── masakhapos_bbj.yaml │ │ │ │ │ ├── masakhapos_ewe.yaml │ │ │ │ │ ├── masakhapos_fon.yaml │ │ │ │ │ ├── masakhapos_hau.yaml │ │ │ │ │ ├── masakhapos_ibo.yaml │ │ │ │ │ ├── masakhapos_kin.yaml │ │ │ │ │ ├── masakhapos_lug.yaml │ │ │ │ │ ├── masakhapos_luo.yaml │ │ │ │ │ ├── masakhapos_mos.yaml │ │ │ │ │ ├── masakhapos_nya.yaml │ │ │ │ │ ├── masakhapos_pcm.yaml │ │ │ │ │ ├── masakhapos_sna.yaml │ │ │ │ │ ├── masakhapos_swa.yaml │ │ │ │ │ ├── masakhapos_tsn.yaml │ │ │ │ │ ├── masakhapos_twi.yaml │ │ │ │ │ ├── masakhapos_wol.yaml │ │ │ │ │ ├── masakhapos_xho.yaml │ │ │ │ │ ├── masakhapos_yaml │ │ │ │ │ ├── masakhapos_yor.yaml │ │ │ │ │ ├── masakhapos_zul.yaml │ │ │ │ │ └── utils.py │ │ │ │ └── utils.py │ │ │ ├── naijarc/ │ │ │ │ ├── README.md │ │ │ │ ├── naijarc.yaml │ │ │ │ ├── prompt_1/ │ │ │ │ │ ├── naijarc │ │ │ │ │ ├── naijarc_hau.yaml │ │ │ │ │ ├── naijarc_ibo.yaml │ │ │ │ │ └── naijarc_yor.yaml │ │ │ │ ├── prompt_2/ │ │ │ │ │ ├── naijarc │ │ │ │ │ ├── naijarc_hau.yaml │ │ │ │ │ ├── naijarc_ibo.yaml │ │ │ │ │ └── naijarc_yor.yaml │ │ │ │ ├── prompt_3/ │ │ │ │ │ ├── naijarc │ │ │ │ │ ├── naijarc_hau.yaml │ │ │ │ │ ├── naijarc_ibo.yaml │ │ │ │ │ └── naijarc_yor.yaml │ │ │ │ ├── prompt_4/ │ │ │ │ │ ├── naijarc │ │ │ │ │ ├── naijarc_hau.yaml │ │ │ │ │ ├── naijarc_ibo.yaml │ │ │ │ │ └── naijarc_yor.yaml │ │ │ │ ├── prompt_5/ │ │ │ │ │ ├── naijarc │ │ │ │ │ ├── naijarc_hau.yaml │ │ │ │ │ ├── naijarc_ibo.yaml │ │ │ │ │ └── naijarc_yor.yaml │ │ │ │ └── utils.py │ │ │ ├── nollysenti/ │ │ │ │ ├── README.md │ │ │ │ ├── nollysenti.yaml │ │ │ │ ├── prompt_1/ │ │ │ │ │ ├── nollysenti │ │ │ │ │ ├── nollysenti_eng.yaml │ │ │ │ │ ├── nollysenti_hau.yaml │ │ │ │ │ ├── nollysenti_ibo.yaml │ │ │ │ │ ├── nollysenti_pcm.yaml │ │ │ │ │ ├── nollysenti_yor.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── prompt_2/ │ │ │ │ │ ├── nollysenti │ │ │ │ │ ├── nollysenti_eng.yaml │ │ │ │ │ ├── nollysenti_hau.yaml │ │ │ │ │ ├── nollysenti_ibo.yaml │ │ │ │ │ ├── nollysenti_pcm.yaml │ │ │ │ │ ├── nollysenti_yor.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── prompt_3/ │ │ │ │ │ ├── nollysenti │ │ │ │ │ ├── nollysenti_eng.yaml │ │ │ │ │ ├── nollysenti_hau.yaml │ │ │ │ │ ├── nollysenti_ibo.yaml │ │ │ │ │ ├── nollysenti_pcm.yaml │ │ │ │ │ ├── nollysenti_yor.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── prompt_4/ │ │ │ │ │ ├── nollysenti │ │ │ │ │ ├── nollysenti_eng.yaml │ │ │ │ │ ├── nollysenti_hau.yaml │ │ │ │ │ ├── nollysenti_ibo.yaml │ │ │ │ │ ├── nollysenti_pcm.yaml │ │ │ │ │ ├── nollysenti_yor.yaml │ │ │ │ │ └── utils.py │ │ │ │ └── prompt_5/ │ │ │ │ ├── nollysenti │ │ │ │ ├── nollysenti_eng.yaml │ │ │ │ ├── nollysenti_hau.yaml │ │ │ │ ├── nollysenti_ibo.yaml │ │ │ │ ├── nollysenti_pcm.yaml │ │ │ │ ├── nollysenti_yor.yaml │ │ │ │ └── utils.py │ │ │ ├── ntrex/ │ │ │ │ ├── README.md │ │ │ │ ├── gen_utils.py │ │ │ │ ├── ntrex.yaml │ │ │ │ ├── prompt_1/ │ │ │ │ │ ├── african-english/ │ │ │ │ │ │ ├── ntrex │ │ │ │ │ │ ├── ntrex_afr_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── ntrex_amh_Ethi-eng_Latn.yaml │ │ │ │ │ │ ├── ntrex_arb_Arab-eng_Latn.yaml │ │ │ │ │ │ ├── ntrex_bem_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── ntrex_ewe_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── ntrex_fra_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── ntrex_hau_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── ntrex_ibo_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── ntrex_kin_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── ntrex_mey_Arab-eng_Latn.yaml │ │ │ │ │ │ ├── ntrex_mlg_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── ntrex_msa_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── ntrex_nde_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── ntrex_nso_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── ntrex_nya_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── ntrex_orm_Ethi-eng_Latn.yaml │ │ │ │ │ │ ├── ntrex_shi_Arab-eng_Latn.yaml │ │ │ │ │ │ ├── ntrex_sna_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── ntrex_som_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── ntrex_ssw_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── ntrex_swa_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── ntrex_tam_Taml-eng_Latn.yaml │ │ │ │ │ │ ├── ntrex_tel_Telu-eng_Latn.yaml │ │ │ │ │ │ ├── ntrex_tir_Ethi-eng_Latn.yaml │ │ │ │ │ │ ├── ntrex_ton_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── ntrex_tsn_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── ntrex_urd_Arab-eng_Latn.yaml │ │ │ │ │ │ ├── ntrex_ven_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── ntrex_wol_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── ntrex_xho_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── ntrex_yor_Latn-eng_Latn.yaml │ │ │ │ │ │ └── ntrex_zul_Latn-eng_Latn.yaml │ │ │ │ │ └── english-african/ │ │ │ │ │ ├── ntrex │ │ │ │ │ ├── ntrex_eng_Latn-afr_Latn.yaml │ │ │ │ │ ├── ntrex_eng_Latn-amh_Ethi.yaml │ │ │ │ │ ├── ntrex_eng_Latn-arb_Arab.yaml │ │ │ │ │ ├── ntrex_eng_Latn-bem_Latn.yaml │ │ │ │ │ ├── ntrex_eng_Latn-ewe_Latn.yaml │ │ │ │ │ ├── ntrex_eng_Latn-fra_Latn.yaml │ │ │ │ │ ├── ntrex_eng_Latn-hau_Latn.yaml │ │ │ │ │ ├── ntrex_eng_Latn-ibo_Latn.yaml │ │ │ │ │ ├── ntrex_eng_Latn-kin_Latn.yaml │ │ │ │ │ ├── ntrex_eng_Latn-mey_Arab.yaml │ │ │ │ │ ├── ntrex_eng_Latn-mlg_Latn.yaml │ │ │ │ │ ├── ntrex_eng_Latn-msa_Latn.yaml │ │ │ │ │ ├── ntrex_eng_Latn-nde_Latn.yaml │ │ │ │ │ ├── ntrex_eng_Latn-nso_Latn.yaml │ │ │ │ │ ├── ntrex_eng_Latn-nya_Latn.yaml │ │ │ │ │ ├── ntrex_eng_Latn-orm_Ethi.yaml │ │ │ │ │ ├── ntrex_eng_Latn-shi_Arab.yaml │ │ │ │ │ ├── ntrex_eng_Latn-sna_Latn.yaml │ │ │ │ │ ├── ntrex_eng_Latn-som_Latn.yaml │ │ │ │ │ ├── ntrex_eng_Latn-ssw_Latn.yaml │ │ │ │ │ ├── ntrex_eng_Latn-swa_Latn.yaml │ │ │ │ │ ├── ntrex_eng_Latn-tam_Taml.yaml │ │ │ │ │ ├── ntrex_eng_Latn-tel_Telu.yaml │ │ │ │ │ ├── ntrex_eng_Latn-tir_Ethi.yaml │ │ │ │ │ ├── ntrex_eng_Latn-ton_Latn.yaml │ │ │ │ │ ├── ntrex_eng_Latn-tsn_Latn.yaml │ │ │ │ │ ├── ntrex_eng_Latn-urd_Arab.yaml │ │ │ │ │ ├── ntrex_eng_Latn-ven_Latn.yaml │ │ │ │ │ ├── ntrex_eng_Latn-wol_Latn.yaml │ │ │ │ │ ├── ntrex_eng_Latn-xho_Latn.yaml │ │ │ │ │ ├── ntrex_eng_Latn-yor_Latn.yaml │ │ │ │ │ └── ntrex_eng_Latn-zul_Latn.yaml │ │ │ │ ├── prompt_2/ │ │ │ │ │ ├── african-english/ │ │ │ │ │ │ ├── ntrex │ │ │ │ │ │ ├── ntrex_afr_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── ntrex_amh_Ethi-eng_Latn.yaml │ │ │ │ │ │ ├── ntrex_arb_Arab-eng_Latn.yaml │ │ │ │ │ │ ├── ntrex_bem_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── ntrex_ewe_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── ntrex_fra_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── ntrex_hau_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── ntrex_ibo_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── ntrex_kin_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── ntrex_mey_Arab-eng_Latn.yaml │ │ │ │ │ │ ├── ntrex_mlg_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── ntrex_msa_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── ntrex_nde_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── ntrex_nso_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── ntrex_nya_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── ntrex_orm_Ethi-eng_Latn.yaml │ │ │ │ │ │ ├── ntrex_shi_Arab-eng_Latn.yaml │ │ │ │ │ │ ├── ntrex_sna_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── ntrex_som_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── ntrex_ssw_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── ntrex_swa_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── ntrex_tam_Taml-eng_Latn.yaml │ │ │ │ │ │ ├── ntrex_tel_Telu-eng_Latn.yaml │ │ │ │ │ │ ├── ntrex_tir_Ethi-eng_Latn.yaml │ │ │ │ │ │ ├── ntrex_ton_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── ntrex_tsn_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── ntrex_urd_Arab-eng_Latn.yaml │ │ │ │ │ │ ├── ntrex_ven_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── ntrex_wol_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── ntrex_xho_Latn-eng_Latn.yaml │ │ │ │ │ │ ├── ntrex_yor_Latn-eng_Latn.yaml │ │ │ │ │ │ └── ntrex_zul_Latn-eng_Latn.yaml │ │ │ │ │ └── english-african/ │ │ │ │ │ ├── ntrex │ │ │ │ │ ├── ntrex_eng_Latn-afr_Latn.yaml │ │ │ │ │ ├── ntrex_eng_Latn-amh_Ethi.yaml │ │ │ │ │ ├── ntrex_eng_Latn-arb_Arab.yaml │ │ │ │ │ ├── ntrex_eng_Latn-bem_Latn.yaml │ │ │ │ │ ├── ntrex_eng_Latn-ewe_Latn.yaml │ │ │ │ │ ├── ntrex_eng_Latn-fra_Latn.yaml │ │ │ │ │ ├── ntrex_eng_Latn-hau_Latn.yaml │ │ │ │ │ ├── ntrex_eng_Latn-ibo_Latn.yaml │ │ │ │ │ ├── ntrex_eng_Latn-kin_Latn.yaml │ │ │ │ │ ├── ntrex_eng_Latn-mey_Arab.yaml │ │ │ │ │ ├── ntrex_eng_Latn-mlg_Latn.yaml │ │ │ │ │ ├── ntrex_eng_Latn-msa_Latn.yaml │ │ │ │ │ ├── ntrex_eng_Latn-nde_Latn.yaml │ │ │ │ │ ├── ntrex_eng_Latn-nso_Latn.yaml │ │ │ │ │ ├── ntrex_eng_Latn-nya_Latn.yaml │ │ │ │ │ ├── ntrex_eng_Latn-orm_Ethi.yaml │ │ │ │ │ ├── ntrex_eng_Latn-shi_Arab.yaml │ │ │ │ │ ├── ntrex_eng_Latn-sna_Latn.yaml │ │ │ │ │ ├── ntrex_eng_Latn-som_Latn.yaml │ │ │ │ │ ├── ntrex_eng_Latn-ssw_Latn.yaml │ │ │ │ │ ├── ntrex_eng_Latn-swa_Latn.yaml │ │ │ │ │ ├── ntrex_eng_Latn-tam_Taml.yaml │ │ │ │ │ ├── ntrex_eng_Latn-tel_Telu.yaml │ │ │ │ │ ├── ntrex_eng_Latn-tir_Ethi.yaml │ │ │ │ │ ├── ntrex_eng_Latn-ton_Latn.yaml │ │ │ │ │ ├── ntrex_eng_Latn-tsn_Latn.yaml │ │ │ │ │ ├── ntrex_eng_Latn-urd_Arab.yaml │ │ │ │ │ ├── ntrex_eng_Latn-ven_Latn.yaml │ │ │ │ │ ├── ntrex_eng_Latn-wol_Latn.yaml │ │ │ │ │ ├── ntrex_eng_Latn-xho_Latn.yaml │ │ │ │ │ ├── ntrex_eng_Latn-yor_Latn.yaml │ │ │ │ │ └── ntrex_eng_Latn-zul_Latn.yaml │ │ │ │ └── prompt_3/ │ │ │ │ ├── african-english/ │ │ │ │ │ ├── ntrex │ │ │ │ │ ├── ntrex_afr_Latn-eng_Latn.yaml │ │ │ │ │ ├── ntrex_amh_Ethi-eng_Latn.yaml │ │ │ │ │ ├── ntrex_arb_Arab-eng_Latn.yaml │ │ │ │ │ ├── ntrex_bem_Latn-eng_Latn.yaml │ │ │ │ │ ├── ntrex_ewe_Latn-eng_Latn.yaml │ │ │ │ │ ├── ntrex_fra_Latn-eng_Latn.yaml │ │ │ │ │ ├── ntrex_hau_Latn-eng_Latn.yaml │ │ │ │ │ ├── ntrex_ibo_Latn-eng_Latn.yaml │ │ │ │ │ ├── ntrex_kin_Latn-eng_Latn.yaml │ │ │ │ │ ├── ntrex_mey_Arab-eng_Latn.yaml │ │ │ │ │ ├── ntrex_mlg_Latn-eng_Latn.yaml │ │ │ │ │ ├── ntrex_msa_Latn-eng_Latn.yaml │ │ │ │ │ ├── ntrex_nde_Latn-eng_Latn.yaml │ │ │ │ │ ├── ntrex_nso_Latn-eng_Latn.yaml │ │ │ │ │ ├── ntrex_nya_Latn-eng_Latn.yaml │ │ │ │ │ ├── ntrex_orm_Ethi-eng_Latn.yaml │ │ │ │ │ ├── ntrex_shi_Arab-eng_Latn.yaml │ │ │ │ │ ├── ntrex_sna_Latn-eng_Latn.yaml │ │ │ │ │ ├── ntrex_som_Latn-eng_Latn.yaml │ │ │ │ │ ├── ntrex_ssw_Latn-eng_Latn.yaml │ │ │ │ │ ├── ntrex_swa_Latn-eng_Latn.yaml │ │ │ │ │ ├── ntrex_tam_Taml-eng_Latn.yaml │ │ │ │ │ ├── ntrex_tel_Telu-eng_Latn.yaml │ │ │ │ │ ├── ntrex_tir_Ethi-eng_Latn.yaml │ │ │ │ │ ├── ntrex_ton_Latn-eng_Latn.yaml │ │ │ │ │ ├── ntrex_tsn_Latn-eng_Latn.yaml │ │ │ │ │ ├── ntrex_urd_Arab-eng_Latn.yaml │ │ │ │ │ ├── ntrex_ven_Latn-eng_Latn.yaml │ │ │ │ │ ├── ntrex_wol_Latn-eng_Latn.yaml │ │ │ │ │ ├── ntrex_xho_Latn-eng_Latn.yaml │ │ │ │ │ ├── ntrex_yor_Latn-eng_Latn.yaml │ │ │ │ │ └── ntrex_zul_Latn-eng_Latn.yaml │ │ │ │ └── english-african/ │ │ │ │ ├── ntrex │ │ │ │ ├── ntrex_eng_Latn-afr_Latn.yaml │ │ │ │ ├── ntrex_eng_Latn-amh_Ethi.yaml │ │ │ │ ├── ntrex_eng_Latn-arb_Arab.yaml │ │ │ │ ├── ntrex_eng_Latn-bem_Latn.yaml │ │ │ │ ├── ntrex_eng_Latn-ewe_Latn.yaml │ │ │ │ ├── ntrex_eng_Latn-fra_Latn.yaml │ │ │ │ ├── ntrex_eng_Latn-hau_Latn.yaml │ │ │ │ ├── ntrex_eng_Latn-ibo_Latn.yaml │ │ │ │ ├── ntrex_eng_Latn-kin_Latn.yaml │ │ │ │ ├── ntrex_eng_Latn-mey_Arab.yaml │ │ │ │ ├── ntrex_eng_Latn-mlg_Latn.yaml │ │ │ │ ├── ntrex_eng_Latn-msa_Latn.yaml │ │ │ │ ├── ntrex_eng_Latn-nde_Latn.yaml │ │ │ │ ├── ntrex_eng_Latn-nso_Latn.yaml │ │ │ │ ├── ntrex_eng_Latn-nya_Latn.yaml │ │ │ │ ├── ntrex_eng_Latn-orm_Ethi.yaml │ │ │ │ ├── ntrex_eng_Latn-shi_Arab.yaml │ │ │ │ ├── ntrex_eng_Latn-sna_Latn.yaml │ │ │ │ ├── ntrex_eng_Latn-som_Latn.yaml │ │ │ │ ├── ntrex_eng_Latn-ssw_Latn.yaml │ │ │ │ ├── ntrex_eng_Latn-swa_Latn.yaml │ │ │ │ ├── ntrex_eng_Latn-tam_Taml.yaml │ │ │ │ ├── ntrex_eng_Latn-tel_Telu.yaml │ │ │ │ ├── ntrex_eng_Latn-tir_Ethi.yaml │ │ │ │ ├── ntrex_eng_Latn-ton_Latn.yaml │ │ │ │ ├── ntrex_eng_Latn-tsn_Latn.yaml │ │ │ │ ├── ntrex_eng_Latn-urd_Arab.yaml │ │ │ │ ├── ntrex_eng_Latn-ven_Latn.yaml │ │ │ │ ├── ntrex_eng_Latn-wol_Latn.yaml │ │ │ │ ├── ntrex_eng_Latn-xho_Latn.yaml │ │ │ │ ├── ntrex_eng_Latn-yor_Latn.yaml │ │ │ │ └── ntrex_eng_Latn-zul_Latn.yaml │ │ │ ├── openai_mmlu/ │ │ │ │ ├── README.md │ │ │ │ ├── openai_mmlu.yaml │ │ │ │ ├── prompt_1/ │ │ │ │ │ ├── openai_mmlu │ │ │ │ │ ├── openai_mmlu_ara.yaml │ │ │ │ │ ├── openai_mmlu_swa.yaml │ │ │ │ │ └── openai_mmlu_yor.yaml │ │ │ │ ├── prompt_2/ │ │ │ │ │ ├── openai_mmlu │ │ │ │ │ ├── openai_mmlu_ara.yaml │ │ │ │ │ ├── openai_mmlu_swa.yaml │ │ │ │ │ └── openai_mmlu_yor.yaml │ │ │ │ ├── prompt_3/ │ │ │ │ │ ├── openai_mmlu │ │ │ │ │ ├── openai_mmlu_ara.yaml │ │ │ │ │ ├── openai_mmlu_swa.yaml │ │ │ │ │ └── openai_mmlu_yor.yaml │ │ │ │ ├── prompt_4/ │ │ │ │ │ ├── openai_mmlu │ │ │ │ │ ├── openai_mmlu_ara.yaml │ │ │ │ │ ├── openai_mmlu_swa.yaml │ │ │ │ │ └── openai_mmlu_yor.yaml │ │ │ │ ├── prompt_5/ │ │ │ │ │ ├── openai_mmlu │ │ │ │ │ ├── openai_mmlu_ara.yaml │ │ │ │ │ ├── openai_mmlu_swa.yaml │ │ │ │ │ └── openai_mmlu_yor.yaml │ │ │ │ └── utils.py │ │ │ ├── salt/ │ │ │ │ ├── README.md │ │ │ │ ├── gen_utils.py │ │ │ │ ├── prompt_1/ │ │ │ │ │ ├── salt │ │ │ │ │ ├── salt_ach-eng.yaml │ │ │ │ │ ├── salt_eng-ach.yaml │ │ │ │ │ ├── salt_eng-ibo.yaml │ │ │ │ │ ├── salt_eng-lgg.yaml │ │ │ │ │ ├── salt_eng-lug.yaml │ │ │ │ │ ├── salt_eng-nyn.yaml │ │ │ │ │ ├── salt_eng-swa.yaml │ │ │ │ │ ├── salt_eng-teo.yaml │ │ │ │ │ ├── salt_ibo-eng.yaml │ │ │ │ │ ├── salt_lgg-eng.yaml │ │ │ │ │ ├── salt_lug-eng.yaml │ │ │ │ │ ├── salt_nyn-eng.yaml │ │ │ │ │ ├── salt_swa-eng.yaml │ │ │ │ │ └── salt_teo-eng.yaml │ │ │ │ ├── prompt_2/ │ │ │ │ │ ├── salt │ │ │ │ │ ├── salt_ach-eng.yaml │ │ │ │ │ ├── salt_eng-ach.yaml │ │ │ │ │ ├── salt_eng-ibo.yaml │ │ │ │ │ ├── salt_eng-lgg.yaml │ │ │ │ │ ├── salt_eng-lug.yaml │ │ │ │ │ ├── salt_eng-nyn.yaml │ │ │ │ │ ├── salt_eng-swa.yaml │ │ │ │ │ ├── salt_eng-teo.yaml │ │ │ │ │ ├── salt_ibo-eng.yaml │ │ │ │ │ ├── salt_lgg-eng.yaml │ │ │ │ │ ├── salt_lug-eng.yaml │ │ │ │ │ ├── salt_nyn-eng.yaml │ │ │ │ │ ├── salt_swa-eng.yaml │ │ │ │ │ └── salt_teo-eng.yaml │ │ │ │ ├── prompt_3/ │ │ │ │ │ ├── salt │ │ │ │ │ ├── salt_ach-eng.yaml │ │ │ │ │ ├── salt_eng-ach.yaml │ │ │ │ │ ├── salt_eng-ibo.yaml │ │ │ │ │ ├── salt_eng-lgg.yaml │ │ │ │ │ ├── salt_eng-lug.yaml │ │ │ │ │ ├── salt_eng-nyn.yaml │ │ │ │ │ ├── salt_eng-swa.yaml │ │ │ │ │ ├── salt_eng-teo.yaml │ │ │ │ │ ├── salt_ibo-eng.yaml │ │ │ │ │ ├── salt_lgg-eng.yaml │ │ │ │ │ ├── salt_lug-eng.yaml │ │ │ │ │ ├── salt_nyn-eng.yaml │ │ │ │ │ ├── salt_swa-eng.yaml │ │ │ │ │ └── salt_teo-eng.yaml │ │ │ │ └── salt.yaml │ │ │ ├── sample_run_scripts/ │ │ │ │ ├── run_afrobench.sh │ │ │ │ └── run_afrobench_lite.sh │ │ │ ├── sib/ │ │ │ │ ├── README.md │ │ │ │ ├── prompt_1/ │ │ │ │ │ ├── sib │ │ │ │ │ ├── sib_aeb.yaml │ │ │ │ │ ├── sib_afr.yaml │ │ │ │ │ ├── sib_aka.yaml │ │ │ │ │ ├── sib_amh.yaml │ │ │ │ │ ├── sib_ary.yaml │ │ │ │ │ ├── sib_arz.yaml │ │ │ │ │ ├── sib_bam.yaml │ │ │ │ │ ├── sib_bem.yaml │ │ │ │ │ ├── sib_cjk.yaml │ │ │ │ │ ├── sib_dik.yaml │ │ │ │ │ ├── sib_dyu.yaml │ │ │ │ │ ├── sib_eng.yaml │ │ │ │ │ ├── sib_ewe.yaml │ │ │ │ │ ├── sib_fon.yaml │ │ │ │ │ ├── sib_fra.yaml │ │ │ │ │ ├── sib_fuv.yaml │ │ │ │ │ ├── sib_gaz.yaml │ │ │ │ │ ├── sib_hau.yaml │ │ │ │ │ ├── sib_ibo.yaml │ │ │ │ │ ├── sib_kab.yaml │ │ │ │ │ ├── sib_kam.yaml │ │ │ │ │ ├── sib_kbp.yaml │ │ │ │ │ ├── sib_kea.yaml │ │ │ │ │ ├── sib_kik.yaml │ │ │ │ │ ├── sib_kin.yaml │ │ │ │ │ ├── sib_kmb.yaml │ │ │ │ │ ├── sib_knc.yaml │ │ │ │ │ ├── sib_kon.yaml │ │ │ │ │ ├── sib_lin.yaml │ │ │ │ │ ├── sib_lua.yaml │ │ │ │ │ ├── sib_lug.yaml │ │ │ │ │ ├── sib_luo.yaml │ │ │ │ │ ├── sib_mos.yaml │ │ │ │ │ ├── sib_nso.yaml │ │ │ │ │ ├── sib_nus.yaml │ │ │ │ │ ├── sib_nya.yaml │ │ │ │ │ ├── sib_plt.yaml │ │ │ │ │ ├── sib_por.yaml │ │ │ │ │ ├── sib_run.yaml │ │ │ │ │ ├── sib_sag.yaml │ │ │ │ │ ├── sib_sna.yaml │ │ │ │ │ ├── sib_som.yaml │ │ │ │ │ ├── sib_sot.yaml │ │ │ │ │ ├── sib_ssw.yaml │ │ │ │ │ ├── sib_swa.yaml │ │ │ │ │ ├── sib_taq.yaml │ │ │ │ │ ├── sib_tir.yaml │ │ │ │ │ ├── sib_tso.yaml │ │ │ │ │ ├── sib_tum.yaml │ │ │ │ │ ├── sib_twi.yaml │ │ │ │ │ ├── sib_tzm.yaml │ │ │ │ │ ├── sib_umb.yaml │ │ │ │ │ ├── sib_wol.yaml │ │ │ │ │ ├── sib_xho.yaml │ │ │ │ │ ├── sib_yor.yaml │ │ │ │ │ ├── sib_zul.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── prompt_2/ │ │ │ │ │ ├── sib │ │ │ │ │ ├── sib_aeb.yaml │ │ │ │ │ ├── sib_afr.yaml │ │ │ │ │ ├── sib_aka.yaml │ │ │ │ │ ├── sib_amh.yaml │ │ │ │ │ ├── sib_ary.yaml │ │ │ │ │ ├── sib_arz.yaml │ │ │ │ │ ├── sib_bam.yaml │ │ │ │ │ ├── sib_bem.yaml │ │ │ │ │ ├── sib_cjk.yaml │ │ │ │ │ ├── sib_dik.yaml │ │ │ │ │ ├── sib_dyu.yaml │ │ │ │ │ ├── sib_eng.yaml │ │ │ │ │ ├── sib_ewe.yaml │ │ │ │ │ ├── sib_fon.yaml │ │ │ │ │ ├── sib_fra.yaml │ │ │ │ │ ├── sib_fuv.yaml │ │ │ │ │ ├── sib_gaz.yaml │ │ │ │ │ ├── sib_hau.yaml │ │ │ │ │ ├── sib_ibo.yaml │ │ │ │ │ ├── sib_kab.yaml │ │ │ │ │ ├── sib_kam.yaml │ │ │ │ │ ├── sib_kbp.yaml │ │ │ │ │ ├── sib_kea.yaml │ │ │ │ │ ├── sib_kik.yaml │ │ │ │ │ ├── sib_kin.yaml │ │ │ │ │ ├── sib_kmb.yaml │ │ │ │ │ ├── sib_knc.yaml │ │ │ │ │ ├── sib_kon.yaml │ │ │ │ │ ├── sib_lin.yaml │ │ │ │ │ ├── sib_lua.yaml │ │ │ │ │ ├── sib_lug.yaml │ │ │ │ │ ├── sib_luo.yaml │ │ │ │ │ ├── sib_mos.yaml │ │ │ │ │ ├── sib_nso.yaml │ │ │ │ │ ├── sib_nus.yaml │ │ │ │ │ ├── sib_nya.yaml │ │ │ │ │ ├── sib_plt.yaml │ │ │ │ │ ├── sib_por.yaml │ │ │ │ │ ├── sib_run.yaml │ │ │ │ │ ├── sib_sag.yaml │ │ │ │ │ ├── sib_sna.yaml │ │ │ │ │ ├── sib_som.yaml │ │ │ │ │ ├── sib_sot.yaml │ │ │ │ │ ├── sib_ssw.yaml │ │ │ │ │ ├── sib_swa.yaml │ │ │ │ │ ├── sib_taq.yaml │ │ │ │ │ ├── sib_tir.yaml │ │ │ │ │ ├── sib_tso.yaml │ │ │ │ │ ├── sib_tum.yaml │ │ │ │ │ ├── sib_twi.yaml │ │ │ │ │ ├── sib_tzm.yaml │ │ │ │ │ ├── sib_umb.yaml │ │ │ │ │ ├── sib_wol.yaml │ │ │ │ │ ├── sib_xho.yaml │ │ │ │ │ ├── sib_yor.yaml │ │ │ │ │ ├── sib_zul.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── prompt_3/ │ │ │ │ │ ├── sib │ │ │ │ │ ├── sib_aeb.yaml │ │ │ │ │ ├── sib_afr.yaml │ │ │ │ │ ├── sib_aka.yaml │ │ │ │ │ ├── sib_amh.yaml │ │ │ │ │ ├── sib_ary.yaml │ │ │ │ │ ├── sib_arz.yaml │ │ │ │ │ ├── sib_bam.yaml │ │ │ │ │ ├── sib_bem.yaml │ │ │ │ │ ├── sib_cjk.yaml │ │ │ │ │ ├── sib_dik.yaml │ │ │ │ │ ├── sib_dyu.yaml │ │ │ │ │ ├── sib_eng.yaml │ │ │ │ │ ├── sib_ewe.yaml │ │ │ │ │ ├── sib_fon.yaml │ │ │ │ │ ├── sib_fra.yaml │ │ │ │ │ ├── sib_fuv.yaml │ │ │ │ │ ├── sib_gaz.yaml │ │ │ │ │ ├── sib_hau.yaml │ │ │ │ │ ├── sib_ibo.yaml │ │ │ │ │ ├── sib_kab.yaml │ │ │ │ │ ├── sib_kam.yaml │ │ │ │ │ ├── sib_kbp.yaml │ │ │ │ │ ├── sib_kea.yaml │ │ │ │ │ ├── sib_kik.yaml │ │ │ │ │ ├── sib_kin.yaml │ │ │ │ │ ├── sib_kmb.yaml │ │ │ │ │ ├── sib_knc.yaml │ │ │ │ │ ├── sib_kon.yaml │ │ │ │ │ ├── sib_lin.yaml │ │ │ │ │ ├── sib_lua.yaml │ │ │ │ │ ├── sib_lug.yaml │ │ │ │ │ ├── sib_luo.yaml │ │ │ │ │ ├── sib_mos.yaml │ │ │ │ │ ├── sib_nso.yaml │ │ │ │ │ ├── sib_nus.yaml │ │ │ │ │ ├── sib_nya.yaml │ │ │ │ │ ├── sib_plt.yaml │ │ │ │ │ ├── sib_por.yaml │ │ │ │ │ ├── sib_run.yaml │ │ │ │ │ ├── sib_sag.yaml │ │ │ │ │ ├── sib_sna.yaml │ │ │ │ │ ├── sib_som.yaml │ │ │ │ │ ├── sib_sot.yaml │ │ │ │ │ ├── sib_ssw.yaml │ │ │ │ │ ├── sib_swa.yaml │ │ │ │ │ ├── sib_taq.yaml │ │ │ │ │ ├── sib_tir.yaml │ │ │ │ │ ├── sib_tso.yaml │ │ │ │ │ ├── sib_tum.yaml │ │ │ │ │ ├── sib_twi.yaml │ │ │ │ │ ├── sib_tzm.yaml │ │ │ │ │ ├── sib_umb.yaml │ │ │ │ │ ├── sib_wol.yaml │ │ │ │ │ ├── sib_xho.yaml │ │ │ │ │ ├── sib_yor.yaml │ │ │ │ │ ├── sib_zul.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── prompt_4/ │ │ │ │ │ ├── sib │ │ │ │ │ ├── sib_aeb.yaml │ │ │ │ │ ├── sib_afr.yaml │ │ │ │ │ ├── sib_aka.yaml │ │ │ │ │ ├── sib_amh.yaml │ │ │ │ │ ├── sib_ary.yaml │ │ │ │ │ ├── sib_arz.yaml │ │ │ │ │ ├── sib_bam.yaml │ │ │ │ │ ├── sib_bem.yaml │ │ │ │ │ ├── sib_cjk.yaml │ │ │ │ │ ├── sib_dik.yaml │ │ │ │ │ ├── sib_dyu.yaml │ │ │ │ │ ├── sib_eng.yaml │ │ │ │ │ ├── sib_ewe.yaml │ │ │ │ │ ├── sib_fon.yaml │ │ │ │ │ ├── sib_fra.yaml │ │ │ │ │ ├── sib_fuv.yaml │ │ │ │ │ ├── sib_gaz.yaml │ │ │ │ │ ├── sib_hau.yaml │ │ │ │ │ ├── sib_ibo.yaml │ │ │ │ │ ├── sib_kab.yaml │ │ │ │ │ ├── sib_kam.yaml │ │ │ │ │ ├── sib_kbp.yaml │ │ │ │ │ ├── sib_kea.yaml │ │ │ │ │ ├── sib_kik.yaml │ │ │ │ │ ├── sib_kin.yaml │ │ │ │ │ ├── sib_kmb.yaml │ │ │ │ │ ├── sib_knc.yaml │ │ │ │ │ ├── sib_kon.yaml │ │ │ │ │ ├── sib_lin.yaml │ │ │ │ │ ├── sib_lua.yaml │ │ │ │ │ ├── sib_lug.yaml │ │ │ │ │ ├── sib_luo.yaml │ │ │ │ │ ├── sib_mos.yaml │ │ │ │ │ ├── sib_nso.yaml │ │ │ │ │ ├── sib_nus.yaml │ │ │ │ │ ├── sib_nya.yaml │ │ │ │ │ ├── sib_plt.yaml │ │ │ │ │ ├── sib_por.yaml │ │ │ │ │ ├── sib_run.yaml │ │ │ │ │ ├── sib_sag.yaml │ │ │ │ │ ├── sib_sna.yaml │ │ │ │ │ ├── sib_som.yaml │ │ │ │ │ ├── sib_sot.yaml │ │ │ │ │ ├── sib_ssw.yaml │ │ │ │ │ ├── sib_swa.yaml │ │ │ │ │ ├── sib_taq.yaml │ │ │ │ │ ├── sib_tir.yaml │ │ │ │ │ ├── sib_tso.yaml │ │ │ │ │ ├── sib_tum.yaml │ │ │ │ │ ├── sib_twi.yaml │ │ │ │ │ ├── sib_tzm.yaml │ │ │ │ │ ├── sib_umb.yaml │ │ │ │ │ ├── sib_wol.yaml │ │ │ │ │ ├── sib_xho.yaml │ │ │ │ │ ├── sib_yor.yaml │ │ │ │ │ ├── sib_zul.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── prompt_5/ │ │ │ │ │ ├── sib │ │ │ │ │ ├── sib_aeb.yaml │ │ │ │ │ ├── sib_afr.yaml │ │ │ │ │ ├── sib_aka.yaml │ │ │ │ │ ├── sib_amh.yaml │ │ │ │ │ ├── sib_ary.yaml │ │ │ │ │ ├── sib_arz.yaml │ │ │ │ │ ├── sib_bam.yaml │ │ │ │ │ ├── sib_bem.yaml │ │ │ │ │ ├── sib_cjk.yaml │ │ │ │ │ ├── sib_dik.yaml │ │ │ │ │ ├── sib_dyu.yaml │ │ │ │ │ ├── sib_eng.yaml │ │ │ │ │ ├── sib_ewe.yaml │ │ │ │ │ ├── sib_fon.yaml │ │ │ │ │ ├── sib_fra.yaml │ │ │ │ │ ├── sib_fuv.yaml │ │ │ │ │ ├── sib_gaz.yaml │ │ │ │ │ ├── sib_hau.yaml │ │ │ │ │ ├── sib_ibo.yaml │ │ │ │ │ ├── sib_kab.yaml │ │ │ │ │ ├── sib_kam.yaml │ │ │ │ │ ├── sib_kbp.yaml │ │ │ │ │ ├── sib_kea.yaml │ │ │ │ │ ├── sib_kik.yaml │ │ │ │ │ ├── sib_kin.yaml │ │ │ │ │ ├── sib_kmb.yaml │ │ │ │ │ ├── sib_knc.yaml │ │ │ │ │ ├── sib_kon.yaml │ │ │ │ │ ├── sib_lin.yaml │ │ │ │ │ ├── sib_lua.yaml │ │ │ │ │ ├── sib_lug.yaml │ │ │ │ │ ├── sib_luo.yaml │ │ │ │ │ ├── sib_mos.yaml │ │ │ │ │ ├── sib_nso.yaml │ │ │ │ │ ├── sib_nus.yaml │ │ │ │ │ ├── sib_nya.yaml │ │ │ │ │ ├── sib_plt.yaml │ │ │ │ │ ├── sib_por.yaml │ │ │ │ │ ├── sib_run.yaml │ │ │ │ │ ├── sib_sag.yaml │ │ │ │ │ ├── sib_sna.yaml │ │ │ │ │ ├── sib_som.yaml │ │ │ │ │ ├── sib_sot.yaml │ │ │ │ │ ├── sib_ssw.yaml │ │ │ │ │ ├── sib_swa.yaml │ │ │ │ │ ├── sib_taq.yaml │ │ │ │ │ ├── sib_tir.yaml │ │ │ │ │ ├── sib_tso.yaml │ │ │ │ │ ├── sib_tum.yaml │ │ │ │ │ ├── sib_twi.yaml │ │ │ │ │ ├── sib_tzm.yaml │ │ │ │ │ ├── sib_umb.yaml │ │ │ │ │ ├── sib_wol.yaml │ │ │ │ │ ├── sib_xho.yaml │ │ │ │ │ ├── sib_yor.yaml │ │ │ │ │ ├── sib_zul.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── sib.yaml │ │ │ │ └── utils.py │ │ │ ├── uhura-arc-easy/ │ │ │ │ ├── README.md │ │ │ │ ├── prompt_1/ │ │ │ │ │ ├── uhura-arc-easy │ │ │ │ │ ├── uhura-arc-easy_am.yaml │ │ │ │ │ ├── uhura-arc-easy_en.yaml │ │ │ │ │ ├── uhura-arc-easy_ha.yaml │ │ │ │ │ ├── uhura-arc-easy_nso.yaml │ │ │ │ │ ├── uhura-arc-easy_sw.yaml │ │ │ │ │ ├── uhura-arc-easy_yo.yaml │ │ │ │ │ ├── uhura-arc-easy_zu.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── prompt_2/ │ │ │ │ │ ├── uhura-arc-easy │ │ │ │ │ ├── uhura-arc-easy_am.yaml │ │ │ │ │ ├── uhura-arc-easy_en.yaml │ │ │ │ │ ├── uhura-arc-easy_ha.yaml │ │ │ │ │ ├── uhura-arc-easy_nso.yaml │ │ │ │ │ ├── uhura-arc-easy_sw.yaml │ │ │ │ │ ├── uhura-arc-easy_yo.yaml │ │ │ │ │ ├── uhura-arc-easy_zu.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── prompt_3/ │ │ │ │ │ ├── uhura-arc-easy │ │ │ │ │ ├── uhura-arc-easy_am.yaml │ │ │ │ │ ├── uhura-arc-easy_en.yaml │ │ │ │ │ ├── uhura-arc-easy_ha.yaml │ │ │ │ │ ├── uhura-arc-easy_nso.yaml │ │ │ │ │ ├── uhura-arc-easy_sw.yaml │ │ │ │ │ ├── uhura-arc-easy_yo.yaml │ │ │ │ │ ├── uhura-arc-easy_zu.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── prompt_4/ │ │ │ │ │ ├── uhura-arc-easy │ │ │ │ │ ├── uhura-arc-easy_am.yaml │ │ │ │ │ ├── uhura-arc-easy_en.yaml │ │ │ │ │ ├── uhura-arc-easy_ha.yaml │ │ │ │ │ ├── uhura-arc-easy_nso.yaml │ │ │ │ │ ├── uhura-arc-easy_sw.yaml │ │ │ │ │ ├── uhura-arc-easy_yo.yaml │ │ │ │ │ ├── uhura-arc-easy_zu.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── prompt_5/ │ │ │ │ │ ├── uhura-arc-easy │ │ │ │ │ ├── uhura-arc-easy_am.yaml │ │ │ │ │ ├── uhura-arc-easy_en.yaml │ │ │ │ │ ├── uhura-arc-easy_ha.yaml │ │ │ │ │ ├── uhura-arc-easy_nso.yaml │ │ │ │ │ ├── uhura-arc-easy_sw.yaml │ │ │ │ │ ├── uhura-arc-easy_yo.yaml │ │ │ │ │ ├── uhura-arc-easy_zu.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── uhura.yaml │ │ │ │ └── utils.py │ │ │ └── xlsum/ │ │ │ ├── README.md │ │ │ ├── prompt_1/ │ │ │ │ ├── utils.py │ │ │ │ ├── xlsum │ │ │ │ ├── xlsum_amharic.yaml │ │ │ │ ├── xlsum_arabic.yaml │ │ │ │ ├── xlsum_hausa.yaml │ │ │ │ ├── xlsum_igbo.yaml │ │ │ │ ├── xlsum_kirundi.yaml │ │ │ │ ├── xlsum_oromo.yaml │ │ │ │ ├── xlsum_pidgin.yaml │ │ │ │ ├── xlsum_somali.yaml │ │ │ │ ├── xlsum_swahili.yaml │ │ │ │ ├── xlsum_telugu.yaml │ │ │ │ ├── xlsum_tigrinya.yaml │ │ │ │ └── xlsum_yoruba.yaml │ │ │ ├── prompt_2/ │ │ │ │ ├── utils.py │ │ │ │ ├── xlsum │ │ │ │ ├── xlsum_amharic.yaml │ │ │ │ ├── xlsum_arabic.yaml │ │ │ │ ├── xlsum_hausa.yaml │ │ │ │ ├── xlsum_igbo.yaml │ │ │ │ ├── xlsum_kirundi.yaml │ │ │ │ ├── xlsum_oromo.yaml │ │ │ │ ├── xlsum_pidgin.yaml │ │ │ │ ├── xlsum_somali.yaml │ │ │ │ ├── xlsum_swahili.yaml │ │ │ │ ├── xlsum_telugu.yaml │ │ │ │ ├── xlsum_tigrinya.yaml │ │ │ │ └── xlsum_yoruba.yaml │ │ │ ├── prompt_3/ │ │ │ │ ├── utils.py │ │ │ │ ├── xlsum │ │ │ │ ├── xlsum_amharic.yaml │ │ │ │ ├── xlsum_arabic.yaml │ │ │ │ ├── xlsum_hausa.yaml │ │ │ │ ├── xlsum_igbo.yaml │ │ │ │ ├── xlsum_kirundi.yaml │ │ │ │ ├── xlsum_oromo.yaml │ │ │ │ ├── xlsum_pidgin.yaml │ │ │ │ ├── xlsum_somali.yaml │ │ │ │ ├── xlsum_swahili.yaml │ │ │ │ ├── xlsum_telugu.yaml │ │ │ │ ├── xlsum_tigrinya.yaml │ │ │ │ └── xlsum_yoruba.yaml │ │ │ ├── utils.py │ │ │ └── xlsum.yaml │ │ ├── agieval/ │ │ │ ├── README.md │ │ │ ├── agieval.yaml │ │ │ ├── agieval_cn.yaml │ │ │ ├── agieval_en.yaml │ │ │ ├── agieval_nous.yaml │ │ │ ├── aqua-rat.yaml │ │ │ ├── gaokao-biology.yaml │ │ │ ├── gaokao-chemistry.yaml │ │ │ ├── gaokao-chinese.yaml │ │ │ ├── gaokao-english.yaml │ │ │ ├── gaokao-geography.yaml │ │ │ ├── gaokao-history.yaml │ │ │ ├── gaokao-mathcloze.yaml │ │ │ ├── gaokao-mathqa.yaml │ │ │ ├── gaokao-physics.yaml │ │ │ ├── jec-qa-ca.yaml │ │ │ ├── jec-qa-kd.yaml │ │ │ ├── logiqa-en.yaml │ │ │ ├── logiqa-zh.yaml │ │ │ ├── lsat-ar.yaml │ │ │ ├── lsat-lr.yaml │ │ │ ├── lsat-rc.yaml │ │ │ ├── math.yaml │ │ │ ├── sat-en-without-passage.yaml │ │ │ ├── sat-en.yaml │ │ │ ├── sat-math.yaml │ │ │ └── utils.py │ │ ├── aime/ │ │ │ ├── README.md │ │ │ ├── aime.yaml │ │ │ ├── aime24.yaml │ │ │ ├── aime25.yaml │ │ │ └── utils.py │ │ ├── alghafa/ │ │ │ ├── copa_ar/ │ │ │ │ ├── README.md │ │ │ │ └── copa_ar.yaml │ │ │ └── piqa_ar/ │ │ │ ├── README.md │ │ │ └── piqa_ar.yaml │ │ ├── anli/ │ │ │ ├── README.md │ │ │ ├── anli_r1.yaml │ │ │ ├── anli_r2.yaml │ │ │ └── anli_r3.yaml │ │ ├── arab_culture/ │ │ │ ├── README.md │ │ │ ├── _arab_culture.yaml │ │ │ ├── _arab_culture_gulf.yaml │ │ │ ├── _arab_culture_levant.yaml │ │ │ ├── _arab_culture_nile_valley.yaml │ │ │ ├── _arab_culture_north_africa.yaml │ │ │ ├── _default_arab_culture_mcq_template_yaml │ │ │ ├── _generate_configs.py │ │ │ ├── arab_culture_algeria.yaml │ │ │ ├── arab_culture_egypt.yaml │ │ │ ├── arab_culture_jordan.yaml │ │ │ ├── arab_culture_ksa.yaml │ │ │ ├── arab_culture_lebanon.yaml │ │ │ ├── arab_culture_libya.yaml │ │ │ ├── arab_culture_morocco.yaml │ │ │ ├── arab_culture_palestine.yaml │ │ │ ├── arab_culture_sudan.yaml │ │ │ ├── arab_culture_syria.yaml │ │ │ ├── arab_culture_tunisia.yaml │ │ │ ├── arab_culture_uae.yaml │ │ │ ├── arab_culture_yemen.yaml │ │ │ ├── prompts.py │ │ │ └── utils_mcq.py │ │ ├── arab_culture_completion/ │ │ │ ├── README.md │ │ │ ├── _arab_culture_completion.yaml │ │ │ ├── _arab_culture_completion_gulf.yaml │ │ │ ├── _arab_culture_completion_levant.yaml │ │ │ ├── _arab_culture_completion_nile_valley.yaml │ │ │ ├── _arab_culture_completion_north_africa.yaml │ │ │ ├── _default_arab_culture_completion_template_yaml │ │ │ ├── _generate_configs.py │ │ │ ├── arab_culture_completion_algeria.yaml │ │ │ ├── arab_culture_completion_egypt.yaml │ │ │ ├── arab_culture_completion_jordan.yaml │ │ │ ├── arab_culture_completion_ksa.yaml │ │ │ ├── arab_culture_completion_lebanon.yaml │ │ │ ├── arab_culture_completion_libya.yaml │ │ │ ├── arab_culture_completion_morocco.yaml │ │ │ ├── arab_culture_completion_palestine.yaml │ │ │ ├── arab_culture_completion_sudan.yaml │ │ │ ├── arab_culture_completion_syria.yaml │ │ │ ├── arab_culture_completion_tunisia.yaml │ │ │ ├── arab_culture_completion_uae.yaml │ │ │ ├── arab_culture_completion_yemen.yaml │ │ │ ├── prompts.py │ │ │ └── utils_completion.py │ │ ├── arabic_leaderboard_complete/ │ │ │ ├── README.md │ │ │ ├── arabic_leaderboard_alghafa/ │ │ │ │ ├── arabic_leaderboard_alghafa.yaml │ │ │ │ ├── arabic_leaderboard_alghafa_mcq_exams_test_ar.yaml │ │ │ │ ├── arabic_leaderboard_alghafa_meta_ar_dialects.yaml │ │ │ │ ├── arabic_leaderboard_alghafa_meta_ar_msa.yaml │ │ │ │ ├── arabic_leaderboard_alghafa_multiple_choice_facts_truefalse_balanced_task.yaml │ │ │ │ ├── arabic_leaderboard_alghafa_multiple_choice_grounded_statement_soqal_task.yaml │ │ │ │ ├── arabic_leaderboard_alghafa_multiple_choice_grounded_statement_xglue_mlqa_task.yaml │ │ │ │ ├── arabic_leaderboard_alghafa_multiple_choice_rating_sentiment_no_neutral_task.yaml │ │ │ │ ├── arabic_leaderboard_alghafa_multiple_choice_rating_sentiment_task.yaml │ │ │ │ ├── arabic_leaderboard_alghafa_multiple_choice_sentiment_task.yaml │ │ │ │ └── utils.py │ │ │ ├── arabic_leaderboard_arabic_exams/ │ │ │ │ ├── arabic_exams.yaml │ │ │ │ ├── arabic_leaderboard_arabic_exams.yaml │ │ │ │ └── utils.py │ │ │ ├── arabic_leaderboard_arabic_mmlu/ │ │ │ │ ├── arabic_leaderboard_arabic_mmlu.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_abstract_algebra.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_anatomy.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_astronomy.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_business_ethics.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_clinical_knowledge.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_college_biology.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_college_chemistry.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_college_computer_science.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_college_mathematics.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_college_medicine.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_college_physics.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_computer_security.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_conceptual_physics.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_econometrics.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_electrical_engineering.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_elementary_mathematics.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_formal_logic.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_global_facts.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_high_school_biology.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_high_school_chemistry.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_high_school_computer_science.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_high_school_european_history.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_high_school_geography.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_high_school_government_and_politics.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_high_school_macroeconomics.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_high_school_mathematics.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_high_school_microeconomics.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_high_school_physics.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_high_school_psychology.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_high_school_statistics.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_high_school_us_history.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_high_school_world_history.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_human_aging.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_human_sexuality.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_international_law.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_jurisprudence.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_logical_fallacies.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_machine_learning.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_management.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_marketing.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_medical_genetics.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_miscellaneous.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_moral_disputes.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_moral_scenarios.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_nutrition.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_philosophy.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_prehistory.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_professional_accounting.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_professional_law.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_professional_medicine.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_professional_psychology.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_public_relations.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_security_studies.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_sociology.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_us_foreign_policy.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_virology.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_world_religions.yaml │ │ │ │ └── utils.py │ │ │ ├── arabic_leaderboard_arabic_mt_arc_challenge/ │ │ │ │ ├── arabic_leaderboard_arabic_mt_arc_challenge.yaml │ │ │ │ ├── arabic_mt_arc_challenge.yaml │ │ │ │ └── utils.py │ │ │ ├── arabic_leaderboard_arabic_mt_arc_easy/ │ │ │ │ ├── arabic_leaderboard_arabic_mt_arc_easy.yaml │ │ │ │ ├── arabic_mt_arc_easy.yaml │ │ │ │ └── utils.py │ │ │ ├── arabic_leaderboard_arabic_mt_boolq/ │ │ │ │ ├── arabic_leaderboard_arabic_mt_boolq.yaml │ │ │ │ ├── arabic_mt_boolq.yaml │ │ │ │ └── utils.py │ │ │ ├── arabic_leaderboard_arabic_mt_copa/ │ │ │ │ ├── arabic_leaderboard_arabic_mt_copa.yaml │ │ │ │ ├── arabic_mt_copa.yaml │ │ │ │ └── utils.py │ │ │ ├── arabic_leaderboard_arabic_mt_hellaswag/ │ │ │ │ ├── arabic_leaderboard_arabic_mt_hellaswag.yaml │ │ │ │ ├── arabic_mt_hellaswag.yaml │ │ │ │ └── utils.py │ │ │ ├── arabic_leaderboard_arabic_mt_mmlu/ │ │ │ │ ├── arabic_leaderboard_arabic_mt_mmlu.yaml │ │ │ │ ├── arabic_mt_mmlu.yaml │ │ │ │ └── utils.py │ │ │ ├── arabic_leaderboard_arabic_mt_openbook_qa/ │ │ │ │ ├── arabic_leaderboard_arabic_mt_openbook_qa.yaml │ │ │ │ ├── arabic_mt_openbook_qa.yaml │ │ │ │ └── utils.py │ │ │ ├── arabic_leaderboard_arabic_mt_piqa/ │ │ │ │ ├── arabic_leaderboard_arabic_mt_piqa.yaml │ │ │ │ ├── arabic_mt_piqa.yaml │ │ │ │ └── utils.py │ │ │ ├── arabic_leaderboard_arabic_mt_race/ │ │ │ │ ├── arabic_leaderboard_arabic_mt_race.yaml │ │ │ │ ├── arabic_mt_race.yaml │ │ │ │ └── utils.py │ │ │ ├── arabic_leaderboard_arabic_mt_sciq/ │ │ │ │ ├── arabic_leaderboard_arabic_mt_sciq.yaml │ │ │ │ ├── arabic_mt_sciq.yaml │ │ │ │ └── utils.py │ │ │ ├── arabic_leaderboard_arabic_mt_toxigen/ │ │ │ │ ├── arabic_leaderboard_arabic_mt_toxigen.yaml │ │ │ │ ├── arabic_mt_toxigen.yaml │ │ │ │ └── utils.py │ │ │ ├── arabic_leaderboard_avca/ │ │ │ │ ├── arabic_leaderboard_acva.yaml │ │ │ │ ├── arabic_leaderboard_acva_Algeria.yaml │ │ │ │ ├── arabic_leaderboard_acva_Ancient_Egypt.yaml │ │ │ │ ├── arabic_leaderboard_acva_Arab_Empire.yaml │ │ │ │ ├── arabic_leaderboard_acva_Arabic_Architecture.yaml │ │ │ │ ├── arabic_leaderboard_acva_Arabic_Art.yaml │ │ │ │ ├── arabic_leaderboard_acva_Arabic_Astronomy.yaml │ │ │ │ ├── arabic_leaderboard_acva_Arabic_Calligraphy.yaml │ │ │ │ ├── arabic_leaderboard_acva_Arabic_Ceremony.yaml │ │ │ │ ├── arabic_leaderboard_acva_Arabic_Clothing.yaml │ │ │ │ ├── arabic_leaderboard_acva_Arabic_Culture.yaml │ │ │ │ ├── arabic_leaderboard_acva_Arabic_Food.yaml │ │ │ │ ├── arabic_leaderboard_acva_Arabic_Funeral.yaml │ │ │ │ ├── arabic_leaderboard_acva_Arabic_Geography.yaml │ │ │ │ ├── arabic_leaderboard_acva_Arabic_History.yaml │ │ │ │ ├── arabic_leaderboard_acva_Arabic_Language_Origin.yaml │ │ │ │ ├── arabic_leaderboard_acva_Arabic_Literature.yaml │ │ │ │ ├── arabic_leaderboard_acva_Arabic_Math.yaml │ │ │ │ ├── arabic_leaderboard_acva_Arabic_Medicine.yaml │ │ │ │ ├── arabic_leaderboard_acva_Arabic_Music.yaml │ │ │ │ ├── arabic_leaderboard_acva_Arabic_Ornament.yaml │ │ │ │ ├── arabic_leaderboard_acva_Arabic_Philosophy.yaml │ │ │ │ ├── arabic_leaderboard_acva_Arabic_Physics_and_Chemistry.yaml │ │ │ │ ├── arabic_leaderboard_acva_Arabic_Wedding.yaml │ │ │ │ ├── arabic_leaderboard_acva_Bahrain.yaml │ │ │ │ ├── arabic_leaderboard_acva_Comoros.yaml │ │ │ │ ├── arabic_leaderboard_acva_Egypt_modern.yaml │ │ │ │ ├── arabic_leaderboard_acva_InfluenceFromAncientEgypt.yaml │ │ │ │ ├── arabic_leaderboard_acva_InfluenceFromByzantium.yaml │ │ │ │ ├── arabic_leaderboard_acva_InfluenceFromChina.yaml │ │ │ │ ├── arabic_leaderboard_acva_InfluenceFromGreece.yaml │ │ │ │ ├── arabic_leaderboard_acva_InfluenceFromIslam.yaml │ │ │ │ ├── arabic_leaderboard_acva_InfluenceFromPersia.yaml │ │ │ │ ├── arabic_leaderboard_acva_InfluenceFromRome.yaml │ │ │ │ ├── arabic_leaderboard_acva_Iraq.yaml │ │ │ │ ├── arabic_leaderboard_acva_Islam_Education.yaml │ │ │ │ ├── arabic_leaderboard_acva_Islam_branches_and_schools.yaml │ │ │ │ ├── arabic_leaderboard_acva_Islamic_law_system.yaml │ │ │ │ ├── arabic_leaderboard_acva_Jordan.yaml │ │ │ │ ├── arabic_leaderboard_acva_Kuwait.yaml │ │ │ │ ├── arabic_leaderboard_acva_Lebanon.yaml │ │ │ │ ├── arabic_leaderboard_acva_Libya.yaml │ │ │ │ ├── arabic_leaderboard_acva_Mauritania.yaml │ │ │ │ ├── arabic_leaderboard_acva_Mesopotamia_civilization.yaml │ │ │ │ ├── arabic_leaderboard_acva_Morocco.yaml │ │ │ │ ├── arabic_leaderboard_acva_Oman.yaml │ │ │ │ ├── arabic_leaderboard_acva_Palestine.yaml │ │ │ │ ├── arabic_leaderboard_acva_Qatar.yaml │ │ │ │ ├── arabic_leaderboard_acva_Saudi_Arabia.yaml │ │ │ │ ├── arabic_leaderboard_acva_Somalia.yaml │ │ │ │ ├── arabic_leaderboard_acva_Sudan.yaml │ │ │ │ ├── arabic_leaderboard_acva_Syria.yaml │ │ │ │ ├── arabic_leaderboard_acva_Tunisia.yaml │ │ │ │ ├── arabic_leaderboard_acva_United_Arab_Emirates.yaml │ │ │ │ ├── arabic_leaderboard_acva_Yemen.yaml │ │ │ │ ├── arabic_leaderboard_acva_communication.yaml │ │ │ │ ├── arabic_leaderboard_acva_computer_and_phone.yaml │ │ │ │ ├── arabic_leaderboard_acva_daily_life.yaml │ │ │ │ ├── arabic_leaderboard_acva_entertainment.yaml │ │ │ │ └── utils.py │ │ │ └── arabic_leaderboard_complete.yaml │ │ ├── arabic_leaderboard_light/ │ │ │ ├── README.md │ │ │ ├── arabic_leaderboard_alghafa_light/ │ │ │ │ ├── arabic_leaderboard_alghafa_light.yaml │ │ │ │ ├── arabic_leaderboard_alghafa_mcq_exams_test_ar_light.yaml │ │ │ │ ├── arabic_leaderboard_alghafa_meta_ar_dialects_light.yaml │ │ │ │ ├── arabic_leaderboard_alghafa_meta_ar_msa_light.yaml │ │ │ │ ├── arabic_leaderboard_alghafa_multiple_choice_facts_truefalse_balanced_task_light.yaml │ │ │ │ ├── arabic_leaderboard_alghafa_multiple_choice_grounded_statement_soqal_task_light.yaml │ │ │ │ ├── arabic_leaderboard_alghafa_multiple_choice_grounded_statement_xglue_mlqa_task_light.yaml │ │ │ │ ├── arabic_leaderboard_alghafa_multiple_choice_rating_sentiment_no_neutral_task_light.yaml │ │ │ │ ├── arabic_leaderboard_alghafa_multiple_choice_rating_sentiment_task_light.yaml │ │ │ │ ├── arabic_leaderboard_alghafa_multiple_choice_sentiment_task_light.yaml │ │ │ │ └── utils.py │ │ │ ├── arabic_leaderboard_arabic_exams_light/ │ │ │ │ ├── arabic_exams_light.yaml │ │ │ │ ├── arabic_leaderboard_arabic_exams_light.yaml │ │ │ │ └── utils.py │ │ │ ├── arabic_leaderboard_arabic_mmlu_light/ │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_abstract_algebra_light.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_anatomy_light.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_astronomy_light.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_business_ethics_light.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_clinical_knowledge_light.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_college_biology_light.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_college_chemistry_light.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_college_computer_science_light.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_college_mathematics_light.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_college_medicine_light.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_college_physics_light.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_computer_security_light.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_conceptual_physics_light.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_econometrics_light.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_electrical_engineering_light.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_elementary_mathematics_light.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_formal_logic_light.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_global_facts_light.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_high_school_biology_light.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_high_school_chemistry_light.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_high_school_computer_science_light.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_high_school_european_history_light.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_high_school_geography_light.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_high_school_government_and_politics_light.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_high_school_macroeconomics_light.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_high_school_mathematics_light.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_high_school_microeconomics_light.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_high_school_physics_light.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_high_school_psychology_light.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_high_school_statistics_light.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_high_school_us_history_light.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_high_school_world_history_light.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_human_aging_light.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_human_sexuality_light.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_international_law_light.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_jurisprudence_light.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_light.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_logical_fallacies_light.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_machine_learning_light.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_management_light.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_marketing_light.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_medical_genetics_light.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_miscellaneous_light.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_moral_disputes_light.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_moral_scenarios_light.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_nutrition_light.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_philosophy_light.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_prehistory_light.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_professional_accounting_light.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_professional_law_light.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_professional_medicine_light.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_professional_psychology_light.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_public_relations_light.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_security_studies_light.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_sociology_light.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_us_foreign_policy_light.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_virology_light.yaml │ │ │ │ ├── arabic_leaderboard_arabic_mmlu_world_religions_light.yaml │ │ │ │ └── utils.py │ │ │ ├── arabic_leaderboard_arabic_mt_arc_challenge_light/ │ │ │ │ ├── arabic_leaderboard_arabic_mt_arc_challenge_light.yaml │ │ │ │ ├── arabic_mt_arc_challenge_light.yaml │ │ │ │ └── utils.py │ │ │ ├── arabic_leaderboard_arabic_mt_arc_easy_light/ │ │ │ │ ├── arabic_leaderboard_arabic_mt_arc_easy_light.yaml │ │ │ │ ├── arabic_mt_arc_easy_light.yaml │ │ │ │ └── utils.py │ │ │ ├── arabic_leaderboard_arabic_mt_boolq_light/ │ │ │ │ ├── arabic_leaderboard_arabic_mt_boolq_light.yaml │ │ │ │ ├── arabic_mt_boolq_light.yaml │ │ │ │ └── utils.py │ │ │ ├── arabic_leaderboard_arabic_mt_copa_light/ │ │ │ │ ├── arabic_mt_copa_light.yaml │ │ │ │ ├── arbic_leaderboard_arabic_mt_copa_light.yaml │ │ │ │ └── utils.py │ │ │ ├── arabic_leaderboard_arabic_mt_hellaswag_light/ │ │ │ │ ├── arabic_leaderboard_arabic_mt_hellaswag_light.yaml │ │ │ │ ├── arabic_mt_hellaswag_light.yaml │ │ │ │ └── utils.py │ │ │ ├── arabic_leaderboard_arabic_mt_mmlu_light/ │ │ │ │ ├── arabic_leaderboard_arabic_mt_mmlu_light.yaml │ │ │ │ ├── arabic_mt_mmlu_light.yaml │ │ │ │ └── utils.py │ │ │ ├── arabic_leaderboard_arabic_mt_openbook_qa_light/ │ │ │ │ ├── arabic_leaderboard_arabic_mt_openbook_qa_light.yaml │ │ │ │ ├── arabic_mt_openbook_qa_light.yaml │ │ │ │ └── utils.py │ │ │ ├── arabic_leaderboard_arabic_mt_piqa_light/ │ │ │ │ ├── arabic_leaderboard_arabic_mt_piqa_light.yaml │ │ │ │ ├── arabic_mt_piqa_light.yaml │ │ │ │ └── utils.py │ │ │ ├── arabic_leaderboard_arabic_mt_race_light/ │ │ │ │ ├── arabic_leaderboard_arabic_mt_race_light.yaml │ │ │ │ ├── arabic_mt_race_light.yaml │ │ │ │ └── utils.py │ │ │ ├── arabic_leaderboard_arabic_mt_sciq_light/ │ │ │ │ ├── arabic_leaderboard_arabic_mt_sciq_light.yaml │ │ │ │ ├── arabic_mt_sciq_light.yaml │ │ │ │ └── utils.py │ │ │ ├── arabic_leaderboard_arabic_mt_toxigen_light/ │ │ │ │ ├── arabic_leaderboard_arabic_mt_toxigen_light.yaml │ │ │ │ ├── arabic_mt_toxigen_light.yaml │ │ │ │ └── utils.py │ │ │ ├── arabic_leaderboard_avca_light/ │ │ │ │ ├── arabic_leaderboard_acva_Algeria_light.yaml │ │ │ │ ├── arabic_leaderboard_acva_Ancient_Egypt_light.yaml │ │ │ │ ├── arabic_leaderboard_acva_Arab_Empire_light.yaml │ │ │ │ ├── arabic_leaderboard_acva_Arabic_Architecture_light.yaml │ │ │ │ ├── arabic_leaderboard_acva_Arabic_Art_light.yaml │ │ │ │ ├── arabic_leaderboard_acva_Arabic_Astronomy_light.yaml │ │ │ │ ├── arabic_leaderboard_acva_Arabic_Calligraphy_light.yaml │ │ │ │ ├── arabic_leaderboard_acva_Arabic_Ceremony_light.yaml │ │ │ │ ├── arabic_leaderboard_acva_Arabic_Clothing_light.yaml │ │ │ │ ├── arabic_leaderboard_acva_Arabic_Culture_light.yaml │ │ │ │ ├── arabic_leaderboard_acva_Arabic_Food_light.yaml │ │ │ │ ├── arabic_leaderboard_acva_Arabic_Funeral_light.yaml │ │ │ │ ├── arabic_leaderboard_acva_Arabic_Geography_light.yaml │ │ │ │ ├── arabic_leaderboard_acva_Arabic_History_light.yaml │ │ │ │ ├── arabic_leaderboard_acva_Arabic_Language_Origin_light.yaml │ │ │ │ ├── arabic_leaderboard_acva_Arabic_Literature_light.yaml │ │ │ │ ├── arabic_leaderboard_acva_Arabic_Math_light.yaml │ │ │ │ ├── arabic_leaderboard_acva_Arabic_Medicine_light.yaml │ │ │ │ ├── arabic_leaderboard_acva_Arabic_Music_light.yaml │ │ │ │ ├── arabic_leaderboard_acva_Arabic_Ornament_light.yaml │ │ │ │ ├── arabic_leaderboard_acva_Arabic_Philosophy_light.yaml │ │ │ │ ├── arabic_leaderboard_acva_Arabic_Physics_and_Chemistry_light.yaml │ │ │ │ ├── arabic_leaderboard_acva_Arabic_Wedding_light.yaml │ │ │ │ ├── arabic_leaderboard_acva_Bahrain_light.yaml │ │ │ │ ├── arabic_leaderboard_acva_Comoros_light.yaml │ │ │ │ ├── arabic_leaderboard_acva_Egypt_modern_light.yaml │ │ │ │ ├── arabic_leaderboard_acva_InfluenceFromAncientEgypt_light.yaml │ │ │ │ ├── arabic_leaderboard_acva_InfluenceFromByzantium_light.yaml │ │ │ │ ├── arabic_leaderboard_acva_InfluenceFromChina_light.yaml │ │ │ │ ├── arabic_leaderboard_acva_InfluenceFromGreece_light.yaml │ │ │ │ ├── arabic_leaderboard_acva_InfluenceFromIslam_light.yaml │ │ │ │ ├── arabic_leaderboard_acva_InfluenceFromPersia_light.yaml │ │ │ │ ├── arabic_leaderboard_acva_InfluenceFromRome_light.yaml │ │ │ │ ├── arabic_leaderboard_acva_Iraq_light.yaml │ │ │ │ ├── arabic_leaderboard_acva_Islam_Education_light.yaml │ │ │ │ ├── arabic_leaderboard_acva_Islam_branches_and_schools_light.yaml │ │ │ │ ├── arabic_leaderboard_acva_Islamic_law_system_light.yaml │ │ │ │ ├── arabic_leaderboard_acva_Jordan_light.yaml │ │ │ │ ├── arabic_leaderboard_acva_Kuwait_light.yaml │ │ │ │ ├── arabic_leaderboard_acva_Lebanon_light.yaml │ │ │ │ ├── arabic_leaderboard_acva_Libya_light.yaml │ │ │ │ ├── arabic_leaderboard_acva_Mauritania_light.yaml │ │ │ │ ├── arabic_leaderboard_acva_Mesopotamia_civilization_light.yaml │ │ │ │ ├── arabic_leaderboard_acva_Morocco_light.yaml │ │ │ │ ├── arabic_leaderboard_acva_Oman_light.yaml │ │ │ │ ├── arabic_leaderboard_acva_Palestine_light.yaml │ │ │ │ ├── arabic_leaderboard_acva_Qatar_light.yaml │ │ │ │ ├── arabic_leaderboard_acva_Saudi_Arabia_light.yaml │ │ │ │ ├── arabic_leaderboard_acva_Somalia_light.yaml │ │ │ │ ├── arabic_leaderboard_acva_Sudan_light.yaml │ │ │ │ ├── arabic_leaderboard_acva_Syria_light.yaml │ │ │ │ ├── arabic_leaderboard_acva_Tunisia_light.yaml │ │ │ │ ├── arabic_leaderboard_acva_United_Arab_Emirates_light.yaml │ │ │ │ ├── arabic_leaderboard_acva_Yemen_light.yaml │ │ │ │ ├── arabic_leaderboard_acva_communication_light.yaml │ │ │ │ ├── arabic_leaderboard_acva_computer_and_phone_light.yaml │ │ │ │ ├── arabic_leaderboard_acva_daily_life_light.yaml │ │ │ │ ├── arabic_leaderboard_acva_entertainment_light.yaml │ │ │ │ ├── arabic_leaderboard_acva_light.yaml │ │ │ │ └── utils.py │ │ │ └── arabic_leaderboard_light.yaml │ │ ├── arabicmmlu/ │ │ │ ├── README.md │ │ │ ├── _arabicmmlu.yaml │ │ │ ├── _arabicmmlu_humanities.yaml │ │ │ ├── _arabicmmlu_language.yaml │ │ │ ├── _arabicmmlu_other.yaml │ │ │ ├── _arabicmmlu_social_science.yaml │ │ │ ├── _arabicmmlu_stem.yaml │ │ │ ├── _default_arabicmmlu_template_yaml │ │ │ ├── _generate_configs.py │ │ │ ├── arabicmmlu_accounting_university.yaml │ │ │ ├── arabicmmlu_arabic_language_general.yaml │ │ │ ├── arabicmmlu_arabic_language_grammar.yaml │ │ │ ├── arabicmmlu_arabic_language_high_school.yaml │ │ │ ├── arabicmmlu_arabic_language_middle_school.yaml │ │ │ ├── arabicmmlu_arabic_language_primary_school.yaml │ │ │ ├── arabicmmlu_biology_high_school.yaml │ │ │ ├── arabicmmlu_civics_high_school.yaml │ │ │ ├── arabicmmlu_civics_middle_school.yaml │ │ │ ├── arabicmmlu_computer_science_high_school.yaml │ │ │ ├── arabicmmlu_computer_science_middle_school.yaml │ │ │ ├── arabicmmlu_computer_science_primary_school.yaml │ │ │ ├── arabicmmlu_computer_science_university.yaml │ │ │ ├── arabicmmlu_driving_test.yaml │ │ │ ├── arabicmmlu_economics_high_school.yaml │ │ │ ├── arabicmmlu_economics_middle_school.yaml │ │ │ ├── arabicmmlu_economics_university.yaml │ │ │ ├── arabicmmlu_general_knowledge.yaml │ │ │ ├── arabicmmlu_general_knowledge_middle_school.yaml │ │ │ ├── arabicmmlu_general_knowledge_primary_school.yaml │ │ │ ├── arabicmmlu_geography_high_school.yaml │ │ │ ├── arabicmmlu_geography_middle_school.yaml │ │ │ ├── arabicmmlu_geography_primary_school.yaml │ │ │ ├── arabicmmlu_history_high_school.yaml │ │ │ ├── arabicmmlu_history_middle_school.yaml │ │ │ ├── arabicmmlu_history_primary_school.yaml │ │ │ ├── arabicmmlu_islamic_studies.yaml │ │ │ ├── arabicmmlu_islamic_studies_high_school.yaml │ │ │ ├── arabicmmlu_islamic_studies_middle_school.yaml │ │ │ ├── arabicmmlu_islamic_studies_primary_school.yaml │ │ │ ├── arabicmmlu_law_professional.yaml │ │ │ ├── arabicmmlu_management_university.yaml │ │ │ ├── arabicmmlu_math_primary_school.yaml │ │ │ ├── arabicmmlu_natural_science_middle_school.yaml │ │ │ ├── arabicmmlu_natural_science_primary_school.yaml │ │ │ ├── arabicmmlu_philosophy_high_school.yaml │ │ │ ├── arabicmmlu_physics_high_school.yaml │ │ │ ├── arabicmmlu_political_science_university.yaml │ │ │ ├── arabicmmlu_social_science_middle_school.yaml │ │ │ ├── arabicmmlu_social_science_primary_school.yaml │ │ │ └── utils.py │ │ ├── aradice/ │ │ │ ├── ArabicMMLU/ │ │ │ │ ├── EGY/ │ │ │ │ │ ├── AraDiCE_ArabicMMLU.yaml │ │ │ │ │ ├── AraDiCE_ArabicMMLU_high_humanities_history.yaml │ │ │ │ │ ├── AraDiCE_ArabicMMLU_high_humanities_islamic-studies.yaml │ │ │ │ │ ├── AraDiCE_ArabicMMLU_high_humanities_philosophy.yaml │ │ │ │ │ ├── AraDiCE_ArabicMMLU_high_language_arabic-language.yaml │ │ │ │ │ ├── AraDiCE_ArabicMMLU_high_social-science_civics.yaml │ │ │ │ │ ├── AraDiCE_ArabicMMLU_high_social-science_economics.yaml │ │ │ │ │ ├── AraDiCE_ArabicMMLU_high_social-science_geography.yaml │ │ │ │ │ ├── AraDiCE_ArabicMMLU_high_stem_biology.yaml │ │ │ │ │ ├── AraDiCE_ArabicMMLU_high_stem_computer-science.yaml │ │ │ │ │ ├── AraDiCE_ArabicMMLU_high_stem_physics.yaml │ │ │ │ │ ├── AraDiCE_ArabicMMLU_middle_humanities_history.yaml │ │ │ │ │ ├── AraDiCE_ArabicMMLU_middle_humanities_islamic-studies.yaml │ │ │ │ │ ├── AraDiCE_ArabicMMLU_middle_language_arabic-language.yaml │ │ │ │ │ ├── AraDiCE_ArabicMMLU_middle_other_general-knowledge.yaml │ │ │ │ │ ├── AraDiCE_ArabicMMLU_middle_social-science_civics.yaml │ │ │ │ │ ├── AraDiCE_ArabicMMLU_middle_social-science_economics.yaml │ │ │ │ │ ├── AraDiCE_ArabicMMLU_middle_social-science_geography.yaml │ │ │ │ │ ├── AraDiCE_ArabicMMLU_middle_social-science_social-science.yaml │ │ │ │ │ ├── AraDiCE_ArabicMMLU_middle_stem_computer-science.yaml │ │ │ │ │ ├── AraDiCE_ArabicMMLU_middle_stem_natural-science.yaml │ │ │ │ │ ├── AraDiCE_ArabicMMLU_na_humanities_islamic-studies.yaml │ │ │ │ │ ├── AraDiCE_ArabicMMLU_na_language_arabic-language-general.yaml │ │ │ │ │ ├── AraDiCE_ArabicMMLU_na_language_arabic-language-grammar.yaml │ │ │ │ │ ├── AraDiCE_ArabicMMLU_na_other_driving-test.yaml │ │ │ │ │ ├── AraDiCE_ArabicMMLU_na_other_general-knowledge.yaml │ │ │ │ │ ├── AraDiCE_ArabicMMLU_primary_humanities_history.yaml │ │ │ │ │ ├── AraDiCE_ArabicMMLU_primary_humanities_islamic-studies.yaml │ │ │ │ │ ├── AraDiCE_ArabicMMLU_primary_language_arabic-language.yaml │ │ │ │ │ ├── AraDiCE_ArabicMMLU_primary_other_general-knowledge.yaml │ │ │ │ │ ├── AraDiCE_ArabicMMLU_primary_social-science_geography.yaml │ │ │ │ │ ├── AraDiCE_ArabicMMLU_primary_social-science_social-science.yaml │ │ │ │ │ ├── AraDiCE_ArabicMMLU_primary_stem_computer-science.yaml │ │ │ │ │ ├── AraDiCE_ArabicMMLU_primary_stem_math.yaml │ │ │ │ │ ├── AraDiCE_ArabicMMLU_primary_stem_natural-science.yaml │ │ │ │ │ ├── AraDiCE_ArabicMMLU_prof_humanities_law.yaml │ │ │ │ │ ├── AraDiCE_ArabicMMLU_univ_other_management.yaml │ │ │ │ │ ├── AraDiCE_ArabicMMLU_univ_social-science_accounting.yaml │ │ │ │ │ ├── AraDiCE_ArabicMMLU_univ_social-science_economics.yaml │ │ │ │ │ ├── AraDiCE_ArabicMMLU_univ_social-science_political-science.yaml │ │ │ │ │ ├── AraDiCE_ArabicMMLU_univ_stem_computer-science.yaml │ │ │ │ │ ├── _default_template_yaml │ │ │ │ │ ├── metrics.py │ │ │ │ │ └── utils.py │ │ │ │ └── LEV/ │ │ │ │ ├── AraDiCE_ArabicMMLU.yaml │ │ │ │ ├── AraDiCE_ArabicMMLU_high_humanities_history.yaml │ │ │ │ ├── AraDiCE_ArabicMMLU_high_humanities_islamic-studies.yaml │ │ │ │ ├── AraDiCE_ArabicMMLU_high_humanities_philosophy.yaml │ │ │ │ ├── AraDiCE_ArabicMMLU_high_language_arabic-language.yaml │ │ │ │ ├── AraDiCE_ArabicMMLU_high_social-science_civics.yaml │ │ │ │ ├── AraDiCE_ArabicMMLU_high_social-science_economics.yaml │ │ │ │ ├── AraDiCE_ArabicMMLU_high_social-science_geography.yaml │ │ │ │ ├── AraDiCE_ArabicMMLU_high_stem_biology.yaml │ │ │ │ ├── AraDiCE_ArabicMMLU_high_stem_computer-science.yaml │ │ │ │ ├── AraDiCE_ArabicMMLU_high_stem_physics.yaml │ │ │ │ ├── AraDiCE_ArabicMMLU_middle_humanities_history.yaml │ │ │ │ ├── AraDiCE_ArabicMMLU_middle_humanities_islamic-studies.yaml │ │ │ │ ├── AraDiCE_ArabicMMLU_middle_language_arabic-language.yaml │ │ │ │ ├── AraDiCE_ArabicMMLU_middle_other_general-knowledge.yaml │ │ │ │ ├── AraDiCE_ArabicMMLU_middle_social-science_civics.yaml │ │ │ │ ├── AraDiCE_ArabicMMLU_middle_social-science_economics.yaml │ │ │ │ ├── AraDiCE_ArabicMMLU_middle_social-science_geography.yaml │ │ │ │ ├── AraDiCE_ArabicMMLU_middle_social-science_social-science.yaml │ │ │ │ ├── AraDiCE_ArabicMMLU_middle_stem_computer-science.yaml │ │ │ │ ├── AraDiCE_ArabicMMLU_middle_stem_natural-science.yaml │ │ │ │ ├── AraDiCE_ArabicMMLU_na_humanities_islamic-studies.yaml │ │ │ │ ├── AraDiCE_ArabicMMLU_na_language_arabic-language-general.yaml │ │ │ │ ├── AraDiCE_ArabicMMLU_na_language_arabic-language-grammar.yaml │ │ │ │ ├── AraDiCE_ArabicMMLU_na_other_driving-test.yaml │ │ │ │ ├── AraDiCE_ArabicMMLU_na_other_general-knowledge.yaml │ │ │ │ ├── AraDiCE_ArabicMMLU_primary_humanities_history.yaml │ │ │ │ ├── AraDiCE_ArabicMMLU_primary_humanities_islamic-studies.yaml │ │ │ │ ├── AraDiCE_ArabicMMLU_primary_language_arabic-language.yaml │ │ │ │ ├── AraDiCE_ArabicMMLU_primary_other_general-knowledge.yaml │ │ │ │ ├── AraDiCE_ArabicMMLU_primary_social-science_geography.yaml │ │ │ │ ├── AraDiCE_ArabicMMLU_primary_social-science_social-science.yaml │ │ │ │ ├── AraDiCE_ArabicMMLU_primary_stem_computer-science.yaml │ │ │ │ ├── AraDiCE_ArabicMMLU_primary_stem_math.yaml │ │ │ │ ├── AraDiCE_ArabicMMLU_primary_stem_natural-science.yaml │ │ │ │ ├── AraDiCE_ArabicMMLU_prof_humanities_law.yaml │ │ │ │ ├── AraDiCE_ArabicMMLU_univ_other_management.yaml │ │ │ │ ├── AraDiCE_ArabicMMLU_univ_social-science_accounting.yaml │ │ │ │ ├── AraDiCE_ArabicMMLU_univ_social-science_economics.yaml │ │ │ │ ├── AraDiCE_ArabicMMLU_univ_social-science_political-science.yaml │ │ │ │ ├── AraDiCE_ArabicMMLU_univ_stem_computer-science.yaml │ │ │ │ ├── _default_template_yaml │ │ │ │ ├── metrics.py │ │ │ │ └── utils.py │ │ │ ├── README.md │ │ │ ├── aradice.yaml │ │ │ ├── boolq/ │ │ │ │ ├── EGY/ │ │ │ │ │ ├── boolq_egy.yaml │ │ │ │ │ ├── metrics.py │ │ │ │ │ └── utils.py │ │ │ │ ├── ENG/ │ │ │ │ │ ├── boolq_eng.yaml │ │ │ │ │ ├── metrics.py │ │ │ │ │ └── utils.py │ │ │ │ ├── LEV/ │ │ │ │ │ ├── boolq_lev.yaml │ │ │ │ │ ├── metrics.py │ │ │ │ │ └── utils.py │ │ │ │ └── MSA/ │ │ │ │ ├── boolq_msa.yaml │ │ │ │ ├── metrics.py │ │ │ │ └── utils.py │ │ │ ├── cultural-benchmark/ │ │ │ │ ├── egypt.yaml │ │ │ │ ├── jordan.yaml │ │ │ │ ├── lebanon.yaml │ │ │ │ ├── metrics.py │ │ │ │ ├── palestine.yaml │ │ │ │ ├── qatar.yaml │ │ │ │ ├── syria.yaml │ │ │ │ └── utils.py │ │ │ ├── openbookqa/ │ │ │ │ ├── metrics.py │ │ │ │ ├── openbookqa_egy.yaml │ │ │ │ ├── openbookqa_eng.yaml │ │ │ │ ├── openbookqa_lev.yaml │ │ │ │ ├── openbookqa_msa.yaml │ │ │ │ └── utils.py │ │ │ ├── piqa/ │ │ │ │ ├── metrics.py │ │ │ │ ├── piqa_egy.yaml │ │ │ │ ├── piqa_eng.yaml │ │ │ │ ├── piqa_lev.yaml │ │ │ │ └── piqa_msa.yaml │ │ │ ├── truthfulqa_mcq/ │ │ │ │ ├── metrics.py │ │ │ │ ├── truthfulqa_mc1_egy.yaml │ │ │ │ ├── truthfulqa_mc1_eng.yaml │ │ │ │ ├── truthfulqa_mc1_lev.yaml │ │ │ │ └── truthfulqa_mc1_msa.yaml │ │ │ └── winogrande/ │ │ │ ├── metrics.py │ │ │ ├── utils.py │ │ │ ├── winogrande_egy.yaml │ │ │ ├── winogrande_eng.yaml │ │ │ ├── winogrande_lev.yaml │ │ │ └── winogrande_msa.yaml │ │ ├── arc/ │ │ │ ├── README.md │ │ │ ├── arc_challenge.yaml │ │ │ ├── arc_challenge_chat.yaml │ │ │ └── arc_easy.yaml │ │ ├── arc_mt/ │ │ │ ├── README.md │ │ │ ├── arc_challenge_mt_da.yaml │ │ │ ├── arc_challenge_mt_de.yaml │ │ │ ├── arc_challenge_mt_el.yaml │ │ │ ├── arc_challenge_mt_es.yaml │ │ │ ├── arc_challenge_mt_fi.yaml │ │ │ ├── arc_challenge_mt_hu.yaml │ │ │ ├── arc_challenge_mt_is.yaml │ │ │ ├── arc_challenge_mt_it.yaml │ │ │ ├── arc_challenge_mt_nb.yaml │ │ │ ├── arc_challenge_mt_pl.yaml │ │ │ ├── arc_challenge_mt_pt.yaml │ │ │ └── arc_challenge_mt_sv.yaml │ │ ├── arithmetic/ │ │ │ ├── README.md │ │ │ ├── arithmetic_1dc.yaml │ │ │ ├── arithmetic_2da.yaml │ │ │ ├── arithmetic_2dm.yaml │ │ │ ├── arithmetic_2ds.yaml │ │ │ ├── arithmetic_3da.yaml │ │ │ ├── arithmetic_3ds.yaml │ │ │ ├── arithmetic_4da.yaml │ │ │ ├── arithmetic_4ds.yaml │ │ │ ├── arithmetic_5da.yaml │ │ │ └── arithmetic_5ds.yaml │ │ ├── asdiv/ │ │ │ ├── README.md │ │ │ ├── asdiv-cot-llama.yaml │ │ │ └── default.yaml │ │ ├── babi/ │ │ │ ├── README.md │ │ │ └── babi.yaml │ │ ├── babilong/ │ │ │ ├── README.md │ │ │ ├── _babilong_common_yaml │ │ │ ├── babilong.yaml │ │ │ ├── babilong_longctx.yaml │ │ │ ├── babilong_qa1.yaml │ │ │ ├── babilong_qa10.yaml │ │ │ ├── babilong_qa11.yaml │ │ │ ├── babilong_qa12.yaml │ │ │ ├── babilong_qa13.yaml │ │ │ ├── babilong_qa14.yaml │ │ │ ├── babilong_qa15.yaml │ │ │ ├── babilong_qa16.yaml │ │ │ ├── babilong_qa17.yaml │ │ │ ├── babilong_qa18.yaml │ │ │ ├── babilong_qa19.yaml │ │ │ ├── babilong_qa2.yaml │ │ │ ├── babilong_qa20.yaml │ │ │ ├── babilong_qa3.yaml │ │ │ ├── babilong_qa4.yaml │ │ │ ├── babilong_qa5.yaml │ │ │ ├── babilong_qa6.yaml │ │ │ ├── babilong_qa7.yaml │ │ │ ├── babilong_qa8.yaml │ │ │ ├── babilong_qa9.yaml │ │ │ └── common_utils.py │ │ ├── bangla/ │ │ │ ├── README.md │ │ │ ├── bangla_boolqa.yaml │ │ │ ├── bangla_commonsenseqa.yaml │ │ │ ├── bangla_mmlu.yaml │ │ │ ├── bangla_openbookqa.yaml │ │ │ └── bangla_piqa.yaml │ │ ├── basque_bench/ │ │ │ ├── README.md │ │ │ ├── arc_eu_challenge.yaml │ │ │ ├── arc_eu_easy.yaml │ │ │ ├── basque_bench.yaml │ │ │ ├── flores_eu/ │ │ │ │ ├── _flores_common_yaml │ │ │ │ ├── create_yamls_flores_eu.py │ │ │ │ ├── flores_ca-eu.yaml │ │ │ │ ├── flores_de-eu.yaml │ │ │ │ ├── flores_en-eu.yaml │ │ │ │ ├── flores_es-eu.yaml │ │ │ │ ├── flores_eu-ca.yaml │ │ │ │ ├── flores_eu-de.yaml │ │ │ │ ├── flores_eu-en.yaml │ │ │ │ ├── flores_eu-es.yaml │ │ │ │ ├── flores_eu-fr.yaml │ │ │ │ ├── flores_eu-gl.yaml │ │ │ │ ├── flores_eu-it.yaml │ │ │ │ ├── flores_eu-pt.yaml │ │ │ │ ├── flores_eu.yaml │ │ │ │ ├── flores_fr-eu.yaml │ │ │ │ ├── flores_gl-eu.yaml │ │ │ │ ├── flores_it-eu.yaml │ │ │ │ └── flores_pt-eu.yaml │ │ │ ├── mgsm_direct_eu.yaml │ │ │ ├── mgsm_native_cot_eu.yaml │ │ │ ├── paws_eu.yaml │ │ │ ├── piqa_eu.yaml │ │ │ ├── utils.py │ │ │ ├── wnli_eu.yaml │ │ │ └── xcopa_eu.yaml │ │ ├── basqueglue/ │ │ │ ├── README.md │ │ │ ├── bec.yaml │ │ │ ├── bhtc.yaml │ │ │ ├── coref.yaml │ │ │ ├── qnli.yaml │ │ │ ├── utils.py │ │ │ ├── vaxx.yaml │ │ │ └── wic.yaml │ │ ├── bbh/ │ │ │ ├── README.md │ │ │ ├── _generate_configs.py │ │ │ ├── cot_fewshot/ │ │ │ │ ├── _bbh.yaml │ │ │ │ ├── _bbh_cot_fewshot.yaml │ │ │ │ ├── _cot_fewshot_template_yaml │ │ │ │ ├── boolean_expressions.yaml │ │ │ │ ├── causal_judgement.yaml │ │ │ │ ├── date_understanding.yaml │ │ │ │ ├── disambiguation_qa.yaml │ │ │ │ ├── dyck_languages.yaml │ │ │ │ ├── formal_fallacies.yaml │ │ │ │ ├── geometric_shapes.yaml │ │ │ │ ├── hyperbaton.yaml │ │ │ │ ├── logical_deduction_five_objects.yaml │ │ │ │ ├── logical_deduction_seven_objects.yaml │ │ │ │ ├── logical_deduction_three_objects.yaml │ │ │ │ ├── movie_recommendation.yaml │ │ │ │ ├── multistep_arithmetic_two.yaml │ │ │ │ ├── navigate.yaml │ │ │ │ ├── object_counting.yaml │ │ │ │ ├── penguins_in_a_table.yaml │ │ │ │ ├── reasoning_about_colored_objects.yaml │ │ │ │ ├── ruin_names.yaml │ │ │ │ ├── salient_translation_error_detection.yaml │ │ │ │ ├── snarks.yaml │ │ │ │ ├── sports_understanding.yaml │ │ │ │ ├── temporal_sequences.yaml │ │ │ │ ├── tracking_shuffled_objects_five_objects.yaml │ │ │ │ ├── tracking_shuffled_objects_seven_objects.yaml │ │ │ │ ├── tracking_shuffled_objects_three_objects.yaml │ │ │ │ ├── web_of_lies.yaml │ │ │ │ └── word_sorting.yaml │ │ │ ├── cot_zeroshot/ │ │ │ │ ├── _bbh_cot_zeroshot.yaml │ │ │ │ ├── _cot_zeroshot_template_yaml │ │ │ │ ├── boolean_expressions.yaml │ │ │ │ ├── causal_judgement.yaml │ │ │ │ ├── date_understanding.yaml │ │ │ │ ├── disambiguation_qa.yaml │ │ │ │ ├── dyck_languages.yaml │ │ │ │ ├── formal_fallacies.yaml │ │ │ │ ├── geometric_shapes.yaml │ │ │ │ ├── hyperbaton.yaml │ │ │ │ ├── logical_deduction_five_objects.yaml │ │ │ │ ├── logical_deduction_seven_objects.yaml │ │ │ │ ├── logical_deduction_three_objects.yaml │ │ │ │ ├── movie_recommendation.yaml │ │ │ │ ├── multistep_arithmetic_two.yaml │ │ │ │ ├── navigate.yaml │ │ │ │ ├── object_counting.yaml │ │ │ │ ├── penguins_in_a_table.yaml │ │ │ │ ├── reasoning_about_colored_objects.yaml │ │ │ │ ├── ruin_names.yaml │ │ │ │ ├── salient_translation_error_detection.yaml │ │ │ │ ├── snarks.yaml │ │ │ │ ├── sports_understanding.yaml │ │ │ │ ├── temporal_sequences.yaml │ │ │ │ ├── tracking_shuffled_objects_five_objects.yaml │ │ │ │ ├── tracking_shuffled_objects_seven_objects.yaml │ │ │ │ ├── tracking_shuffled_objects_three_objects.yaml │ │ │ │ ├── utils.py │ │ │ │ ├── web_of_lies.yaml │ │ │ │ └── word_sorting.yaml │ │ │ ├── fewshot/ │ │ │ │ ├── _bbh_fewshot.yaml │ │ │ │ ├── _fewshot_template_yaml │ │ │ │ ├── boolean_expressions.yaml │ │ │ │ ├── causal_judgement.yaml │ │ │ │ ├── date_understanding.yaml │ │ │ │ ├── disambiguation_qa.yaml │ │ │ │ ├── dyck_languages.yaml │ │ │ │ ├── formal_fallacies.yaml │ │ │ │ ├── geometric_shapes.yaml │ │ │ │ ├── hyperbaton.yaml │ │ │ │ ├── logical_deduction_five_objects.yaml │ │ │ │ ├── logical_deduction_seven_objects.yaml │ │ │ │ ├── logical_deduction_three_objects.yaml │ │ │ │ ├── movie_recommendation.yaml │ │ │ │ ├── multistep_arithmetic_two.yaml │ │ │ │ ├── navigate.yaml │ │ │ │ ├── object_counting.yaml │ │ │ │ ├── penguins_in_a_table.yaml │ │ │ │ ├── reasoning_about_colored_objects.yaml │ │ │ │ ├── ruin_names.yaml │ │ │ │ ├── salient_translation_error_detection.yaml │ │ │ │ ├── snarks.yaml │ │ │ │ ├── sports_understanding.yaml │ │ │ │ ├── temporal_sequences.yaml │ │ │ │ ├── tracking_shuffled_objects_five_objects.yaml │ │ │ │ ├── tracking_shuffled_objects_seven_objects.yaml │ │ │ │ ├── tracking_shuffled_objects_three_objects.yaml │ │ │ │ ├── web_of_lies.yaml │ │ │ │ └── word_sorting.yaml │ │ │ └── zeroshot/ │ │ │ ├── _bbh_zeroshot.yaml │ │ │ ├── _zeroshot_template_yaml │ │ │ ├── boolean_expressions.yaml │ │ │ ├── causal_judgement.yaml │ │ │ ├── date_understanding.yaml │ │ │ ├── disambiguation_qa.yaml │ │ │ ├── dyck_languages.yaml │ │ │ ├── formal_fallacies.yaml │ │ │ ├── geometric_shapes.yaml │ │ │ ├── hyperbaton.yaml │ │ │ ├── logical_deduction_five_objects.yaml │ │ │ ├── logical_deduction_seven_objects.yaml │ │ │ ├── logical_deduction_three_objects.yaml │ │ │ ├── movie_recommendation.yaml │ │ │ ├── multistep_arithmetic_two.yaml │ │ │ ├── navigate.yaml │ │ │ ├── object_counting.yaml │ │ │ ├── penguins_in_a_table.yaml │ │ │ ├── reasoning_about_colored_objects.yaml │ │ │ ├── ruin_names.yaml │ │ │ ├── salient_translation_error_detection.yaml │ │ │ ├── snarks.yaml │ │ │ ├── sports_understanding.yaml │ │ │ ├── temporal_sequences.yaml │ │ │ ├── tracking_shuffled_objects_five_objects.yaml │ │ │ ├── tracking_shuffled_objects_seven_objects.yaml │ │ │ ├── tracking_shuffled_objects_three_objects.yaml │ │ │ ├── utils.py │ │ │ ├── web_of_lies.yaml │ │ │ └── word_sorting.yaml │ │ ├── bbq/ │ │ │ ├── README.md │ │ │ ├── bbq_generate.yaml │ │ │ ├── bbq_generate_ambig.yaml │ │ │ ├── bbq_generate_disambig.yaml │ │ │ ├── bbq_multiple_choice.yaml │ │ │ ├── bbq_multiple_choice_ambig.yaml │ │ │ ├── bbq_multiple_choice_disambig.yaml │ │ │ └── utils.py │ │ ├── bear/ │ │ │ ├── README.md │ │ │ ├── bear.yaml │ │ │ └── bear_big.yaml │ │ ├── belebele/ │ │ │ ├── README.md │ │ │ ├── _belebele.yaml │ │ │ ├── _default_template_yaml │ │ │ ├── _generate_configs.py │ │ │ ├── belebele_acm_Arab.yaml │ │ │ ├── belebele_afr_Latn.yaml │ │ │ ├── belebele_als_Latn.yaml │ │ │ ├── belebele_amh_Ethi.yaml │ │ │ ├── belebele_apc_Arab.yaml │ │ │ ├── belebele_arb_Arab.yaml │ │ │ ├── belebele_arb_Latn.yaml │ │ │ ├── belebele_ars_Arab.yaml │ │ │ ├── belebele_ary_Arab.yaml │ │ │ ├── belebele_arz_Arab.yaml │ │ │ ├── belebele_asm_Beng.yaml │ │ │ ├── belebele_azj_Latn.yaml │ │ │ ├── belebele_bam_Latn.yaml │ │ │ ├── belebele_ben_Beng.yaml │ │ │ ├── belebele_ben_Latn.yaml │ │ │ ├── belebele_bod_Tibt.yaml │ │ │ ├── belebele_bul_Cyrl.yaml │ │ │ ├── belebele_cat_Latn.yaml │ │ │ ├── belebele_ceb_Latn.yaml │ │ │ ├── belebele_ces_Latn.yaml │ │ │ ├── belebele_ckb_Arab.yaml │ │ │ ├── belebele_dan_Latn.yaml │ │ │ ├── belebele_deu_Latn.yaml │ │ │ ├── belebele_ell_Grek.yaml │ │ │ ├── belebele_eng_Latn.yaml │ │ │ ├── belebele_est_Latn.yaml │ │ │ ├── belebele_eus_Latn.yaml │ │ │ ├── belebele_fin_Latn.yaml │ │ │ ├── belebele_fra_Latn.yaml │ │ │ ├── belebele_fuv_Latn.yaml │ │ │ ├── belebele_gaz_Latn.yaml │ │ │ ├── belebele_grn_Latn.yaml │ │ │ ├── belebele_guj_Gujr.yaml │ │ │ ├── belebele_hat_Latn.yaml │ │ │ ├── belebele_hau_Latn.yaml │ │ │ ├── belebele_heb_Hebr.yaml │ │ │ ├── belebele_hin_Deva.yaml │ │ │ ├── belebele_hin_Latn.yaml │ │ │ ├── belebele_hrv_Latn.yaml │ │ │ ├── belebele_hun_Latn.yaml │ │ │ ├── belebele_hye_Armn.yaml │ │ │ ├── belebele_ibo_Latn.yaml │ │ │ ├── belebele_ilo_Latn.yaml │ │ │ ├── belebele_ind_Latn.yaml │ │ │ ├── belebele_isl_Latn.yaml │ │ │ ├── belebele_ita_Latn.yaml │ │ │ ├── belebele_jav_Latn.yaml │ │ │ ├── belebele_jpn_Jpan.yaml │ │ │ ├── belebele_kac_Latn.yaml │ │ │ ├── belebele_kan_Knda.yaml │ │ │ ├── belebele_kat_Geor.yaml │ │ │ ├── belebele_kaz_Cyrl.yaml │ │ │ ├── belebele_kea_Latn.yaml │ │ │ ├── belebele_khk_Cyrl.yaml │ │ │ ├── belebele_khm_Khmr.yaml │ │ │ ├── belebele_kin_Latn.yaml │ │ │ ├── belebele_kir_Cyrl.yaml │ │ │ ├── belebele_kor_Hang.yaml │ │ │ ├── belebele_lao_Laoo.yaml │ │ │ ├── belebele_lin_Latn.yaml │ │ │ ├── belebele_lit_Latn.yaml │ │ │ ├── belebele_lug_Latn.yaml │ │ │ ├── belebele_luo_Latn.yaml │ │ │ ├── belebele_lvs_Latn.yaml │ │ │ ├── belebele_mal_Mlym.yaml │ │ │ ├── belebele_mar_Deva.yaml │ │ │ ├── belebele_mkd_Cyrl.yaml │ │ │ ├── belebele_mlt_Latn.yaml │ │ │ ├── belebele_mri_Latn.yaml │ │ │ ├── belebele_mya_Mymr.yaml │ │ │ ├── belebele_nld_Latn.yaml │ │ │ ├── belebele_nob_Latn.yaml │ │ │ ├── belebele_npi_Deva.yaml │ │ │ ├── belebele_npi_Latn.yaml │ │ │ ├── belebele_nso_Latn.yaml │ │ │ ├── belebele_nya_Latn.yaml │ │ │ ├── belebele_ory_Orya.yaml │ │ │ ├── belebele_pan_Guru.yaml │ │ │ ├── belebele_pbt_Arab.yaml │ │ │ ├── belebele_pes_Arab.yaml │ │ │ ├── belebele_plt_Latn.yaml │ │ │ ├── belebele_pol_Latn.yaml │ │ │ ├── belebele_por_Latn.yaml │ │ │ ├── belebele_ron_Latn.yaml │ │ │ ├── belebele_rus_Cyrl.yaml │ │ │ ├── belebele_shn_Mymr.yaml │ │ │ ├── belebele_sin_Latn.yaml │ │ │ ├── belebele_sin_Sinh.yaml │ │ │ ├── belebele_slk_Latn.yaml │ │ │ ├── belebele_slv_Latn.yaml │ │ │ ├── belebele_sna_Latn.yaml │ │ │ ├── belebele_snd_Arab.yaml │ │ │ ├── belebele_som_Latn.yaml │ │ │ ├── belebele_sot_Latn.yaml │ │ │ ├── belebele_spa_Latn.yaml │ │ │ ├── belebele_srp_Cyrl.yaml │ │ │ ├── belebele_ssw_Latn.yaml │ │ │ ├── belebele_sun_Latn.yaml │ │ │ ├── belebele_swe_Latn.yaml │ │ │ ├── belebele_swh_Latn.yaml │ │ │ ├── belebele_tam_Taml.yaml │ │ │ ├── belebele_tel_Telu.yaml │ │ │ ├── belebele_tgk_Cyrl.yaml │ │ │ ├── belebele_tgl_Latn.yaml │ │ │ ├── belebele_tha_Thai.yaml │ │ │ ├── belebele_tir_Ethi.yaml │ │ │ ├── belebele_tsn_Latn.yaml │ │ │ ├── belebele_tso_Latn.yaml │ │ │ ├── belebele_tur_Latn.yaml │ │ │ ├── belebele_ukr_Cyrl.yaml │ │ │ ├── belebele_urd_Arab.yaml │ │ │ ├── belebele_urd_Latn.yaml │ │ │ ├── belebele_uzn_Latn.yaml │ │ │ ├── belebele_vie_Latn.yaml │ │ │ ├── belebele_war_Latn.yaml │ │ │ ├── belebele_wol_Latn.yaml │ │ │ ├── belebele_xho_Latn.yaml │ │ │ ├── belebele_yor_Latn.yaml │ │ │ ├── belebele_zho_Hans.yaml │ │ │ ├── belebele_zho_Hant.yaml │ │ │ ├── belebele_zsm_Latn.yaml │ │ │ └── belebele_zul_Latn.yaml │ │ ├── benchmarks/ │ │ │ ├── README.md │ │ │ ├── flan/ │ │ │ │ ├── _held_in_template_yaml │ │ │ │ ├── flan_held_in.yaml │ │ │ │ └── flan_held_out.yaml │ │ │ ├── minerva_math.yaml │ │ │ ├── multimedqa/ │ │ │ │ ├── README.md │ │ │ │ └── multimedqa.yaml │ │ │ ├── openllm.yaml │ │ │ ├── pythia.yaml │ │ │ └── t0_eval.yaml │ │ ├── bertaqa/ │ │ │ ├── README.md │ │ │ ├── _bertaqa_template │ │ │ ├── bertaqa_en.yaml │ │ │ ├── bertaqa_en_mt_gemma-7b.yaml │ │ │ ├── bertaqa_en_mt_hitz.yaml │ │ │ ├── bertaqa_en_mt_itzuli.yaml │ │ │ ├── bertaqa_en_mt_latxa-13b-v1.1.yaml │ │ │ ├── bertaqa_en_mt_latxa-13b-v1.yaml │ │ │ ├── bertaqa_en_mt_latxa-70b-v1.1.yaml │ │ │ ├── bertaqa_en_mt_latxa-70b-v1.yaml │ │ │ ├── bertaqa_en_mt_latxa-7b-v1.1.yaml │ │ │ ├── bertaqa_en_mt_latxa-7b-v1.yaml │ │ │ ├── bertaqa_en_mt_llama-2-13b.yaml │ │ │ ├── bertaqa_en_mt_llama-2-70b.yaml │ │ │ ├── bertaqa_en_mt_llama-2-7b.yaml │ │ │ ├── bertaqa_en_mt_madlad.yaml │ │ │ ├── bertaqa_en_mt_nllb.yaml │ │ │ └── bertaqa_eu.yaml │ │ ├── bhs/ │ │ │ ├── README.md │ │ │ ├── _template_yaml │ │ │ ├── basque-DO-S_DO_V_AUX.yaml │ │ │ ├── basque-DO-S_IO_DO_V_AUX.yaml │ │ │ ├── basque-IO-IO_S_V_AUX.yaml │ │ │ ├── basque-IO-S_IO_DO_V_AUX.yaml │ │ │ ├── basque-S-IO_S_V_AUX.yaml │ │ │ ├── basque-S-S_DO_V_AUX.yaml │ │ │ ├── basque-S-S_IO_DO_V_AUX.yaml │ │ │ ├── basque-S-S_V_AUX.yaml │ │ │ ├── bhs_basque.yaml │ │ │ ├── bhs_hindi.yaml │ │ │ ├── bhs_swahili.yaml │ │ │ ├── hindi-S_O_V.yaml │ │ │ ├── hindi-S_PossPRN_O_V.yaml │ │ │ ├── hindi-S_PossPRN_PossN_O_V.yaml │ │ │ ├── hindi-S_ne_O_V.yaml │ │ │ ├── hindi-S_ne_PossPRN_O_V.yaml │ │ │ ├── hindi-S_ne_PossPRN_PossN_O_V.yaml │ │ │ ├── swahili-N_of_Poss_D_AP_V_ni_AN.yaml │ │ │ ├── swahili-N_of_Poss_D_AP_ni_AN.yaml │ │ │ ├── swahili-N_of_Poss_D_A_V.yaml │ │ │ ├── swahili-N_of_Poss_D_A_V1_V2.yaml │ │ │ ├── swahili-N_of_Poss_D_V.yaml │ │ │ ├── swahili-N_of_Poss_D_ni_A.yaml │ │ │ ├── swahili-N_of_Poss_V.yaml │ │ │ └── swahili-N_of_Poss_ni_A.yaml │ │ ├── bigbench/ │ │ │ ├── README.md │ │ │ ├── generate_tasks.py │ │ │ ├── generate_until/ │ │ │ │ ├── abstract_narrative_understanding.yaml │ │ │ │ ├── anachronisms.yaml │ │ │ │ ├── analogical_similarity.yaml │ │ │ │ ├── analytic_entailment.yaml │ │ │ │ ├── arithmetic.yaml │ │ │ │ ├── ascii_word_recognition.yaml │ │ │ │ ├── authorship_verification.yaml │ │ │ │ ├── auto_categorization.yaml │ │ │ │ ├── auto_debugging.yaml │ │ │ │ ├── bbq_lite_json.yaml │ │ │ │ ├── bridging_anaphora_resolution_barqa.yaml │ │ │ │ ├── causal_judgment.yaml │ │ │ │ ├── cause_and_effect.yaml │ │ │ │ ├── checkmate_in_one.yaml │ │ │ │ ├── chess_state_tracking.yaml │ │ │ │ ├── chinese_remainder_theorem.yaml │ │ │ │ ├── cifar10_classification.yaml │ │ │ │ ├── code_line_description.yaml │ │ │ │ ├── codenames.yaml │ │ │ │ ├── color.yaml │ │ │ │ ├── common_morpheme.yaml │ │ │ │ ├── conceptual_combinations.yaml │ │ │ │ ├── conlang_translation.yaml │ │ │ │ ├── contextual_parametric_knowledge_conflicts.yaml │ │ │ │ ├── crash_blossom.yaml │ │ │ │ ├── crass_ai.yaml │ │ │ │ ├── cryobiology_spanish.yaml │ │ │ │ ├── cryptonite.yaml │ │ │ │ ├── cs_algorithms.yaml │ │ │ │ ├── dark_humor_detection.yaml │ │ │ │ ├── date_understanding.yaml │ │ │ │ ├── disambiguation_qa.yaml │ │ │ │ ├── discourse_marker_prediction.yaml │ │ │ │ ├── disfl_qa.yaml │ │ │ │ ├── dyck_languages.yaml │ │ │ │ ├── elementary_math_qa.yaml │ │ │ │ ├── emoji_movie.yaml │ │ │ │ ├── emojis_emotion_prediction.yaml │ │ │ │ ├── empirical_judgments.yaml │ │ │ │ ├── english_proverbs.yaml │ │ │ │ ├── english_russian_proverbs.yaml │ │ │ │ ├── entailed_polarity.yaml │ │ │ │ ├── entailed_polarity_hindi.yaml │ │ │ │ ├── epistemic_reasoning.yaml │ │ │ │ ├── evaluating_information_essentiality.yaml │ │ │ │ ├── fact_checker.yaml │ │ │ │ ├── fantasy_reasoning.yaml │ │ │ │ ├── few_shot_nlg.yaml │ │ │ │ ├── figure_of_speech_detection.yaml │ │ │ │ ├── formal_fallacies_syllogisms_negation.yaml │ │ │ │ ├── gem.yaml │ │ │ │ ├── gender_inclusive_sentences_german.yaml │ │ │ │ ├── general_knowledge.yaml │ │ │ │ ├── geometric_shapes.yaml │ │ │ │ ├── goal_step_wikihow.yaml │ │ │ │ ├── gre_reading_comprehension.yaml │ │ │ │ ├── hhh_alignment.yaml │ │ │ │ ├── hindi_question_answering.yaml │ │ │ │ ├── hindu_knowledge.yaml │ │ │ │ ├── hinglish_toxicity.yaml │ │ │ │ ├── human_organs_senses.yaml │ │ │ │ ├── hyperbaton.yaml │ │ │ │ ├── identify_math_theorems.yaml │ │ │ │ ├── identify_odd_metaphor.yaml │ │ │ │ ├── implicatures.yaml │ │ │ │ ├── implicit_relations.yaml │ │ │ │ ├── intent_recognition.yaml │ │ │ │ ├── international_phonetic_alphabet_nli.yaml │ │ │ │ ├── international_phonetic_alphabet_transliterate.yaml │ │ │ │ ├── intersect_geometry.yaml │ │ │ │ ├── irony_identification.yaml │ │ │ │ ├── kanji_ascii.yaml │ │ │ │ ├── kannada.yaml │ │ │ │ ├── key_value_maps.yaml │ │ │ │ ├── known_unknowns.yaml │ │ │ │ ├── language_games.yaml │ │ │ │ ├── language_identification.yaml │ │ │ │ ├── linguistic_mappings.yaml │ │ │ │ ├── linguistics_puzzles.yaml │ │ │ │ ├── list_functions.yaml │ │ │ │ ├── logic_grid_puzzle.yaml │ │ │ │ ├── logical_args.yaml │ │ │ │ ├── logical_deduction.yaml │ │ │ │ ├── logical_fallacy_detection.yaml │ │ │ │ ├── logical_sequence.yaml │ │ │ │ ├── mathematical_induction.yaml │ │ │ │ ├── matrixshapes.yaml │ │ │ │ ├── metaphor_boolean.yaml │ │ │ │ ├── metaphor_understanding.yaml │ │ │ │ ├── minute_mysteries_qa.yaml │ │ │ │ ├── misconceptions.yaml │ │ │ │ ├── misconceptions_russian.yaml │ │ │ │ ├── mnist_ascii.yaml │ │ │ │ ├── modified_arithmetic.yaml │ │ │ │ ├── moral_permissibility.yaml │ │ │ │ ├── movie_dialog_same_or_different.yaml │ │ │ │ ├── movie_recommendation.yaml │ │ │ │ ├── mult_data_wrangling.yaml │ │ │ │ ├── multiemo.yaml │ │ │ │ ├── natural_instructions.yaml │ │ │ │ ├── navigate.yaml │ │ │ │ ├── nonsense_words_grammar.yaml │ │ │ │ ├── novel_concepts.yaml │ │ │ │ ├── object_counting.yaml │ │ │ │ ├── odd_one_out.yaml │ │ │ │ ├── operators.yaml │ │ │ │ ├── paragraph_segmentation.yaml │ │ │ │ ├── parsinlu_qa.yaml │ │ │ │ ├── parsinlu_reading_comprehension.yaml │ │ │ │ ├── penguins_in_a_table.yaml │ │ │ │ ├── periodic_elements.yaml │ │ │ │ ├── persian_idioms.yaml │ │ │ │ ├── phrase_relatedness.yaml │ │ │ │ ├── physical_intuition.yaml │ │ │ │ ├── physics.yaml │ │ │ │ ├── physics_questions.yaml │ │ │ │ ├── play_dialog_same_or_different.yaml │ │ │ │ ├── polish_sequence_labeling.yaml │ │ │ │ ├── presuppositions_as_nli.yaml │ │ │ │ ├── qa_wikidata.yaml │ │ │ │ ├── question_selection.yaml │ │ │ │ ├── real_or_fake_text.yaml │ │ │ │ ├── reasoning_about_colored_objects.yaml │ │ │ │ ├── repeat_copy_logic.yaml │ │ │ │ ├── rephrase.yaml │ │ │ │ ├── riddle_sense.yaml │ │ │ │ ├── ruin_names.yaml │ │ │ │ ├── salient_translation_error_detection.yaml │ │ │ │ ├── scientific_press_release.yaml │ │ │ │ ├── semantic_parsing_in_context_sparc.yaml │ │ │ │ ├── semantic_parsing_spider.yaml │ │ │ │ ├── sentence_ambiguity.yaml │ │ │ │ ├── similarities_abstraction.yaml │ │ │ │ ├── simp_turing_concept.yaml │ │ │ │ ├── simple_arithmetic_json.yaml │ │ │ │ ├── simple_arithmetic_json_multiple_choice.yaml │ │ │ │ ├── simple_arithmetic_json_subtasks.yaml │ │ │ │ ├── simple_arithmetic_multiple_targets_json.yaml │ │ │ │ ├── simple_ethical_questions.yaml │ │ │ │ ├── simple_text_editing.yaml │ │ │ │ ├── snarks.yaml │ │ │ │ ├── social_iqa.yaml │ │ │ │ ├── social_support.yaml │ │ │ │ ├── sports_understanding.yaml │ │ │ │ ├── strange_stories.yaml │ │ │ │ ├── strategyqa.yaml │ │ │ │ ├── sufficient_information.yaml │ │ │ │ ├── suicide_risk.yaml │ │ │ │ ├── swahili_english_proverbs.yaml │ │ │ │ ├── swedish_to_german_proverbs.yaml │ │ │ │ ├── symbol_interpretation.yaml │ │ │ │ ├── temporal_sequences.yaml │ │ │ │ ├── tense.yaml │ │ │ │ ├── timedial.yaml │ │ │ │ ├── topical_chat.yaml │ │ │ │ ├── tracking_shuffled_objects.yaml │ │ │ │ ├── understanding_fables.yaml │ │ │ │ ├── undo_permutation.yaml │ │ │ │ ├── unit_conversion.yaml │ │ │ │ ├── unit_interpretation.yaml │ │ │ │ ├── unnatural_in_context_learning.yaml │ │ │ │ ├── vitaminc_fact_verification.yaml │ │ │ │ ├── what_is_the_tao.yaml │ │ │ │ ├── which_wiki_edit.yaml │ │ │ │ ├── winowhy.yaml │ │ │ │ ├── word_sorting.yaml │ │ │ │ └── word_unscrambling.yaml │ │ │ ├── generate_until_template_yaml │ │ │ ├── multiple_choice/ │ │ │ │ ├── abstract_narrative_understanding.yaml │ │ │ │ ├── anachronisms.yaml │ │ │ │ ├── analogical_similarity.yaml │ │ │ │ ├── analytic_entailment.yaml │ │ │ │ ├── arithmetic.yaml │ │ │ │ ├── authorship_verification.yaml │ │ │ │ ├── bbq_lite_json.yaml │ │ │ │ ├── causal_judgment.yaml │ │ │ │ ├── cause_and_effect.yaml │ │ │ │ ├── checkmate_in_one.yaml │ │ │ │ ├── cifar10_classification.yaml │ │ │ │ ├── code_line_description.yaml │ │ │ │ ├── color.yaml │ │ │ │ ├── common_morpheme.yaml │ │ │ │ ├── conceptual_combinations.yaml │ │ │ │ ├── contextual_parametric_knowledge_conflicts.yaml │ │ │ │ ├── crash_blossom.yaml │ │ │ │ ├── crass_ai.yaml │ │ │ │ ├── cryobiology_spanish.yaml │ │ │ │ ├── cs_algorithms.yaml │ │ │ │ ├── dark_humor_detection.yaml │ │ │ │ ├── date_understanding.yaml │ │ │ │ ├── disambiguation_qa.yaml │ │ │ │ ├── discourse_marker_prediction.yaml │ │ │ │ ├── dyck_languages.yaml │ │ │ │ ├── elementary_math_qa.yaml │ │ │ │ ├── emoji_movie.yaml │ │ │ │ ├── emojis_emotion_prediction.yaml │ │ │ │ ├── empirical_judgments.yaml │ │ │ │ ├── english_proverbs.yaml │ │ │ │ ├── english_russian_proverbs.yaml │ │ │ │ ├── entailed_polarity.yaml │ │ │ │ ├── entailed_polarity_hindi.yaml │ │ │ │ ├── epistemic_reasoning.yaml │ │ │ │ ├── evaluating_information_essentiality.yaml │ │ │ │ ├── fact_checker.yaml │ │ │ │ ├── fantasy_reasoning.yaml │ │ │ │ ├── figure_of_speech_detection.yaml │ │ │ │ ├── formal_fallacies_syllogisms_negation.yaml │ │ │ │ ├── general_knowledge.yaml │ │ │ │ ├── geometric_shapes.yaml │ │ │ │ ├── goal_step_wikihow.yaml │ │ │ │ ├── gre_reading_comprehension.yaml │ │ │ │ ├── hhh_alignment.yaml │ │ │ │ ├── hindu_knowledge.yaml │ │ │ │ ├── hinglish_toxicity.yaml │ │ │ │ ├── human_organs_senses.yaml │ │ │ │ ├── hyperbaton.yaml │ │ │ │ ├── identify_math_theorems.yaml │ │ │ │ ├── identify_odd_metaphor.yaml │ │ │ │ ├── implicatures.yaml │ │ │ │ ├── implicit_relations.yaml │ │ │ │ ├── intent_recognition.yaml │ │ │ │ ├── international_phonetic_alphabet_nli.yaml │ │ │ │ ├── intersect_geometry.yaml │ │ │ │ ├── irony_identification.yaml │ │ │ │ ├── kanji_ascii.yaml │ │ │ │ ├── kannada.yaml │ │ │ │ ├── key_value_maps.yaml │ │ │ │ ├── known_unknowns.yaml │ │ │ │ ├── language_identification.yaml │ │ │ │ ├── logic_grid_puzzle.yaml │ │ │ │ ├── logical_args.yaml │ │ │ │ ├── logical_deduction.yaml │ │ │ │ ├── logical_fallacy_detection.yaml │ │ │ │ ├── logical_sequence.yaml │ │ │ │ ├── mathematical_induction.yaml │ │ │ │ ├── metaphor_boolean.yaml │ │ │ │ ├── metaphor_understanding.yaml │ │ │ │ ├── misconceptions.yaml │ │ │ │ ├── misconceptions_russian.yaml │ │ │ │ ├── mnist_ascii.yaml │ │ │ │ ├── moral_permissibility.yaml │ │ │ │ ├── movie_dialog_same_or_different.yaml │ │ │ │ ├── movie_recommendation.yaml │ │ │ │ ├── multiemo.yaml │ │ │ │ ├── navigate.yaml │ │ │ │ ├── nonsense_words_grammar.yaml │ │ │ │ ├── novel_concepts.yaml │ │ │ │ ├── odd_one_out.yaml │ │ │ │ ├── parsinlu_qa.yaml │ │ │ │ ├── penguins_in_a_table.yaml │ │ │ │ ├── periodic_elements.yaml │ │ │ │ ├── persian_idioms.yaml │ │ │ │ ├── phrase_relatedness.yaml │ │ │ │ ├── physical_intuition.yaml │ │ │ │ ├── physics.yaml │ │ │ │ ├── play_dialog_same_or_different.yaml │ │ │ │ ├── presuppositions_as_nli.yaml │ │ │ │ ├── question_selection.yaml │ │ │ │ ├── real_or_fake_text.yaml │ │ │ │ ├── reasoning_about_colored_objects.yaml │ │ │ │ ├── riddle_sense.yaml │ │ │ │ ├── ruin_names.yaml │ │ │ │ ├── salient_translation_error_detection.yaml │ │ │ │ ├── sentence_ambiguity.yaml │ │ │ │ ├── similarities_abstraction.yaml │ │ │ │ ├── simple_ethical_questions.yaml │ │ │ │ ├── snarks.yaml │ │ │ │ ├── social_iqa.yaml │ │ │ │ ├── social_support.yaml │ │ │ │ ├── sports_understanding.yaml │ │ │ │ ├── strange_stories.yaml │ │ │ │ ├── strategyqa.yaml │ │ │ │ ├── suicide_risk.yaml │ │ │ │ ├── swahili_english_proverbs.yaml │ │ │ │ ├── swedish_to_german_proverbs.yaml │ │ │ │ ├── symbol_interpretation.yaml │ │ │ │ ├── temporal_sequences.yaml │ │ │ │ ├── timedial.yaml │ │ │ │ ├── tracking_shuffled_objects.yaml │ │ │ │ ├── understanding_fables.yaml │ │ │ │ ├── undo_permutation.yaml │ │ │ │ ├── unit_conversion.yaml │ │ │ │ ├── unit_interpretation.yaml │ │ │ │ ├── vitaminc_fact_verification.yaml │ │ │ │ ├── what_is_the_tao.yaml │ │ │ │ ├── which_wiki_edit.yaml │ │ │ │ └── winowhy.yaml │ │ │ ├── multiple_choice_template_a_yaml │ │ │ ├── multiple_choice_template_b_yaml │ │ │ └── push_bigbench_dataset.py │ │ ├── blimp/ │ │ │ ├── README.md │ │ │ ├── _blimp.yaml │ │ │ ├── _template_yaml │ │ │ ├── adjunct_island.yaml │ │ │ ├── anaphor_gender_agreement.yaml │ │ │ ├── anaphor_number_agreement.yaml │ │ │ ├── animate_subject_passive.yaml │ │ │ ├── animate_subject_trans.yaml │ │ │ ├── causative.yaml │ │ │ ├── complex_NP_island.yaml │ │ │ ├── coordinate_structure_constraint_complex_left_branch.yaml │ │ │ ├── coordinate_structure_constraint_object_extraction.yaml │ │ │ ├── determiner_noun_agreement_1.yaml │ │ │ ├── determiner_noun_agreement_2.yaml │ │ │ ├── determiner_noun_agreement_irregular_1.yaml │ │ │ ├── determiner_noun_agreement_irregular_2.yaml │ │ │ ├── determiner_noun_agreement_with_adj_2.yaml │ │ │ ├── determiner_noun_agreement_with_adj_irregular_1.yaml │ │ │ ├── determiner_noun_agreement_with_adj_irregular_2.yaml │ │ │ ├── determiner_noun_agreement_with_adjective_1.yaml │ │ │ ├── distractor_agreement_relational_noun.yaml │ │ │ ├── distractor_agreement_relative_clause.yaml │ │ │ ├── drop_argument.yaml │ │ │ ├── ellipsis_n_bar_1.yaml │ │ │ ├── ellipsis_n_bar_2.yaml │ │ │ ├── existential_there_object_raising.yaml │ │ │ ├── existential_there_quantifiers_1.yaml │ │ │ ├── existential_there_quantifiers_2.yaml │ │ │ ├── existential_there_subject_raising.yaml │ │ │ ├── expletive_it_object_raising.yaml │ │ │ ├── generate_configs.py │ │ │ ├── inchoative.yaml │ │ │ ├── intransitive.yaml │ │ │ ├── irregular_past_participle_adjectives.yaml │ │ │ ├── irregular_past_participle_verbs.yaml │ │ │ ├── irregular_plural_subject_verb_agreement_1.yaml │ │ │ ├── irregular_plural_subject_verb_agreement_2.yaml │ │ │ ├── left_branch_island_echo_question.yaml │ │ │ ├── left_branch_island_simple_question.yaml │ │ │ ├── matrix_question_npi_licensor_present.yaml │ │ │ ├── npi_present_1.yaml │ │ │ ├── npi_present_2.yaml │ │ │ ├── only_npi_licensor_present.yaml │ │ │ ├── only_npi_scope.yaml │ │ │ ├── passive_1.yaml │ │ │ ├── passive_2.yaml │ │ │ ├── principle_A_c_command.yaml │ │ │ ├── principle_A_case_1.yaml │ │ │ ├── principle_A_case_2.yaml │ │ │ ├── principle_A_domain_1.yaml │ │ │ ├── principle_A_domain_2.yaml │ │ │ ├── principle_A_domain_3.yaml │ │ │ ├── principle_A_reconstruction.yaml │ │ │ ├── regular_plural_subject_verb_agreement_1.yaml │ │ │ ├── regular_plural_subject_verb_agreement_2.yaml │ │ │ ├── sentential_negation_npi_licensor_present.yaml │ │ │ ├── sentential_negation_npi_scope.yaml │ │ │ ├── sentential_subject_island.yaml │ │ │ ├── superlative_quantifiers_1.yaml │ │ │ ├── superlative_quantifiers_2.yaml │ │ │ ├── tough_vs_raising_1.yaml │ │ │ ├── tough_vs_raising_2.yaml │ │ │ ├── transitive.yaml │ │ │ ├── wh_island.yaml │ │ │ ├── wh_questions_object_gap.yaml │ │ │ ├── wh_questions_subject_gap.yaml │ │ │ ├── wh_questions_subject_gap_long_distance.yaml │ │ │ ├── wh_vs_that_no_gap.yaml │ │ │ ├── wh_vs_that_no_gap_long_distance.yaml │ │ │ ├── wh_vs_that_with_gap.yaml │ │ │ └── wh_vs_that_with_gap_long_distance.yaml │ │ ├── blimp_nl/ │ │ │ ├── README.md │ │ │ ├── _template_yaml │ │ │ ├── adpositional_phrases__argument_r_extraction.yaml │ │ │ ├── adpositional_phrases__argument_scrambling.yaml │ │ │ ├── adverbial_modification__position_proform.yaml │ │ │ ├── adverbial_modification__position_type.yaml │ │ │ ├── anaphor_agreement__number.yaml │ │ │ ├── anaphor_agreement__person.yaml │ │ │ ├── argument_structure__argument_number_ditransitive.yaml │ │ │ ├── argument_structure__argument_number_in_transitive.yaml │ │ │ ├── argument_structure__ditransitive_nomdat_1.yaml │ │ │ ├── argument_structure__ditransitive_nomdat_2.yaml │ │ │ ├── argument_structure__ditransitive_nomdat_3.yaml │ │ │ ├── argument_structure__intransitive_unaccusative_1.yaml │ │ │ ├── argument_structure__intransitive_unaccusative_2.yaml │ │ │ ├── argument_structure__intransitive_unaccusative_3.yaml │ │ │ ├── auxiliaries__order_1.yaml │ │ │ ├── auxiliaries__order_2.yaml │ │ │ ├── auxiliaries__perfect.yaml │ │ │ ├── auxiliaries__semi_aspectual_1.yaml │ │ │ ├── auxiliaries__semi_aspectual_2.yaml │ │ │ ├── binding_principle_a__c_command.yaml │ │ │ ├── binding_principle_a__monomorphemic.yaml │ │ │ ├── blimp_nl_group.yaml │ │ │ ├── complementive__ditransitive.yaml │ │ │ ├── complementive__intransitive.yaml │ │ │ ├── complementive__position_adverb.yaml │ │ │ ├── complementive__position_verb.yaml │ │ │ ├── complementive__transitive.yaml │ │ │ ├── crossing_dependencies__cross_dependency.yaml │ │ │ ├── determiners__geen_expletive.yaml │ │ │ ├── determiners__geen_scrambling_1.yaml │ │ │ ├── determiners__geen_scrambling_2.yaml │ │ │ ├── determiners__negative_polarity.yaml │ │ │ ├── extraposition__adjectival_adverbial.yaml │ │ │ ├── extraposition__adjectival_supplementive.yaml │ │ │ ├── extraposition__argument_nominal.yaml │ │ │ ├── finite_argument_clause__complementizer.yaml │ │ │ ├── finite_argument_clause__perception_dat.yaml │ │ │ ├── finite_argument_clause__perception_of.yaml │ │ │ ├── finite_argument_clause__position.yaml │ │ │ ├── finite_argument_clause__sluicing_1.yaml │ │ │ ├── finite_argument_clause__sluicing_2.yaml │ │ │ ├── infinitival_argument_clause__bare_verb_cluster.yaml │ │ │ ├── infinitival_argument_clause__bare_verb_type_1.yaml │ │ │ ├── infinitival_argument_clause__bare_verb_type_2.yaml │ │ │ ├── infinitival_argument_clause__bare_verb_type_3.yaml │ │ │ ├── infinitival_argument_clause__om_te.yaml │ │ │ ├── infinitival_argument_clause__te_om_te_difference_1.yaml │ │ │ ├── infinitival_argument_clause__te_om_te_difference_2.yaml │ │ │ ├── infinitival_argument_clause__te_transparant_split.yaml │ │ │ ├── infinitival_argument_clause__verb_type.yaml │ │ │ ├── nominalization__type_inf_1.yaml │ │ │ ├── nominalization__type_inf_2.yaml │ │ │ ├── parasitic_gaps__scrambling.yaml │ │ │ ├── parasitic_gaps__structure_type_1.yaml │ │ │ ├── parasitic_gaps__structure_type_2.yaml │ │ │ ├── parasitic_gaps__structure_type_3.yaml │ │ │ ├── passive__aci.yaml │ │ │ ├── passive__ditransitive_1.yaml │ │ │ ├── passive__ditransitive_2.yaml │ │ │ ├── passive__impersonal.yaml │ │ │ ├── quantifiers__universal_difference_agreement_plural.yaml │ │ │ ├── quantifiers__universal_difference_agreement_singular.yaml │ │ │ ├── r_words__adverbial.yaml │ │ │ ├── r_words__weak_proform.yaml │ │ │ ├── relativization__island.yaml │ │ │ ├── relativization__pied_piping.yaml │ │ │ ├── relativization__resumptive_prolepsis.yaml │ │ │ ├── topicalization__island.yaml │ │ │ ├── topicalization__question_similarity_1.yaml │ │ │ ├── topicalization__question_similarity_2.yaml │ │ │ ├── topicalization__resumptive_prolepsis.yaml │ │ │ ├── verb_second__order_embedded.yaml │ │ │ ├── verb_second__order_main.yaml │ │ │ ├── wh_movement__filler_effect_gap.yaml │ │ │ ├── wh_movement__filler_effect_no_gap.yaml │ │ │ ├── wh_movement__hierarchy.yaml │ │ │ ├── wh_movement__question_formation.yaml │ │ │ ├── wh_movement__stranding_1.yaml │ │ │ ├── wh_movement__stranding_2.yaml │ │ │ ├── wh_movement_restrictions__bridge_verb_1.yaml │ │ │ ├── wh_movement_restrictions__bridge_verb_2.yaml │ │ │ ├── wh_movement_restrictions__island_1.yaml │ │ │ ├── wh_movement_restrictions__island_2.yaml │ │ │ ├── wh_movement_restrictions__resumptive_prolepsis.yaml │ │ │ └── wh_movement_restrictions__superiority.yaml │ │ ├── c4/ │ │ │ ├── README.md │ │ │ ├── c4.yaml │ │ │ └── preprocess_c4.py │ │ ├── cabbq/ │ │ │ ├── README.md │ │ │ ├── _cabbq_common_yaml │ │ │ ├── cabbq.yaml │ │ │ ├── cabbq_age.yaml │ │ │ ├── cabbq_disability_status.yaml │ │ │ ├── cabbq_gender.yaml │ │ │ ├── cabbq_lgbtqia.yaml │ │ │ ├── cabbq_nationality.yaml │ │ │ ├── cabbq_physical_appearance.yaml │ │ │ ├── cabbq_race_ethnicity.yaml │ │ │ ├── cabbq_religion.yaml │ │ │ ├── cabbq_ses.yaml │ │ │ ├── cabbq_spanish_region.yaml │ │ │ └── utils.py │ │ ├── careqa/ │ │ │ ├── README.md │ │ │ ├── careqa_en.yaml │ │ │ ├── careqa_es.yaml │ │ │ ├── careqa_open.yaml │ │ │ ├── careqa_open_perplexity.yaml │ │ │ ├── utils.py │ │ │ ├── utils_open.py │ │ │ └── utils_perplexity.py │ │ ├── catalan_bench/ │ │ │ ├── README.md │ │ │ ├── _arc_ca_common_yaml │ │ │ ├── _cabreu_common_yaml │ │ │ ├── arc_ca_challenge.yaml │ │ │ ├── arc_ca_easy.yaml │ │ │ ├── cabreu_abstractive.yaml │ │ │ ├── cabreu_extractive.yaml │ │ │ ├── cabreu_extreme.yaml │ │ │ ├── catalan_bench.yaml │ │ │ ├── catalanqa.yaml │ │ │ ├── catcola.yaml │ │ │ ├── cocoteros_va.yaml │ │ │ ├── copa_ca.yaml │ │ │ ├── coqcat.yaml │ │ │ ├── flores_ca/ │ │ │ │ ├── _flores_common_yaml │ │ │ │ ├── create_yamls_flores_ca.py │ │ │ │ ├── flores_ca-de.yaml │ │ │ │ ├── flores_ca-en.yaml │ │ │ │ ├── flores_ca-es.yaml │ │ │ │ ├── flores_ca-eu.yaml │ │ │ │ ├── flores_ca-fr.yaml │ │ │ │ ├── flores_ca-gl.yaml │ │ │ │ ├── flores_ca-it.yaml │ │ │ │ ├── flores_ca-pt.yaml │ │ │ │ ├── flores_ca.yaml │ │ │ │ ├── flores_de-ca.yaml │ │ │ │ ├── flores_en-ca.yaml │ │ │ │ ├── flores_es-ca.yaml │ │ │ │ ├── flores_eu-ca.yaml │ │ │ │ ├── flores_fr-ca.yaml │ │ │ │ ├── flores_gl-ca.yaml │ │ │ │ ├── flores_it-ca.yaml │ │ │ │ └── flores_pt-ca.yaml │ │ │ ├── mgsm_direct_ca.yaml │ │ │ ├── openbookqa_ca.yaml │ │ │ ├── parafraseja.yaml │ │ │ ├── paws_ca.yaml │ │ │ ├── phrases_va/ │ │ │ │ ├── _phrases_va_common │ │ │ │ ├── phrases_ca-va.yaml │ │ │ │ └── phrases_va-ca.yaml │ │ │ ├── piqa_ca.yaml │ │ │ ├── siqa_ca.yaml │ │ │ ├── teca.yaml │ │ │ ├── truthfulqa_va/ │ │ │ │ ├── truthfulqa_va.yaml │ │ │ │ └── utils.py │ │ │ ├── utils.py │ │ │ ├── wnli_ca.yaml │ │ │ ├── xnli_ca.yaml │ │ │ ├── xnli_va.yaml │ │ │ ├── xquad_ca.yaml │ │ │ └── xstorycloze_ca.yaml │ │ ├── ceval/ │ │ │ ├── README.md │ │ │ ├── _ceval-valid.yaml │ │ │ ├── _default_ceval_yaml │ │ │ ├── _generate_configs.py │ │ │ ├── ceval-valid_accountant.yaml │ │ │ ├── ceval-valid_advanced_mathematics.yaml │ │ │ ├── ceval-valid_art_studies.yaml │ │ │ ├── ceval-valid_basic_medicine.yaml │ │ │ ├── ceval-valid_business_administration.yaml │ │ │ ├── ceval-valid_chinese_language_and_literature.yaml │ │ │ ├── ceval-valid_civil_servant.yaml │ │ │ ├── ceval-valid_clinical_medicine.yaml │ │ │ ├── ceval-valid_college_chemistry.yaml │ │ │ ├── ceval-valid_college_economics.yaml │ │ │ ├── ceval-valid_college_physics.yaml │ │ │ ├── ceval-valid_college_programming.yaml │ │ │ ├── ceval-valid_computer_architecture.yaml │ │ │ ├── ceval-valid_computer_network.yaml │ │ │ ├── ceval-valid_discrete_mathematics.yaml │ │ │ ├── ceval-valid_education_science.yaml │ │ │ ├── ceval-valid_electrical_engineer.yaml │ │ │ ├── ceval-valid_environmental_impact_assessment_engineer.yaml │ │ │ ├── ceval-valid_fire_engineer.yaml │ │ │ ├── ceval-valid_high_school_biology.yaml │ │ │ ├── ceval-valid_high_school_chemistry.yaml │ │ │ ├── ceval-valid_high_school_chinese.yaml │ │ │ ├── ceval-valid_high_school_geography.yaml │ │ │ ├── ceval-valid_high_school_history.yaml │ │ │ ├── ceval-valid_high_school_mathematics.yaml │ │ │ ├── ceval-valid_high_school_physics.yaml │ │ │ ├── ceval-valid_high_school_politics.yaml │ │ │ ├── ceval-valid_ideological_and_moral_cultivation.yaml │ │ │ ├── ceval-valid_law.yaml │ │ │ ├── ceval-valid_legal_professional.yaml │ │ │ ├── ceval-valid_logic.yaml │ │ │ ├── ceval-valid_mao_zedong_thought.yaml │ │ │ ├── ceval-valid_marxism.yaml │ │ │ ├── ceval-valid_metrology_engineer.yaml │ │ │ ├── ceval-valid_middle_school_biology.yaml │ │ │ ├── ceval-valid_middle_school_chemistry.yaml │ │ │ ├── ceval-valid_middle_school_geography.yaml │ │ │ ├── ceval-valid_middle_school_history.yaml │ │ │ ├── ceval-valid_middle_school_mathematics.yaml │ │ │ ├── ceval-valid_middle_school_physics.yaml │ │ │ ├── ceval-valid_middle_school_politics.yaml │ │ │ ├── ceval-valid_modern_chinese_history.yaml │ │ │ ├── ceval-valid_operating_system.yaml │ │ │ ├── ceval-valid_physician.yaml │ │ │ ├── ceval-valid_plant_protection.yaml │ │ │ ├── ceval-valid_probability_and_statistics.yaml │ │ │ ├── ceval-valid_professional_tour_guide.yaml │ │ │ ├── ceval-valid_sports_science.yaml │ │ │ ├── ceval-valid_tax_accountant.yaml │ │ │ ├── ceval-valid_teacher_qualification.yaml │ │ │ ├── ceval-valid_urban_and_rural_planner.yaml │ │ │ └── ceval-valid_veterinary_medicine.yaml │ │ ├── chartqa/ │ │ │ ├── README.md │ │ │ ├── chartqa.yaml │ │ │ ├── chartqa_llama.yaml │ │ │ ├── chartqa_llama_90.yaml │ │ │ └── utils.py │ │ ├── click/ │ │ │ ├── README.md │ │ │ ├── click.yaml │ │ │ ├── click_cul/ │ │ │ │ ├── _click_cul.yaml │ │ │ │ ├── _default_click_cul_yaml │ │ │ │ ├── click_cul_economy.yaml │ │ │ │ ├── click_cul_geography.yaml │ │ │ │ ├── click_cul_history.yaml │ │ │ │ ├── click_cul_kpop.yaml │ │ │ │ ├── click_cul_law.yaml │ │ │ │ ├── click_cul_politics.yaml │ │ │ │ ├── click_cul_society.yaml │ │ │ │ ├── click_cul_tradition.yaml │ │ │ │ └── utils.py │ │ │ └── click_lang/ │ │ │ ├── _click_lang.yaml │ │ │ ├── _default_click_lang_yaml │ │ │ ├── click_lang_function.yaml │ │ │ ├── click_lang_grammar.yaml │ │ │ ├── click_lang_text.yaml │ │ │ └── utils.py │ │ ├── cmmlu/ │ │ │ ├── README.md │ │ │ ├── _cmmlu.yaml │ │ │ ├── _default_template_yaml │ │ │ ├── _generate_configs.py │ │ │ ├── cmmlu_agronomy.yaml │ │ │ ├── cmmlu_anatomy.yaml │ │ │ ├── cmmlu_ancient_chinese.yaml │ │ │ ├── cmmlu_arts.yaml │ │ │ ├── cmmlu_astronomy.yaml │ │ │ ├── cmmlu_business_ethics.yaml │ │ │ ├── cmmlu_chinese_civil_service_exam.yaml │ │ │ ├── cmmlu_chinese_driving_rule.yaml │ │ │ ├── cmmlu_chinese_food_culture.yaml │ │ │ ├── cmmlu_chinese_foreign_policy.yaml │ │ │ ├── cmmlu_chinese_history.yaml │ │ │ ├── cmmlu_chinese_literature.yaml │ │ │ ├── cmmlu_chinese_teacher_qualification.yaml │ │ │ ├── cmmlu_clinical_knowledge.yaml │ │ │ ├── cmmlu_college_actuarial_science.yaml │ │ │ ├── cmmlu_college_education.yaml │ │ │ ├── cmmlu_college_engineering_hydrology.yaml │ │ │ ├── cmmlu_college_law.yaml │ │ │ ├── cmmlu_college_mathematics.yaml │ │ │ ├── cmmlu_college_medical_statistics.yaml │ │ │ ├── cmmlu_college_medicine.yaml │ │ │ ├── cmmlu_computer_science.yaml │ │ │ ├── cmmlu_computer_security.yaml │ │ │ ├── cmmlu_conceptual_physics.yaml │ │ │ ├── cmmlu_construction_project_management.yaml │ │ │ ├── cmmlu_economics.yaml │ │ │ ├── cmmlu_education.yaml │ │ │ ├── cmmlu_electrical_engineering.yaml │ │ │ ├── cmmlu_elementary_chinese.yaml │ │ │ ├── cmmlu_elementary_commonsense.yaml │ │ │ ├── cmmlu_elementary_information_and_technology.yaml │ │ │ ├── cmmlu_elementary_mathematics.yaml │ │ │ ├── cmmlu_ethnology.yaml │ │ │ ├── cmmlu_food_science.yaml │ │ │ ├── cmmlu_genetics.yaml │ │ │ ├── cmmlu_global_facts.yaml │ │ │ ├── cmmlu_high_school_biology.yaml │ │ │ ├── cmmlu_high_school_chemistry.yaml │ │ │ ├── cmmlu_high_school_geography.yaml │ │ │ ├── cmmlu_high_school_mathematics.yaml │ │ │ ├── cmmlu_high_school_physics.yaml │ │ │ ├── cmmlu_high_school_politics.yaml │ │ │ ├── cmmlu_human_sexuality.yaml │ │ │ ├── cmmlu_international_law.yaml │ │ │ ├── cmmlu_journalism.yaml │ │ │ ├── cmmlu_jurisprudence.yaml │ │ │ ├── cmmlu_legal_and_moral_basis.yaml │ │ │ ├── cmmlu_logical.yaml │ │ │ ├── cmmlu_machine_learning.yaml │ │ │ ├── cmmlu_management.yaml │ │ │ ├── cmmlu_marketing.yaml │ │ │ ├── cmmlu_marxist_theory.yaml │ │ │ ├── cmmlu_modern_chinese.yaml │ │ │ ├── cmmlu_nutrition.yaml │ │ │ ├── cmmlu_philosophy.yaml │ │ │ ├── cmmlu_professional_accounting.yaml │ │ │ ├── cmmlu_professional_law.yaml │ │ │ ├── cmmlu_professional_medicine.yaml │ │ │ ├── cmmlu_professional_psychology.yaml │ │ │ ├── cmmlu_public_relations.yaml │ │ │ ├── cmmlu_security_study.yaml │ │ │ ├── cmmlu_sociology.yaml │ │ │ ├── cmmlu_sports_science.yaml │ │ │ ├── cmmlu_traditional_chinese_medicine.yaml │ │ │ ├── cmmlu_virology.yaml │ │ │ ├── cmmlu_world_history.yaml │ │ │ └── cmmlu_world_religions.yaml │ │ ├── cnn_dailymail/ │ │ │ ├── README.md │ │ │ ├── cnn_dailymail.yaml │ │ │ └── utils.py │ │ ├── code_x_glue/ │ │ │ └── code-text/ │ │ │ ├── README.md │ │ │ ├── _codexglue.yaml │ │ │ ├── _default_template_yaml │ │ │ ├── bleu.py │ │ │ ├── go.yaml │ │ │ ├── java.yaml │ │ │ ├── javascript.yaml │ │ │ ├── php.yaml │ │ │ ├── python.yaml │ │ │ ├── ruby.yaml │ │ │ └── utils.py │ │ ├── common_voice/ │ │ │ ├── common_voice_en.yaml │ │ │ └── utils.py │ │ ├── commonsense_qa/ │ │ │ ├── README.md │ │ │ └── default.yaml │ │ ├── copal_id/ │ │ │ ├── README.md │ │ │ ├── colloquial.yaml │ │ │ ├── standard.yaml │ │ │ └── utils.py │ │ ├── coqa/ │ │ │ ├── README.md │ │ │ ├── default.yaml │ │ │ └── utils.py │ │ ├── crows_pairs/ │ │ │ ├── README.md │ │ │ ├── crows_pairs_english.yaml │ │ │ ├── crows_pairs_english_age.yaml │ │ │ ├── crows_pairs_english_autre.yaml │ │ │ ├── crows_pairs_english_disability.yaml │ │ │ ├── crows_pairs_english_gender.yaml │ │ │ ├── crows_pairs_english_nationality.yaml │ │ │ ├── crows_pairs_english_physical_appearance.yaml │ │ │ ├── crows_pairs_english_race_color.yaml │ │ │ ├── crows_pairs_english_religion.yaml │ │ │ ├── crows_pairs_english_sexual_orientation.yaml │ │ │ ├── crows_pairs_english_socioeconomic.yaml │ │ │ ├── crows_pairs_french.yaml │ │ │ ├── crows_pairs_french_age.yaml │ │ │ ├── crows_pairs_french_autre.yaml │ │ │ ├── crows_pairs_french_disability.yaml │ │ │ ├── crows_pairs_french_gender.yaml │ │ │ ├── crows_pairs_french_nationality.yaml │ │ │ ├── crows_pairs_french_physical_appearance.yaml │ │ │ ├── crows_pairs_french_race_color.yaml │ │ │ ├── crows_pairs_french_religion.yaml │ │ │ ├── crows_pairs_french_sexual_orientation.yaml │ │ │ ├── crows_pairs_french_socioeconomic.yaml │ │ │ └── utils.py │ │ ├── csatqa/ │ │ │ ├── _csatqa.yaml │ │ │ ├── _default_csatqa_yaml │ │ │ ├── _generate_configs.py │ │ │ ├── csatqa_gr.yaml │ │ │ ├── csatqa_li.yaml │ │ │ ├── csatqa_rch.yaml │ │ │ ├── csatqa_rcs.yaml │ │ │ ├── csatqa_rcss.yaml │ │ │ ├── csatqa_wr.yaml │ │ │ └── utils.py │ │ ├── darija_bench/ │ │ │ ├── README.md │ │ │ ├── darija_sentiment/ │ │ │ │ ├── README.md │ │ │ │ ├── darija_sentiment.yaml │ │ │ │ ├── darija_sentiment_electrom.yaml │ │ │ │ ├── darija_sentiment_mac.yaml │ │ │ │ ├── darija_sentiment_msac.yaml │ │ │ │ ├── darija_sentiment_msda.yaml │ │ │ │ ├── darija_sentiment_myc.yaml │ │ │ │ ├── default_darija_sentiment_template_yaml │ │ │ │ └── utils.py │ │ │ ├── darija_summarization/ │ │ │ │ ├── README.md │ │ │ │ ├── summarization.yaml │ │ │ │ ├── summarization_common_yaml │ │ │ │ ├── summarization_darija.yaml │ │ │ │ └── utils.py │ │ │ ├── darija_translation/ │ │ │ │ ├── README.md │ │ │ │ ├── doda_common_yaml │ │ │ │ ├── doda_translation_all.yaml │ │ │ │ ├── doda_translation_darija.yaml │ │ │ │ ├── doda_translation_dr_en.yaml │ │ │ │ ├── doda_translation_dr_fr.yaml │ │ │ │ ├── doda_translation_dr_msa.yaml │ │ │ │ ├── doda_translation_en_dr.yaml │ │ │ │ ├── doda_translation_fr_dr.yaml │ │ │ │ ├── doda_translation_msa_dr.yaml │ │ │ │ ├── flores_common_yaml │ │ │ │ ├── flores_translation_all.yaml │ │ │ │ ├── flores_translation_darija.yaml │ │ │ │ ├── flores_translation_dr_en.yaml │ │ │ │ ├── flores_translation_dr_fr.yaml │ │ │ │ ├── flores_translation_dr_msa.yaml │ │ │ │ ├── flores_translation_en_dr.yaml │ │ │ │ ├── flores_translation_fr_dr.yaml │ │ │ │ ├── flores_translation_msa_dr.yaml │ │ │ │ ├── madar_common_yaml │ │ │ │ ├── madar_translation_all.yaml │ │ │ │ ├── madar_translation_darija.yaml │ │ │ │ ├── madar_translation_dr_msa.yaml │ │ │ │ ├── madar_translation_msa_dr.yaml │ │ │ │ ├── seed_common_yaml │ │ │ │ ├── seed_translation_all.yaml │ │ │ │ ├── seed_translation_darija.yaml │ │ │ │ ├── seed_translation_dr_en.yaml │ │ │ │ ├── seed_translation_en_dr.yaml │ │ │ │ ├── translation_common_yaml │ │ │ │ ├── translation_darija.yaml │ │ │ │ └── utils.py │ │ │ └── darija_transliteration/ │ │ │ ├── README.md │ │ │ ├── translation_ar_dr.yaml │ │ │ ├── translation_dr_ar.yaml │ │ │ ├── transliteration_all.yaml │ │ │ ├── transliteration_common_yaml │ │ │ ├── transliteration_darija.yaml │ │ │ └── utils.py │ │ ├── darijahellaswag/ │ │ │ ├── README.md │ │ │ ├── darijahellaswag.yaml │ │ │ └── utils.py │ │ ├── darijammlu/ │ │ │ ├── README.md │ │ │ ├── _darijammlu.yaml │ │ │ ├── _darijammlu_ar_mmlu.yaml │ │ │ ├── _darijammlu_mmlu.yaml │ │ │ ├── _default_darijammlu_template_yaml │ │ │ ├── _generate_configs.py │ │ │ ├── darijammlu_accounting.yaml │ │ │ ├── darijammlu_arabic_language.yaml │ │ │ ├── darijammlu_arabic_language_(general).yaml │ │ │ ├── darijammlu_arabic_language_(grammar).yaml │ │ │ ├── darijammlu_biology.yaml │ │ │ ├── darijammlu_civics.yaml │ │ │ ├── darijammlu_computer_science.yaml │ │ │ ├── darijammlu_driving_test.yaml │ │ │ ├── darijammlu_economics.yaml │ │ │ ├── darijammlu_general_knowledge.yaml │ │ │ ├── darijammlu_geography.yaml │ │ │ ├── darijammlu_global_facts.yaml │ │ │ ├── darijammlu_high_school_european_history.yaml │ │ │ ├── darijammlu_high_school_geography.yaml │ │ │ ├── darijammlu_high_school_government_and_politics.yaml │ │ │ ├── darijammlu_high_school_psychology.yaml │ │ │ ├── darijammlu_high_school_statistics.yaml │ │ │ ├── darijammlu_high_school_world_history.yaml │ │ │ ├── darijammlu_history.yaml │ │ │ ├── darijammlu_human_aging.yaml │ │ │ ├── darijammlu_international_law.yaml │ │ │ ├── darijammlu_islamic_studies.yaml │ │ │ ├── darijammlu_jurisprudence.yaml │ │ │ ├── darijammlu_law.yaml │ │ │ ├── darijammlu_logical_fallacies.yaml │ │ │ ├── darijammlu_management.yaml │ │ │ ├── darijammlu_management_ar.yaml │ │ │ ├── darijammlu_marketing.yaml │ │ │ ├── darijammlu_math.yaml │ │ │ ├── darijammlu_moral_disputes.yaml │ │ │ ├── darijammlu_moral_scenarios.yaml │ │ │ ├── darijammlu_natural_science.yaml │ │ │ ├── darijammlu_nutrition.yaml │ │ │ ├── darijammlu_philosophy.yaml │ │ │ ├── darijammlu_philosophy_ar.yaml │ │ │ ├── darijammlu_physics.yaml │ │ │ ├── darijammlu_political_science.yaml │ │ │ ├── darijammlu_professional_law.yaml │ │ │ ├── darijammlu_professional_psychology.yaml │ │ │ ├── darijammlu_public_relations.yaml │ │ │ ├── darijammlu_security_studies.yaml │ │ │ ├── darijammlu_social_science.yaml │ │ │ ├── darijammlu_sociology.yaml │ │ │ ├── darijammlu_world_religions.yaml │ │ │ └── utils.py │ │ ├── discrim_eval/ │ │ │ ├── README.md │ │ │ ├── discrim_eval_explicit.yaml │ │ │ ├── discrim_eval_implicit.yaml │ │ │ └── utils.py │ │ ├── drop/ │ │ │ ├── README.md │ │ │ ├── default.yaml │ │ │ └── utils.py │ │ ├── e2lmc/ │ │ │ ├── mmlu_early_training/ │ │ │ │ ├── README.md │ │ │ │ ├── custom_metrics.py │ │ │ │ └── mmlu_early_training.yaml │ │ │ ├── noor/ │ │ │ │ ├── README.md │ │ │ │ ├── _generate_configs.py │ │ │ │ ├── _noor.yaml │ │ │ │ ├── _noor_stem.yaml │ │ │ │ ├── _noor_template │ │ │ │ ├── noor_abstract_algebra.yaml │ │ │ │ ├── noor_college_computer_science.yaml │ │ │ │ ├── noor_college_mathematics.yaml │ │ │ │ ├── noor_college_physics.yaml │ │ │ │ ├── noor_conceptual_physics.yaml │ │ │ │ ├── noor_electrical_engineering.yaml │ │ │ │ ├── noor_elementary_mathematics.yaml │ │ │ │ ├── noor_high_school_computer_science.yaml │ │ │ │ ├── noor_high_school_mathematics.yaml │ │ │ │ ├── noor_high_school_physics.yaml │ │ │ │ ├── noor_high_school_statistics.yaml │ │ │ │ └── noor_machine_learning.yaml │ │ │ └── sciknoweval_mcqa/ │ │ │ ├── README.md │ │ │ ├── _sciknoweval_mcqa.yaml │ │ │ ├── _var5shots_template_yaml │ │ │ ├── sciknoweval_Biology.yaml │ │ │ ├── sciknoweval_Chemistry.yaml │ │ │ ├── sciknoweval_Material.yaml │ │ │ └── sciknoweval_Physics.yaml │ │ ├── egyhellaswag/ │ │ │ ├── README.md │ │ │ ├── egyhellaswag.yaml │ │ │ └── utils.py │ │ ├── egymmlu/ │ │ │ ├── README.md │ │ │ ├── _default_egymmlu_template_yaml │ │ │ ├── _egymmlu.yaml │ │ │ ├── _egymmlu_ar_mmlu.yaml │ │ │ ├── _egymmlu_mmlu.yaml │ │ │ ├── _generate_configs.py │ │ │ ├── egymmlu_accounting.yaml │ │ │ ├── egymmlu_arabic_language.yaml │ │ │ ├── egymmlu_arabic_language_(general).yaml │ │ │ ├── egymmlu_arabic_language_(grammar).yaml │ │ │ ├── egymmlu_biology.yaml │ │ │ ├── egymmlu_civics.yaml │ │ │ ├── egymmlu_computer_science.yaml │ │ │ ├── egymmlu_driving_test.yaml │ │ │ ├── egymmlu_economics.yaml │ │ │ ├── egymmlu_general_knowledge.yaml │ │ │ ├── egymmlu_geography.yaml │ │ │ ├── egymmlu_global_facts.yaml │ │ │ ├── egymmlu_high_school_european_history.yaml │ │ │ ├── egymmlu_high_school_geography.yaml │ │ │ ├── egymmlu_high_school_government_and_politics.yaml │ │ │ ├── egymmlu_high_school_psychology.yaml │ │ │ ├── egymmlu_high_school_statistics.yaml │ │ │ ├── egymmlu_high_school_world_history.yaml │ │ │ ├── egymmlu_history.yaml │ │ │ ├── egymmlu_human_aging.yaml │ │ │ ├── egymmlu_international_law.yaml │ │ │ ├── egymmlu_islamic_studies.yaml │ │ │ ├── egymmlu_jurisprudence.yaml │ │ │ ├── egymmlu_law.yaml │ │ │ ├── egymmlu_logical_fallacies.yaml │ │ │ ├── egymmlu_management.yaml │ │ │ ├── egymmlu_management_ar.yaml │ │ │ ├── egymmlu_marketing.yaml │ │ │ ├── egymmlu_math.yaml │ │ │ ├── egymmlu_moral_disputes.yaml │ │ │ ├── egymmlu_moral_scenarios.yaml │ │ │ ├── egymmlu_natural_science.yaml │ │ │ ├── egymmlu_nutrition.yaml │ │ │ ├── egymmlu_philosophy.yaml │ │ │ ├── egymmlu_philosophy_ar.yaml │ │ │ ├── egymmlu_physics.yaml │ │ │ ├── egymmlu_political_science.yaml │ │ │ ├── egymmlu_professional_law.yaml │ │ │ ├── egymmlu_professional_psychology.yaml │ │ │ ├── egymmlu_public_relations.yaml │ │ │ ├── egymmlu_security_studies.yaml │ │ │ ├── egymmlu_social_science.yaml │ │ │ ├── egymmlu_sociology.yaml │ │ │ ├── egymmlu_world_religions.yaml │ │ │ └── utils.py │ │ ├── eq_bench/ │ │ │ ├── README.md │ │ │ ├── default.yaml │ │ │ ├── multilingual/ │ │ │ │ ├── eqbench_ca.yaml │ │ │ │ ├── eqbench_es.yaml │ │ │ │ └── utils.py │ │ │ └── utils.py │ │ ├── esbbq/ │ │ │ ├── README.md │ │ │ ├── _esbbq_common_yaml │ │ │ ├── esbbq.yaml │ │ │ ├── esbbq_age.yaml │ │ │ ├── esbbq_disability_status.yaml │ │ │ ├── esbbq_gender.yaml │ │ │ ├── esbbq_lgbtqia.yaml │ │ │ ├── esbbq_nationality.yaml │ │ │ ├── esbbq_physical_appearance.yaml │ │ │ ├── esbbq_race_ethnicity.yaml │ │ │ ├── esbbq_religion.yaml │ │ │ ├── esbbq_ses.yaml │ │ │ ├── esbbq_spanish_region.yaml │ │ │ └── utils.py │ │ ├── eus_exams/ │ │ │ ├── README.md │ │ │ ├── configs.py │ │ │ ├── eus_exams │ │ │ ├── eus_exams_es │ │ │ ├── eus_exams_es_ejadministrativo.yaml │ │ │ ├── eus_exams_es_ejauxiliar.yaml │ │ │ ├── eus_exams_es_ejsubalterno.yaml │ │ │ ├── eus_exams_es_ejtecnico.yaml │ │ │ ├── eus_exams_es_opeayuntamientovitoria.yaml │ │ │ ├── eus_exams_es_opebilbao.yaml │ │ │ ├── eus_exams_es_opeehuadmin.yaml │ │ │ ├── eus_exams_es_opeehuaux.yaml │ │ │ ├── eus_exams_es_opeehubiblio.yaml │ │ │ ├── eus_exams_es_opeehuderecho.yaml │ │ │ ├── eus_exams_es_opeehueconomicas.yaml │ │ │ ├── eus_exams_es_opeehuempresariales.yaml │ │ │ ├── eus_exams_es_opeehusubalterno.yaml │ │ │ ├── eus_exams_es_opeehutecnico.yaml │ │ │ ├── eus_exams_es_opeehutecnicob.yaml │ │ │ ├── eus_exams_es_opeosakiadmin.yaml │ │ │ ├── eus_exams_es_opeosakiaux.yaml │ │ │ ├── eus_exams_es_opeosakiauxenf.yaml │ │ │ ├── eus_exams_es_opeosakicelador.yaml │ │ │ ├── eus_exams_es_opeosakienf.yaml │ │ │ ├── eus_exams_es_opeosakijuridico.yaml │ │ │ ├── eus_exams_es_opeosakioperario.yaml │ │ │ ├── eus_exams_es_opeosakitecnico.yaml │ │ │ ├── eus_exams_es_opeosakivarios.yaml │ │ │ ├── eus_exams_es_osakidetza1c.yaml │ │ │ ├── eus_exams_es_osakidetza2c.yaml │ │ │ ├── eus_exams_es_osakidetza3c.yaml │ │ │ ├── eus_exams_es_osakidetza4c.yaml │ │ │ ├── eus_exams_es_osakidetza5c.yaml │ │ │ ├── eus_exams_es_osakidetza6c.yaml │ │ │ ├── eus_exams_es_osakidetza7c.yaml │ │ │ ├── eus_exams_es_osakidetza8c.yaml │ │ │ ├── eus_exams_es_osakidetza9c.yaml │ │ │ ├── eus_exams_eu │ │ │ ├── eus_exams_eu_ejadministrari.yaml │ │ │ ├── eus_exams_eu_ejlaguntza.yaml │ │ │ ├── eus_exams_eu_ejlaguntzaile.yaml │ │ │ ├── eus_exams_eu_ejteknikari.yaml │ │ │ ├── eus_exams_eu_opebilbaoeu.yaml │ │ │ ├── eus_exams_eu_opeehuadmineu.yaml │ │ │ ├── eus_exams_eu_opeehuauxeu.yaml │ │ │ ├── eus_exams_eu_opeehubiblioeu.yaml │ │ │ ├── eus_exams_eu_opeehuderechoeu.yaml │ │ │ ├── eus_exams_eu_opeehueconomicaseu.yaml │ │ │ ├── eus_exams_eu_opeehuempresarialeseu.yaml │ │ │ ├── eus_exams_eu_opeehusubalternoeu.yaml │ │ │ ├── eus_exams_eu_opeehutecnicoeu.yaml │ │ │ ├── eus_exams_eu_opeehuteknikarib.yaml │ │ │ ├── eus_exams_eu_opegasteizkoudala.yaml │ │ │ ├── eus_exams_eu_opeosakiadmineu.yaml │ │ │ ├── eus_exams_eu_opeosakiauxenfeu.yaml │ │ │ ├── eus_exams_eu_opeosakiauxeu.yaml │ │ │ ├── eus_exams_eu_opeosakiceladoreu.yaml │ │ │ ├── eus_exams_eu_opeosakienfeu.yaml │ │ │ ├── eus_exams_eu_opeosakioperarioeu.yaml │ │ │ ├── eus_exams_eu_opeosakitecnicoeu.yaml │ │ │ ├── eus_exams_eu_opeosakivarioseu.yaml │ │ │ ├── eus_exams_eu_osakidetza1e.yaml │ │ │ ├── eus_exams_eu_osakidetza2e.yaml │ │ │ ├── eus_exams_eu_osakidetza3e.yaml │ │ │ ├── eus_exams_eu_osakidetza5e.yaml │ │ │ ├── eus_exams_eu_osakidetza6e.yaml │ │ │ ├── eus_exams_eu_osakidetza7e.yaml │ │ │ └── utils.py │ │ ├── eus_proficiency/ │ │ │ ├── README.md │ │ │ └── eus_proficiency.yaml │ │ ├── eus_reading/ │ │ │ ├── README.md │ │ │ ├── eus_reading.yaml │ │ │ └── utils.py │ │ ├── eus_trivia/ │ │ │ ├── README.md │ │ │ ├── eus_trivia.yaml │ │ │ └── utils.py │ │ ├── evalita_llm/ │ │ │ ├── README.md │ │ │ ├── _at_template_yaml │ │ │ ├── _evalita-mp.yaml │ │ │ ├── _evalita-mp_at_task_p1.yaml │ │ │ ├── _evalita-mp_at_task_p2.yaml │ │ │ ├── _evalita-mp_at_task_p3.yaml │ │ │ ├── _evalita-mp_at_task_p4.yaml │ │ │ ├── _evalita-mp_at_task_p5.yaml │ │ │ ├── _evalita-mp_at_task_p6.yaml │ │ │ ├── _evalita-mp_at_tasks.yaml │ │ │ ├── _evalita-mp_faq_p1.yaml │ │ │ ├── _evalita-mp_faq_p2.yaml │ │ │ ├── _evalita-mp_faq_p3.yaml │ │ │ ├── _evalita-mp_faq_p4.yaml │ │ │ ├── _evalita-mp_faq_p5.yaml │ │ │ ├── _evalita-mp_faq_p6.yaml │ │ │ ├── _evalita-mp_faq_tasks.yaml │ │ │ ├── _evalita-mp_gen.yaml │ │ │ ├── _evalita-mp_hs_p1.yaml │ │ │ ├── _evalita-mp_hs_p2.yaml │ │ │ ├── _evalita-mp_hs_p3.yaml │ │ │ ├── _evalita-mp_hs_p4.yaml │ │ │ ├── _evalita-mp_hs_p5.yaml │ │ │ ├── _evalita-mp_hs_p6.yaml │ │ │ ├── _evalita-mp_hs_task.yaml │ │ │ ├── _evalita-mp_ls_p1.yaml │ │ │ ├── _evalita-mp_ls_p2.yaml │ │ │ ├── _evalita-mp_ls_task.yaml │ │ │ ├── _evalita-mp_mc.yaml │ │ │ ├── _evalita-mp_ner-adg_group.yaml │ │ │ ├── _evalita-mp_ner-adg_group_p1.yaml │ │ │ ├── _evalita-mp_ner-adg_group_p2.yaml │ │ │ ├── _evalita-mp_ner-fic_group.yaml │ │ │ ├── _evalita-mp_ner-fic_group_p1.yaml │ │ │ ├── _evalita-mp_ner-fic_group_p2.yaml │ │ │ ├── _evalita-mp_ner-wn_group.yaml │ │ │ ├── _evalita-mp_ner-wn_group_p1.yaml │ │ │ ├── _evalita-mp_ner-wn_group_p2.yaml │ │ │ ├── _evalita-mp_ner_adg │ │ │ ├── _evalita-mp_ner_adg_p1.yaml │ │ │ ├── _evalita-mp_ner_adg_p2.yaml │ │ │ ├── _evalita-mp_ner_fic │ │ │ ├── _evalita-mp_ner_fic_p1.yaml │ │ │ ├── _evalita-mp_ner_fic_p2.yaml │ │ │ ├── _evalita-mp_ner_group.yaml │ │ │ ├── _evalita-mp_ner_wn │ │ │ ├── _evalita-mp_ner_wn_p1.yaml │ │ │ ├── _evalita-mp_ner_wn_p2.yaml │ │ │ ├── _evalita-mp_re_p1.yaml │ │ │ ├── _evalita-mp_re_p2.yaml │ │ │ ├── _evalita-mp_re_task.yaml │ │ │ ├── _evalita-mp_sa_p1.yaml │ │ │ ├── _evalita-mp_sa_p2.yaml │ │ │ ├── _evalita-mp_sa_p3.yaml │ │ │ ├── _evalita-mp_sa_p4.yaml │ │ │ ├── _evalita-mp_sa_p5.yaml │ │ │ ├── _evalita-mp_sa_p6.yaml │ │ │ ├── _evalita-mp_sa_tasks.yaml │ │ │ ├── _evalita-mp_sum_fp-small_p1.yaml │ │ │ ├── _evalita-mp_sum_fp-small_p2.yaml │ │ │ ├── _evalita-mp_sum_fp-small_task.yaml │ │ │ ├── _evalita-mp_sum_fp_p1.yaml │ │ │ ├── _evalita-mp_sum_fp_p2.yaml │ │ │ ├── _evalita-mp_sum_fp_task.yaml │ │ │ ├── _evalita-mp_te_p1.yaml │ │ │ ├── _evalita-mp_te_p2.yaml │ │ │ ├── _evalita-mp_te_p3.yaml │ │ │ ├── _evalita-mp_te_p4.yaml │ │ │ ├── _evalita-mp_te_p5.yaml │ │ │ ├── _evalita-mp_te_p6.yaml │ │ │ ├── _evalita-mp_te_tasks.yaml │ │ │ ├── _evalita-mp_wic_p1.yaml │ │ │ ├── _evalita-mp_wic_p2.yaml │ │ │ ├── _evalita-mp_wic_p3.yaml │ │ │ ├── _evalita-mp_wic_p4.yaml │ │ │ ├── _evalita-mp_wic_p5.yaml │ │ │ ├── _evalita-mp_wic_p6.yaml │ │ │ ├── _evalita-mp_wic_tasks.yaml │ │ │ ├── _faq_template_yaml │ │ │ ├── _hs_template_yaml │ │ │ ├── _ls_template_yaml │ │ │ ├── _ner_template_yaml │ │ │ ├── _re_template_yaml │ │ │ ├── _sa_template_v2_yaml │ │ │ ├── _sa_template_yaml │ │ │ ├── _sum_template_fp-small_yaml │ │ │ ├── _sum_template_fp_yaml │ │ │ ├── _sum_template_yaml │ │ │ ├── _te_template_yaml │ │ │ ├── _wic_template_yaml │ │ │ ├── metrics.py │ │ │ ├── sum_utils.py │ │ │ └── utils.py │ │ ├── fda/ │ │ │ ├── README.md │ │ │ ├── fda.yaml │ │ │ └── task.py │ │ ├── fld/ │ │ │ ├── README.md │ │ │ ├── fld_default.yaml │ │ │ ├── fld_logical_formula_default.yaml │ │ │ ├── fld_logical_formula_star.yaml │ │ │ └── fld_star.yaml │ │ ├── french_bench/ │ │ │ ├── README.md │ │ │ ├── _default_template_yaml │ │ │ ├── french_bench_arc_challenge.yaml │ │ │ ├── french_bench_boolqa.yaml │ │ │ ├── french_bench_fquadv2.yaml │ │ │ ├── french_bench_fquadv2_bool.yaml │ │ │ ├── french_bench_fquadv2_genq.yaml │ │ │ ├── french_bench_fquadv2_hasAns.yaml │ │ │ ├── french_bench_grammar.yaml │ │ │ ├── french_bench_hellaswag.yaml │ │ │ ├── french_bench_multifquad.yaml │ │ │ ├── french_bench_opus_perplexity.yaml │ │ │ ├── french_bench_orangesum_abstract.yaml │ │ │ ├── french_bench_orangesum_title.yaml │ │ │ ├── french_bench_reading_comp.yaml │ │ │ ├── french_bench_topic_based_nli.yaml │ │ │ ├── french_bench_trivia.yaml │ │ │ ├── french_bench_vocab.yaml │ │ │ ├── french_bench_wikitext_fr.yaml │ │ │ ├── french_bench_xnli.yaml │ │ │ ├── preprocess_wikitext.py │ │ │ └── utils.py │ │ ├── galician_bench/ │ │ │ ├── README.md │ │ │ ├── belebele_glg_Latn.yaml │ │ │ ├── flores_gl/ │ │ │ │ ├── _flores_common_yaml │ │ │ │ ├── create_yamls_flores_gl.py │ │ │ │ ├── flores_ca-gl.yaml │ │ │ │ ├── flores_de-gl.yaml │ │ │ │ ├── flores_en-gl.yaml │ │ │ │ ├── flores_es-gl.yaml │ │ │ │ ├── flores_eu-gl.yaml │ │ │ │ ├── flores_fr-gl.yaml │ │ │ │ ├── flores_gl-ca.yaml │ │ │ │ ├── flores_gl-de.yaml │ │ │ │ ├── flores_gl-en.yaml │ │ │ │ ├── flores_gl-es.yaml │ │ │ │ ├── flores_gl-eu.yaml │ │ │ │ ├── flores_gl-fr.yaml │ │ │ │ ├── flores_gl-it.yaml │ │ │ │ ├── flores_gl-pt.yaml │ │ │ │ ├── flores_gl.yaml │ │ │ │ ├── flores_it-gl.yaml │ │ │ │ └── flores_pt-gl.yaml │ │ │ ├── galcola.yaml │ │ │ ├── galician_bench.yaml │ │ │ ├── mgsm_direct_gl.yaml │ │ │ ├── openbookqa_gl.yaml │ │ │ ├── parafrases_gl.yaml │ │ │ ├── paws_gl.yaml │ │ │ ├── summarization_gl.yaml │ │ │ ├── truthfulqa_gl_gen.yaml │ │ │ ├── truthfulqa_gl_mc1.yaml │ │ │ ├── truthfulqa_gl_mc2.yaml │ │ │ ├── utils.py │ │ │ ├── xnli_gl.yaml │ │ │ └── xstorycloze_gl.yaml │ │ ├── glianorex/ │ │ │ ├── README.md │ │ │ ├── glianorex.yaml │ │ │ ├── glianorex_en.yaml │ │ │ ├── glianorex_fr.yaml │ │ │ └── preprocess_glianorex.py │ │ ├── global_mmlu/ │ │ │ ├── README.md │ │ │ ├── default/ │ │ │ │ ├── ar/ │ │ │ │ │ ├── _ar_template_yaml │ │ │ │ │ ├── _global_mmlu_ar.yaml │ │ │ │ │ ├── global_mmlu_ar_business.yaml │ │ │ │ │ ├── global_mmlu_ar_humanities.yaml │ │ │ │ │ ├── global_mmlu_ar_medical.yaml │ │ │ │ │ ├── global_mmlu_ar_other.yaml │ │ │ │ │ ├── global_mmlu_ar_social_sciences.yaml │ │ │ │ │ ├── global_mmlu_ar_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── bn/ │ │ │ │ │ ├── _bn_template_yaml │ │ │ │ │ ├── _global_mmlu_bn.yaml │ │ │ │ │ ├── global_mmlu_bn_business.yaml │ │ │ │ │ ├── global_mmlu_bn_humanities.yaml │ │ │ │ │ ├── global_mmlu_bn_medical.yaml │ │ │ │ │ ├── global_mmlu_bn_other.yaml │ │ │ │ │ ├── global_mmlu_bn_social_sciences.yaml │ │ │ │ │ ├── global_mmlu_bn_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── de/ │ │ │ │ │ ├── _de_template_yaml │ │ │ │ │ ├── _global_mmlu_de.yaml │ │ │ │ │ ├── global_mmlu_de_business.yaml │ │ │ │ │ ├── global_mmlu_de_humanities.yaml │ │ │ │ │ ├── global_mmlu_de_medical.yaml │ │ │ │ │ ├── global_mmlu_de_other.yaml │ │ │ │ │ ├── global_mmlu_de_social_sciences.yaml │ │ │ │ │ ├── global_mmlu_de_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── en/ │ │ │ │ │ ├── _en_template_yaml │ │ │ │ │ ├── _global_mmlu_en.yaml │ │ │ │ │ ├── global_mmlu_en_business.yaml │ │ │ │ │ ├── global_mmlu_en_humanities.yaml │ │ │ │ │ ├── global_mmlu_en_medical.yaml │ │ │ │ │ ├── global_mmlu_en_other.yaml │ │ │ │ │ ├── global_mmlu_en_social_sciences.yaml │ │ │ │ │ ├── global_mmlu_en_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── es/ │ │ │ │ │ ├── _es_template_yaml │ │ │ │ │ ├── _global_mmlu_es.yaml │ │ │ │ │ ├── global_mmlu_es_business.yaml │ │ │ │ │ ├── global_mmlu_es_humanities.yaml │ │ │ │ │ ├── global_mmlu_es_medical.yaml │ │ │ │ │ ├── global_mmlu_es_other.yaml │ │ │ │ │ ├── global_mmlu_es_social_sciences.yaml │ │ │ │ │ ├── global_mmlu_es_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── fr/ │ │ │ │ │ ├── _fr_template_yaml │ │ │ │ │ ├── _global_mmlu_fr.yaml │ │ │ │ │ ├── global_mmlu_fr_business.yaml │ │ │ │ │ ├── global_mmlu_fr_humanities.yaml │ │ │ │ │ ├── global_mmlu_fr_medical.yaml │ │ │ │ │ ├── global_mmlu_fr_other.yaml │ │ │ │ │ ├── global_mmlu_fr_social_sciences.yaml │ │ │ │ │ ├── global_mmlu_fr_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── hi/ │ │ │ │ │ ├── _global_mmlu_hi.yaml │ │ │ │ │ ├── _hi_template_yaml │ │ │ │ │ ├── global_mmlu_hi_business.yaml │ │ │ │ │ ├── global_mmlu_hi_humanities.yaml │ │ │ │ │ ├── global_mmlu_hi_medical.yaml │ │ │ │ │ ├── global_mmlu_hi_other.yaml │ │ │ │ │ ├── global_mmlu_hi_social_sciences.yaml │ │ │ │ │ ├── global_mmlu_hi_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── id/ │ │ │ │ │ ├── _global_mmlu_id.yaml │ │ │ │ │ ├── _id_template_yaml │ │ │ │ │ ├── global_mmlu_id_business.yaml │ │ │ │ │ ├── global_mmlu_id_humanities.yaml │ │ │ │ │ ├── global_mmlu_id_medical.yaml │ │ │ │ │ ├── global_mmlu_id_other.yaml │ │ │ │ │ ├── global_mmlu_id_social_sciences.yaml │ │ │ │ │ ├── global_mmlu_id_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── it/ │ │ │ │ │ ├── _global_mmlu_it.yaml │ │ │ │ │ ├── _it_template_yaml │ │ │ │ │ ├── global_mmlu_it_business.yaml │ │ │ │ │ ├── global_mmlu_it_humanities.yaml │ │ │ │ │ ├── global_mmlu_it_medical.yaml │ │ │ │ │ ├── global_mmlu_it_other.yaml │ │ │ │ │ ├── global_mmlu_it_social_sciences.yaml │ │ │ │ │ ├── global_mmlu_it_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── ja/ │ │ │ │ │ ├── _global_mmlu_ja.yaml │ │ │ │ │ ├── _ja_template_yaml │ │ │ │ │ ├── global_mmlu_ja_business.yaml │ │ │ │ │ ├── global_mmlu_ja_humanities.yaml │ │ │ │ │ ├── global_mmlu_ja_medical.yaml │ │ │ │ │ ├── global_mmlu_ja_other.yaml │ │ │ │ │ ├── global_mmlu_ja_social_sciences.yaml │ │ │ │ │ ├── global_mmlu_ja_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── ko/ │ │ │ │ │ ├── _global_mmlu_ko.yaml │ │ │ │ │ ├── _ko_template_yaml │ │ │ │ │ ├── global_mmlu_ko_business.yaml │ │ │ │ │ ├── global_mmlu_ko_humanities.yaml │ │ │ │ │ ├── global_mmlu_ko_medical.yaml │ │ │ │ │ ├── global_mmlu_ko_other.yaml │ │ │ │ │ ├── global_mmlu_ko_social_sciences.yaml │ │ │ │ │ ├── global_mmlu_ko_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── pt/ │ │ │ │ │ ├── _global_mmlu_pt.yaml │ │ │ │ │ ├── _pt_template_yaml │ │ │ │ │ ├── global_mmlu_pt_business.yaml │ │ │ │ │ ├── global_mmlu_pt_humanities.yaml │ │ │ │ │ ├── global_mmlu_pt_medical.yaml │ │ │ │ │ ├── global_mmlu_pt_other.yaml │ │ │ │ │ ├── global_mmlu_pt_social_sciences.yaml │ │ │ │ │ ├── global_mmlu_pt_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── sw/ │ │ │ │ │ ├── _global_mmlu_sw.yaml │ │ │ │ │ ├── _sw_template_yaml │ │ │ │ │ ├── global_mmlu_sw_business.yaml │ │ │ │ │ ├── global_mmlu_sw_humanities.yaml │ │ │ │ │ ├── global_mmlu_sw_medical.yaml │ │ │ │ │ ├── global_mmlu_sw_other.yaml │ │ │ │ │ ├── global_mmlu_sw_social_sciences.yaml │ │ │ │ │ ├── global_mmlu_sw_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── yo/ │ │ │ │ │ ├── _global_mmlu_yo.yaml │ │ │ │ │ ├── _yo_template_yaml │ │ │ │ │ ├── global_mmlu_yo_business.yaml │ │ │ │ │ ├── global_mmlu_yo_humanities.yaml │ │ │ │ │ ├── global_mmlu_yo_medical.yaml │ │ │ │ │ ├── global_mmlu_yo_other.yaml │ │ │ │ │ ├── global_mmlu_yo_social_sciences.yaml │ │ │ │ │ ├── global_mmlu_yo_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ └── zh/ │ │ │ │ ├── _global_mmlu_zh.yaml │ │ │ │ ├── _zh_template_yaml │ │ │ │ ├── global_mmlu_zh_business.yaml │ │ │ │ ├── global_mmlu_zh_humanities.yaml │ │ │ │ ├── global_mmlu_zh_medical.yaml │ │ │ │ ├── global_mmlu_zh_other.yaml │ │ │ │ ├── global_mmlu_zh_social_sciences.yaml │ │ │ │ ├── global_mmlu_zh_stem.yaml │ │ │ │ └── utils.py │ │ │ └── full/ │ │ │ ├── am/ │ │ │ │ ├── _am_template_yaml │ │ │ │ ├── _global_mmlu_full_am.yaml │ │ │ │ ├── _global_mmlu_full_am_humanities.yaml │ │ │ │ ├── _global_mmlu_full_am_other.yaml │ │ │ │ ├── _global_mmlu_full_am_social_sciences.yaml │ │ │ │ ├── _global_mmlu_full_am_stem.yaml │ │ │ │ ├── global_mmlu_full_am_abstract_algebra.yaml │ │ │ │ ├── global_mmlu_full_am_anatomy.yaml │ │ │ │ ├── global_mmlu_full_am_astronomy.yaml │ │ │ │ ├── global_mmlu_full_am_business_ethics.yaml │ │ │ │ ├── global_mmlu_full_am_clinical_knowledge.yaml │ │ │ │ ├── global_mmlu_full_am_college_biology.yaml │ │ │ │ ├── global_mmlu_full_am_college_chemistry.yaml │ │ │ │ ├── global_mmlu_full_am_college_computer_science.yaml │ │ │ │ ├── global_mmlu_full_am_college_mathematics.yaml │ │ │ │ ├── global_mmlu_full_am_college_medicine.yaml │ │ │ │ ├── global_mmlu_full_am_college_physics.yaml │ │ │ │ ├── global_mmlu_full_am_computer_security.yaml │ │ │ │ ├── global_mmlu_full_am_conceptual_physics.yaml │ │ │ │ ├── global_mmlu_full_am_econometrics.yaml │ │ │ │ ├── global_mmlu_full_am_electrical_engineering.yaml │ │ │ │ ├── global_mmlu_full_am_elementary_mathematics.yaml │ │ │ │ ├── global_mmlu_full_am_formal_logic.yaml │ │ │ │ ├── global_mmlu_full_am_global_facts.yaml │ │ │ │ ├── global_mmlu_full_am_high_school_biology.yaml │ │ │ │ ├── global_mmlu_full_am_high_school_chemistry.yaml │ │ │ │ ├── global_mmlu_full_am_high_school_computer_science.yaml │ │ │ │ ├── global_mmlu_full_am_high_school_european_history.yaml │ │ │ │ ├── global_mmlu_full_am_high_school_geography.yaml │ │ │ │ ├── global_mmlu_full_am_high_school_government_and_politics.yaml │ │ │ │ ├── global_mmlu_full_am_high_school_macroeconomics.yaml │ │ │ │ ├── global_mmlu_full_am_high_school_mathematics.yaml │ │ │ │ ├── global_mmlu_full_am_high_school_microeconomics.yaml │ │ │ │ ├── global_mmlu_full_am_high_school_physics.yaml │ │ │ │ ├── global_mmlu_full_am_high_school_psychology.yaml │ │ │ │ ├── global_mmlu_full_am_high_school_statistics.yaml │ │ │ │ ├── global_mmlu_full_am_high_school_us_history.yaml │ │ │ │ ├── global_mmlu_full_am_high_school_world_history.yaml │ │ │ │ ├── global_mmlu_full_am_human_aging.yaml │ │ │ │ ├── global_mmlu_full_am_human_sexuality.yaml │ │ │ │ ├── global_mmlu_full_am_international_law.yaml │ │ │ │ ├── global_mmlu_full_am_jurisprudence.yaml │ │ │ │ ├── global_mmlu_full_am_logical_fallacies.yaml │ │ │ │ ├── global_mmlu_full_am_machine_learning.yaml │ │ │ │ ├── global_mmlu_full_am_management.yaml │ │ │ │ ├── global_mmlu_full_am_marketing.yaml │ │ │ │ ├── global_mmlu_full_am_medical_genetics.yaml │ │ │ │ ├── global_mmlu_full_am_miscellaneous.yaml │ │ │ │ ├── global_mmlu_full_am_moral_disputes.yaml │ │ │ │ ├── global_mmlu_full_am_moral_scenarios.yaml │ │ │ │ ├── global_mmlu_full_am_nutrition.yaml │ │ │ │ ├── global_mmlu_full_am_philosophy.yaml │ │ │ │ ├── global_mmlu_full_am_prehistory.yaml │ │ │ │ ├── global_mmlu_full_am_professional_accounting.yaml │ │ │ │ ├── global_mmlu_full_am_professional_law.yaml │ │ │ │ ├── global_mmlu_full_am_professional_medicine.yaml │ │ │ │ ├── global_mmlu_full_am_professional_psychology.yaml │ │ │ │ ├── global_mmlu_full_am_public_relations.yaml │ │ │ │ ├── global_mmlu_full_am_security_studies.yaml │ │ │ │ ├── global_mmlu_full_am_sociology.yaml │ │ │ │ ├── global_mmlu_full_am_us_foreign_policy.yaml │ │ │ │ ├── global_mmlu_full_am_virology.yaml │ │ │ │ ├── global_mmlu_full_am_world_religions.yaml │ │ │ │ └── utils.py │ │ │ ├── ar/ │ │ │ │ ├── _ar_template_yaml │ │ │ │ ├── _global_mmlu_full_ar.yaml │ │ │ │ ├── _global_mmlu_full_ar_humanities.yaml │ │ │ │ ├── _global_mmlu_full_ar_other.yaml │ │ │ │ ├── _global_mmlu_full_ar_social_sciences.yaml │ │ │ │ ├── _global_mmlu_full_ar_stem.yaml │ │ │ │ ├── global_mmlu_full_ar_abstract_algebra.yaml │ │ │ │ ├── global_mmlu_full_ar_anatomy.yaml │ │ │ │ ├── global_mmlu_full_ar_astronomy.yaml │ │ │ │ ├── global_mmlu_full_ar_business_ethics.yaml │ │ │ │ ├── global_mmlu_full_ar_clinical_knowledge.yaml │ │ │ │ ├── global_mmlu_full_ar_college_biology.yaml │ │ │ │ ├── global_mmlu_full_ar_college_chemistry.yaml │ │ │ │ ├── global_mmlu_full_ar_college_computer_science.yaml │ │ │ │ ├── global_mmlu_full_ar_college_mathematics.yaml │ │ │ │ ├── global_mmlu_full_ar_college_medicine.yaml │ │ │ │ ├── global_mmlu_full_ar_college_physics.yaml │ │ │ │ ├── global_mmlu_full_ar_computer_security.yaml │ │ │ │ ├── global_mmlu_full_ar_conceptual_physics.yaml │ │ │ │ ├── global_mmlu_full_ar_econometrics.yaml │ │ │ │ ├── global_mmlu_full_ar_electrical_engineering.yaml │ │ │ │ ├── global_mmlu_full_ar_elementary_mathematics.yaml │ │ │ │ ├── global_mmlu_full_ar_formal_logic.yaml │ │ │ │ ├── global_mmlu_full_ar_global_facts.yaml │ │ │ │ ├── global_mmlu_full_ar_high_school_biology.yaml │ │ │ │ ├── global_mmlu_full_ar_high_school_chemistry.yaml │ │ │ │ ├── global_mmlu_full_ar_high_school_computer_science.yaml │ │ │ │ ├── global_mmlu_full_ar_high_school_european_history.yaml │ │ │ │ ├── global_mmlu_full_ar_high_school_geography.yaml │ │ │ │ ├── global_mmlu_full_ar_high_school_government_and_politics.yaml │ │ │ │ ├── global_mmlu_full_ar_high_school_macroeconomics.yaml │ │ │ │ ├── global_mmlu_full_ar_high_school_mathematics.yaml │ │ │ │ ├── global_mmlu_full_ar_high_school_microeconomics.yaml │ │ │ │ ├── global_mmlu_full_ar_high_school_physics.yaml │ │ │ │ ├── global_mmlu_full_ar_high_school_psychology.yaml │ │ │ │ ├── global_mmlu_full_ar_high_school_statistics.yaml │ │ │ │ ├── global_mmlu_full_ar_high_school_us_history.yaml │ │ │ │ ├── global_mmlu_full_ar_high_school_world_history.yaml │ │ │ │ ├── global_mmlu_full_ar_human_aging.yaml │ │ │ │ ├── global_mmlu_full_ar_human_sexuality.yaml │ │ │ │ ├── global_mmlu_full_ar_international_law.yaml │ │ │ │ ├── global_mmlu_full_ar_jurisprudence.yaml │ │ │ │ ├── global_mmlu_full_ar_logical_fallacies.yaml │ │ │ │ ├── global_mmlu_full_ar_machine_learning.yaml │ │ │ │ ├── global_mmlu_full_ar_management.yaml │ │ │ │ ├── global_mmlu_full_ar_marketing.yaml │ │ │ │ ├── global_mmlu_full_ar_medical_genetics.yaml │ │ │ │ ├── global_mmlu_full_ar_miscellaneous.yaml │ │ │ │ ├── global_mmlu_full_ar_moral_disputes.yaml │ │ │ │ ├── global_mmlu_full_ar_moral_scenarios.yaml │ │ │ │ ├── global_mmlu_full_ar_nutrition.yaml │ │ │ │ ├── global_mmlu_full_ar_philosophy.yaml │ │ │ │ ├── global_mmlu_full_ar_prehistory.yaml │ │ │ │ ├── global_mmlu_full_ar_professional_accounting.yaml │ │ │ │ ├── global_mmlu_full_ar_professional_law.yaml │ │ │ │ ├── global_mmlu_full_ar_professional_medicine.yaml │ │ │ │ ├── global_mmlu_full_ar_professional_psychology.yaml │ │ │ │ ├── global_mmlu_full_ar_public_relations.yaml │ │ │ │ ├── global_mmlu_full_ar_security_studies.yaml │ │ │ │ ├── global_mmlu_full_ar_sociology.yaml │ │ │ │ ├── global_mmlu_full_ar_us_foreign_policy.yaml │ │ │ │ ├── global_mmlu_full_ar_virology.yaml │ │ │ │ ├── global_mmlu_full_ar_world_religions.yaml │ │ │ │ └── utils.py │ │ │ ├── bn/ │ │ │ │ ├── _bn_template_yaml │ │ │ │ ├── _global_mmlu_full_bn.yaml │ │ │ │ ├── _global_mmlu_full_bn_humanities.yaml │ │ │ │ ├── _global_mmlu_full_bn_other.yaml │ │ │ │ ├── _global_mmlu_full_bn_social_sciences.yaml │ │ │ │ ├── _global_mmlu_full_bn_stem.yaml │ │ │ │ ├── global_mmlu_full_bn_abstract_algebra.yaml │ │ │ │ ├── global_mmlu_full_bn_anatomy.yaml │ │ │ │ ├── global_mmlu_full_bn_astronomy.yaml │ │ │ │ ├── global_mmlu_full_bn_business_ethics.yaml │ │ │ │ ├── global_mmlu_full_bn_clinical_knowledge.yaml │ │ │ │ ├── global_mmlu_full_bn_college_biology.yaml │ │ │ │ ├── global_mmlu_full_bn_college_chemistry.yaml │ │ │ │ ├── global_mmlu_full_bn_college_computer_science.yaml │ │ │ │ ├── global_mmlu_full_bn_college_mathematics.yaml │ │ │ │ ├── global_mmlu_full_bn_college_medicine.yaml │ │ │ │ ├── global_mmlu_full_bn_college_physics.yaml │ │ │ │ ├── global_mmlu_full_bn_computer_security.yaml │ │ │ │ ├── global_mmlu_full_bn_conceptual_physics.yaml │ │ │ │ ├── global_mmlu_full_bn_econometrics.yaml │ │ │ │ ├── global_mmlu_full_bn_electrical_engineering.yaml │ │ │ │ ├── global_mmlu_full_bn_elementary_mathematics.yaml │ │ │ │ ├── global_mmlu_full_bn_formal_logic.yaml │ │ │ │ ├── global_mmlu_full_bn_global_facts.yaml │ │ │ │ ├── global_mmlu_full_bn_high_school_biology.yaml │ │ │ │ ├── global_mmlu_full_bn_high_school_chemistry.yaml │ │ │ │ ├── global_mmlu_full_bn_high_school_computer_science.yaml │ │ │ │ ├── global_mmlu_full_bn_high_school_european_history.yaml │ │ │ │ ├── global_mmlu_full_bn_high_school_geography.yaml │ │ │ │ ├── global_mmlu_full_bn_high_school_government_and_politics.yaml │ │ │ │ ├── global_mmlu_full_bn_high_school_macroeconomics.yaml │ │ │ │ ├── global_mmlu_full_bn_high_school_mathematics.yaml │ │ │ │ ├── global_mmlu_full_bn_high_school_microeconomics.yaml │ │ │ │ ├── global_mmlu_full_bn_high_school_physics.yaml │ │ │ │ ├── global_mmlu_full_bn_high_school_psychology.yaml │ │ │ │ ├── global_mmlu_full_bn_high_school_statistics.yaml │ │ │ │ ├── global_mmlu_full_bn_high_school_us_history.yaml │ │ │ │ ├── global_mmlu_full_bn_high_school_world_history.yaml │ │ │ │ ├── global_mmlu_full_bn_human_aging.yaml │ │ │ │ ├── global_mmlu_full_bn_human_sexuality.yaml │ │ │ │ ├── global_mmlu_full_bn_international_law.yaml │ │ │ │ ├── global_mmlu_full_bn_jurisprudence.yaml │ │ │ │ ├── global_mmlu_full_bn_logical_fallacies.yaml │ │ │ │ ├── global_mmlu_full_bn_machine_learning.yaml │ │ │ │ ├── global_mmlu_full_bn_management.yaml │ │ │ │ ├── global_mmlu_full_bn_marketing.yaml │ │ │ │ ├── global_mmlu_full_bn_medical_genetics.yaml │ │ │ │ ├── global_mmlu_full_bn_miscellaneous.yaml │ │ │ │ ├── global_mmlu_full_bn_moral_disputes.yaml │ │ │ │ ├── global_mmlu_full_bn_moral_scenarios.yaml │ │ │ │ ├── global_mmlu_full_bn_nutrition.yaml │ │ │ │ ├── global_mmlu_full_bn_philosophy.yaml │ │ │ │ ├── global_mmlu_full_bn_prehistory.yaml │ │ │ │ ├── global_mmlu_full_bn_professional_accounting.yaml │ │ │ │ ├── global_mmlu_full_bn_professional_law.yaml │ │ │ │ ├── global_mmlu_full_bn_professional_medicine.yaml │ │ │ │ ├── global_mmlu_full_bn_professional_psychology.yaml │ │ │ │ ├── global_mmlu_full_bn_public_relations.yaml │ │ │ │ ├── global_mmlu_full_bn_security_studies.yaml │ │ │ │ ├── global_mmlu_full_bn_sociology.yaml │ │ │ │ ├── global_mmlu_full_bn_us_foreign_policy.yaml │ │ │ │ ├── global_mmlu_full_bn_virology.yaml │ │ │ │ ├── global_mmlu_full_bn_world_religions.yaml │ │ │ │ └── utils.py │ │ │ ├── cs/ │ │ │ │ ├── _cs_template_yaml │ │ │ │ ├── _global_mmlu_full_cs.yaml │ │ │ │ ├── _global_mmlu_full_cs_humanities.yaml │ │ │ │ ├── _global_mmlu_full_cs_other.yaml │ │ │ │ ├── _global_mmlu_full_cs_social_sciences.yaml │ │ │ │ ├── _global_mmlu_full_cs_stem.yaml │ │ │ │ ├── global_mmlu_full_cs_abstract_algebra.yaml │ │ │ │ ├── global_mmlu_full_cs_anatomy.yaml │ │ │ │ ├── global_mmlu_full_cs_astronomy.yaml │ │ │ │ ├── global_mmlu_full_cs_business_ethics.yaml │ │ │ │ ├── global_mmlu_full_cs_clinical_knowledge.yaml │ │ │ │ ├── global_mmlu_full_cs_college_biology.yaml │ │ │ │ ├── global_mmlu_full_cs_college_chemistry.yaml │ │ │ │ ├── global_mmlu_full_cs_college_computer_science.yaml │ │ │ │ ├── global_mmlu_full_cs_college_mathematics.yaml │ │ │ │ ├── global_mmlu_full_cs_college_medicine.yaml │ │ │ │ ├── global_mmlu_full_cs_college_physics.yaml │ │ │ │ ├── global_mmlu_full_cs_computer_security.yaml │ │ │ │ ├── global_mmlu_full_cs_conceptual_physics.yaml │ │ │ │ ├── global_mmlu_full_cs_econometrics.yaml │ │ │ │ ├── global_mmlu_full_cs_electrical_engineering.yaml │ │ │ │ ├── global_mmlu_full_cs_elementary_mathematics.yaml │ │ │ │ ├── global_mmlu_full_cs_formal_logic.yaml │ │ │ │ ├── global_mmlu_full_cs_global_facts.yaml │ │ │ │ ├── global_mmlu_full_cs_high_school_biology.yaml │ │ │ │ ├── global_mmlu_full_cs_high_school_chemistry.yaml │ │ │ │ ├── global_mmlu_full_cs_high_school_computer_science.yaml │ │ │ │ ├── global_mmlu_full_cs_high_school_european_history.yaml │ │ │ │ ├── global_mmlu_full_cs_high_school_geography.yaml │ │ │ │ ├── global_mmlu_full_cs_high_school_government_and_politics.yaml │ │ │ │ ├── global_mmlu_full_cs_high_school_macroeconomics.yaml │ │ │ │ ├── global_mmlu_full_cs_high_school_mathematics.yaml │ │ │ │ ├── global_mmlu_full_cs_high_school_microeconomics.yaml │ │ │ │ ├── global_mmlu_full_cs_high_school_physics.yaml │ │ │ │ ├── global_mmlu_full_cs_high_school_psychology.yaml │ │ │ │ ├── global_mmlu_full_cs_high_school_statistics.yaml │ │ │ │ ├── global_mmlu_full_cs_high_school_us_history.yaml │ │ │ │ ├── global_mmlu_full_cs_high_school_world_history.yaml │ │ │ │ ├── global_mmlu_full_cs_human_aging.yaml │ │ │ │ ├── global_mmlu_full_cs_human_sexuality.yaml │ │ │ │ ├── global_mmlu_full_cs_international_law.yaml │ │ │ │ ├── global_mmlu_full_cs_jurisprudence.yaml │ │ │ │ ├── global_mmlu_full_cs_logical_fallacies.yaml │ │ │ │ ├── global_mmlu_full_cs_machine_learning.yaml │ │ │ │ ├── global_mmlu_full_cs_management.yaml │ │ │ │ ├── global_mmlu_full_cs_marketing.yaml │ │ │ │ ├── global_mmlu_full_cs_medical_genetics.yaml │ │ │ │ ├── global_mmlu_full_cs_miscellaneous.yaml │ │ │ │ ├── global_mmlu_full_cs_moral_disputes.yaml │ │ │ │ ├── global_mmlu_full_cs_moral_scenarios.yaml │ │ │ │ ├── global_mmlu_full_cs_nutrition.yaml │ │ │ │ ├── global_mmlu_full_cs_philosophy.yaml │ │ │ │ ├── global_mmlu_full_cs_prehistory.yaml │ │ │ │ ├── global_mmlu_full_cs_professional_accounting.yaml │ │ │ │ ├── global_mmlu_full_cs_professional_law.yaml │ │ │ │ ├── global_mmlu_full_cs_professional_medicine.yaml │ │ │ │ ├── global_mmlu_full_cs_professional_psychology.yaml │ │ │ │ ├── global_mmlu_full_cs_public_relations.yaml │ │ │ │ ├── global_mmlu_full_cs_security_studies.yaml │ │ │ │ ├── global_mmlu_full_cs_sociology.yaml │ │ │ │ ├── global_mmlu_full_cs_us_foreign_policy.yaml │ │ │ │ ├── global_mmlu_full_cs_virology.yaml │ │ │ │ ├── global_mmlu_full_cs_world_religions.yaml │ │ │ │ └── utils.py │ │ │ ├── de/ │ │ │ │ ├── _de_template_yaml │ │ │ │ ├── _global_mmlu_full_de.yaml │ │ │ │ ├── _global_mmlu_full_de_humanities.yaml │ │ │ │ ├── _global_mmlu_full_de_other.yaml │ │ │ │ ├── _global_mmlu_full_de_social_sciences.yaml │ │ │ │ ├── _global_mmlu_full_de_stem.yaml │ │ │ │ ├── global_mmlu_full_de_abstract_algebra.yaml │ │ │ │ ├── global_mmlu_full_de_anatomy.yaml │ │ │ │ ├── global_mmlu_full_de_astronomy.yaml │ │ │ │ ├── global_mmlu_full_de_business_ethics.yaml │ │ │ │ ├── global_mmlu_full_de_clinical_knowledge.yaml │ │ │ │ ├── global_mmlu_full_de_college_biology.yaml │ │ │ │ ├── global_mmlu_full_de_college_chemistry.yaml │ │ │ │ ├── global_mmlu_full_de_college_computer_science.yaml │ │ │ │ ├── global_mmlu_full_de_college_mathematics.yaml │ │ │ │ ├── global_mmlu_full_de_college_medicine.yaml │ │ │ │ ├── global_mmlu_full_de_college_physics.yaml │ │ │ │ ├── global_mmlu_full_de_computer_security.yaml │ │ │ │ ├── global_mmlu_full_de_conceptual_physics.yaml │ │ │ │ ├── global_mmlu_full_de_econometrics.yaml │ │ │ │ ├── global_mmlu_full_de_electrical_engineering.yaml │ │ │ │ ├── global_mmlu_full_de_elementary_mathematics.yaml │ │ │ │ ├── global_mmlu_full_de_formal_logic.yaml │ │ │ │ ├── global_mmlu_full_de_global_facts.yaml │ │ │ │ ├── global_mmlu_full_de_high_school_biology.yaml │ │ │ │ ├── global_mmlu_full_de_high_school_chemistry.yaml │ │ │ │ ├── global_mmlu_full_de_high_school_computer_science.yaml │ │ │ │ ├── global_mmlu_full_de_high_school_european_history.yaml │ │ │ │ ├── global_mmlu_full_de_high_school_geography.yaml │ │ │ │ ├── global_mmlu_full_de_high_school_government_and_politics.yaml │ │ │ │ ├── global_mmlu_full_de_high_school_macroeconomics.yaml │ │ │ │ ├── global_mmlu_full_de_high_school_mathematics.yaml │ │ │ │ ├── global_mmlu_full_de_high_school_microeconomics.yaml │ │ │ │ ├── global_mmlu_full_de_high_school_physics.yaml │ │ │ │ ├── global_mmlu_full_de_high_school_psychology.yaml │ │ │ │ ├── global_mmlu_full_de_high_school_statistics.yaml │ │ │ │ ├── global_mmlu_full_de_high_school_us_history.yaml │ │ │ │ ├── global_mmlu_full_de_high_school_world_history.yaml │ │ │ │ ├── global_mmlu_full_de_human_aging.yaml │ │ │ │ ├── global_mmlu_full_de_human_sexuality.yaml │ │ │ │ ├── global_mmlu_full_de_international_law.yaml │ │ │ │ ├── global_mmlu_full_de_jurisprudence.yaml │ │ │ │ ├── global_mmlu_full_de_logical_fallacies.yaml │ │ │ │ ├── global_mmlu_full_de_machine_learning.yaml │ │ │ │ ├── global_mmlu_full_de_management.yaml │ │ │ │ ├── global_mmlu_full_de_marketing.yaml │ │ │ │ ├── global_mmlu_full_de_medical_genetics.yaml │ │ │ │ ├── global_mmlu_full_de_miscellaneous.yaml │ │ │ │ ├── global_mmlu_full_de_moral_disputes.yaml │ │ │ │ ├── global_mmlu_full_de_moral_scenarios.yaml │ │ │ │ ├── global_mmlu_full_de_nutrition.yaml │ │ │ │ ├── global_mmlu_full_de_philosophy.yaml │ │ │ │ ├── global_mmlu_full_de_prehistory.yaml │ │ │ │ ├── global_mmlu_full_de_professional_accounting.yaml │ │ │ │ ├── global_mmlu_full_de_professional_law.yaml │ │ │ │ ├── global_mmlu_full_de_professional_medicine.yaml │ │ │ │ ├── global_mmlu_full_de_professional_psychology.yaml │ │ │ │ ├── global_mmlu_full_de_public_relations.yaml │ │ │ │ ├── global_mmlu_full_de_security_studies.yaml │ │ │ │ ├── global_mmlu_full_de_sociology.yaml │ │ │ │ ├── global_mmlu_full_de_us_foreign_policy.yaml │ │ │ │ ├── global_mmlu_full_de_virology.yaml │ │ │ │ ├── global_mmlu_full_de_world_religions.yaml │ │ │ │ └── utils.py │ │ │ ├── el/ │ │ │ │ ├── _el_template_yaml │ │ │ │ ├── _global_mmlu_full_el.yaml │ │ │ │ ├── _global_mmlu_full_el_humanities.yaml │ │ │ │ ├── _global_mmlu_full_el_other.yaml │ │ │ │ ├── _global_mmlu_full_el_social_sciences.yaml │ │ │ │ ├── _global_mmlu_full_el_stem.yaml │ │ │ │ ├── global_mmlu_full_el_abstract_algebra.yaml │ │ │ │ ├── global_mmlu_full_el_anatomy.yaml │ │ │ │ ├── global_mmlu_full_el_astronomy.yaml │ │ │ │ ├── global_mmlu_full_el_business_ethics.yaml │ │ │ │ ├── global_mmlu_full_el_clinical_knowledge.yaml │ │ │ │ ├── global_mmlu_full_el_college_biology.yaml │ │ │ │ ├── global_mmlu_full_el_college_chemistry.yaml │ │ │ │ ├── global_mmlu_full_el_college_computer_science.yaml │ │ │ │ ├── global_mmlu_full_el_college_mathematics.yaml │ │ │ │ ├── global_mmlu_full_el_college_medicine.yaml │ │ │ │ ├── global_mmlu_full_el_college_physics.yaml │ │ │ │ ├── global_mmlu_full_el_computer_security.yaml │ │ │ │ ├── global_mmlu_full_el_conceptual_physics.yaml │ │ │ │ ├── global_mmlu_full_el_econometrics.yaml │ │ │ │ ├── global_mmlu_full_el_electrical_engineering.yaml │ │ │ │ ├── global_mmlu_full_el_elementary_mathematics.yaml │ │ │ │ ├── global_mmlu_full_el_formal_logic.yaml │ │ │ │ ├── global_mmlu_full_el_global_facts.yaml │ │ │ │ ├── global_mmlu_full_el_high_school_biology.yaml │ │ │ │ ├── global_mmlu_full_el_high_school_chemistry.yaml │ │ │ │ ├── global_mmlu_full_el_high_school_computer_science.yaml │ │ │ │ ├── global_mmlu_full_el_high_school_european_history.yaml │ │ │ │ ├── global_mmlu_full_el_high_school_geography.yaml │ │ │ │ ├── global_mmlu_full_el_high_school_government_and_politics.yaml │ │ │ │ ├── global_mmlu_full_el_high_school_macroeconomics.yaml │ │ │ │ ├── global_mmlu_full_el_high_school_mathematics.yaml │ │ │ │ ├── global_mmlu_full_el_high_school_microeconomics.yaml │ │ │ │ ├── global_mmlu_full_el_high_school_physics.yaml │ │ │ │ ├── global_mmlu_full_el_high_school_psychology.yaml │ │ │ │ ├── global_mmlu_full_el_high_school_statistics.yaml │ │ │ │ ├── global_mmlu_full_el_high_school_us_history.yaml │ │ │ │ ├── global_mmlu_full_el_high_school_world_history.yaml │ │ │ │ ├── global_mmlu_full_el_human_aging.yaml │ │ │ │ ├── global_mmlu_full_el_human_sexuality.yaml │ │ │ │ ├── global_mmlu_full_el_international_law.yaml │ │ │ │ ├── global_mmlu_full_el_jurisprudence.yaml │ │ │ │ ├── global_mmlu_full_el_logical_fallacies.yaml │ │ │ │ ├── global_mmlu_full_el_machine_learning.yaml │ │ │ │ ├── global_mmlu_full_el_management.yaml │ │ │ │ ├── global_mmlu_full_el_marketing.yaml │ │ │ │ ├── global_mmlu_full_el_medical_genetics.yaml │ │ │ │ ├── global_mmlu_full_el_miscellaneous.yaml │ │ │ │ ├── global_mmlu_full_el_moral_disputes.yaml │ │ │ │ ├── global_mmlu_full_el_moral_scenarios.yaml │ │ │ │ ├── global_mmlu_full_el_nutrition.yaml │ │ │ │ ├── global_mmlu_full_el_philosophy.yaml │ │ │ │ ├── global_mmlu_full_el_prehistory.yaml │ │ │ │ ├── global_mmlu_full_el_professional_accounting.yaml │ │ │ │ ├── global_mmlu_full_el_professional_law.yaml │ │ │ │ ├── global_mmlu_full_el_professional_medicine.yaml │ │ │ │ ├── global_mmlu_full_el_professional_psychology.yaml │ │ │ │ ├── global_mmlu_full_el_public_relations.yaml │ │ │ │ ├── global_mmlu_full_el_security_studies.yaml │ │ │ │ ├── global_mmlu_full_el_sociology.yaml │ │ │ │ ├── global_mmlu_full_el_us_foreign_policy.yaml │ │ │ │ ├── global_mmlu_full_el_virology.yaml │ │ │ │ ├── global_mmlu_full_el_world_religions.yaml │ │ │ │ └── utils.py │ │ │ ├── en/ │ │ │ │ ├── _en_template_yaml │ │ │ │ ├── _global_mmlu_full_en.yaml │ │ │ │ ├── _global_mmlu_full_en_humanities.yaml │ │ │ │ ├── _global_mmlu_full_en_other.yaml │ │ │ │ ├── _global_mmlu_full_en_social_sciences.yaml │ │ │ │ ├── _global_mmlu_full_en_stem.yaml │ │ │ │ ├── global_mmlu_full_en_abstract_algebra.yaml │ │ │ │ ├── global_mmlu_full_en_anatomy.yaml │ │ │ │ ├── global_mmlu_full_en_astronomy.yaml │ │ │ │ ├── global_mmlu_full_en_business_ethics.yaml │ │ │ │ ├── global_mmlu_full_en_clinical_knowledge.yaml │ │ │ │ ├── global_mmlu_full_en_college_biology.yaml │ │ │ │ ├── global_mmlu_full_en_college_chemistry.yaml │ │ │ │ ├── global_mmlu_full_en_college_computer_science.yaml │ │ │ │ ├── global_mmlu_full_en_college_mathematics.yaml │ │ │ │ ├── global_mmlu_full_en_college_medicine.yaml │ │ │ │ ├── global_mmlu_full_en_college_physics.yaml │ │ │ │ ├── global_mmlu_full_en_computer_security.yaml │ │ │ │ ├── global_mmlu_full_en_conceptual_physics.yaml │ │ │ │ ├── global_mmlu_full_en_econometrics.yaml │ │ │ │ ├── global_mmlu_full_en_electrical_engineering.yaml │ │ │ │ ├── global_mmlu_full_en_elementary_mathematics.yaml │ │ │ │ ├── global_mmlu_full_en_formal_logic.yaml │ │ │ │ ├── global_mmlu_full_en_global_facts.yaml │ │ │ │ ├── global_mmlu_full_en_high_school_biology.yaml │ │ │ │ ├── global_mmlu_full_en_high_school_chemistry.yaml │ │ │ │ ├── global_mmlu_full_en_high_school_computer_science.yaml │ │ │ │ ├── global_mmlu_full_en_high_school_european_history.yaml │ │ │ │ ├── global_mmlu_full_en_high_school_geography.yaml │ │ │ │ ├── global_mmlu_full_en_high_school_government_and_politics.yaml │ │ │ │ ├── global_mmlu_full_en_high_school_macroeconomics.yaml │ │ │ │ ├── global_mmlu_full_en_high_school_mathematics.yaml │ │ │ │ ├── global_mmlu_full_en_high_school_microeconomics.yaml │ │ │ │ ├── global_mmlu_full_en_high_school_physics.yaml │ │ │ │ ├── global_mmlu_full_en_high_school_psychology.yaml │ │ │ │ ├── global_mmlu_full_en_high_school_statistics.yaml │ │ │ │ ├── global_mmlu_full_en_high_school_us_history.yaml │ │ │ │ ├── global_mmlu_full_en_high_school_world_history.yaml │ │ │ │ ├── global_mmlu_full_en_human_aging.yaml │ │ │ │ ├── global_mmlu_full_en_human_sexuality.yaml │ │ │ │ ├── global_mmlu_full_en_international_law.yaml │ │ │ │ ├── global_mmlu_full_en_jurisprudence.yaml │ │ │ │ ├── global_mmlu_full_en_logical_fallacies.yaml │ │ │ │ ├── global_mmlu_full_en_machine_learning.yaml │ │ │ │ ├── global_mmlu_full_en_management.yaml │ │ │ │ ├── global_mmlu_full_en_marketing.yaml │ │ │ │ ├── global_mmlu_full_en_medical_genetics.yaml │ │ │ │ ├── global_mmlu_full_en_miscellaneous.yaml │ │ │ │ ├── global_mmlu_full_en_moral_disputes.yaml │ │ │ │ ├── global_mmlu_full_en_moral_scenarios.yaml │ │ │ │ ├── global_mmlu_full_en_nutrition.yaml │ │ │ │ ├── global_mmlu_full_en_philosophy.yaml │ │ │ │ ├── global_mmlu_full_en_prehistory.yaml │ │ │ │ ├── global_mmlu_full_en_professional_accounting.yaml │ │ │ │ ├── global_mmlu_full_en_professional_law.yaml │ │ │ │ ├── global_mmlu_full_en_professional_medicine.yaml │ │ │ │ ├── global_mmlu_full_en_professional_psychology.yaml │ │ │ │ ├── global_mmlu_full_en_public_relations.yaml │ │ │ │ ├── global_mmlu_full_en_security_studies.yaml │ │ │ │ ├── global_mmlu_full_en_sociology.yaml │ │ │ │ ├── global_mmlu_full_en_us_foreign_policy.yaml │ │ │ │ ├── global_mmlu_full_en_virology.yaml │ │ │ │ ├── global_mmlu_full_en_world_religions.yaml │ │ │ │ └── utils.py │ │ │ ├── es/ │ │ │ │ ├── _es_template_yaml │ │ │ │ ├── _global_mmlu_full_es.yaml │ │ │ │ ├── _global_mmlu_full_es_humanities.yaml │ │ │ │ ├── _global_mmlu_full_es_other.yaml │ │ │ │ ├── _global_mmlu_full_es_social_sciences.yaml │ │ │ │ ├── _global_mmlu_full_es_stem.yaml │ │ │ │ ├── global_mmlu_full_es_abstract_algebra.yaml │ │ │ │ ├── global_mmlu_full_es_anatomy.yaml │ │ │ │ ├── global_mmlu_full_es_astronomy.yaml │ │ │ │ ├── global_mmlu_full_es_business_ethics.yaml │ │ │ │ ├── global_mmlu_full_es_clinical_knowledge.yaml │ │ │ │ ├── global_mmlu_full_es_college_biology.yaml │ │ │ │ ├── global_mmlu_full_es_college_chemistry.yaml │ │ │ │ ├── global_mmlu_full_es_college_computer_science.yaml │ │ │ │ ├── global_mmlu_full_es_college_mathematics.yaml │ │ │ │ ├── global_mmlu_full_es_college_medicine.yaml │ │ │ │ ├── global_mmlu_full_es_college_physics.yaml │ │ │ │ ├── global_mmlu_full_es_computer_security.yaml │ │ │ │ ├── global_mmlu_full_es_conceptual_physics.yaml │ │ │ │ ├── global_mmlu_full_es_econometrics.yaml │ │ │ │ ├── global_mmlu_full_es_electrical_engineering.yaml │ │ │ │ ├── global_mmlu_full_es_elementary_mathematics.yaml │ │ │ │ ├── global_mmlu_full_es_formal_logic.yaml │ │ │ │ ├── global_mmlu_full_es_global_facts.yaml │ │ │ │ ├── global_mmlu_full_es_high_school_biology.yaml │ │ │ │ ├── global_mmlu_full_es_high_school_chemistry.yaml │ │ │ │ ├── global_mmlu_full_es_high_school_computer_science.yaml │ │ │ │ ├── global_mmlu_full_es_high_school_european_history.yaml │ │ │ │ ├── global_mmlu_full_es_high_school_geography.yaml │ │ │ │ ├── global_mmlu_full_es_high_school_government_and_politics.yaml │ │ │ │ ├── global_mmlu_full_es_high_school_macroeconomics.yaml │ │ │ │ ├── global_mmlu_full_es_high_school_mathematics.yaml │ │ │ │ ├── global_mmlu_full_es_high_school_microeconomics.yaml │ │ │ │ ├── global_mmlu_full_es_high_school_physics.yaml │ │ │ │ ├── global_mmlu_full_es_high_school_psychology.yaml │ │ │ │ ├── global_mmlu_full_es_high_school_statistics.yaml │ │ │ │ ├── global_mmlu_full_es_high_school_us_history.yaml │ │ │ │ ├── global_mmlu_full_es_high_school_world_history.yaml │ │ │ │ ├── global_mmlu_full_es_human_aging.yaml │ │ │ │ ├── global_mmlu_full_es_human_sexuality.yaml │ │ │ │ ├── global_mmlu_full_es_international_law.yaml │ │ │ │ ├── global_mmlu_full_es_jurisprudence.yaml │ │ │ │ ├── global_mmlu_full_es_logical_fallacies.yaml │ │ │ │ ├── global_mmlu_full_es_machine_learning.yaml │ │ │ │ ├── global_mmlu_full_es_management.yaml │ │ │ │ ├── global_mmlu_full_es_marketing.yaml │ │ │ │ ├── global_mmlu_full_es_medical_genetics.yaml │ │ │ │ ├── global_mmlu_full_es_miscellaneous.yaml │ │ │ │ ├── global_mmlu_full_es_moral_disputes.yaml │ │ │ │ ├── global_mmlu_full_es_moral_scenarios.yaml │ │ │ │ ├── global_mmlu_full_es_nutrition.yaml │ │ │ │ ├── global_mmlu_full_es_philosophy.yaml │ │ │ │ ├── global_mmlu_full_es_prehistory.yaml │ │ │ │ ├── global_mmlu_full_es_professional_accounting.yaml │ │ │ │ ├── global_mmlu_full_es_professional_law.yaml │ │ │ │ ├── global_mmlu_full_es_professional_medicine.yaml │ │ │ │ ├── global_mmlu_full_es_professional_psychology.yaml │ │ │ │ ├── global_mmlu_full_es_public_relations.yaml │ │ │ │ ├── global_mmlu_full_es_security_studies.yaml │ │ │ │ ├── global_mmlu_full_es_sociology.yaml │ │ │ │ ├── global_mmlu_full_es_us_foreign_policy.yaml │ │ │ │ ├── global_mmlu_full_es_virology.yaml │ │ │ │ ├── global_mmlu_full_es_world_religions.yaml │ │ │ │ └── utils.py │ │ │ ├── fa/ │ │ │ │ ├── _fa_template_yaml │ │ │ │ ├── _global_mmlu_full_fa.yaml │ │ │ │ ├── _global_mmlu_full_fa_humanities.yaml │ │ │ │ ├── _global_mmlu_full_fa_other.yaml │ │ │ │ ├── _global_mmlu_full_fa_social_sciences.yaml │ │ │ │ ├── _global_mmlu_full_fa_stem.yaml │ │ │ │ ├── global_mmlu_full_fa_abstract_algebra.yaml │ │ │ │ ├── global_mmlu_full_fa_anatomy.yaml │ │ │ │ ├── global_mmlu_full_fa_astronomy.yaml │ │ │ │ ├── global_mmlu_full_fa_business_ethics.yaml │ │ │ │ ├── global_mmlu_full_fa_clinical_knowledge.yaml │ │ │ │ ├── global_mmlu_full_fa_college_biology.yaml │ │ │ │ ├── global_mmlu_full_fa_college_chemistry.yaml │ │ │ │ ├── global_mmlu_full_fa_college_computer_science.yaml │ │ │ │ ├── global_mmlu_full_fa_college_mathematics.yaml │ │ │ │ ├── global_mmlu_full_fa_college_medicine.yaml │ │ │ │ ├── global_mmlu_full_fa_college_physics.yaml │ │ │ │ ├── global_mmlu_full_fa_computer_security.yaml │ │ │ │ ├── global_mmlu_full_fa_conceptual_physics.yaml │ │ │ │ ├── global_mmlu_full_fa_econometrics.yaml │ │ │ │ ├── global_mmlu_full_fa_electrical_engineering.yaml │ │ │ │ ├── global_mmlu_full_fa_elementary_mathematics.yaml │ │ │ │ ├── global_mmlu_full_fa_formal_logic.yaml │ │ │ │ ├── global_mmlu_full_fa_global_facts.yaml │ │ │ │ ├── global_mmlu_full_fa_high_school_biology.yaml │ │ │ │ ├── global_mmlu_full_fa_high_school_chemistry.yaml │ │ │ │ ├── global_mmlu_full_fa_high_school_computer_science.yaml │ │ │ │ ├── global_mmlu_full_fa_high_school_european_history.yaml │ │ │ │ ├── global_mmlu_full_fa_high_school_geography.yaml │ │ │ │ ├── global_mmlu_full_fa_high_school_government_and_politics.yaml │ │ │ │ ├── global_mmlu_full_fa_high_school_macroeconomics.yaml │ │ │ │ ├── global_mmlu_full_fa_high_school_mathematics.yaml │ │ │ │ ├── global_mmlu_full_fa_high_school_microeconomics.yaml │ │ │ │ ├── global_mmlu_full_fa_high_school_physics.yaml │ │ │ │ ├── global_mmlu_full_fa_high_school_psychology.yaml │ │ │ │ ├── global_mmlu_full_fa_high_school_statistics.yaml │ │ │ │ ├── global_mmlu_full_fa_high_school_us_history.yaml │ │ │ │ ├── global_mmlu_full_fa_high_school_world_history.yaml │ │ │ │ ├── global_mmlu_full_fa_human_aging.yaml │ │ │ │ ├── global_mmlu_full_fa_human_sexuality.yaml │ │ │ │ ├── global_mmlu_full_fa_international_law.yaml │ │ │ │ ├── global_mmlu_full_fa_jurisprudence.yaml │ │ │ │ ├── global_mmlu_full_fa_logical_fallacies.yaml │ │ │ │ ├── global_mmlu_full_fa_machine_learning.yaml │ │ │ │ ├── global_mmlu_full_fa_management.yaml │ │ │ │ ├── global_mmlu_full_fa_marketing.yaml │ │ │ │ ├── global_mmlu_full_fa_medical_genetics.yaml │ │ │ │ ├── global_mmlu_full_fa_miscellaneous.yaml │ │ │ │ ├── global_mmlu_full_fa_moral_disputes.yaml │ │ │ │ ├── global_mmlu_full_fa_moral_scenarios.yaml │ │ │ │ ├── global_mmlu_full_fa_nutrition.yaml │ │ │ │ ├── global_mmlu_full_fa_philosophy.yaml │ │ │ │ ├── global_mmlu_full_fa_prehistory.yaml │ │ │ │ ├── global_mmlu_full_fa_professional_accounting.yaml │ │ │ │ ├── global_mmlu_full_fa_professional_law.yaml │ │ │ │ ├── global_mmlu_full_fa_professional_medicine.yaml │ │ │ │ ├── global_mmlu_full_fa_professional_psychology.yaml │ │ │ │ ├── global_mmlu_full_fa_public_relations.yaml │ │ │ │ ├── global_mmlu_full_fa_security_studies.yaml │ │ │ │ ├── global_mmlu_full_fa_sociology.yaml │ │ │ │ ├── global_mmlu_full_fa_us_foreign_policy.yaml │ │ │ │ ├── global_mmlu_full_fa_virology.yaml │ │ │ │ ├── global_mmlu_full_fa_world_religions.yaml │ │ │ │ └── utils.py │ │ │ ├── fil/ │ │ │ │ ├── _fil_template_yaml │ │ │ │ ├── _global_mmlu_full_fil.yaml │ │ │ │ ├── _global_mmlu_full_fil_humanities.yaml │ │ │ │ ├── _global_mmlu_full_fil_other.yaml │ │ │ │ ├── _global_mmlu_full_fil_social_sciences.yaml │ │ │ │ ├── _global_mmlu_full_fil_stem.yaml │ │ │ │ ├── global_mmlu_full_fil_abstract_algebra.yaml │ │ │ │ ├── global_mmlu_full_fil_anatomy.yaml │ │ │ │ ├── global_mmlu_full_fil_astronomy.yaml │ │ │ │ ├── global_mmlu_full_fil_business_ethics.yaml │ │ │ │ ├── global_mmlu_full_fil_clinical_knowledge.yaml │ │ │ │ ├── global_mmlu_full_fil_college_biology.yaml │ │ │ │ ├── global_mmlu_full_fil_college_chemistry.yaml │ │ │ │ ├── global_mmlu_full_fil_college_computer_science.yaml │ │ │ │ ├── global_mmlu_full_fil_college_mathematics.yaml │ │ │ │ ├── global_mmlu_full_fil_college_medicine.yaml │ │ │ │ ├── global_mmlu_full_fil_college_physics.yaml │ │ │ │ ├── global_mmlu_full_fil_computer_security.yaml │ │ │ │ ├── global_mmlu_full_fil_conceptual_physics.yaml │ │ │ │ ├── global_mmlu_full_fil_econometrics.yaml │ │ │ │ ├── global_mmlu_full_fil_electrical_engineering.yaml │ │ │ │ ├── global_mmlu_full_fil_elementary_mathematics.yaml │ │ │ │ ├── global_mmlu_full_fil_formal_logic.yaml │ │ │ │ ├── global_mmlu_full_fil_global_facts.yaml │ │ │ │ ├── global_mmlu_full_fil_high_school_biology.yaml │ │ │ │ ├── global_mmlu_full_fil_high_school_chemistry.yaml │ │ │ │ ├── global_mmlu_full_fil_high_school_computer_science.yaml │ │ │ │ ├── global_mmlu_full_fil_high_school_european_history.yaml │ │ │ │ ├── global_mmlu_full_fil_high_school_geography.yaml │ │ │ │ ├── global_mmlu_full_fil_high_school_government_and_politics.yaml │ │ │ │ ├── global_mmlu_full_fil_high_school_macroeconomics.yaml │ │ │ │ ├── global_mmlu_full_fil_high_school_mathematics.yaml │ │ │ │ ├── global_mmlu_full_fil_high_school_microeconomics.yaml │ │ │ │ ├── global_mmlu_full_fil_high_school_physics.yaml │ │ │ │ ├── global_mmlu_full_fil_high_school_psychology.yaml │ │ │ │ ├── global_mmlu_full_fil_high_school_statistics.yaml │ │ │ │ ├── global_mmlu_full_fil_high_school_us_history.yaml │ │ │ │ ├── global_mmlu_full_fil_high_school_world_history.yaml │ │ │ │ ├── global_mmlu_full_fil_human_aging.yaml │ │ │ │ ├── global_mmlu_full_fil_human_sexuality.yaml │ │ │ │ ├── global_mmlu_full_fil_international_law.yaml │ │ │ │ ├── global_mmlu_full_fil_jurisprudence.yaml │ │ │ │ ├── global_mmlu_full_fil_logical_fallacies.yaml │ │ │ │ ├── global_mmlu_full_fil_machine_learning.yaml │ │ │ │ ├── global_mmlu_full_fil_management.yaml │ │ │ │ ├── global_mmlu_full_fil_marketing.yaml │ │ │ │ ├── global_mmlu_full_fil_medical_genetics.yaml │ │ │ │ ├── global_mmlu_full_fil_miscellaneous.yaml │ │ │ │ ├── global_mmlu_full_fil_moral_disputes.yaml │ │ │ │ ├── global_mmlu_full_fil_moral_scenarios.yaml │ │ │ │ ├── global_mmlu_full_fil_nutrition.yaml │ │ │ │ ├── global_mmlu_full_fil_philosophy.yaml │ │ │ │ ├── global_mmlu_full_fil_prehistory.yaml │ │ │ │ ├── global_mmlu_full_fil_professional_accounting.yaml │ │ │ │ ├── global_mmlu_full_fil_professional_law.yaml │ │ │ │ ├── global_mmlu_full_fil_professional_medicine.yaml │ │ │ │ ├── global_mmlu_full_fil_professional_psychology.yaml │ │ │ │ ├── global_mmlu_full_fil_public_relations.yaml │ │ │ │ ├── global_mmlu_full_fil_security_studies.yaml │ │ │ │ ├── global_mmlu_full_fil_sociology.yaml │ │ │ │ ├── global_mmlu_full_fil_us_foreign_policy.yaml │ │ │ │ ├── global_mmlu_full_fil_virology.yaml │ │ │ │ ├── global_mmlu_full_fil_world_religions.yaml │ │ │ │ └── utils.py │ │ │ ├── fr/ │ │ │ │ ├── _fr_template_yaml │ │ │ │ ├── _global_mmlu_full_fr.yaml │ │ │ │ ├── _global_mmlu_full_fr_humanities.yaml │ │ │ │ ├── _global_mmlu_full_fr_other.yaml │ │ │ │ ├── _global_mmlu_full_fr_social_sciences.yaml │ │ │ │ ├── _global_mmlu_full_fr_stem.yaml │ │ │ │ ├── global_mmlu_full_fr_abstract_algebra.yaml │ │ │ │ ├── global_mmlu_full_fr_anatomy.yaml │ │ │ │ ├── global_mmlu_full_fr_astronomy.yaml │ │ │ │ ├── global_mmlu_full_fr_business_ethics.yaml │ │ │ │ ├── global_mmlu_full_fr_clinical_knowledge.yaml │ │ │ │ ├── global_mmlu_full_fr_college_biology.yaml │ │ │ │ ├── global_mmlu_full_fr_college_chemistry.yaml │ │ │ │ ├── global_mmlu_full_fr_college_computer_science.yaml │ │ │ │ ├── global_mmlu_full_fr_college_mathematics.yaml │ │ │ │ ├── global_mmlu_full_fr_college_medicine.yaml │ │ │ │ ├── global_mmlu_full_fr_college_physics.yaml │ │ │ │ ├── global_mmlu_full_fr_computer_security.yaml │ │ │ │ ├── global_mmlu_full_fr_conceptual_physics.yaml │ │ │ │ ├── global_mmlu_full_fr_econometrics.yaml │ │ │ │ ├── global_mmlu_full_fr_electrical_engineering.yaml │ │ │ │ ├── global_mmlu_full_fr_elementary_mathematics.yaml │ │ │ │ ├── global_mmlu_full_fr_formal_logic.yaml │ │ │ │ ├── global_mmlu_full_fr_global_facts.yaml │ │ │ │ ├── global_mmlu_full_fr_high_school_biology.yaml │ │ │ │ ├── global_mmlu_full_fr_high_school_chemistry.yaml │ │ │ │ ├── global_mmlu_full_fr_high_school_computer_science.yaml │ │ │ │ ├── global_mmlu_full_fr_high_school_european_history.yaml │ │ │ │ ├── global_mmlu_full_fr_high_school_geography.yaml │ │ │ │ ├── global_mmlu_full_fr_high_school_government_and_politics.yaml │ │ │ │ ├── global_mmlu_full_fr_high_school_macroeconomics.yaml │ │ │ │ ├── global_mmlu_full_fr_high_school_mathematics.yaml │ │ │ │ ├── global_mmlu_full_fr_high_school_microeconomics.yaml │ │ │ │ ├── global_mmlu_full_fr_high_school_physics.yaml │ │ │ │ ├── global_mmlu_full_fr_high_school_psychology.yaml │ │ │ │ ├── global_mmlu_full_fr_high_school_statistics.yaml │ │ │ │ ├── global_mmlu_full_fr_high_school_us_history.yaml │ │ │ │ ├── global_mmlu_full_fr_high_school_world_history.yaml │ │ │ │ ├── global_mmlu_full_fr_human_aging.yaml │ │ │ │ ├── global_mmlu_full_fr_human_sexuality.yaml │ │ │ │ ├── global_mmlu_full_fr_international_law.yaml │ │ │ │ ├── global_mmlu_full_fr_jurisprudence.yaml │ │ │ │ ├── global_mmlu_full_fr_logical_fallacies.yaml │ │ │ │ ├── global_mmlu_full_fr_machine_learning.yaml │ │ │ │ ├── global_mmlu_full_fr_management.yaml │ │ │ │ ├── global_mmlu_full_fr_marketing.yaml │ │ │ │ ├── global_mmlu_full_fr_medical_genetics.yaml │ │ │ │ ├── global_mmlu_full_fr_miscellaneous.yaml │ │ │ │ ├── global_mmlu_full_fr_moral_disputes.yaml │ │ │ │ ├── global_mmlu_full_fr_moral_scenarios.yaml │ │ │ │ ├── global_mmlu_full_fr_nutrition.yaml │ │ │ │ ├── global_mmlu_full_fr_philosophy.yaml │ │ │ │ ├── global_mmlu_full_fr_prehistory.yaml │ │ │ │ ├── global_mmlu_full_fr_professional_accounting.yaml │ │ │ │ ├── global_mmlu_full_fr_professional_law.yaml │ │ │ │ ├── global_mmlu_full_fr_professional_medicine.yaml │ │ │ │ ├── global_mmlu_full_fr_professional_psychology.yaml │ │ │ │ ├── global_mmlu_full_fr_public_relations.yaml │ │ │ │ ├── global_mmlu_full_fr_security_studies.yaml │ │ │ │ ├── global_mmlu_full_fr_sociology.yaml │ │ │ │ ├── global_mmlu_full_fr_us_foreign_policy.yaml │ │ │ │ ├── global_mmlu_full_fr_virology.yaml │ │ │ │ ├── global_mmlu_full_fr_world_religions.yaml │ │ │ │ └── utils.py │ │ │ ├── ha/ │ │ │ │ ├── _global_mmlu_full_ha.yaml │ │ │ │ ├── _global_mmlu_full_ha_humanities.yaml │ │ │ │ ├── _global_mmlu_full_ha_other.yaml │ │ │ │ ├── _global_mmlu_full_ha_social_sciences.yaml │ │ │ │ ├── _global_mmlu_full_ha_stem.yaml │ │ │ │ ├── _ha_template_yaml │ │ │ │ ├── global_mmlu_full_ha_abstract_algebra.yaml │ │ │ │ ├── global_mmlu_full_ha_anatomy.yaml │ │ │ │ ├── global_mmlu_full_ha_astronomy.yaml │ │ │ │ ├── global_mmlu_full_ha_business_ethics.yaml │ │ │ │ ├── global_mmlu_full_ha_clinical_knowledge.yaml │ │ │ │ ├── global_mmlu_full_ha_college_biology.yaml │ │ │ │ ├── global_mmlu_full_ha_college_chemistry.yaml │ │ │ │ ├── global_mmlu_full_ha_college_computer_science.yaml │ │ │ │ ├── global_mmlu_full_ha_college_mathematics.yaml │ │ │ │ ├── global_mmlu_full_ha_college_medicine.yaml │ │ │ │ ├── global_mmlu_full_ha_college_physics.yaml │ │ │ │ ├── global_mmlu_full_ha_computer_security.yaml │ │ │ │ ├── global_mmlu_full_ha_conceptual_physics.yaml │ │ │ │ ├── global_mmlu_full_ha_econometrics.yaml │ │ │ │ ├── global_mmlu_full_ha_electrical_engineering.yaml │ │ │ │ ├── global_mmlu_full_ha_elementary_mathematics.yaml │ │ │ │ ├── global_mmlu_full_ha_formal_logic.yaml │ │ │ │ ├── global_mmlu_full_ha_global_facts.yaml │ │ │ │ ├── global_mmlu_full_ha_high_school_biology.yaml │ │ │ │ ├── global_mmlu_full_ha_high_school_chemistry.yaml │ │ │ │ ├── global_mmlu_full_ha_high_school_computer_science.yaml │ │ │ │ ├── global_mmlu_full_ha_high_school_european_history.yaml │ │ │ │ ├── global_mmlu_full_ha_high_school_geography.yaml │ │ │ │ ├── global_mmlu_full_ha_high_school_government_and_politics.yaml │ │ │ │ ├── global_mmlu_full_ha_high_school_macroeconomics.yaml │ │ │ │ ├── global_mmlu_full_ha_high_school_mathematics.yaml │ │ │ │ ├── global_mmlu_full_ha_high_school_microeconomics.yaml │ │ │ │ ├── global_mmlu_full_ha_high_school_physics.yaml │ │ │ │ ├── global_mmlu_full_ha_high_school_psychology.yaml │ │ │ │ ├── global_mmlu_full_ha_high_school_statistics.yaml │ │ │ │ ├── global_mmlu_full_ha_high_school_us_history.yaml │ │ │ │ ├── global_mmlu_full_ha_high_school_world_history.yaml │ │ │ │ ├── global_mmlu_full_ha_human_aging.yaml │ │ │ │ ├── global_mmlu_full_ha_human_sexuality.yaml │ │ │ │ ├── global_mmlu_full_ha_international_law.yaml │ │ │ │ ├── global_mmlu_full_ha_jurisprudence.yaml │ │ │ │ ├── global_mmlu_full_ha_logical_fallacies.yaml │ │ │ │ ├── global_mmlu_full_ha_machine_learning.yaml │ │ │ │ ├── global_mmlu_full_ha_management.yaml │ │ │ │ ├── global_mmlu_full_ha_marketing.yaml │ │ │ │ ├── global_mmlu_full_ha_medical_genetics.yaml │ │ │ │ ├── global_mmlu_full_ha_miscellaneous.yaml │ │ │ │ ├── global_mmlu_full_ha_moral_disputes.yaml │ │ │ │ ├── global_mmlu_full_ha_moral_scenarios.yaml │ │ │ │ ├── global_mmlu_full_ha_nutrition.yaml │ │ │ │ ├── global_mmlu_full_ha_philosophy.yaml │ │ │ │ ├── global_mmlu_full_ha_prehistory.yaml │ │ │ │ ├── global_mmlu_full_ha_professional_accounting.yaml │ │ │ │ ├── global_mmlu_full_ha_professional_law.yaml │ │ │ │ ├── global_mmlu_full_ha_professional_medicine.yaml │ │ │ │ ├── global_mmlu_full_ha_professional_psychology.yaml │ │ │ │ ├── global_mmlu_full_ha_public_relations.yaml │ │ │ │ ├── global_mmlu_full_ha_security_studies.yaml │ │ │ │ ├── global_mmlu_full_ha_sociology.yaml │ │ │ │ ├── global_mmlu_full_ha_us_foreign_policy.yaml │ │ │ │ ├── global_mmlu_full_ha_virology.yaml │ │ │ │ ├── global_mmlu_full_ha_world_religions.yaml │ │ │ │ └── utils.py │ │ │ ├── he/ │ │ │ │ ├── _global_mmlu_full_he.yaml │ │ │ │ ├── _global_mmlu_full_he_humanities.yaml │ │ │ │ ├── _global_mmlu_full_he_other.yaml │ │ │ │ ├── _global_mmlu_full_he_social_sciences.yaml │ │ │ │ ├── _global_mmlu_full_he_stem.yaml │ │ │ │ ├── _he_template_yaml │ │ │ │ ├── global_mmlu_full_he_abstract_algebra.yaml │ │ │ │ ├── global_mmlu_full_he_anatomy.yaml │ │ │ │ ├── global_mmlu_full_he_astronomy.yaml │ │ │ │ ├── global_mmlu_full_he_business_ethics.yaml │ │ │ │ ├── global_mmlu_full_he_clinical_knowledge.yaml │ │ │ │ ├── global_mmlu_full_he_college_biology.yaml │ │ │ │ ├── global_mmlu_full_he_college_chemistry.yaml │ │ │ │ ├── global_mmlu_full_he_college_computer_science.yaml │ │ │ │ ├── global_mmlu_full_he_college_mathematics.yaml │ │ │ │ ├── global_mmlu_full_he_college_medicine.yaml │ │ │ │ ├── global_mmlu_full_he_college_physics.yaml │ │ │ │ ├── global_mmlu_full_he_computer_security.yaml │ │ │ │ ├── global_mmlu_full_he_conceptual_physics.yaml │ │ │ │ ├── global_mmlu_full_he_econometrics.yaml │ │ │ │ ├── global_mmlu_full_he_electrical_engineering.yaml │ │ │ │ ├── global_mmlu_full_he_elementary_mathematics.yaml │ │ │ │ ├── global_mmlu_full_he_formal_logic.yaml │ │ │ │ ├── global_mmlu_full_he_global_facts.yaml │ │ │ │ ├── global_mmlu_full_he_high_school_biology.yaml │ │ │ │ ├── global_mmlu_full_he_high_school_chemistry.yaml │ │ │ │ ├── global_mmlu_full_he_high_school_computer_science.yaml │ │ │ │ ├── global_mmlu_full_he_high_school_european_history.yaml │ │ │ │ ├── global_mmlu_full_he_high_school_geography.yaml │ │ │ │ ├── global_mmlu_full_he_high_school_government_and_politics.yaml │ │ │ │ ├── global_mmlu_full_he_high_school_macroeconomics.yaml │ │ │ │ ├── global_mmlu_full_he_high_school_mathematics.yaml │ │ │ │ ├── global_mmlu_full_he_high_school_microeconomics.yaml │ │ │ │ ├── global_mmlu_full_he_high_school_physics.yaml │ │ │ │ ├── global_mmlu_full_he_high_school_psychology.yaml │ │ │ │ ├── global_mmlu_full_he_high_school_statistics.yaml │ │ │ │ ├── global_mmlu_full_he_high_school_us_history.yaml │ │ │ │ ├── global_mmlu_full_he_high_school_world_history.yaml │ │ │ │ ├── global_mmlu_full_he_human_aging.yaml │ │ │ │ ├── global_mmlu_full_he_human_sexuality.yaml │ │ │ │ ├── global_mmlu_full_he_international_law.yaml │ │ │ │ ├── global_mmlu_full_he_jurisprudence.yaml │ │ │ │ ├── global_mmlu_full_he_logical_fallacies.yaml │ │ │ │ ├── global_mmlu_full_he_machine_learning.yaml │ │ │ │ ├── global_mmlu_full_he_management.yaml │ │ │ │ ├── global_mmlu_full_he_marketing.yaml │ │ │ │ ├── global_mmlu_full_he_medical_genetics.yaml │ │ │ │ ├── global_mmlu_full_he_miscellaneous.yaml │ │ │ │ ├── global_mmlu_full_he_moral_disputes.yaml │ │ │ │ ├── global_mmlu_full_he_moral_scenarios.yaml │ │ │ │ ├── global_mmlu_full_he_nutrition.yaml │ │ │ │ ├── global_mmlu_full_he_philosophy.yaml │ │ │ │ ├── global_mmlu_full_he_prehistory.yaml │ │ │ │ ├── global_mmlu_full_he_professional_accounting.yaml │ │ │ │ ├── global_mmlu_full_he_professional_law.yaml │ │ │ │ ├── global_mmlu_full_he_professional_medicine.yaml │ │ │ │ ├── global_mmlu_full_he_professional_psychology.yaml │ │ │ │ ├── global_mmlu_full_he_public_relations.yaml │ │ │ │ ├── global_mmlu_full_he_security_studies.yaml │ │ │ │ ├── global_mmlu_full_he_sociology.yaml │ │ │ │ ├── global_mmlu_full_he_us_foreign_policy.yaml │ │ │ │ ├── global_mmlu_full_he_virology.yaml │ │ │ │ ├── global_mmlu_full_he_world_religions.yaml │ │ │ │ └── utils.py │ │ │ ├── hi/ │ │ │ │ ├── _global_mmlu_full_hi.yaml │ │ │ │ ├── _global_mmlu_full_hi_humanities.yaml │ │ │ │ ├── _global_mmlu_full_hi_other.yaml │ │ │ │ ├── _global_mmlu_full_hi_social_sciences.yaml │ │ │ │ ├── _global_mmlu_full_hi_stem.yaml │ │ │ │ ├── _hi_template_yaml │ │ │ │ ├── global_mmlu_full_hi_abstract_algebra.yaml │ │ │ │ ├── global_mmlu_full_hi_anatomy.yaml │ │ │ │ ├── global_mmlu_full_hi_astronomy.yaml │ │ │ │ ├── global_mmlu_full_hi_business_ethics.yaml │ │ │ │ ├── global_mmlu_full_hi_clinical_knowledge.yaml │ │ │ │ ├── global_mmlu_full_hi_college_biology.yaml │ │ │ │ ├── global_mmlu_full_hi_college_chemistry.yaml │ │ │ │ ├── global_mmlu_full_hi_college_computer_science.yaml │ │ │ │ ├── global_mmlu_full_hi_college_mathematics.yaml │ │ │ │ ├── global_mmlu_full_hi_college_medicine.yaml │ │ │ │ ├── global_mmlu_full_hi_college_physics.yaml │ │ │ │ ├── global_mmlu_full_hi_computer_security.yaml │ │ │ │ ├── global_mmlu_full_hi_conceptual_physics.yaml │ │ │ │ ├── global_mmlu_full_hi_econometrics.yaml │ │ │ │ ├── global_mmlu_full_hi_electrical_engineering.yaml │ │ │ │ ├── global_mmlu_full_hi_elementary_mathematics.yaml │ │ │ │ ├── global_mmlu_full_hi_formal_logic.yaml │ │ │ │ ├── global_mmlu_full_hi_global_facts.yaml │ │ │ │ ├── global_mmlu_full_hi_high_school_biology.yaml │ │ │ │ ├── global_mmlu_full_hi_high_school_chemistry.yaml │ │ │ │ ├── global_mmlu_full_hi_high_school_computer_science.yaml │ │ │ │ ├── global_mmlu_full_hi_high_school_european_history.yaml │ │ │ │ ├── global_mmlu_full_hi_high_school_geography.yaml │ │ │ │ ├── global_mmlu_full_hi_high_school_government_and_politics.yaml │ │ │ │ ├── global_mmlu_full_hi_high_school_macroeconomics.yaml │ │ │ │ ├── global_mmlu_full_hi_high_school_mathematics.yaml │ │ │ │ ├── global_mmlu_full_hi_high_school_microeconomics.yaml │ │ │ │ ├── global_mmlu_full_hi_high_school_physics.yaml │ │ │ │ ├── global_mmlu_full_hi_high_school_psychology.yaml │ │ │ │ ├── global_mmlu_full_hi_high_school_statistics.yaml │ │ │ │ ├── global_mmlu_full_hi_high_school_us_history.yaml │ │ │ │ ├── global_mmlu_full_hi_high_school_world_history.yaml │ │ │ │ ├── global_mmlu_full_hi_human_aging.yaml │ │ │ │ ├── global_mmlu_full_hi_human_sexuality.yaml │ │ │ │ ├── global_mmlu_full_hi_international_law.yaml │ │ │ │ ├── global_mmlu_full_hi_jurisprudence.yaml │ │ │ │ ├── global_mmlu_full_hi_logical_fallacies.yaml │ │ │ │ ├── global_mmlu_full_hi_machine_learning.yaml │ │ │ │ ├── global_mmlu_full_hi_management.yaml │ │ │ │ ├── global_mmlu_full_hi_marketing.yaml │ │ │ │ ├── global_mmlu_full_hi_medical_genetics.yaml │ │ │ │ ├── global_mmlu_full_hi_miscellaneous.yaml │ │ │ │ ├── global_mmlu_full_hi_moral_disputes.yaml │ │ │ │ ├── global_mmlu_full_hi_moral_scenarios.yaml │ │ │ │ ├── global_mmlu_full_hi_nutrition.yaml │ │ │ │ ├── global_mmlu_full_hi_philosophy.yaml │ │ │ │ ├── global_mmlu_full_hi_prehistory.yaml │ │ │ │ ├── global_mmlu_full_hi_professional_accounting.yaml │ │ │ │ ├── global_mmlu_full_hi_professional_law.yaml │ │ │ │ ├── global_mmlu_full_hi_professional_medicine.yaml │ │ │ │ ├── global_mmlu_full_hi_professional_psychology.yaml │ │ │ │ ├── global_mmlu_full_hi_public_relations.yaml │ │ │ │ ├── global_mmlu_full_hi_security_studies.yaml │ │ │ │ ├── global_mmlu_full_hi_sociology.yaml │ │ │ │ ├── global_mmlu_full_hi_us_foreign_policy.yaml │ │ │ │ ├── global_mmlu_full_hi_virology.yaml │ │ │ │ ├── global_mmlu_full_hi_world_religions.yaml │ │ │ │ └── utils.py │ │ │ ├── id/ │ │ │ │ ├── _global_mmlu_full_id.yaml │ │ │ │ ├── _global_mmlu_full_id_humanities.yaml │ │ │ │ ├── _global_mmlu_full_id_other.yaml │ │ │ │ ├── _global_mmlu_full_id_social_sciences.yaml │ │ │ │ ├── _global_mmlu_full_id_stem.yaml │ │ │ │ ├── _id_template_yaml │ │ │ │ ├── global_mmlu_full_id_abstract_algebra.yaml │ │ │ │ ├── global_mmlu_full_id_anatomy.yaml │ │ │ │ ├── global_mmlu_full_id_astronomy.yaml │ │ │ │ ├── global_mmlu_full_id_business_ethics.yaml │ │ │ │ ├── global_mmlu_full_id_clinical_knowledge.yaml │ │ │ │ ├── global_mmlu_full_id_college_biology.yaml │ │ │ │ ├── global_mmlu_full_id_college_chemistry.yaml │ │ │ │ ├── global_mmlu_full_id_college_computer_science.yaml │ │ │ │ ├── global_mmlu_full_id_college_mathematics.yaml │ │ │ │ ├── global_mmlu_full_id_college_medicine.yaml │ │ │ │ ├── global_mmlu_full_id_college_physics.yaml │ │ │ │ ├── global_mmlu_full_id_computer_security.yaml │ │ │ │ ├── global_mmlu_full_id_conceptual_physics.yaml │ │ │ │ ├── global_mmlu_full_id_econometrics.yaml │ │ │ │ ├── global_mmlu_full_id_electrical_engineering.yaml │ │ │ │ ├── global_mmlu_full_id_elementary_mathematics.yaml │ │ │ │ ├── global_mmlu_full_id_formal_logic.yaml │ │ │ │ ├── global_mmlu_full_id_global_facts.yaml │ │ │ │ ├── global_mmlu_full_id_high_school_biology.yaml │ │ │ │ ├── global_mmlu_full_id_high_school_chemistry.yaml │ │ │ │ ├── global_mmlu_full_id_high_school_computer_science.yaml │ │ │ │ ├── global_mmlu_full_id_high_school_european_history.yaml │ │ │ │ ├── global_mmlu_full_id_high_school_geography.yaml │ │ │ │ ├── global_mmlu_full_id_high_school_government_and_politics.yaml │ │ │ │ ├── global_mmlu_full_id_high_school_macroeconomics.yaml │ │ │ │ ├── global_mmlu_full_id_high_school_mathematics.yaml │ │ │ │ ├── global_mmlu_full_id_high_school_microeconomics.yaml │ │ │ │ ├── global_mmlu_full_id_high_school_physics.yaml │ │ │ │ ├── global_mmlu_full_id_high_school_psychology.yaml │ │ │ │ ├── global_mmlu_full_id_high_school_statistics.yaml │ │ │ │ ├── global_mmlu_full_id_high_school_us_history.yaml │ │ │ │ ├── global_mmlu_full_id_high_school_world_history.yaml │ │ │ │ ├── global_mmlu_full_id_human_aging.yaml │ │ │ │ ├── global_mmlu_full_id_human_sexuality.yaml │ │ │ │ ├── global_mmlu_full_id_international_law.yaml │ │ │ │ ├── global_mmlu_full_id_jurisprudence.yaml │ │ │ │ ├── global_mmlu_full_id_logical_fallacies.yaml │ │ │ │ ├── global_mmlu_full_id_machine_learning.yaml │ │ │ │ ├── global_mmlu_full_id_management.yaml │ │ │ │ ├── global_mmlu_full_id_marketing.yaml │ │ │ │ ├── global_mmlu_full_id_medical_genetics.yaml │ │ │ │ ├── global_mmlu_full_id_miscellaneous.yaml │ │ │ │ ├── global_mmlu_full_id_moral_disputes.yaml │ │ │ │ ├── global_mmlu_full_id_moral_scenarios.yaml │ │ │ │ ├── global_mmlu_full_id_nutrition.yaml │ │ │ │ ├── global_mmlu_full_id_philosophy.yaml │ │ │ │ ├── global_mmlu_full_id_prehistory.yaml │ │ │ │ ├── global_mmlu_full_id_professional_accounting.yaml │ │ │ │ ├── global_mmlu_full_id_professional_law.yaml │ │ │ │ ├── global_mmlu_full_id_professional_medicine.yaml │ │ │ │ ├── global_mmlu_full_id_professional_psychology.yaml │ │ │ │ ├── global_mmlu_full_id_public_relations.yaml │ │ │ │ ├── global_mmlu_full_id_security_studies.yaml │ │ │ │ ├── global_mmlu_full_id_sociology.yaml │ │ │ │ ├── global_mmlu_full_id_us_foreign_policy.yaml │ │ │ │ ├── global_mmlu_full_id_virology.yaml │ │ │ │ ├── global_mmlu_full_id_world_religions.yaml │ │ │ │ └── utils.py │ │ │ ├── ig/ │ │ │ │ ├── _global_mmlu_full_ig.yaml │ │ │ │ ├── _global_mmlu_full_ig_humanities.yaml │ │ │ │ ├── _global_mmlu_full_ig_other.yaml │ │ │ │ ├── _global_mmlu_full_ig_social_sciences.yaml │ │ │ │ ├── _global_mmlu_full_ig_stem.yaml │ │ │ │ ├── _ig_template_yaml │ │ │ │ ├── global_mmlu_full_ig_abstract_algebra.yaml │ │ │ │ ├── global_mmlu_full_ig_anatomy.yaml │ │ │ │ ├── global_mmlu_full_ig_astronomy.yaml │ │ │ │ ├── global_mmlu_full_ig_business_ethics.yaml │ │ │ │ ├── global_mmlu_full_ig_clinical_knowledge.yaml │ │ │ │ ├── global_mmlu_full_ig_college_biology.yaml │ │ │ │ ├── global_mmlu_full_ig_college_chemistry.yaml │ │ │ │ ├── global_mmlu_full_ig_college_computer_science.yaml │ │ │ │ ├── global_mmlu_full_ig_college_mathematics.yaml │ │ │ │ ├── global_mmlu_full_ig_college_medicine.yaml │ │ │ │ ├── global_mmlu_full_ig_college_physics.yaml │ │ │ │ ├── global_mmlu_full_ig_computer_security.yaml │ │ │ │ ├── global_mmlu_full_ig_conceptual_physics.yaml │ │ │ │ ├── global_mmlu_full_ig_econometrics.yaml │ │ │ │ ├── global_mmlu_full_ig_electrical_engineering.yaml │ │ │ │ ├── global_mmlu_full_ig_elementary_mathematics.yaml │ │ │ │ ├── global_mmlu_full_ig_formal_logic.yaml │ │ │ │ ├── global_mmlu_full_ig_global_facts.yaml │ │ │ │ ├── global_mmlu_full_ig_high_school_biology.yaml │ │ │ │ ├── global_mmlu_full_ig_high_school_chemistry.yaml │ │ │ │ ├── global_mmlu_full_ig_high_school_computer_science.yaml │ │ │ │ ├── global_mmlu_full_ig_high_school_european_history.yaml │ │ │ │ ├── global_mmlu_full_ig_high_school_geography.yaml │ │ │ │ ├── global_mmlu_full_ig_high_school_government_and_politics.yaml │ │ │ │ ├── global_mmlu_full_ig_high_school_macroeconomics.yaml │ │ │ │ ├── global_mmlu_full_ig_high_school_mathematics.yaml │ │ │ │ ├── global_mmlu_full_ig_high_school_microeconomics.yaml │ │ │ │ ├── global_mmlu_full_ig_high_school_physics.yaml │ │ │ │ ├── global_mmlu_full_ig_high_school_psychology.yaml │ │ │ │ ├── global_mmlu_full_ig_high_school_statistics.yaml │ │ │ │ ├── global_mmlu_full_ig_high_school_us_history.yaml │ │ │ │ ├── global_mmlu_full_ig_high_school_world_history.yaml │ │ │ │ ├── global_mmlu_full_ig_human_aging.yaml │ │ │ │ ├── global_mmlu_full_ig_human_sexuality.yaml │ │ │ │ ├── global_mmlu_full_ig_international_law.yaml │ │ │ │ ├── global_mmlu_full_ig_jurisprudence.yaml │ │ │ │ ├── global_mmlu_full_ig_logical_fallacies.yaml │ │ │ │ ├── global_mmlu_full_ig_machine_learning.yaml │ │ │ │ ├── global_mmlu_full_ig_management.yaml │ │ │ │ ├── global_mmlu_full_ig_marketing.yaml │ │ │ │ ├── global_mmlu_full_ig_medical_genetics.yaml │ │ │ │ ├── global_mmlu_full_ig_miscellaneous.yaml │ │ │ │ ├── global_mmlu_full_ig_moral_disputes.yaml │ │ │ │ ├── global_mmlu_full_ig_moral_scenarios.yaml │ │ │ │ ├── global_mmlu_full_ig_nutrition.yaml │ │ │ │ ├── global_mmlu_full_ig_philosophy.yaml │ │ │ │ ├── global_mmlu_full_ig_prehistory.yaml │ │ │ │ ├── global_mmlu_full_ig_professional_accounting.yaml │ │ │ │ ├── global_mmlu_full_ig_professional_law.yaml │ │ │ │ ├── global_mmlu_full_ig_professional_medicine.yaml │ │ │ │ ├── global_mmlu_full_ig_professional_psychology.yaml │ │ │ │ ├── global_mmlu_full_ig_public_relations.yaml │ │ │ │ ├── global_mmlu_full_ig_security_studies.yaml │ │ │ │ ├── global_mmlu_full_ig_sociology.yaml │ │ │ │ ├── global_mmlu_full_ig_us_foreign_policy.yaml │ │ │ │ ├── global_mmlu_full_ig_virology.yaml │ │ │ │ ├── global_mmlu_full_ig_world_religions.yaml │ │ │ │ └── utils.py │ │ │ ├── it/ │ │ │ │ ├── _global_mmlu_full_it.yaml │ │ │ │ ├── _global_mmlu_full_it_humanities.yaml │ │ │ │ ├── _global_mmlu_full_it_other.yaml │ │ │ │ ├── _global_mmlu_full_it_social_sciences.yaml │ │ │ │ ├── _global_mmlu_full_it_stem.yaml │ │ │ │ ├── _it_template_yaml │ │ │ │ ├── global_mmlu_full_it_abstract_algebra.yaml │ │ │ │ ├── global_mmlu_full_it_anatomy.yaml │ │ │ │ ├── global_mmlu_full_it_astronomy.yaml │ │ │ │ ├── global_mmlu_full_it_business_ethics.yaml │ │ │ │ ├── global_mmlu_full_it_clinical_knowledge.yaml │ │ │ │ ├── global_mmlu_full_it_college_biology.yaml │ │ │ │ ├── global_mmlu_full_it_college_chemistry.yaml │ │ │ │ ├── global_mmlu_full_it_college_computer_science.yaml │ │ │ │ ├── global_mmlu_full_it_college_mathematics.yaml │ │ │ │ ├── global_mmlu_full_it_college_medicine.yaml │ │ │ │ ├── global_mmlu_full_it_college_physics.yaml │ │ │ │ ├── global_mmlu_full_it_computer_security.yaml │ │ │ │ ├── global_mmlu_full_it_conceptual_physics.yaml │ │ │ │ ├── global_mmlu_full_it_econometrics.yaml │ │ │ │ ├── global_mmlu_full_it_electrical_engineering.yaml │ │ │ │ ├── global_mmlu_full_it_elementary_mathematics.yaml │ │ │ │ ├── global_mmlu_full_it_formal_logic.yaml │ │ │ │ ├── global_mmlu_full_it_global_facts.yaml │ │ │ │ ├── global_mmlu_full_it_high_school_biology.yaml │ │ │ │ ├── global_mmlu_full_it_high_school_chemistry.yaml │ │ │ │ ├── global_mmlu_full_it_high_school_computer_science.yaml │ │ │ │ ├── global_mmlu_full_it_high_school_european_history.yaml │ │ │ │ ├── global_mmlu_full_it_high_school_geography.yaml │ │ │ │ ├── global_mmlu_full_it_high_school_government_and_politics.yaml │ │ │ │ ├── global_mmlu_full_it_high_school_macroeconomics.yaml │ │ │ │ ├── global_mmlu_full_it_high_school_mathematics.yaml │ │ │ │ ├── global_mmlu_full_it_high_school_microeconomics.yaml │ │ │ │ ├── global_mmlu_full_it_high_school_physics.yaml │ │ │ │ ├── global_mmlu_full_it_high_school_psychology.yaml │ │ │ │ ├── global_mmlu_full_it_high_school_statistics.yaml │ │ │ │ ├── global_mmlu_full_it_high_school_us_history.yaml │ │ │ │ ├── global_mmlu_full_it_high_school_world_history.yaml │ │ │ │ ├── global_mmlu_full_it_human_aging.yaml │ │ │ │ ├── global_mmlu_full_it_human_sexuality.yaml │ │ │ │ ├── global_mmlu_full_it_international_law.yaml │ │ │ │ ├── global_mmlu_full_it_jurisprudence.yaml │ │ │ │ ├── global_mmlu_full_it_logical_fallacies.yaml │ │ │ │ ├── global_mmlu_full_it_machine_learning.yaml │ │ │ │ ├── global_mmlu_full_it_management.yaml │ │ │ │ ├── global_mmlu_full_it_marketing.yaml │ │ │ │ ├── global_mmlu_full_it_medical_genetics.yaml │ │ │ │ ├── global_mmlu_full_it_miscellaneous.yaml │ │ │ │ ├── global_mmlu_full_it_moral_disputes.yaml │ │ │ │ ├── global_mmlu_full_it_moral_scenarios.yaml │ │ │ │ ├── global_mmlu_full_it_nutrition.yaml │ │ │ │ ├── global_mmlu_full_it_philosophy.yaml │ │ │ │ ├── global_mmlu_full_it_prehistory.yaml │ │ │ │ ├── global_mmlu_full_it_professional_accounting.yaml │ │ │ │ ├── global_mmlu_full_it_professional_law.yaml │ │ │ │ ├── global_mmlu_full_it_professional_medicine.yaml │ │ │ │ ├── global_mmlu_full_it_professional_psychology.yaml │ │ │ │ ├── global_mmlu_full_it_public_relations.yaml │ │ │ │ ├── global_mmlu_full_it_security_studies.yaml │ │ │ │ ├── global_mmlu_full_it_sociology.yaml │ │ │ │ ├── global_mmlu_full_it_us_foreign_policy.yaml │ │ │ │ ├── global_mmlu_full_it_virology.yaml │ │ │ │ ├── global_mmlu_full_it_world_religions.yaml │ │ │ │ └── utils.py │ │ │ ├── ja/ │ │ │ │ ├── _global_mmlu_full_ja.yaml │ │ │ │ ├── _global_mmlu_full_ja_humanities.yaml │ │ │ │ ├── _global_mmlu_full_ja_other.yaml │ │ │ │ ├── _global_mmlu_full_ja_social_sciences.yaml │ │ │ │ ├── _global_mmlu_full_ja_stem.yaml │ │ │ │ ├── _ja_template_yaml │ │ │ │ ├── global_mmlu_full_ja_abstract_algebra.yaml │ │ │ │ ├── global_mmlu_full_ja_anatomy.yaml │ │ │ │ ├── global_mmlu_full_ja_astronomy.yaml │ │ │ │ ├── global_mmlu_full_ja_business_ethics.yaml │ │ │ │ ├── global_mmlu_full_ja_clinical_knowledge.yaml │ │ │ │ ├── global_mmlu_full_ja_college_biology.yaml │ │ │ │ ├── global_mmlu_full_ja_college_chemistry.yaml │ │ │ │ ├── global_mmlu_full_ja_college_computer_science.yaml │ │ │ │ ├── global_mmlu_full_ja_college_mathematics.yaml │ │ │ │ ├── global_mmlu_full_ja_college_medicine.yaml │ │ │ │ ├── global_mmlu_full_ja_college_physics.yaml │ │ │ │ ├── global_mmlu_full_ja_computer_security.yaml │ │ │ │ ├── global_mmlu_full_ja_conceptual_physics.yaml │ │ │ │ ├── global_mmlu_full_ja_econometrics.yaml │ │ │ │ ├── global_mmlu_full_ja_electrical_engineering.yaml │ │ │ │ ├── global_mmlu_full_ja_elementary_mathematics.yaml │ │ │ │ ├── global_mmlu_full_ja_formal_logic.yaml │ │ │ │ ├── global_mmlu_full_ja_global_facts.yaml │ │ │ │ ├── global_mmlu_full_ja_high_school_biology.yaml │ │ │ │ ├── global_mmlu_full_ja_high_school_chemistry.yaml │ │ │ │ ├── global_mmlu_full_ja_high_school_computer_science.yaml │ │ │ │ ├── global_mmlu_full_ja_high_school_european_history.yaml │ │ │ │ ├── global_mmlu_full_ja_high_school_geography.yaml │ │ │ │ ├── global_mmlu_full_ja_high_school_government_and_politics.yaml │ │ │ │ ├── global_mmlu_full_ja_high_school_macroeconomics.yaml │ │ │ │ ├── global_mmlu_full_ja_high_school_mathematics.yaml │ │ │ │ ├── global_mmlu_full_ja_high_school_microeconomics.yaml │ │ │ │ ├── global_mmlu_full_ja_high_school_physics.yaml │ │ │ │ ├── global_mmlu_full_ja_high_school_psychology.yaml │ │ │ │ ├── global_mmlu_full_ja_high_school_statistics.yaml │ │ │ │ ├── global_mmlu_full_ja_high_school_us_history.yaml │ │ │ │ ├── global_mmlu_full_ja_high_school_world_history.yaml │ │ │ │ ├── global_mmlu_full_ja_human_aging.yaml │ │ │ │ ├── global_mmlu_full_ja_human_sexuality.yaml │ │ │ │ ├── global_mmlu_full_ja_international_law.yaml │ │ │ │ ├── global_mmlu_full_ja_jurisprudence.yaml │ │ │ │ ├── global_mmlu_full_ja_logical_fallacies.yaml │ │ │ │ ├── global_mmlu_full_ja_machine_learning.yaml │ │ │ │ ├── global_mmlu_full_ja_management.yaml │ │ │ │ ├── global_mmlu_full_ja_marketing.yaml │ │ │ │ ├── global_mmlu_full_ja_medical_genetics.yaml │ │ │ │ ├── global_mmlu_full_ja_miscellaneous.yaml │ │ │ │ ├── global_mmlu_full_ja_moral_disputes.yaml │ │ │ │ ├── global_mmlu_full_ja_moral_scenarios.yaml │ │ │ │ ├── global_mmlu_full_ja_nutrition.yaml │ │ │ │ ├── global_mmlu_full_ja_philosophy.yaml │ │ │ │ ├── global_mmlu_full_ja_prehistory.yaml │ │ │ │ ├── global_mmlu_full_ja_professional_accounting.yaml │ │ │ │ ├── global_mmlu_full_ja_professional_law.yaml │ │ │ │ ├── global_mmlu_full_ja_professional_medicine.yaml │ │ │ │ ├── global_mmlu_full_ja_professional_psychology.yaml │ │ │ │ ├── global_mmlu_full_ja_public_relations.yaml │ │ │ │ ├── global_mmlu_full_ja_security_studies.yaml │ │ │ │ ├── global_mmlu_full_ja_sociology.yaml │ │ │ │ ├── global_mmlu_full_ja_us_foreign_policy.yaml │ │ │ │ ├── global_mmlu_full_ja_virology.yaml │ │ │ │ ├── global_mmlu_full_ja_world_religions.yaml │ │ │ │ └── utils.py │ │ │ ├── ko/ │ │ │ │ ├── _global_mmlu_full_ko.yaml │ │ │ │ ├── _global_mmlu_full_ko_humanities.yaml │ │ │ │ ├── _global_mmlu_full_ko_other.yaml │ │ │ │ ├── _global_mmlu_full_ko_social_sciences.yaml │ │ │ │ ├── _global_mmlu_full_ko_stem.yaml │ │ │ │ ├── _ko_template_yaml │ │ │ │ ├── global_mmlu_full_ko_abstract_algebra.yaml │ │ │ │ ├── global_mmlu_full_ko_anatomy.yaml │ │ │ │ ├── global_mmlu_full_ko_astronomy.yaml │ │ │ │ ├── global_mmlu_full_ko_business_ethics.yaml │ │ │ │ ├── global_mmlu_full_ko_clinical_knowledge.yaml │ │ │ │ ├── global_mmlu_full_ko_college_biology.yaml │ │ │ │ ├── global_mmlu_full_ko_college_chemistry.yaml │ │ │ │ ├── global_mmlu_full_ko_college_computer_science.yaml │ │ │ │ ├── global_mmlu_full_ko_college_mathematics.yaml │ │ │ │ ├── global_mmlu_full_ko_college_medicine.yaml │ │ │ │ ├── global_mmlu_full_ko_college_physics.yaml │ │ │ │ ├── global_mmlu_full_ko_computer_security.yaml │ │ │ │ ├── global_mmlu_full_ko_conceptual_physics.yaml │ │ │ │ ├── global_mmlu_full_ko_econometrics.yaml │ │ │ │ ├── global_mmlu_full_ko_electrical_engineering.yaml │ │ │ │ ├── global_mmlu_full_ko_elementary_mathematics.yaml │ │ │ │ ├── global_mmlu_full_ko_formal_logic.yaml │ │ │ │ ├── global_mmlu_full_ko_global_facts.yaml │ │ │ │ ├── global_mmlu_full_ko_high_school_biology.yaml │ │ │ │ ├── global_mmlu_full_ko_high_school_chemistry.yaml │ │ │ │ ├── global_mmlu_full_ko_high_school_computer_science.yaml │ │ │ │ ├── global_mmlu_full_ko_high_school_european_history.yaml │ │ │ │ ├── global_mmlu_full_ko_high_school_geography.yaml │ │ │ │ ├── global_mmlu_full_ko_high_school_government_and_politics.yaml │ │ │ │ ├── global_mmlu_full_ko_high_school_macroeconomics.yaml │ │ │ │ ├── global_mmlu_full_ko_high_school_mathematics.yaml │ │ │ │ ├── global_mmlu_full_ko_high_school_microeconomics.yaml │ │ │ │ ├── global_mmlu_full_ko_high_school_physics.yaml │ │ │ │ ├── global_mmlu_full_ko_high_school_psychology.yaml │ │ │ │ ├── global_mmlu_full_ko_high_school_statistics.yaml │ │ │ │ ├── global_mmlu_full_ko_high_school_us_history.yaml │ │ │ │ ├── global_mmlu_full_ko_high_school_world_history.yaml │ │ │ │ ├── global_mmlu_full_ko_human_aging.yaml │ │ │ │ ├── global_mmlu_full_ko_human_sexuality.yaml │ │ │ │ ├── global_mmlu_full_ko_international_law.yaml │ │ │ │ ├── global_mmlu_full_ko_jurisprudence.yaml │ │ │ │ ├── global_mmlu_full_ko_logical_fallacies.yaml │ │ │ │ ├── global_mmlu_full_ko_machine_learning.yaml │ │ │ │ ├── global_mmlu_full_ko_management.yaml │ │ │ │ ├── global_mmlu_full_ko_marketing.yaml │ │ │ │ ├── global_mmlu_full_ko_medical_genetics.yaml │ │ │ │ ├── global_mmlu_full_ko_miscellaneous.yaml │ │ │ │ ├── global_mmlu_full_ko_moral_disputes.yaml │ │ │ │ ├── global_mmlu_full_ko_moral_scenarios.yaml │ │ │ │ ├── global_mmlu_full_ko_nutrition.yaml │ │ │ │ ├── global_mmlu_full_ko_philosophy.yaml │ │ │ │ ├── global_mmlu_full_ko_prehistory.yaml │ │ │ │ ├── global_mmlu_full_ko_professional_accounting.yaml │ │ │ │ ├── global_mmlu_full_ko_professional_law.yaml │ │ │ │ ├── global_mmlu_full_ko_professional_medicine.yaml │ │ │ │ ├── global_mmlu_full_ko_professional_psychology.yaml │ │ │ │ ├── global_mmlu_full_ko_public_relations.yaml │ │ │ │ ├── global_mmlu_full_ko_security_studies.yaml │ │ │ │ ├── global_mmlu_full_ko_sociology.yaml │ │ │ │ ├── global_mmlu_full_ko_us_foreign_policy.yaml │ │ │ │ ├── global_mmlu_full_ko_virology.yaml │ │ │ │ ├── global_mmlu_full_ko_world_religions.yaml │ │ │ │ └── utils.py │ │ │ ├── ky/ │ │ │ │ ├── _global_mmlu_full_ky.yaml │ │ │ │ ├── _global_mmlu_full_ky_humanities.yaml │ │ │ │ ├── _global_mmlu_full_ky_other.yaml │ │ │ │ ├── _global_mmlu_full_ky_social_sciences.yaml │ │ │ │ ├── _global_mmlu_full_ky_stem.yaml │ │ │ │ ├── _ky_template_yaml │ │ │ │ ├── global_mmlu_full_ky_abstract_algebra.yaml │ │ │ │ ├── global_mmlu_full_ky_anatomy.yaml │ │ │ │ ├── global_mmlu_full_ky_astronomy.yaml │ │ │ │ ├── global_mmlu_full_ky_business_ethics.yaml │ │ │ │ ├── global_mmlu_full_ky_clinical_knowledge.yaml │ │ │ │ ├── global_mmlu_full_ky_college_biology.yaml │ │ │ │ ├── global_mmlu_full_ky_college_chemistry.yaml │ │ │ │ ├── global_mmlu_full_ky_college_computer_science.yaml │ │ │ │ ├── global_mmlu_full_ky_college_mathematics.yaml │ │ │ │ ├── global_mmlu_full_ky_college_medicine.yaml │ │ │ │ ├── global_mmlu_full_ky_college_physics.yaml │ │ │ │ ├── global_mmlu_full_ky_computer_security.yaml │ │ │ │ ├── global_mmlu_full_ky_conceptual_physics.yaml │ │ │ │ ├── global_mmlu_full_ky_econometrics.yaml │ │ │ │ ├── global_mmlu_full_ky_electrical_engineering.yaml │ │ │ │ ├── global_mmlu_full_ky_elementary_mathematics.yaml │ │ │ │ ├── global_mmlu_full_ky_formal_logic.yaml │ │ │ │ ├── global_mmlu_full_ky_global_facts.yaml │ │ │ │ ├── global_mmlu_full_ky_high_school_biology.yaml │ │ │ │ ├── global_mmlu_full_ky_high_school_chemistry.yaml │ │ │ │ ├── global_mmlu_full_ky_high_school_computer_science.yaml │ │ │ │ ├── global_mmlu_full_ky_high_school_european_history.yaml │ │ │ │ ├── global_mmlu_full_ky_high_school_geography.yaml │ │ │ │ ├── global_mmlu_full_ky_high_school_government_and_politics.yaml │ │ │ │ ├── global_mmlu_full_ky_high_school_macroeconomics.yaml │ │ │ │ ├── global_mmlu_full_ky_high_school_mathematics.yaml │ │ │ │ ├── global_mmlu_full_ky_high_school_microeconomics.yaml │ │ │ │ ├── global_mmlu_full_ky_high_school_physics.yaml │ │ │ │ ├── global_mmlu_full_ky_high_school_psychology.yaml │ │ │ │ ├── global_mmlu_full_ky_high_school_statistics.yaml │ │ │ │ ├── global_mmlu_full_ky_high_school_us_history.yaml │ │ │ │ ├── global_mmlu_full_ky_high_school_world_history.yaml │ │ │ │ ├── global_mmlu_full_ky_human_aging.yaml │ │ │ │ ├── global_mmlu_full_ky_human_sexuality.yaml │ │ │ │ ├── global_mmlu_full_ky_international_law.yaml │ │ │ │ ├── global_mmlu_full_ky_jurisprudence.yaml │ │ │ │ ├── global_mmlu_full_ky_logical_fallacies.yaml │ │ │ │ ├── global_mmlu_full_ky_machine_learning.yaml │ │ │ │ ├── global_mmlu_full_ky_management.yaml │ │ │ │ ├── global_mmlu_full_ky_marketing.yaml │ │ │ │ ├── global_mmlu_full_ky_medical_genetics.yaml │ │ │ │ ├── global_mmlu_full_ky_miscellaneous.yaml │ │ │ │ ├── global_mmlu_full_ky_moral_disputes.yaml │ │ │ │ ├── global_mmlu_full_ky_moral_scenarios.yaml │ │ │ │ ├── global_mmlu_full_ky_nutrition.yaml │ │ │ │ ├── global_mmlu_full_ky_philosophy.yaml │ │ │ │ ├── global_mmlu_full_ky_prehistory.yaml │ │ │ │ ├── global_mmlu_full_ky_professional_accounting.yaml │ │ │ │ ├── global_mmlu_full_ky_professional_law.yaml │ │ │ │ ├── global_mmlu_full_ky_professional_medicine.yaml │ │ │ │ ├── global_mmlu_full_ky_professional_psychology.yaml │ │ │ │ ├── global_mmlu_full_ky_public_relations.yaml │ │ │ │ ├── global_mmlu_full_ky_security_studies.yaml │ │ │ │ ├── global_mmlu_full_ky_sociology.yaml │ │ │ │ ├── global_mmlu_full_ky_us_foreign_policy.yaml │ │ │ │ ├── global_mmlu_full_ky_virology.yaml │ │ │ │ ├── global_mmlu_full_ky_world_religions.yaml │ │ │ │ └── utils.py │ │ │ ├── lt/ │ │ │ │ ├── _global_mmlu_full_lt.yaml │ │ │ │ ├── _global_mmlu_full_lt_humanities.yaml │ │ │ │ ├── _global_mmlu_full_lt_other.yaml │ │ │ │ ├── _global_mmlu_full_lt_social_sciences.yaml │ │ │ │ ├── _global_mmlu_full_lt_stem.yaml │ │ │ │ ├── _lt_template_yaml │ │ │ │ ├── global_mmlu_full_lt_abstract_algebra.yaml │ │ │ │ ├── global_mmlu_full_lt_anatomy.yaml │ │ │ │ ├── global_mmlu_full_lt_astronomy.yaml │ │ │ │ ├── global_mmlu_full_lt_business_ethics.yaml │ │ │ │ ├── global_mmlu_full_lt_clinical_knowledge.yaml │ │ │ │ ├── global_mmlu_full_lt_college_biology.yaml │ │ │ │ ├── global_mmlu_full_lt_college_chemistry.yaml │ │ │ │ ├── global_mmlu_full_lt_college_computer_science.yaml │ │ │ │ ├── global_mmlu_full_lt_college_mathematics.yaml │ │ │ │ ├── global_mmlu_full_lt_college_medicine.yaml │ │ │ │ ├── global_mmlu_full_lt_college_physics.yaml │ │ │ │ ├── global_mmlu_full_lt_computer_security.yaml │ │ │ │ ├── global_mmlu_full_lt_conceptual_physics.yaml │ │ │ │ ├── global_mmlu_full_lt_econometrics.yaml │ │ │ │ ├── global_mmlu_full_lt_electrical_engineering.yaml │ │ │ │ ├── global_mmlu_full_lt_elementary_mathematics.yaml │ │ │ │ ├── global_mmlu_full_lt_formal_logic.yaml │ │ │ │ ├── global_mmlu_full_lt_global_facts.yaml │ │ │ │ ├── global_mmlu_full_lt_high_school_biology.yaml │ │ │ │ ├── global_mmlu_full_lt_high_school_chemistry.yaml │ │ │ │ ├── global_mmlu_full_lt_high_school_computer_science.yaml │ │ │ │ ├── global_mmlu_full_lt_high_school_european_history.yaml │ │ │ │ ├── global_mmlu_full_lt_high_school_geography.yaml │ │ │ │ ├── global_mmlu_full_lt_high_school_government_and_politics.yaml │ │ │ │ ├── global_mmlu_full_lt_high_school_macroeconomics.yaml │ │ │ │ ├── global_mmlu_full_lt_high_school_mathematics.yaml │ │ │ │ ├── global_mmlu_full_lt_high_school_microeconomics.yaml │ │ │ │ ├── global_mmlu_full_lt_high_school_physics.yaml │ │ │ │ ├── global_mmlu_full_lt_high_school_psychology.yaml │ │ │ │ ├── global_mmlu_full_lt_high_school_statistics.yaml │ │ │ │ ├── global_mmlu_full_lt_high_school_us_history.yaml │ │ │ │ ├── global_mmlu_full_lt_high_school_world_history.yaml │ │ │ │ ├── global_mmlu_full_lt_human_aging.yaml │ │ │ │ ├── global_mmlu_full_lt_human_sexuality.yaml │ │ │ │ ├── global_mmlu_full_lt_international_law.yaml │ │ │ │ ├── global_mmlu_full_lt_jurisprudence.yaml │ │ │ │ ├── global_mmlu_full_lt_logical_fallacies.yaml │ │ │ │ ├── global_mmlu_full_lt_machine_learning.yaml │ │ │ │ ├── global_mmlu_full_lt_management.yaml │ │ │ │ ├── global_mmlu_full_lt_marketing.yaml │ │ │ │ ├── global_mmlu_full_lt_medical_genetics.yaml │ │ │ │ ├── global_mmlu_full_lt_miscellaneous.yaml │ │ │ │ ├── global_mmlu_full_lt_moral_disputes.yaml │ │ │ │ ├── global_mmlu_full_lt_moral_scenarios.yaml │ │ │ │ ├── global_mmlu_full_lt_nutrition.yaml │ │ │ │ ├── global_mmlu_full_lt_philosophy.yaml │ │ │ │ ├── global_mmlu_full_lt_prehistory.yaml │ │ │ │ ├── global_mmlu_full_lt_professional_accounting.yaml │ │ │ │ ├── global_mmlu_full_lt_professional_law.yaml │ │ │ │ ├── global_mmlu_full_lt_professional_medicine.yaml │ │ │ │ ├── global_mmlu_full_lt_professional_psychology.yaml │ │ │ │ ├── global_mmlu_full_lt_public_relations.yaml │ │ │ │ ├── global_mmlu_full_lt_security_studies.yaml │ │ │ │ ├── global_mmlu_full_lt_sociology.yaml │ │ │ │ ├── global_mmlu_full_lt_us_foreign_policy.yaml │ │ │ │ ├── global_mmlu_full_lt_virology.yaml │ │ │ │ ├── global_mmlu_full_lt_world_religions.yaml │ │ │ │ └── utils.py │ │ │ ├── mg/ │ │ │ │ ├── _global_mmlu_full_mg.yaml │ │ │ │ ├── _global_mmlu_full_mg_humanities.yaml │ │ │ │ ├── _global_mmlu_full_mg_other.yaml │ │ │ │ ├── _global_mmlu_full_mg_social_sciences.yaml │ │ │ │ ├── _global_mmlu_full_mg_stem.yaml │ │ │ │ ├── _mg_template_yaml │ │ │ │ ├── global_mmlu_full_mg_abstract_algebra.yaml │ │ │ │ ├── global_mmlu_full_mg_anatomy.yaml │ │ │ │ ├── global_mmlu_full_mg_astronomy.yaml │ │ │ │ ├── global_mmlu_full_mg_business_ethics.yaml │ │ │ │ ├── global_mmlu_full_mg_clinical_knowledge.yaml │ │ │ │ ├── global_mmlu_full_mg_college_biology.yaml │ │ │ │ ├── global_mmlu_full_mg_college_chemistry.yaml │ │ │ │ ├── global_mmlu_full_mg_college_computer_science.yaml │ │ │ │ ├── global_mmlu_full_mg_college_mathematics.yaml │ │ │ │ ├── global_mmlu_full_mg_college_medicine.yaml │ │ │ │ ├── global_mmlu_full_mg_college_physics.yaml │ │ │ │ ├── global_mmlu_full_mg_computer_security.yaml │ │ │ │ ├── global_mmlu_full_mg_conceptual_physics.yaml │ │ │ │ ├── global_mmlu_full_mg_econometrics.yaml │ │ │ │ ├── global_mmlu_full_mg_electrical_engineering.yaml │ │ │ │ ├── global_mmlu_full_mg_elementary_mathematics.yaml │ │ │ │ ├── global_mmlu_full_mg_formal_logic.yaml │ │ │ │ ├── global_mmlu_full_mg_global_facts.yaml │ │ │ │ ├── global_mmlu_full_mg_high_school_biology.yaml │ │ │ │ ├── global_mmlu_full_mg_high_school_chemistry.yaml │ │ │ │ ├── global_mmlu_full_mg_high_school_computer_science.yaml │ │ │ │ ├── global_mmlu_full_mg_high_school_european_history.yaml │ │ │ │ ├── global_mmlu_full_mg_high_school_geography.yaml │ │ │ │ ├── global_mmlu_full_mg_high_school_government_and_politics.yaml │ │ │ │ ├── global_mmlu_full_mg_high_school_macroeconomics.yaml │ │ │ │ ├── global_mmlu_full_mg_high_school_mathematics.yaml │ │ │ │ ├── global_mmlu_full_mg_high_school_microeconomics.yaml │ │ │ │ ├── global_mmlu_full_mg_high_school_physics.yaml │ │ │ │ ├── global_mmlu_full_mg_high_school_psychology.yaml │ │ │ │ ├── global_mmlu_full_mg_high_school_statistics.yaml │ │ │ │ ├── global_mmlu_full_mg_high_school_us_history.yaml │ │ │ │ ├── global_mmlu_full_mg_high_school_world_history.yaml │ │ │ │ ├── global_mmlu_full_mg_human_aging.yaml │ │ │ │ ├── global_mmlu_full_mg_human_sexuality.yaml │ │ │ │ ├── global_mmlu_full_mg_international_law.yaml │ │ │ │ ├── global_mmlu_full_mg_jurisprudence.yaml │ │ │ │ ├── global_mmlu_full_mg_logical_fallacies.yaml │ │ │ │ ├── global_mmlu_full_mg_machine_learning.yaml │ │ │ │ ├── global_mmlu_full_mg_management.yaml │ │ │ │ ├── global_mmlu_full_mg_marketing.yaml │ │ │ │ ├── global_mmlu_full_mg_medical_genetics.yaml │ │ │ │ ├── global_mmlu_full_mg_miscellaneous.yaml │ │ │ │ ├── global_mmlu_full_mg_moral_disputes.yaml │ │ │ │ ├── global_mmlu_full_mg_moral_scenarios.yaml │ │ │ │ ├── global_mmlu_full_mg_nutrition.yaml │ │ │ │ ├── global_mmlu_full_mg_philosophy.yaml │ │ │ │ ├── global_mmlu_full_mg_prehistory.yaml │ │ │ │ ├── global_mmlu_full_mg_professional_accounting.yaml │ │ │ │ ├── global_mmlu_full_mg_professional_law.yaml │ │ │ │ ├── global_mmlu_full_mg_professional_medicine.yaml │ │ │ │ ├── global_mmlu_full_mg_professional_psychology.yaml │ │ │ │ ├── global_mmlu_full_mg_public_relations.yaml │ │ │ │ ├── global_mmlu_full_mg_security_studies.yaml │ │ │ │ ├── global_mmlu_full_mg_sociology.yaml │ │ │ │ ├── global_mmlu_full_mg_us_foreign_policy.yaml │ │ │ │ ├── global_mmlu_full_mg_virology.yaml │ │ │ │ ├── global_mmlu_full_mg_world_religions.yaml │ │ │ │ └── utils.py │ │ │ ├── ms/ │ │ │ │ ├── _global_mmlu_full_ms.yaml │ │ │ │ ├── _global_mmlu_full_ms_humanities.yaml │ │ │ │ ├── _global_mmlu_full_ms_other.yaml │ │ │ │ ├── _global_mmlu_full_ms_social_sciences.yaml │ │ │ │ ├── _global_mmlu_full_ms_stem.yaml │ │ │ │ ├── _ms_template_yaml │ │ │ │ ├── global_mmlu_full_ms_abstract_algebra.yaml │ │ │ │ ├── global_mmlu_full_ms_anatomy.yaml │ │ │ │ ├── global_mmlu_full_ms_astronomy.yaml │ │ │ │ ├── global_mmlu_full_ms_business_ethics.yaml │ │ │ │ ├── global_mmlu_full_ms_clinical_knowledge.yaml │ │ │ │ ├── global_mmlu_full_ms_college_biology.yaml │ │ │ │ ├── global_mmlu_full_ms_college_chemistry.yaml │ │ │ │ ├── global_mmlu_full_ms_college_computer_science.yaml │ │ │ │ ├── global_mmlu_full_ms_college_mathematics.yaml │ │ │ │ ├── global_mmlu_full_ms_college_medicine.yaml │ │ │ │ ├── global_mmlu_full_ms_college_physics.yaml │ │ │ │ ├── global_mmlu_full_ms_computer_security.yaml │ │ │ │ ├── global_mmlu_full_ms_conceptual_physics.yaml │ │ │ │ ├── global_mmlu_full_ms_econometrics.yaml │ │ │ │ ├── global_mmlu_full_ms_electrical_engineering.yaml │ │ │ │ ├── global_mmlu_full_ms_elementary_mathematics.yaml │ │ │ │ ├── global_mmlu_full_ms_formal_logic.yaml │ │ │ │ ├── global_mmlu_full_ms_global_facts.yaml │ │ │ │ ├── global_mmlu_full_ms_high_school_biology.yaml │ │ │ │ ├── global_mmlu_full_ms_high_school_chemistry.yaml │ │ │ │ ├── global_mmlu_full_ms_high_school_computer_science.yaml │ │ │ │ ├── global_mmlu_full_ms_high_school_european_history.yaml │ │ │ │ ├── global_mmlu_full_ms_high_school_geography.yaml │ │ │ │ ├── global_mmlu_full_ms_high_school_government_and_politics.yaml │ │ │ │ ├── global_mmlu_full_ms_high_school_macroeconomics.yaml │ │ │ │ ├── global_mmlu_full_ms_high_school_mathematics.yaml │ │ │ │ ├── global_mmlu_full_ms_high_school_microeconomics.yaml │ │ │ │ ├── global_mmlu_full_ms_high_school_physics.yaml │ │ │ │ ├── global_mmlu_full_ms_high_school_psychology.yaml │ │ │ │ ├── global_mmlu_full_ms_high_school_statistics.yaml │ │ │ │ ├── global_mmlu_full_ms_high_school_us_history.yaml │ │ │ │ ├── global_mmlu_full_ms_high_school_world_history.yaml │ │ │ │ ├── global_mmlu_full_ms_human_aging.yaml │ │ │ │ ├── global_mmlu_full_ms_human_sexuality.yaml │ │ │ │ ├── global_mmlu_full_ms_international_law.yaml │ │ │ │ ├── global_mmlu_full_ms_jurisprudence.yaml │ │ │ │ ├── global_mmlu_full_ms_logical_fallacies.yaml │ │ │ │ ├── global_mmlu_full_ms_machine_learning.yaml │ │ │ │ ├── global_mmlu_full_ms_management.yaml │ │ │ │ ├── global_mmlu_full_ms_marketing.yaml │ │ │ │ ├── global_mmlu_full_ms_medical_genetics.yaml │ │ │ │ ├── global_mmlu_full_ms_miscellaneous.yaml │ │ │ │ ├── global_mmlu_full_ms_moral_disputes.yaml │ │ │ │ ├── global_mmlu_full_ms_moral_scenarios.yaml │ │ │ │ ├── global_mmlu_full_ms_nutrition.yaml │ │ │ │ ├── global_mmlu_full_ms_philosophy.yaml │ │ │ │ ├── global_mmlu_full_ms_prehistory.yaml │ │ │ │ ├── global_mmlu_full_ms_professional_accounting.yaml │ │ │ │ ├── global_mmlu_full_ms_professional_law.yaml │ │ │ │ ├── global_mmlu_full_ms_professional_medicine.yaml │ │ │ │ ├── global_mmlu_full_ms_professional_psychology.yaml │ │ │ │ ├── global_mmlu_full_ms_public_relations.yaml │ │ │ │ ├── global_mmlu_full_ms_security_studies.yaml │ │ │ │ ├── global_mmlu_full_ms_sociology.yaml │ │ │ │ ├── global_mmlu_full_ms_us_foreign_policy.yaml │ │ │ │ ├── global_mmlu_full_ms_virology.yaml │ │ │ │ ├── global_mmlu_full_ms_world_religions.yaml │ │ │ │ └── utils.py │ │ │ ├── ne/ │ │ │ │ ├── _global_mmlu_full_ne.yaml │ │ │ │ ├── _global_mmlu_full_ne_humanities.yaml │ │ │ │ ├── _global_mmlu_full_ne_other.yaml │ │ │ │ ├── _global_mmlu_full_ne_social_sciences.yaml │ │ │ │ ├── _global_mmlu_full_ne_stem.yaml │ │ │ │ ├── _ne_template_yaml │ │ │ │ ├── global_mmlu_full_ne_abstract_algebra.yaml │ │ │ │ ├── global_mmlu_full_ne_anatomy.yaml │ │ │ │ ├── global_mmlu_full_ne_astronomy.yaml │ │ │ │ ├── global_mmlu_full_ne_business_ethics.yaml │ │ │ │ ├── global_mmlu_full_ne_clinical_knowledge.yaml │ │ │ │ ├── global_mmlu_full_ne_college_biology.yaml │ │ │ │ ├── global_mmlu_full_ne_college_chemistry.yaml │ │ │ │ ├── global_mmlu_full_ne_college_computer_science.yaml │ │ │ │ ├── global_mmlu_full_ne_college_mathematics.yaml │ │ │ │ ├── global_mmlu_full_ne_college_medicine.yaml │ │ │ │ ├── global_mmlu_full_ne_college_physics.yaml │ │ │ │ ├── global_mmlu_full_ne_computer_security.yaml │ │ │ │ ├── global_mmlu_full_ne_conceptual_physics.yaml │ │ │ │ ├── global_mmlu_full_ne_econometrics.yaml │ │ │ │ ├── global_mmlu_full_ne_electrical_engineering.yaml │ │ │ │ ├── global_mmlu_full_ne_elementary_mathematics.yaml │ │ │ │ ├── global_mmlu_full_ne_formal_logic.yaml │ │ │ │ ├── global_mmlu_full_ne_global_facts.yaml │ │ │ │ ├── global_mmlu_full_ne_high_school_biology.yaml │ │ │ │ ├── global_mmlu_full_ne_high_school_chemistry.yaml │ │ │ │ ├── global_mmlu_full_ne_high_school_computer_science.yaml │ │ │ │ ├── global_mmlu_full_ne_high_school_european_history.yaml │ │ │ │ ├── global_mmlu_full_ne_high_school_geography.yaml │ │ │ │ ├── global_mmlu_full_ne_high_school_government_and_politics.yaml │ │ │ │ ├── global_mmlu_full_ne_high_school_macroeconomics.yaml │ │ │ │ ├── global_mmlu_full_ne_high_school_mathematics.yaml │ │ │ │ ├── global_mmlu_full_ne_high_school_microeconomics.yaml │ │ │ │ ├── global_mmlu_full_ne_high_school_physics.yaml │ │ │ │ ├── global_mmlu_full_ne_high_school_psychology.yaml │ │ │ │ ├── global_mmlu_full_ne_high_school_statistics.yaml │ │ │ │ ├── global_mmlu_full_ne_high_school_us_history.yaml │ │ │ │ ├── global_mmlu_full_ne_high_school_world_history.yaml │ │ │ │ ├── global_mmlu_full_ne_human_aging.yaml │ │ │ │ ├── global_mmlu_full_ne_human_sexuality.yaml │ │ │ │ ├── global_mmlu_full_ne_international_law.yaml │ │ │ │ ├── global_mmlu_full_ne_jurisprudence.yaml │ │ │ │ ├── global_mmlu_full_ne_logical_fallacies.yaml │ │ │ │ ├── global_mmlu_full_ne_machine_learning.yaml │ │ │ │ ├── global_mmlu_full_ne_management.yaml │ │ │ │ ├── global_mmlu_full_ne_marketing.yaml │ │ │ │ ├── global_mmlu_full_ne_medical_genetics.yaml │ │ │ │ ├── global_mmlu_full_ne_miscellaneous.yaml │ │ │ │ ├── global_mmlu_full_ne_moral_disputes.yaml │ │ │ │ ├── global_mmlu_full_ne_moral_scenarios.yaml │ │ │ │ ├── global_mmlu_full_ne_nutrition.yaml │ │ │ │ ├── global_mmlu_full_ne_philosophy.yaml │ │ │ │ ├── global_mmlu_full_ne_prehistory.yaml │ │ │ │ ├── global_mmlu_full_ne_professional_accounting.yaml │ │ │ │ ├── global_mmlu_full_ne_professional_law.yaml │ │ │ │ ├── global_mmlu_full_ne_professional_medicine.yaml │ │ │ │ ├── global_mmlu_full_ne_professional_psychology.yaml │ │ │ │ ├── global_mmlu_full_ne_public_relations.yaml │ │ │ │ ├── global_mmlu_full_ne_security_studies.yaml │ │ │ │ ├── global_mmlu_full_ne_sociology.yaml │ │ │ │ ├── global_mmlu_full_ne_us_foreign_policy.yaml │ │ │ │ ├── global_mmlu_full_ne_virology.yaml │ │ │ │ ├── global_mmlu_full_ne_world_religions.yaml │ │ │ │ └── utils.py │ │ │ ├── nl/ │ │ │ │ ├── _global_mmlu_full_nl.yaml │ │ │ │ ├── _global_mmlu_full_nl_humanities.yaml │ │ │ │ ├── _global_mmlu_full_nl_other.yaml │ │ │ │ ├── _global_mmlu_full_nl_social_sciences.yaml │ │ │ │ ├── _global_mmlu_full_nl_stem.yaml │ │ │ │ ├── _nl_template_yaml │ │ │ │ ├── global_mmlu_full_nl_abstract_algebra.yaml │ │ │ │ ├── global_mmlu_full_nl_anatomy.yaml │ │ │ │ ├── global_mmlu_full_nl_astronomy.yaml │ │ │ │ ├── global_mmlu_full_nl_business_ethics.yaml │ │ │ │ ├── global_mmlu_full_nl_clinical_knowledge.yaml │ │ │ │ ├── global_mmlu_full_nl_college_biology.yaml │ │ │ │ ├── global_mmlu_full_nl_college_chemistry.yaml │ │ │ │ ├── global_mmlu_full_nl_college_computer_science.yaml │ │ │ │ ├── global_mmlu_full_nl_college_mathematics.yaml │ │ │ │ ├── global_mmlu_full_nl_college_medicine.yaml │ │ │ │ ├── global_mmlu_full_nl_college_physics.yaml │ │ │ │ ├── global_mmlu_full_nl_computer_security.yaml │ │ │ │ ├── global_mmlu_full_nl_conceptual_physics.yaml │ │ │ │ ├── global_mmlu_full_nl_econometrics.yaml │ │ │ │ ├── global_mmlu_full_nl_electrical_engineering.yaml │ │ │ │ ├── global_mmlu_full_nl_elementary_mathematics.yaml │ │ │ │ ├── global_mmlu_full_nl_formal_logic.yaml │ │ │ │ ├── global_mmlu_full_nl_global_facts.yaml │ │ │ │ ├── global_mmlu_full_nl_high_school_biology.yaml │ │ │ │ ├── global_mmlu_full_nl_high_school_chemistry.yaml │ │ │ │ ├── global_mmlu_full_nl_high_school_computer_science.yaml │ │ │ │ ├── global_mmlu_full_nl_high_school_european_history.yaml │ │ │ │ ├── global_mmlu_full_nl_high_school_geography.yaml │ │ │ │ ├── global_mmlu_full_nl_high_school_government_and_politics.yaml │ │ │ │ ├── global_mmlu_full_nl_high_school_macroeconomics.yaml │ │ │ │ ├── global_mmlu_full_nl_high_school_mathematics.yaml │ │ │ │ ├── global_mmlu_full_nl_high_school_microeconomics.yaml │ │ │ │ ├── global_mmlu_full_nl_high_school_physics.yaml │ │ │ │ ├── global_mmlu_full_nl_high_school_psychology.yaml │ │ │ │ ├── global_mmlu_full_nl_high_school_statistics.yaml │ │ │ │ ├── global_mmlu_full_nl_high_school_us_history.yaml │ │ │ │ ├── global_mmlu_full_nl_high_school_world_history.yaml │ │ │ │ ├── global_mmlu_full_nl_human_aging.yaml │ │ │ │ ├── global_mmlu_full_nl_human_sexuality.yaml │ │ │ │ ├── global_mmlu_full_nl_international_law.yaml │ │ │ │ ├── global_mmlu_full_nl_jurisprudence.yaml │ │ │ │ ├── global_mmlu_full_nl_logical_fallacies.yaml │ │ │ │ ├── global_mmlu_full_nl_machine_learning.yaml │ │ │ │ ├── global_mmlu_full_nl_management.yaml │ │ │ │ ├── global_mmlu_full_nl_marketing.yaml │ │ │ │ ├── global_mmlu_full_nl_medical_genetics.yaml │ │ │ │ ├── global_mmlu_full_nl_miscellaneous.yaml │ │ │ │ ├── global_mmlu_full_nl_moral_disputes.yaml │ │ │ │ ├── global_mmlu_full_nl_moral_scenarios.yaml │ │ │ │ ├── global_mmlu_full_nl_nutrition.yaml │ │ │ │ ├── global_mmlu_full_nl_philosophy.yaml │ │ │ │ ├── global_mmlu_full_nl_prehistory.yaml │ │ │ │ ├── global_mmlu_full_nl_professional_accounting.yaml │ │ │ │ ├── global_mmlu_full_nl_professional_law.yaml │ │ │ │ ├── global_mmlu_full_nl_professional_medicine.yaml │ │ │ │ ├── global_mmlu_full_nl_professional_psychology.yaml │ │ │ │ ├── global_mmlu_full_nl_public_relations.yaml │ │ │ │ ├── global_mmlu_full_nl_security_studies.yaml │ │ │ │ ├── global_mmlu_full_nl_sociology.yaml │ │ │ │ ├── global_mmlu_full_nl_us_foreign_policy.yaml │ │ │ │ ├── global_mmlu_full_nl_virology.yaml │ │ │ │ ├── global_mmlu_full_nl_world_religions.yaml │ │ │ │ └── utils.py │ │ │ ├── ny/ │ │ │ │ ├── _global_mmlu_full_ny.yaml │ │ │ │ ├── _global_mmlu_full_ny_humanities.yaml │ │ │ │ ├── _global_mmlu_full_ny_other.yaml │ │ │ │ ├── _global_mmlu_full_ny_social_sciences.yaml │ │ │ │ ├── _global_mmlu_full_ny_stem.yaml │ │ │ │ ├── _ny_template_yaml │ │ │ │ ├── global_mmlu_full_ny_abstract_algebra.yaml │ │ │ │ ├── global_mmlu_full_ny_anatomy.yaml │ │ │ │ ├── global_mmlu_full_ny_astronomy.yaml │ │ │ │ ├── global_mmlu_full_ny_business_ethics.yaml │ │ │ │ ├── global_mmlu_full_ny_clinical_knowledge.yaml │ │ │ │ ├── global_mmlu_full_ny_college_biology.yaml │ │ │ │ ├── global_mmlu_full_ny_college_chemistry.yaml │ │ │ │ ├── global_mmlu_full_ny_college_computer_science.yaml │ │ │ │ ├── global_mmlu_full_ny_college_mathematics.yaml │ │ │ │ ├── global_mmlu_full_ny_college_medicine.yaml │ │ │ │ ├── global_mmlu_full_ny_college_physics.yaml │ │ │ │ ├── global_mmlu_full_ny_computer_security.yaml │ │ │ │ ├── global_mmlu_full_ny_conceptual_physics.yaml │ │ │ │ ├── global_mmlu_full_ny_econometrics.yaml │ │ │ │ ├── global_mmlu_full_ny_electrical_engineering.yaml │ │ │ │ ├── global_mmlu_full_ny_elementary_mathematics.yaml │ │ │ │ ├── global_mmlu_full_ny_formal_logic.yaml │ │ │ │ ├── global_mmlu_full_ny_global_facts.yaml │ │ │ │ ├── global_mmlu_full_ny_high_school_biology.yaml │ │ │ │ ├── global_mmlu_full_ny_high_school_chemistry.yaml │ │ │ │ ├── global_mmlu_full_ny_high_school_computer_science.yaml │ │ │ │ ├── global_mmlu_full_ny_high_school_european_history.yaml │ │ │ │ ├── global_mmlu_full_ny_high_school_geography.yaml │ │ │ │ ├── global_mmlu_full_ny_high_school_government_and_politics.yaml │ │ │ │ ├── global_mmlu_full_ny_high_school_macroeconomics.yaml │ │ │ │ ├── global_mmlu_full_ny_high_school_mathematics.yaml │ │ │ │ ├── global_mmlu_full_ny_high_school_microeconomics.yaml │ │ │ │ ├── global_mmlu_full_ny_high_school_physics.yaml │ │ │ │ ├── global_mmlu_full_ny_high_school_psychology.yaml │ │ │ │ ├── global_mmlu_full_ny_high_school_statistics.yaml │ │ │ │ ├── global_mmlu_full_ny_high_school_us_history.yaml │ │ │ │ ├── global_mmlu_full_ny_high_school_world_history.yaml │ │ │ │ ├── global_mmlu_full_ny_human_aging.yaml │ │ │ │ ├── global_mmlu_full_ny_human_sexuality.yaml │ │ │ │ ├── global_mmlu_full_ny_international_law.yaml │ │ │ │ ├── global_mmlu_full_ny_jurisprudence.yaml │ │ │ │ ├── global_mmlu_full_ny_logical_fallacies.yaml │ │ │ │ ├── global_mmlu_full_ny_machine_learning.yaml │ │ │ │ ├── global_mmlu_full_ny_management.yaml │ │ │ │ ├── global_mmlu_full_ny_marketing.yaml │ │ │ │ ├── global_mmlu_full_ny_medical_genetics.yaml │ │ │ │ ├── global_mmlu_full_ny_miscellaneous.yaml │ │ │ │ ├── global_mmlu_full_ny_moral_disputes.yaml │ │ │ │ ├── global_mmlu_full_ny_moral_scenarios.yaml │ │ │ │ ├── global_mmlu_full_ny_nutrition.yaml │ │ │ │ ├── global_mmlu_full_ny_philosophy.yaml │ │ │ │ ├── global_mmlu_full_ny_prehistory.yaml │ │ │ │ ├── global_mmlu_full_ny_professional_accounting.yaml │ │ │ │ ├── global_mmlu_full_ny_professional_law.yaml │ │ │ │ ├── global_mmlu_full_ny_professional_medicine.yaml │ │ │ │ ├── global_mmlu_full_ny_professional_psychology.yaml │ │ │ │ ├── global_mmlu_full_ny_public_relations.yaml │ │ │ │ ├── global_mmlu_full_ny_security_studies.yaml │ │ │ │ ├── global_mmlu_full_ny_sociology.yaml │ │ │ │ ├── global_mmlu_full_ny_us_foreign_policy.yaml │ │ │ │ ├── global_mmlu_full_ny_virology.yaml │ │ │ │ ├── global_mmlu_full_ny_world_religions.yaml │ │ │ │ └── utils.py │ │ │ ├── pl/ │ │ │ │ ├── _global_mmlu_full_pl.yaml │ │ │ │ ├── _global_mmlu_full_pl_humanities.yaml │ │ │ │ ├── _global_mmlu_full_pl_other.yaml │ │ │ │ ├── _global_mmlu_full_pl_social_sciences.yaml │ │ │ │ ├── _global_mmlu_full_pl_stem.yaml │ │ │ │ ├── _pl_template_yaml │ │ │ │ ├── global_mmlu_full_pl_abstract_algebra.yaml │ │ │ │ ├── global_mmlu_full_pl_anatomy.yaml │ │ │ │ ├── global_mmlu_full_pl_astronomy.yaml │ │ │ │ ├── global_mmlu_full_pl_business_ethics.yaml │ │ │ │ ├── global_mmlu_full_pl_clinical_knowledge.yaml │ │ │ │ ├── global_mmlu_full_pl_college_biology.yaml │ │ │ │ ├── global_mmlu_full_pl_college_chemistry.yaml │ │ │ │ ├── global_mmlu_full_pl_college_computer_science.yaml │ │ │ │ ├── global_mmlu_full_pl_college_mathematics.yaml │ │ │ │ ├── global_mmlu_full_pl_college_medicine.yaml │ │ │ │ ├── global_mmlu_full_pl_college_physics.yaml │ │ │ │ ├── global_mmlu_full_pl_computer_security.yaml │ │ │ │ ├── global_mmlu_full_pl_conceptual_physics.yaml │ │ │ │ ├── global_mmlu_full_pl_econometrics.yaml │ │ │ │ ├── global_mmlu_full_pl_electrical_engineering.yaml │ │ │ │ ├── global_mmlu_full_pl_elementary_mathematics.yaml │ │ │ │ ├── global_mmlu_full_pl_formal_logic.yaml │ │ │ │ ├── global_mmlu_full_pl_global_facts.yaml │ │ │ │ ├── global_mmlu_full_pl_high_school_biology.yaml │ │ │ │ ├── global_mmlu_full_pl_high_school_chemistry.yaml │ │ │ │ ├── global_mmlu_full_pl_high_school_computer_science.yaml │ │ │ │ ├── global_mmlu_full_pl_high_school_european_history.yaml │ │ │ │ ├── global_mmlu_full_pl_high_school_geography.yaml │ │ │ │ ├── global_mmlu_full_pl_high_school_government_and_politics.yaml │ │ │ │ ├── global_mmlu_full_pl_high_school_macroeconomics.yaml │ │ │ │ ├── global_mmlu_full_pl_high_school_mathematics.yaml │ │ │ │ ├── global_mmlu_full_pl_high_school_microeconomics.yaml │ │ │ │ ├── global_mmlu_full_pl_high_school_physics.yaml │ │ │ │ ├── global_mmlu_full_pl_high_school_psychology.yaml │ │ │ │ ├── global_mmlu_full_pl_high_school_statistics.yaml │ │ │ │ ├── global_mmlu_full_pl_high_school_us_history.yaml │ │ │ │ ├── global_mmlu_full_pl_high_school_world_history.yaml │ │ │ │ ├── global_mmlu_full_pl_human_aging.yaml │ │ │ │ ├── global_mmlu_full_pl_human_sexuality.yaml │ │ │ │ ├── global_mmlu_full_pl_international_law.yaml │ │ │ │ ├── global_mmlu_full_pl_jurisprudence.yaml │ │ │ │ ├── global_mmlu_full_pl_logical_fallacies.yaml │ │ │ │ ├── global_mmlu_full_pl_machine_learning.yaml │ │ │ │ ├── global_mmlu_full_pl_management.yaml │ │ │ │ ├── global_mmlu_full_pl_marketing.yaml │ │ │ │ ├── global_mmlu_full_pl_medical_genetics.yaml │ │ │ │ ├── global_mmlu_full_pl_miscellaneous.yaml │ │ │ │ ├── global_mmlu_full_pl_moral_disputes.yaml │ │ │ │ ├── global_mmlu_full_pl_moral_scenarios.yaml │ │ │ │ ├── global_mmlu_full_pl_nutrition.yaml │ │ │ │ ├── global_mmlu_full_pl_philosophy.yaml │ │ │ │ ├── global_mmlu_full_pl_prehistory.yaml │ │ │ │ ├── global_mmlu_full_pl_professional_accounting.yaml │ │ │ │ ├── global_mmlu_full_pl_professional_law.yaml │ │ │ │ ├── global_mmlu_full_pl_professional_medicine.yaml │ │ │ │ ├── global_mmlu_full_pl_professional_psychology.yaml │ │ │ │ ├── global_mmlu_full_pl_public_relations.yaml │ │ │ │ ├── global_mmlu_full_pl_security_studies.yaml │ │ │ │ ├── global_mmlu_full_pl_sociology.yaml │ │ │ │ ├── global_mmlu_full_pl_us_foreign_policy.yaml │ │ │ │ ├── global_mmlu_full_pl_virology.yaml │ │ │ │ ├── global_mmlu_full_pl_world_religions.yaml │ │ │ │ └── utils.py │ │ │ ├── pt/ │ │ │ │ ├── _global_mmlu_full_pt.yaml │ │ │ │ ├── _global_mmlu_full_pt_humanities.yaml │ │ │ │ ├── _global_mmlu_full_pt_other.yaml │ │ │ │ ├── _global_mmlu_full_pt_social_sciences.yaml │ │ │ │ ├── _global_mmlu_full_pt_stem.yaml │ │ │ │ ├── _pt_template_yaml │ │ │ │ ├── global_mmlu_full_pt_abstract_algebra.yaml │ │ │ │ ├── global_mmlu_full_pt_anatomy.yaml │ │ │ │ ├── global_mmlu_full_pt_astronomy.yaml │ │ │ │ ├── global_mmlu_full_pt_business_ethics.yaml │ │ │ │ ├── global_mmlu_full_pt_clinical_knowledge.yaml │ │ │ │ ├── global_mmlu_full_pt_college_biology.yaml │ │ │ │ ├── global_mmlu_full_pt_college_chemistry.yaml │ │ │ │ ├── global_mmlu_full_pt_college_computer_science.yaml │ │ │ │ ├── global_mmlu_full_pt_college_mathematics.yaml │ │ │ │ ├── global_mmlu_full_pt_college_medicine.yaml │ │ │ │ ├── global_mmlu_full_pt_college_physics.yaml │ │ │ │ ├── global_mmlu_full_pt_computer_security.yaml │ │ │ │ ├── global_mmlu_full_pt_conceptual_physics.yaml │ │ │ │ ├── global_mmlu_full_pt_econometrics.yaml │ │ │ │ ├── global_mmlu_full_pt_electrical_engineering.yaml │ │ │ │ ├── global_mmlu_full_pt_elementary_mathematics.yaml │ │ │ │ ├── global_mmlu_full_pt_formal_logic.yaml │ │ │ │ ├── global_mmlu_full_pt_global_facts.yaml │ │ │ │ ├── global_mmlu_full_pt_high_school_biology.yaml │ │ │ │ ├── global_mmlu_full_pt_high_school_chemistry.yaml │ │ │ │ ├── global_mmlu_full_pt_high_school_computer_science.yaml │ │ │ │ ├── global_mmlu_full_pt_high_school_european_history.yaml │ │ │ │ ├── global_mmlu_full_pt_high_school_geography.yaml │ │ │ │ ├── global_mmlu_full_pt_high_school_government_and_politics.yaml │ │ │ │ ├── global_mmlu_full_pt_high_school_macroeconomics.yaml │ │ │ │ ├── global_mmlu_full_pt_high_school_mathematics.yaml │ │ │ │ ├── global_mmlu_full_pt_high_school_microeconomics.yaml │ │ │ │ ├── global_mmlu_full_pt_high_school_physics.yaml │ │ │ │ ├── global_mmlu_full_pt_high_school_psychology.yaml │ │ │ │ ├── global_mmlu_full_pt_high_school_statistics.yaml │ │ │ │ ├── global_mmlu_full_pt_high_school_us_history.yaml │ │ │ │ ├── global_mmlu_full_pt_high_school_world_history.yaml │ │ │ │ ├── global_mmlu_full_pt_human_aging.yaml │ │ │ │ ├── global_mmlu_full_pt_human_sexuality.yaml │ │ │ │ ├── global_mmlu_full_pt_international_law.yaml │ │ │ │ ├── global_mmlu_full_pt_jurisprudence.yaml │ │ │ │ ├── global_mmlu_full_pt_logical_fallacies.yaml │ │ │ │ ├── global_mmlu_full_pt_machine_learning.yaml │ │ │ │ ├── global_mmlu_full_pt_management.yaml │ │ │ │ ├── global_mmlu_full_pt_marketing.yaml │ │ │ │ ├── global_mmlu_full_pt_medical_genetics.yaml │ │ │ │ ├── global_mmlu_full_pt_miscellaneous.yaml │ │ │ │ ├── global_mmlu_full_pt_moral_disputes.yaml │ │ │ │ ├── global_mmlu_full_pt_moral_scenarios.yaml │ │ │ │ ├── global_mmlu_full_pt_nutrition.yaml │ │ │ │ ├── global_mmlu_full_pt_philosophy.yaml │ │ │ │ ├── global_mmlu_full_pt_prehistory.yaml │ │ │ │ ├── global_mmlu_full_pt_professional_accounting.yaml │ │ │ │ ├── global_mmlu_full_pt_professional_law.yaml │ │ │ │ ├── global_mmlu_full_pt_professional_medicine.yaml │ │ │ │ ├── global_mmlu_full_pt_professional_psychology.yaml │ │ │ │ ├── global_mmlu_full_pt_public_relations.yaml │ │ │ │ ├── global_mmlu_full_pt_security_studies.yaml │ │ │ │ ├── global_mmlu_full_pt_sociology.yaml │ │ │ │ ├── global_mmlu_full_pt_us_foreign_policy.yaml │ │ │ │ ├── global_mmlu_full_pt_virology.yaml │ │ │ │ ├── global_mmlu_full_pt_world_religions.yaml │ │ │ │ └── utils.py │ │ │ ├── ro/ │ │ │ │ ├── _global_mmlu_full_ro.yaml │ │ │ │ ├── _global_mmlu_full_ro_humanities.yaml │ │ │ │ ├── _global_mmlu_full_ro_other.yaml │ │ │ │ ├── _global_mmlu_full_ro_social_sciences.yaml │ │ │ │ ├── _global_mmlu_full_ro_stem.yaml │ │ │ │ ├── _ro_template_yaml │ │ │ │ ├── global_mmlu_full_ro_abstract_algebra.yaml │ │ │ │ ├── global_mmlu_full_ro_anatomy.yaml │ │ │ │ ├── global_mmlu_full_ro_astronomy.yaml │ │ │ │ ├── global_mmlu_full_ro_business_ethics.yaml │ │ │ │ ├── global_mmlu_full_ro_clinical_knowledge.yaml │ │ │ │ ├── global_mmlu_full_ro_college_biology.yaml │ │ │ │ ├── global_mmlu_full_ro_college_chemistry.yaml │ │ │ │ ├── global_mmlu_full_ro_college_computer_science.yaml │ │ │ │ ├── global_mmlu_full_ro_college_mathematics.yaml │ │ │ │ ├── global_mmlu_full_ro_college_medicine.yaml │ │ │ │ ├── global_mmlu_full_ro_college_physics.yaml │ │ │ │ ├── global_mmlu_full_ro_computer_security.yaml │ │ │ │ ├── global_mmlu_full_ro_conceptual_physics.yaml │ │ │ │ ├── global_mmlu_full_ro_econometrics.yaml │ │ │ │ ├── global_mmlu_full_ro_electrical_engineering.yaml │ │ │ │ ├── global_mmlu_full_ro_elementary_mathematics.yaml │ │ │ │ ├── global_mmlu_full_ro_formal_logic.yaml │ │ │ │ ├── global_mmlu_full_ro_global_facts.yaml │ │ │ │ ├── global_mmlu_full_ro_high_school_biology.yaml │ │ │ │ ├── global_mmlu_full_ro_high_school_chemistry.yaml │ │ │ │ ├── global_mmlu_full_ro_high_school_computer_science.yaml │ │ │ │ ├── global_mmlu_full_ro_high_school_european_history.yaml │ │ │ │ ├── global_mmlu_full_ro_high_school_geography.yaml │ │ │ │ ├── global_mmlu_full_ro_high_school_government_and_politics.yaml │ │ │ │ ├── global_mmlu_full_ro_high_school_macroeconomics.yaml │ │ │ │ ├── global_mmlu_full_ro_high_school_mathematics.yaml │ │ │ │ ├── global_mmlu_full_ro_high_school_microeconomics.yaml │ │ │ │ ├── global_mmlu_full_ro_high_school_physics.yaml │ │ │ │ ├── global_mmlu_full_ro_high_school_psychology.yaml │ │ │ │ ├── global_mmlu_full_ro_high_school_statistics.yaml │ │ │ │ ├── global_mmlu_full_ro_high_school_us_history.yaml │ │ │ │ ├── global_mmlu_full_ro_high_school_world_history.yaml │ │ │ │ ├── global_mmlu_full_ro_human_aging.yaml │ │ │ │ ├── global_mmlu_full_ro_human_sexuality.yaml │ │ │ │ ├── global_mmlu_full_ro_international_law.yaml │ │ │ │ ├── global_mmlu_full_ro_jurisprudence.yaml │ │ │ │ ├── global_mmlu_full_ro_logical_fallacies.yaml │ │ │ │ ├── global_mmlu_full_ro_machine_learning.yaml │ │ │ │ ├── global_mmlu_full_ro_management.yaml │ │ │ │ ├── global_mmlu_full_ro_marketing.yaml │ │ │ │ ├── global_mmlu_full_ro_medical_genetics.yaml │ │ │ │ ├── global_mmlu_full_ro_miscellaneous.yaml │ │ │ │ ├── global_mmlu_full_ro_moral_disputes.yaml │ │ │ │ ├── global_mmlu_full_ro_moral_scenarios.yaml │ │ │ │ ├── global_mmlu_full_ro_nutrition.yaml │ │ │ │ ├── global_mmlu_full_ro_philosophy.yaml │ │ │ │ ├── global_mmlu_full_ro_prehistory.yaml │ │ │ │ ├── global_mmlu_full_ro_professional_accounting.yaml │ │ │ │ ├── global_mmlu_full_ro_professional_law.yaml │ │ │ │ ├── global_mmlu_full_ro_professional_medicine.yaml │ │ │ │ ├── global_mmlu_full_ro_professional_psychology.yaml │ │ │ │ ├── global_mmlu_full_ro_public_relations.yaml │ │ │ │ ├── global_mmlu_full_ro_security_studies.yaml │ │ │ │ ├── global_mmlu_full_ro_sociology.yaml │ │ │ │ ├── global_mmlu_full_ro_us_foreign_policy.yaml │ │ │ │ ├── global_mmlu_full_ro_virology.yaml │ │ │ │ ├── global_mmlu_full_ro_world_religions.yaml │ │ │ │ └── utils.py │ │ │ ├── ru/ │ │ │ │ ├── _global_mmlu_full_ru.yaml │ │ │ │ ├── _global_mmlu_full_ru_humanities.yaml │ │ │ │ ├── _global_mmlu_full_ru_other.yaml │ │ │ │ ├── _global_mmlu_full_ru_social_sciences.yaml │ │ │ │ ├── _global_mmlu_full_ru_stem.yaml │ │ │ │ ├── _ru_template_yaml │ │ │ │ ├── global_mmlu_full_ru_abstract_algebra.yaml │ │ │ │ ├── global_mmlu_full_ru_anatomy.yaml │ │ │ │ ├── global_mmlu_full_ru_astronomy.yaml │ │ │ │ ├── global_mmlu_full_ru_business_ethics.yaml │ │ │ │ ├── global_mmlu_full_ru_clinical_knowledge.yaml │ │ │ │ ├── global_mmlu_full_ru_college_biology.yaml │ │ │ │ ├── global_mmlu_full_ru_college_chemistry.yaml │ │ │ │ ├── global_mmlu_full_ru_college_computer_science.yaml │ │ │ │ ├── global_mmlu_full_ru_college_mathematics.yaml │ │ │ │ ├── global_mmlu_full_ru_college_medicine.yaml │ │ │ │ ├── global_mmlu_full_ru_college_physics.yaml │ │ │ │ ├── global_mmlu_full_ru_computer_security.yaml │ │ │ │ ├── global_mmlu_full_ru_conceptual_physics.yaml │ │ │ │ ├── global_mmlu_full_ru_econometrics.yaml │ │ │ │ ├── global_mmlu_full_ru_electrical_engineering.yaml │ │ │ │ ├── global_mmlu_full_ru_elementary_mathematics.yaml │ │ │ │ ├── global_mmlu_full_ru_formal_logic.yaml │ │ │ │ ├── global_mmlu_full_ru_global_facts.yaml │ │ │ │ ├── global_mmlu_full_ru_high_school_biology.yaml │ │ │ │ ├── global_mmlu_full_ru_high_school_chemistry.yaml │ │ │ │ ├── global_mmlu_full_ru_high_school_computer_science.yaml │ │ │ │ ├── global_mmlu_full_ru_high_school_european_history.yaml │ │ │ │ ├── global_mmlu_full_ru_high_school_geography.yaml │ │ │ │ ├── global_mmlu_full_ru_high_school_government_and_politics.yaml │ │ │ │ ├── global_mmlu_full_ru_high_school_macroeconomics.yaml │ │ │ │ ├── global_mmlu_full_ru_high_school_mathematics.yaml │ │ │ │ ├── global_mmlu_full_ru_high_school_microeconomics.yaml │ │ │ │ ├── global_mmlu_full_ru_high_school_physics.yaml │ │ │ │ ├── global_mmlu_full_ru_high_school_psychology.yaml │ │ │ │ ├── global_mmlu_full_ru_high_school_statistics.yaml │ │ │ │ ├── global_mmlu_full_ru_high_school_us_history.yaml │ │ │ │ ├── global_mmlu_full_ru_high_school_world_history.yaml │ │ │ │ ├── global_mmlu_full_ru_human_aging.yaml │ │ │ │ ├── global_mmlu_full_ru_human_sexuality.yaml │ │ │ │ ├── global_mmlu_full_ru_international_law.yaml │ │ │ │ ├── global_mmlu_full_ru_jurisprudence.yaml │ │ │ │ ├── global_mmlu_full_ru_logical_fallacies.yaml │ │ │ │ ├── global_mmlu_full_ru_machine_learning.yaml │ │ │ │ ├── global_mmlu_full_ru_management.yaml │ │ │ │ ├── global_mmlu_full_ru_marketing.yaml │ │ │ │ ├── global_mmlu_full_ru_medical_genetics.yaml │ │ │ │ ├── global_mmlu_full_ru_miscellaneous.yaml │ │ │ │ ├── global_mmlu_full_ru_moral_disputes.yaml │ │ │ │ ├── global_mmlu_full_ru_moral_scenarios.yaml │ │ │ │ ├── global_mmlu_full_ru_nutrition.yaml │ │ │ │ ├── global_mmlu_full_ru_philosophy.yaml │ │ │ │ ├── global_mmlu_full_ru_prehistory.yaml │ │ │ │ ├── global_mmlu_full_ru_professional_accounting.yaml │ │ │ │ ├── global_mmlu_full_ru_professional_law.yaml │ │ │ │ ├── global_mmlu_full_ru_professional_medicine.yaml │ │ │ │ ├── global_mmlu_full_ru_professional_psychology.yaml │ │ │ │ ├── global_mmlu_full_ru_public_relations.yaml │ │ │ │ ├── global_mmlu_full_ru_security_studies.yaml │ │ │ │ ├── global_mmlu_full_ru_sociology.yaml │ │ │ │ ├── global_mmlu_full_ru_us_foreign_policy.yaml │ │ │ │ ├── global_mmlu_full_ru_virology.yaml │ │ │ │ ├── global_mmlu_full_ru_world_religions.yaml │ │ │ │ └── utils.py │ │ │ ├── si/ │ │ │ │ ├── _global_mmlu_full_si.yaml │ │ │ │ ├── _global_mmlu_full_si_humanities.yaml │ │ │ │ ├── _global_mmlu_full_si_other.yaml │ │ │ │ ├── _global_mmlu_full_si_social_sciences.yaml │ │ │ │ ├── _global_mmlu_full_si_stem.yaml │ │ │ │ ├── _si_template_yaml │ │ │ │ ├── global_mmlu_full_si_abstract_algebra.yaml │ │ │ │ ├── global_mmlu_full_si_anatomy.yaml │ │ │ │ ├── global_mmlu_full_si_astronomy.yaml │ │ │ │ ├── global_mmlu_full_si_business_ethics.yaml │ │ │ │ ├── global_mmlu_full_si_clinical_knowledge.yaml │ │ │ │ ├── global_mmlu_full_si_college_biology.yaml │ │ │ │ ├── global_mmlu_full_si_college_chemistry.yaml │ │ │ │ ├── global_mmlu_full_si_college_computer_science.yaml │ │ │ │ ├── global_mmlu_full_si_college_mathematics.yaml │ │ │ │ ├── global_mmlu_full_si_college_medicine.yaml │ │ │ │ ├── global_mmlu_full_si_college_physics.yaml │ │ │ │ ├── global_mmlu_full_si_computer_security.yaml │ │ │ │ ├── global_mmlu_full_si_conceptual_physics.yaml │ │ │ │ ├── global_mmlu_full_si_econometrics.yaml │ │ │ │ ├── global_mmlu_full_si_electrical_engineering.yaml │ │ │ │ ├── global_mmlu_full_si_elementary_mathematics.yaml │ │ │ │ ├── global_mmlu_full_si_formal_logic.yaml │ │ │ │ ├── global_mmlu_full_si_global_facts.yaml │ │ │ │ ├── global_mmlu_full_si_high_school_biology.yaml │ │ │ │ ├── global_mmlu_full_si_high_school_chemistry.yaml │ │ │ │ ├── global_mmlu_full_si_high_school_computer_science.yaml │ │ │ │ ├── global_mmlu_full_si_high_school_european_history.yaml │ │ │ │ ├── global_mmlu_full_si_high_school_geography.yaml │ │ │ │ ├── global_mmlu_full_si_high_school_government_and_politics.yaml │ │ │ │ ├── global_mmlu_full_si_high_school_macroeconomics.yaml │ │ │ │ ├── global_mmlu_full_si_high_school_mathematics.yaml │ │ │ │ ├── global_mmlu_full_si_high_school_microeconomics.yaml │ │ │ │ ├── global_mmlu_full_si_high_school_physics.yaml │ │ │ │ ├── global_mmlu_full_si_high_school_psychology.yaml │ │ │ │ ├── global_mmlu_full_si_high_school_statistics.yaml │ │ │ │ ├── global_mmlu_full_si_high_school_us_history.yaml │ │ │ │ ├── global_mmlu_full_si_high_school_world_history.yaml │ │ │ │ ├── global_mmlu_full_si_human_aging.yaml │ │ │ │ ├── global_mmlu_full_si_human_sexuality.yaml │ │ │ │ ├── global_mmlu_full_si_international_law.yaml │ │ │ │ ├── global_mmlu_full_si_jurisprudence.yaml │ │ │ │ ├── global_mmlu_full_si_logical_fallacies.yaml │ │ │ │ ├── global_mmlu_full_si_machine_learning.yaml │ │ │ │ ├── global_mmlu_full_si_management.yaml │ │ │ │ ├── global_mmlu_full_si_marketing.yaml │ │ │ │ ├── global_mmlu_full_si_medical_genetics.yaml │ │ │ │ ├── global_mmlu_full_si_miscellaneous.yaml │ │ │ │ ├── global_mmlu_full_si_moral_disputes.yaml │ │ │ │ ├── global_mmlu_full_si_moral_scenarios.yaml │ │ │ │ ├── global_mmlu_full_si_nutrition.yaml │ │ │ │ ├── global_mmlu_full_si_philosophy.yaml │ │ │ │ ├── global_mmlu_full_si_prehistory.yaml │ │ │ │ ├── global_mmlu_full_si_professional_accounting.yaml │ │ │ │ ├── global_mmlu_full_si_professional_law.yaml │ │ │ │ ├── global_mmlu_full_si_professional_medicine.yaml │ │ │ │ ├── global_mmlu_full_si_professional_psychology.yaml │ │ │ │ ├── global_mmlu_full_si_public_relations.yaml │ │ │ │ ├── global_mmlu_full_si_security_studies.yaml │ │ │ │ ├── global_mmlu_full_si_sociology.yaml │ │ │ │ ├── global_mmlu_full_si_us_foreign_policy.yaml │ │ │ │ ├── global_mmlu_full_si_virology.yaml │ │ │ │ ├── global_mmlu_full_si_world_religions.yaml │ │ │ │ └── utils.py │ │ │ ├── sn/ │ │ │ │ ├── _global_mmlu_full_sn.yaml │ │ │ │ ├── _global_mmlu_full_sn_humanities.yaml │ │ │ │ ├── _global_mmlu_full_sn_other.yaml │ │ │ │ ├── _global_mmlu_full_sn_social_sciences.yaml │ │ │ │ ├── _global_mmlu_full_sn_stem.yaml │ │ │ │ ├── _sn_template_yaml │ │ │ │ ├── global_mmlu_full_sn_abstract_algebra.yaml │ │ │ │ ├── global_mmlu_full_sn_anatomy.yaml │ │ │ │ ├── global_mmlu_full_sn_astronomy.yaml │ │ │ │ ├── global_mmlu_full_sn_business_ethics.yaml │ │ │ │ ├── global_mmlu_full_sn_clinical_knowledge.yaml │ │ │ │ ├── global_mmlu_full_sn_college_biology.yaml │ │ │ │ ├── global_mmlu_full_sn_college_chemistry.yaml │ │ │ │ ├── global_mmlu_full_sn_college_computer_science.yaml │ │ │ │ ├── global_mmlu_full_sn_college_mathematics.yaml │ │ │ │ ├── global_mmlu_full_sn_college_medicine.yaml │ │ │ │ ├── global_mmlu_full_sn_college_physics.yaml │ │ │ │ ├── global_mmlu_full_sn_computer_security.yaml │ │ │ │ ├── global_mmlu_full_sn_conceptual_physics.yaml │ │ │ │ ├── global_mmlu_full_sn_econometrics.yaml │ │ │ │ ├── global_mmlu_full_sn_electrical_engineering.yaml │ │ │ │ ├── global_mmlu_full_sn_elementary_mathematics.yaml │ │ │ │ ├── global_mmlu_full_sn_formal_logic.yaml │ │ │ │ ├── global_mmlu_full_sn_global_facts.yaml │ │ │ │ ├── global_mmlu_full_sn_high_school_biology.yaml │ │ │ │ ├── global_mmlu_full_sn_high_school_chemistry.yaml │ │ │ │ ├── global_mmlu_full_sn_high_school_computer_science.yaml │ │ │ │ ├── global_mmlu_full_sn_high_school_european_history.yaml │ │ │ │ ├── global_mmlu_full_sn_high_school_geography.yaml │ │ │ │ ├── global_mmlu_full_sn_high_school_government_and_politics.yaml │ │ │ │ ├── global_mmlu_full_sn_high_school_macroeconomics.yaml │ │ │ │ ├── global_mmlu_full_sn_high_school_mathematics.yaml │ │ │ │ ├── global_mmlu_full_sn_high_school_microeconomics.yaml │ │ │ │ ├── global_mmlu_full_sn_high_school_physics.yaml │ │ │ │ ├── global_mmlu_full_sn_high_school_psychology.yaml │ │ │ │ ├── global_mmlu_full_sn_high_school_statistics.yaml │ │ │ │ ├── global_mmlu_full_sn_high_school_us_history.yaml │ │ │ │ ├── global_mmlu_full_sn_high_school_world_history.yaml │ │ │ │ ├── global_mmlu_full_sn_human_aging.yaml │ │ │ │ ├── global_mmlu_full_sn_human_sexuality.yaml │ │ │ │ ├── global_mmlu_full_sn_international_law.yaml │ │ │ │ ├── global_mmlu_full_sn_jurisprudence.yaml │ │ │ │ ├── global_mmlu_full_sn_logical_fallacies.yaml │ │ │ │ ├── global_mmlu_full_sn_machine_learning.yaml │ │ │ │ ├── global_mmlu_full_sn_management.yaml │ │ │ │ ├── global_mmlu_full_sn_marketing.yaml │ │ │ │ ├── global_mmlu_full_sn_medical_genetics.yaml │ │ │ │ ├── global_mmlu_full_sn_miscellaneous.yaml │ │ │ │ ├── global_mmlu_full_sn_moral_disputes.yaml │ │ │ │ ├── global_mmlu_full_sn_moral_scenarios.yaml │ │ │ │ ├── global_mmlu_full_sn_nutrition.yaml │ │ │ │ ├── global_mmlu_full_sn_philosophy.yaml │ │ │ │ ├── global_mmlu_full_sn_prehistory.yaml │ │ │ │ ├── global_mmlu_full_sn_professional_accounting.yaml │ │ │ │ ├── global_mmlu_full_sn_professional_law.yaml │ │ │ │ ├── global_mmlu_full_sn_professional_medicine.yaml │ │ │ │ ├── global_mmlu_full_sn_professional_psychology.yaml │ │ │ │ ├── global_mmlu_full_sn_public_relations.yaml │ │ │ │ ├── global_mmlu_full_sn_security_studies.yaml │ │ │ │ ├── global_mmlu_full_sn_sociology.yaml │ │ │ │ ├── global_mmlu_full_sn_us_foreign_policy.yaml │ │ │ │ ├── global_mmlu_full_sn_virology.yaml │ │ │ │ ├── global_mmlu_full_sn_world_religions.yaml │ │ │ │ └── utils.py │ │ │ ├── so/ │ │ │ │ ├── _global_mmlu_full_so.yaml │ │ │ │ ├── _global_mmlu_full_so_humanities.yaml │ │ │ │ ├── _global_mmlu_full_so_other.yaml │ │ │ │ ├── _global_mmlu_full_so_social_sciences.yaml │ │ │ │ ├── _global_mmlu_full_so_stem.yaml │ │ │ │ ├── _so_template_yaml │ │ │ │ ├── global_mmlu_full_so_abstract_algebra.yaml │ │ │ │ ├── global_mmlu_full_so_anatomy.yaml │ │ │ │ ├── global_mmlu_full_so_astronomy.yaml │ │ │ │ ├── global_mmlu_full_so_business_ethics.yaml │ │ │ │ ├── global_mmlu_full_so_clinical_knowledge.yaml │ │ │ │ ├── global_mmlu_full_so_college_biology.yaml │ │ │ │ ├── global_mmlu_full_so_college_chemistry.yaml │ │ │ │ ├── global_mmlu_full_so_college_computer_science.yaml │ │ │ │ ├── global_mmlu_full_so_college_mathematics.yaml │ │ │ │ ├── global_mmlu_full_so_college_medicine.yaml │ │ │ │ ├── global_mmlu_full_so_college_physics.yaml │ │ │ │ ├── global_mmlu_full_so_computer_security.yaml │ │ │ │ ├── global_mmlu_full_so_conceptual_physics.yaml │ │ │ │ ├── global_mmlu_full_so_econometrics.yaml │ │ │ │ ├── global_mmlu_full_so_electrical_engineering.yaml │ │ │ │ ├── global_mmlu_full_so_elementary_mathematics.yaml │ │ │ │ ├── global_mmlu_full_so_formal_logic.yaml │ │ │ │ ├── global_mmlu_full_so_global_facts.yaml │ │ │ │ ├── global_mmlu_full_so_high_school_biology.yaml │ │ │ │ ├── global_mmlu_full_so_high_school_chemistry.yaml │ │ │ │ ├── global_mmlu_full_so_high_school_computer_science.yaml │ │ │ │ ├── global_mmlu_full_so_high_school_european_history.yaml │ │ │ │ ├── global_mmlu_full_so_high_school_geography.yaml │ │ │ │ ├── global_mmlu_full_so_high_school_government_and_politics.yaml │ │ │ │ ├── global_mmlu_full_so_high_school_macroeconomics.yaml │ │ │ │ ├── global_mmlu_full_so_high_school_mathematics.yaml │ │ │ │ ├── global_mmlu_full_so_high_school_microeconomics.yaml │ │ │ │ ├── global_mmlu_full_so_high_school_physics.yaml │ │ │ │ ├── global_mmlu_full_so_high_school_psychology.yaml │ │ │ │ ├── global_mmlu_full_so_high_school_statistics.yaml │ │ │ │ ├── global_mmlu_full_so_high_school_us_history.yaml │ │ │ │ ├── global_mmlu_full_so_high_school_world_history.yaml │ │ │ │ ├── global_mmlu_full_so_human_aging.yaml │ │ │ │ ├── global_mmlu_full_so_human_sexuality.yaml │ │ │ │ ├── global_mmlu_full_so_international_law.yaml │ │ │ │ ├── global_mmlu_full_so_jurisprudence.yaml │ │ │ │ ├── global_mmlu_full_so_logical_fallacies.yaml │ │ │ │ ├── global_mmlu_full_so_machine_learning.yaml │ │ │ │ ├── global_mmlu_full_so_management.yaml │ │ │ │ ├── global_mmlu_full_so_marketing.yaml │ │ │ │ ├── global_mmlu_full_so_medical_genetics.yaml │ │ │ │ ├── global_mmlu_full_so_miscellaneous.yaml │ │ │ │ ├── global_mmlu_full_so_moral_disputes.yaml │ │ │ │ ├── global_mmlu_full_so_moral_scenarios.yaml │ │ │ │ ├── global_mmlu_full_so_nutrition.yaml │ │ │ │ ├── global_mmlu_full_so_philosophy.yaml │ │ │ │ ├── global_mmlu_full_so_prehistory.yaml │ │ │ │ ├── global_mmlu_full_so_professional_accounting.yaml │ │ │ │ ├── global_mmlu_full_so_professional_law.yaml │ │ │ │ ├── global_mmlu_full_so_professional_medicine.yaml │ │ │ │ ├── global_mmlu_full_so_professional_psychology.yaml │ │ │ │ ├── global_mmlu_full_so_public_relations.yaml │ │ │ │ ├── global_mmlu_full_so_security_studies.yaml │ │ │ │ ├── global_mmlu_full_so_sociology.yaml │ │ │ │ ├── global_mmlu_full_so_us_foreign_policy.yaml │ │ │ │ ├── global_mmlu_full_so_virology.yaml │ │ │ │ ├── global_mmlu_full_so_world_religions.yaml │ │ │ │ └── utils.py │ │ │ ├── sr/ │ │ │ │ ├── _global_mmlu_full_sr.yaml │ │ │ │ ├── _global_mmlu_full_sr_humanities.yaml │ │ │ │ ├── _global_mmlu_full_sr_other.yaml │ │ │ │ ├── _global_mmlu_full_sr_social_sciences.yaml │ │ │ │ ├── _global_mmlu_full_sr_stem.yaml │ │ │ │ ├── _sr_template_yaml │ │ │ │ ├── global_mmlu_full_sr_abstract_algebra.yaml │ │ │ │ ├── global_mmlu_full_sr_anatomy.yaml │ │ │ │ ├── global_mmlu_full_sr_astronomy.yaml │ │ │ │ ├── global_mmlu_full_sr_business_ethics.yaml │ │ │ │ ├── global_mmlu_full_sr_clinical_knowledge.yaml │ │ │ │ ├── global_mmlu_full_sr_college_biology.yaml │ │ │ │ ├── global_mmlu_full_sr_college_chemistry.yaml │ │ │ │ ├── global_mmlu_full_sr_college_computer_science.yaml │ │ │ │ ├── global_mmlu_full_sr_college_mathematics.yaml │ │ │ │ ├── global_mmlu_full_sr_college_medicine.yaml │ │ │ │ ├── global_mmlu_full_sr_college_physics.yaml │ │ │ │ ├── global_mmlu_full_sr_computer_security.yaml │ │ │ │ ├── global_mmlu_full_sr_conceptual_physics.yaml │ │ │ │ ├── global_mmlu_full_sr_econometrics.yaml │ │ │ │ ├── global_mmlu_full_sr_electrical_engineering.yaml │ │ │ │ ├── global_mmlu_full_sr_elementary_mathematics.yaml │ │ │ │ ├── global_mmlu_full_sr_formal_logic.yaml │ │ │ │ ├── global_mmlu_full_sr_global_facts.yaml │ │ │ │ ├── global_mmlu_full_sr_high_school_biology.yaml │ │ │ │ ├── global_mmlu_full_sr_high_school_chemistry.yaml │ │ │ │ ├── global_mmlu_full_sr_high_school_computer_science.yaml │ │ │ │ ├── global_mmlu_full_sr_high_school_european_history.yaml │ │ │ │ ├── global_mmlu_full_sr_high_school_geography.yaml │ │ │ │ ├── global_mmlu_full_sr_high_school_government_and_politics.yaml │ │ │ │ ├── global_mmlu_full_sr_high_school_macroeconomics.yaml │ │ │ │ ├── global_mmlu_full_sr_high_school_mathematics.yaml │ │ │ │ ├── global_mmlu_full_sr_high_school_microeconomics.yaml │ │ │ │ ├── global_mmlu_full_sr_high_school_physics.yaml │ │ │ │ ├── global_mmlu_full_sr_high_school_psychology.yaml │ │ │ │ ├── global_mmlu_full_sr_high_school_statistics.yaml │ │ │ │ ├── global_mmlu_full_sr_high_school_us_history.yaml │ │ │ │ ├── global_mmlu_full_sr_high_school_world_history.yaml │ │ │ │ ├── global_mmlu_full_sr_human_aging.yaml │ │ │ │ ├── global_mmlu_full_sr_human_sexuality.yaml │ │ │ │ ├── global_mmlu_full_sr_international_law.yaml │ │ │ │ ├── global_mmlu_full_sr_jurisprudence.yaml │ │ │ │ ├── global_mmlu_full_sr_logical_fallacies.yaml │ │ │ │ ├── global_mmlu_full_sr_machine_learning.yaml │ │ │ │ ├── global_mmlu_full_sr_management.yaml │ │ │ │ ├── global_mmlu_full_sr_marketing.yaml │ │ │ │ ├── global_mmlu_full_sr_medical_genetics.yaml │ │ │ │ ├── global_mmlu_full_sr_miscellaneous.yaml │ │ │ │ ├── global_mmlu_full_sr_moral_disputes.yaml │ │ │ │ ├── global_mmlu_full_sr_moral_scenarios.yaml │ │ │ │ ├── global_mmlu_full_sr_nutrition.yaml │ │ │ │ ├── global_mmlu_full_sr_philosophy.yaml │ │ │ │ ├── global_mmlu_full_sr_prehistory.yaml │ │ │ │ ├── global_mmlu_full_sr_professional_accounting.yaml │ │ │ │ ├── global_mmlu_full_sr_professional_law.yaml │ │ │ │ ├── global_mmlu_full_sr_professional_medicine.yaml │ │ │ │ ├── global_mmlu_full_sr_professional_psychology.yaml │ │ │ │ ├── global_mmlu_full_sr_public_relations.yaml │ │ │ │ ├── global_mmlu_full_sr_security_studies.yaml │ │ │ │ ├── global_mmlu_full_sr_sociology.yaml │ │ │ │ ├── global_mmlu_full_sr_us_foreign_policy.yaml │ │ │ │ ├── global_mmlu_full_sr_virology.yaml │ │ │ │ ├── global_mmlu_full_sr_world_religions.yaml │ │ │ │ └── utils.py │ │ │ ├── sv/ │ │ │ │ ├── _global_mmlu_full_sv.yaml │ │ │ │ ├── _global_mmlu_full_sv_humanities.yaml │ │ │ │ ├── _global_mmlu_full_sv_other.yaml │ │ │ │ ├── _global_mmlu_full_sv_social_sciences.yaml │ │ │ │ ├── _global_mmlu_full_sv_stem.yaml │ │ │ │ ├── _sv_template_yaml │ │ │ │ ├── global_mmlu_full_sv_abstract_algebra.yaml │ │ │ │ ├── global_mmlu_full_sv_anatomy.yaml │ │ │ │ ├── global_mmlu_full_sv_astronomy.yaml │ │ │ │ ├── global_mmlu_full_sv_business_ethics.yaml │ │ │ │ ├── global_mmlu_full_sv_clinical_knowledge.yaml │ │ │ │ ├── global_mmlu_full_sv_college_biology.yaml │ │ │ │ ├── global_mmlu_full_sv_college_chemistry.yaml │ │ │ │ ├── global_mmlu_full_sv_college_computer_science.yaml │ │ │ │ ├── global_mmlu_full_sv_college_mathematics.yaml │ │ │ │ ├── global_mmlu_full_sv_college_medicine.yaml │ │ │ │ ├── global_mmlu_full_sv_college_physics.yaml │ │ │ │ ├── global_mmlu_full_sv_computer_security.yaml │ │ │ │ ├── global_mmlu_full_sv_conceptual_physics.yaml │ │ │ │ ├── global_mmlu_full_sv_econometrics.yaml │ │ │ │ ├── global_mmlu_full_sv_electrical_engineering.yaml │ │ │ │ ├── global_mmlu_full_sv_elementary_mathematics.yaml │ │ │ │ ├── global_mmlu_full_sv_formal_logic.yaml │ │ │ │ ├── global_mmlu_full_sv_global_facts.yaml │ │ │ │ ├── global_mmlu_full_sv_high_school_biology.yaml │ │ │ │ ├── global_mmlu_full_sv_high_school_chemistry.yaml │ │ │ │ ├── global_mmlu_full_sv_high_school_computer_science.yaml │ │ │ │ ├── global_mmlu_full_sv_high_school_european_history.yaml │ │ │ │ ├── global_mmlu_full_sv_high_school_geography.yaml │ │ │ │ ├── global_mmlu_full_sv_high_school_government_and_politics.yaml │ │ │ │ ├── global_mmlu_full_sv_high_school_macroeconomics.yaml │ │ │ │ ├── global_mmlu_full_sv_high_school_mathematics.yaml │ │ │ │ ├── global_mmlu_full_sv_high_school_microeconomics.yaml │ │ │ │ ├── global_mmlu_full_sv_high_school_physics.yaml │ │ │ │ ├── global_mmlu_full_sv_high_school_psychology.yaml │ │ │ │ ├── global_mmlu_full_sv_high_school_statistics.yaml │ │ │ │ ├── global_mmlu_full_sv_high_school_us_history.yaml │ │ │ │ ├── global_mmlu_full_sv_high_school_world_history.yaml │ │ │ │ ├── global_mmlu_full_sv_human_aging.yaml │ │ │ │ ├── global_mmlu_full_sv_human_sexuality.yaml │ │ │ │ ├── global_mmlu_full_sv_international_law.yaml │ │ │ │ ├── global_mmlu_full_sv_jurisprudence.yaml │ │ │ │ ├── global_mmlu_full_sv_logical_fallacies.yaml │ │ │ │ ├── global_mmlu_full_sv_machine_learning.yaml │ │ │ │ ├── global_mmlu_full_sv_management.yaml │ │ │ │ ├── global_mmlu_full_sv_marketing.yaml │ │ │ │ ├── global_mmlu_full_sv_medical_genetics.yaml │ │ │ │ ├── global_mmlu_full_sv_miscellaneous.yaml │ │ │ │ ├── global_mmlu_full_sv_moral_disputes.yaml │ │ │ │ ├── global_mmlu_full_sv_moral_scenarios.yaml │ │ │ │ ├── global_mmlu_full_sv_nutrition.yaml │ │ │ │ ├── global_mmlu_full_sv_philosophy.yaml │ │ │ │ ├── global_mmlu_full_sv_prehistory.yaml │ │ │ │ ├── global_mmlu_full_sv_professional_accounting.yaml │ │ │ │ ├── global_mmlu_full_sv_professional_law.yaml │ │ │ │ ├── global_mmlu_full_sv_professional_medicine.yaml │ │ │ │ ├── global_mmlu_full_sv_professional_psychology.yaml │ │ │ │ ├── global_mmlu_full_sv_public_relations.yaml │ │ │ │ ├── global_mmlu_full_sv_security_studies.yaml │ │ │ │ ├── global_mmlu_full_sv_sociology.yaml │ │ │ │ ├── global_mmlu_full_sv_us_foreign_policy.yaml │ │ │ │ ├── global_mmlu_full_sv_virology.yaml │ │ │ │ ├── global_mmlu_full_sv_world_religions.yaml │ │ │ │ └── utils.py │ │ │ ├── sw/ │ │ │ │ ├── _global_mmlu_full_sw.yaml │ │ │ │ ├── _global_mmlu_full_sw_humanities.yaml │ │ │ │ ├── _global_mmlu_full_sw_other.yaml │ │ │ │ ├── _global_mmlu_full_sw_social_sciences.yaml │ │ │ │ ├── _global_mmlu_full_sw_stem.yaml │ │ │ │ ├── _sw_template_yaml │ │ │ │ ├── global_mmlu_full_sw_abstract_algebra.yaml │ │ │ │ ├── global_mmlu_full_sw_anatomy.yaml │ │ │ │ ├── global_mmlu_full_sw_astronomy.yaml │ │ │ │ ├── global_mmlu_full_sw_business_ethics.yaml │ │ │ │ ├── global_mmlu_full_sw_clinical_knowledge.yaml │ │ │ │ ├── global_mmlu_full_sw_college_biology.yaml │ │ │ │ ├── global_mmlu_full_sw_college_chemistry.yaml │ │ │ │ ├── global_mmlu_full_sw_college_computer_science.yaml │ │ │ │ ├── global_mmlu_full_sw_college_mathematics.yaml │ │ │ │ ├── global_mmlu_full_sw_college_medicine.yaml │ │ │ │ ├── global_mmlu_full_sw_college_physics.yaml │ │ │ │ ├── global_mmlu_full_sw_computer_security.yaml │ │ │ │ ├── global_mmlu_full_sw_conceptual_physics.yaml │ │ │ │ ├── global_mmlu_full_sw_econometrics.yaml │ │ │ │ ├── global_mmlu_full_sw_electrical_engineering.yaml │ │ │ │ ├── global_mmlu_full_sw_elementary_mathematics.yaml │ │ │ │ ├── global_mmlu_full_sw_formal_logic.yaml │ │ │ │ ├── global_mmlu_full_sw_global_facts.yaml │ │ │ │ ├── global_mmlu_full_sw_high_school_biology.yaml │ │ │ │ ├── global_mmlu_full_sw_high_school_chemistry.yaml │ │ │ │ ├── global_mmlu_full_sw_high_school_computer_science.yaml │ │ │ │ ├── global_mmlu_full_sw_high_school_european_history.yaml │ │ │ │ ├── global_mmlu_full_sw_high_school_geography.yaml │ │ │ │ ├── global_mmlu_full_sw_high_school_government_and_politics.yaml │ │ │ │ ├── global_mmlu_full_sw_high_school_macroeconomics.yaml │ │ │ │ ├── global_mmlu_full_sw_high_school_mathematics.yaml │ │ │ │ ├── global_mmlu_full_sw_high_school_microeconomics.yaml │ │ │ │ ├── global_mmlu_full_sw_high_school_physics.yaml │ │ │ │ ├── global_mmlu_full_sw_high_school_psychology.yaml │ │ │ │ ├── global_mmlu_full_sw_high_school_statistics.yaml │ │ │ │ ├── global_mmlu_full_sw_high_school_us_history.yaml │ │ │ │ ├── global_mmlu_full_sw_high_school_world_history.yaml │ │ │ │ ├── global_mmlu_full_sw_human_aging.yaml │ │ │ │ ├── global_mmlu_full_sw_human_sexuality.yaml │ │ │ │ ├── global_mmlu_full_sw_international_law.yaml │ │ │ │ ├── global_mmlu_full_sw_jurisprudence.yaml │ │ │ │ ├── global_mmlu_full_sw_logical_fallacies.yaml │ │ │ │ ├── global_mmlu_full_sw_machine_learning.yaml │ │ │ │ ├── global_mmlu_full_sw_management.yaml │ │ │ │ ├── global_mmlu_full_sw_marketing.yaml │ │ │ │ ├── global_mmlu_full_sw_medical_genetics.yaml │ │ │ │ ├── global_mmlu_full_sw_miscellaneous.yaml │ │ │ │ ├── global_mmlu_full_sw_moral_disputes.yaml │ │ │ │ ├── global_mmlu_full_sw_moral_scenarios.yaml │ │ │ │ ├── global_mmlu_full_sw_nutrition.yaml │ │ │ │ ├── global_mmlu_full_sw_philosophy.yaml │ │ │ │ ├── global_mmlu_full_sw_prehistory.yaml │ │ │ │ ├── global_mmlu_full_sw_professional_accounting.yaml │ │ │ │ ├── global_mmlu_full_sw_professional_law.yaml │ │ │ │ ├── global_mmlu_full_sw_professional_medicine.yaml │ │ │ │ ├── global_mmlu_full_sw_professional_psychology.yaml │ │ │ │ ├── global_mmlu_full_sw_public_relations.yaml │ │ │ │ ├── global_mmlu_full_sw_security_studies.yaml │ │ │ │ ├── global_mmlu_full_sw_sociology.yaml │ │ │ │ ├── global_mmlu_full_sw_us_foreign_policy.yaml │ │ │ │ ├── global_mmlu_full_sw_virology.yaml │ │ │ │ ├── global_mmlu_full_sw_world_religions.yaml │ │ │ │ └── utils.py │ │ │ ├── te/ │ │ │ │ ├── _global_mmlu_full_te.yaml │ │ │ │ ├── _global_mmlu_full_te_humanities.yaml │ │ │ │ ├── _global_mmlu_full_te_other.yaml │ │ │ │ ├── _global_mmlu_full_te_social_sciences.yaml │ │ │ │ ├── _global_mmlu_full_te_stem.yaml │ │ │ │ ├── _te_template_yaml │ │ │ │ ├── global_mmlu_full_te_abstract_algebra.yaml │ │ │ │ ├── global_mmlu_full_te_anatomy.yaml │ │ │ │ ├── global_mmlu_full_te_astronomy.yaml │ │ │ │ ├── global_mmlu_full_te_business_ethics.yaml │ │ │ │ ├── global_mmlu_full_te_clinical_knowledge.yaml │ │ │ │ ├── global_mmlu_full_te_college_biology.yaml │ │ │ │ ├── global_mmlu_full_te_college_chemistry.yaml │ │ │ │ ├── global_mmlu_full_te_college_computer_science.yaml │ │ │ │ ├── global_mmlu_full_te_college_mathematics.yaml │ │ │ │ ├── global_mmlu_full_te_college_medicine.yaml │ │ │ │ ├── global_mmlu_full_te_college_physics.yaml │ │ │ │ ├── global_mmlu_full_te_computer_security.yaml │ │ │ │ ├── global_mmlu_full_te_conceptual_physics.yaml │ │ │ │ ├── global_mmlu_full_te_econometrics.yaml │ │ │ │ ├── global_mmlu_full_te_electrical_engineering.yaml │ │ │ │ ├── global_mmlu_full_te_elementary_mathematics.yaml │ │ │ │ ├── global_mmlu_full_te_formal_logic.yaml │ │ │ │ ├── global_mmlu_full_te_global_facts.yaml │ │ │ │ ├── global_mmlu_full_te_high_school_biology.yaml │ │ │ │ ├── global_mmlu_full_te_high_school_chemistry.yaml │ │ │ │ ├── global_mmlu_full_te_high_school_computer_science.yaml │ │ │ │ ├── global_mmlu_full_te_high_school_european_history.yaml │ │ │ │ ├── global_mmlu_full_te_high_school_geography.yaml │ │ │ │ ├── global_mmlu_full_te_high_school_government_and_politics.yaml │ │ │ │ ├── global_mmlu_full_te_high_school_macroeconomics.yaml │ │ │ │ ├── global_mmlu_full_te_high_school_mathematics.yaml │ │ │ │ ├── global_mmlu_full_te_high_school_microeconomics.yaml │ │ │ │ ├── global_mmlu_full_te_high_school_physics.yaml │ │ │ │ ├── global_mmlu_full_te_high_school_psychology.yaml │ │ │ │ ├── global_mmlu_full_te_high_school_statistics.yaml │ │ │ │ ├── global_mmlu_full_te_high_school_us_history.yaml │ │ │ │ ├── global_mmlu_full_te_high_school_world_history.yaml │ │ │ │ ├── global_mmlu_full_te_human_aging.yaml │ │ │ │ ├── global_mmlu_full_te_human_sexuality.yaml │ │ │ │ ├── global_mmlu_full_te_international_law.yaml │ │ │ │ ├── global_mmlu_full_te_jurisprudence.yaml │ │ │ │ ├── global_mmlu_full_te_logical_fallacies.yaml │ │ │ │ ├── global_mmlu_full_te_machine_learning.yaml │ │ │ │ ├── global_mmlu_full_te_management.yaml │ │ │ │ ├── global_mmlu_full_te_marketing.yaml │ │ │ │ ├── global_mmlu_full_te_medical_genetics.yaml │ │ │ │ ├── global_mmlu_full_te_miscellaneous.yaml │ │ │ │ ├── global_mmlu_full_te_moral_disputes.yaml │ │ │ │ ├── global_mmlu_full_te_moral_scenarios.yaml │ │ │ │ ├── global_mmlu_full_te_nutrition.yaml │ │ │ │ ├── global_mmlu_full_te_philosophy.yaml │ │ │ │ ├── global_mmlu_full_te_prehistory.yaml │ │ │ │ ├── global_mmlu_full_te_professional_accounting.yaml │ │ │ │ ├── global_mmlu_full_te_professional_law.yaml │ │ │ │ ├── global_mmlu_full_te_professional_medicine.yaml │ │ │ │ ├── global_mmlu_full_te_professional_psychology.yaml │ │ │ │ ├── global_mmlu_full_te_public_relations.yaml │ │ │ │ ├── global_mmlu_full_te_security_studies.yaml │ │ │ │ ├── global_mmlu_full_te_sociology.yaml │ │ │ │ ├── global_mmlu_full_te_us_foreign_policy.yaml │ │ │ │ ├── global_mmlu_full_te_virology.yaml │ │ │ │ ├── global_mmlu_full_te_world_religions.yaml │ │ │ │ └── utils.py │ │ │ ├── tr/ │ │ │ │ ├── _global_mmlu_full_tr.yaml │ │ │ │ ├── _global_mmlu_full_tr_humanities.yaml │ │ │ │ ├── _global_mmlu_full_tr_other.yaml │ │ │ │ ├── _global_mmlu_full_tr_social_sciences.yaml │ │ │ │ ├── _global_mmlu_full_tr_stem.yaml │ │ │ │ ├── _tr_template_yaml │ │ │ │ ├── global_mmlu_full_tr_abstract_algebra.yaml │ │ │ │ ├── global_mmlu_full_tr_anatomy.yaml │ │ │ │ ├── global_mmlu_full_tr_astronomy.yaml │ │ │ │ ├── global_mmlu_full_tr_business_ethics.yaml │ │ │ │ ├── global_mmlu_full_tr_clinical_knowledge.yaml │ │ │ │ ├── global_mmlu_full_tr_college_biology.yaml │ │ │ │ ├── global_mmlu_full_tr_college_chemistry.yaml │ │ │ │ ├── global_mmlu_full_tr_college_computer_science.yaml │ │ │ │ ├── global_mmlu_full_tr_college_mathematics.yaml │ │ │ │ ├── global_mmlu_full_tr_college_medicine.yaml │ │ │ │ ├── global_mmlu_full_tr_college_physics.yaml │ │ │ │ ├── global_mmlu_full_tr_computer_security.yaml │ │ │ │ ├── global_mmlu_full_tr_conceptual_physics.yaml │ │ │ │ ├── global_mmlu_full_tr_econometrics.yaml │ │ │ │ ├── global_mmlu_full_tr_electrical_engineering.yaml │ │ │ │ ├── global_mmlu_full_tr_elementary_mathematics.yaml │ │ │ │ ├── global_mmlu_full_tr_formal_logic.yaml │ │ │ │ ├── global_mmlu_full_tr_global_facts.yaml │ │ │ │ ├── global_mmlu_full_tr_high_school_biology.yaml │ │ │ │ ├── global_mmlu_full_tr_high_school_chemistry.yaml │ │ │ │ ├── global_mmlu_full_tr_high_school_computer_science.yaml │ │ │ │ ├── global_mmlu_full_tr_high_school_european_history.yaml │ │ │ │ ├── global_mmlu_full_tr_high_school_geography.yaml │ │ │ │ ├── global_mmlu_full_tr_high_school_government_and_politics.yaml │ │ │ │ ├── global_mmlu_full_tr_high_school_macroeconomics.yaml │ │ │ │ ├── global_mmlu_full_tr_high_school_mathematics.yaml │ │ │ │ ├── global_mmlu_full_tr_high_school_microeconomics.yaml │ │ │ │ ├── global_mmlu_full_tr_high_school_physics.yaml │ │ │ │ ├── global_mmlu_full_tr_high_school_psychology.yaml │ │ │ │ ├── global_mmlu_full_tr_high_school_statistics.yaml │ │ │ │ ├── global_mmlu_full_tr_high_school_us_history.yaml │ │ │ │ ├── global_mmlu_full_tr_high_school_world_history.yaml │ │ │ │ ├── global_mmlu_full_tr_human_aging.yaml │ │ │ │ ├── global_mmlu_full_tr_human_sexuality.yaml │ │ │ │ ├── global_mmlu_full_tr_international_law.yaml │ │ │ │ ├── global_mmlu_full_tr_jurisprudence.yaml │ │ │ │ ├── global_mmlu_full_tr_logical_fallacies.yaml │ │ │ │ ├── global_mmlu_full_tr_machine_learning.yaml │ │ │ │ ├── global_mmlu_full_tr_management.yaml │ │ │ │ ├── global_mmlu_full_tr_marketing.yaml │ │ │ │ ├── global_mmlu_full_tr_medical_genetics.yaml │ │ │ │ ├── global_mmlu_full_tr_miscellaneous.yaml │ │ │ │ ├── global_mmlu_full_tr_moral_disputes.yaml │ │ │ │ ├── global_mmlu_full_tr_moral_scenarios.yaml │ │ │ │ ├── global_mmlu_full_tr_nutrition.yaml │ │ │ │ ├── global_mmlu_full_tr_philosophy.yaml │ │ │ │ ├── global_mmlu_full_tr_prehistory.yaml │ │ │ │ ├── global_mmlu_full_tr_professional_accounting.yaml │ │ │ │ ├── global_mmlu_full_tr_professional_law.yaml │ │ │ │ ├── global_mmlu_full_tr_professional_medicine.yaml │ │ │ │ ├── global_mmlu_full_tr_professional_psychology.yaml │ │ │ │ ├── global_mmlu_full_tr_public_relations.yaml │ │ │ │ ├── global_mmlu_full_tr_security_studies.yaml │ │ │ │ ├── global_mmlu_full_tr_sociology.yaml │ │ │ │ ├── global_mmlu_full_tr_us_foreign_policy.yaml │ │ │ │ ├── global_mmlu_full_tr_virology.yaml │ │ │ │ ├── global_mmlu_full_tr_world_religions.yaml │ │ │ │ └── utils.py │ │ │ ├── uk/ │ │ │ │ ├── _global_mmlu_full_uk.yaml │ │ │ │ ├── _global_mmlu_full_uk_humanities.yaml │ │ │ │ ├── _global_mmlu_full_uk_other.yaml │ │ │ │ ├── _global_mmlu_full_uk_social_sciences.yaml │ │ │ │ ├── _global_mmlu_full_uk_stem.yaml │ │ │ │ ├── _uk_template_yaml │ │ │ │ ├── global_mmlu_full_uk_abstract_algebra.yaml │ │ │ │ ├── global_mmlu_full_uk_anatomy.yaml │ │ │ │ ├── global_mmlu_full_uk_astronomy.yaml │ │ │ │ ├── global_mmlu_full_uk_business_ethics.yaml │ │ │ │ ├── global_mmlu_full_uk_clinical_knowledge.yaml │ │ │ │ ├── global_mmlu_full_uk_college_biology.yaml │ │ │ │ ├── global_mmlu_full_uk_college_chemistry.yaml │ │ │ │ ├── global_mmlu_full_uk_college_computer_science.yaml │ │ │ │ ├── global_mmlu_full_uk_college_mathematics.yaml │ │ │ │ ├── global_mmlu_full_uk_college_medicine.yaml │ │ │ │ ├── global_mmlu_full_uk_college_physics.yaml │ │ │ │ ├── global_mmlu_full_uk_computer_security.yaml │ │ │ │ ├── global_mmlu_full_uk_conceptual_physics.yaml │ │ │ │ ├── global_mmlu_full_uk_econometrics.yaml │ │ │ │ ├── global_mmlu_full_uk_electrical_engineering.yaml │ │ │ │ ├── global_mmlu_full_uk_elementary_mathematics.yaml │ │ │ │ ├── global_mmlu_full_uk_formal_logic.yaml │ │ │ │ ├── global_mmlu_full_uk_global_facts.yaml │ │ │ │ ├── global_mmlu_full_uk_high_school_biology.yaml │ │ │ │ ├── global_mmlu_full_uk_high_school_chemistry.yaml │ │ │ │ ├── global_mmlu_full_uk_high_school_computer_science.yaml │ │ │ │ ├── global_mmlu_full_uk_high_school_european_history.yaml │ │ │ │ ├── global_mmlu_full_uk_high_school_geography.yaml │ │ │ │ ├── global_mmlu_full_uk_high_school_government_and_politics.yaml │ │ │ │ ├── global_mmlu_full_uk_high_school_macroeconomics.yaml │ │ │ │ ├── global_mmlu_full_uk_high_school_mathematics.yaml │ │ │ │ ├── global_mmlu_full_uk_high_school_microeconomics.yaml │ │ │ │ ├── global_mmlu_full_uk_high_school_physics.yaml │ │ │ │ ├── global_mmlu_full_uk_high_school_psychology.yaml │ │ │ │ ├── global_mmlu_full_uk_high_school_statistics.yaml │ │ │ │ ├── global_mmlu_full_uk_high_school_us_history.yaml │ │ │ │ ├── global_mmlu_full_uk_high_school_world_history.yaml │ │ │ │ ├── global_mmlu_full_uk_human_aging.yaml │ │ │ │ ├── global_mmlu_full_uk_human_sexuality.yaml │ │ │ │ ├── global_mmlu_full_uk_international_law.yaml │ │ │ │ ├── global_mmlu_full_uk_jurisprudence.yaml │ │ │ │ ├── global_mmlu_full_uk_logical_fallacies.yaml │ │ │ │ ├── global_mmlu_full_uk_machine_learning.yaml │ │ │ │ ├── global_mmlu_full_uk_management.yaml │ │ │ │ ├── global_mmlu_full_uk_marketing.yaml │ │ │ │ ├── global_mmlu_full_uk_medical_genetics.yaml │ │ │ │ ├── global_mmlu_full_uk_miscellaneous.yaml │ │ │ │ ├── global_mmlu_full_uk_moral_disputes.yaml │ │ │ │ ├── global_mmlu_full_uk_moral_scenarios.yaml │ │ │ │ ├── global_mmlu_full_uk_nutrition.yaml │ │ │ │ ├── global_mmlu_full_uk_philosophy.yaml │ │ │ │ ├── global_mmlu_full_uk_prehistory.yaml │ │ │ │ ├── global_mmlu_full_uk_professional_accounting.yaml │ │ │ │ ├── global_mmlu_full_uk_professional_law.yaml │ │ │ │ ├── global_mmlu_full_uk_professional_medicine.yaml │ │ │ │ ├── global_mmlu_full_uk_professional_psychology.yaml │ │ │ │ ├── global_mmlu_full_uk_public_relations.yaml │ │ │ │ ├── global_mmlu_full_uk_security_studies.yaml │ │ │ │ ├── global_mmlu_full_uk_sociology.yaml │ │ │ │ ├── global_mmlu_full_uk_us_foreign_policy.yaml │ │ │ │ ├── global_mmlu_full_uk_virology.yaml │ │ │ │ ├── global_mmlu_full_uk_world_religions.yaml │ │ │ │ └── utils.py │ │ │ ├── vi/ │ │ │ │ ├── _global_mmlu_full_vi.yaml │ │ │ │ ├── _global_mmlu_full_vi_humanities.yaml │ │ │ │ ├── _global_mmlu_full_vi_other.yaml │ │ │ │ ├── _global_mmlu_full_vi_social_sciences.yaml │ │ │ │ ├── _global_mmlu_full_vi_stem.yaml │ │ │ │ ├── _vi_template_yaml │ │ │ │ ├── global_mmlu_full_vi_abstract_algebra.yaml │ │ │ │ ├── global_mmlu_full_vi_anatomy.yaml │ │ │ │ ├── global_mmlu_full_vi_astronomy.yaml │ │ │ │ ├── global_mmlu_full_vi_business_ethics.yaml │ │ │ │ ├── global_mmlu_full_vi_clinical_knowledge.yaml │ │ │ │ ├── global_mmlu_full_vi_college_biology.yaml │ │ │ │ ├── global_mmlu_full_vi_college_chemistry.yaml │ │ │ │ ├── global_mmlu_full_vi_college_computer_science.yaml │ │ │ │ ├── global_mmlu_full_vi_college_mathematics.yaml │ │ │ │ ├── global_mmlu_full_vi_college_medicine.yaml │ │ │ │ ├── global_mmlu_full_vi_college_physics.yaml │ │ │ │ ├── global_mmlu_full_vi_computer_security.yaml │ │ │ │ ├── global_mmlu_full_vi_conceptual_physics.yaml │ │ │ │ ├── global_mmlu_full_vi_econometrics.yaml │ │ │ │ ├── global_mmlu_full_vi_electrical_engineering.yaml │ │ │ │ ├── global_mmlu_full_vi_elementary_mathematics.yaml │ │ │ │ ├── global_mmlu_full_vi_formal_logic.yaml │ │ │ │ ├── global_mmlu_full_vi_global_facts.yaml │ │ │ │ ├── global_mmlu_full_vi_high_school_biology.yaml │ │ │ │ ├── global_mmlu_full_vi_high_school_chemistry.yaml │ │ │ │ ├── global_mmlu_full_vi_high_school_computer_science.yaml │ │ │ │ ├── global_mmlu_full_vi_high_school_european_history.yaml │ │ │ │ ├── global_mmlu_full_vi_high_school_geography.yaml │ │ │ │ ├── global_mmlu_full_vi_high_school_government_and_politics.yaml │ │ │ │ ├── global_mmlu_full_vi_high_school_macroeconomics.yaml │ │ │ │ ├── global_mmlu_full_vi_high_school_mathematics.yaml │ │ │ │ ├── global_mmlu_full_vi_high_school_microeconomics.yaml │ │ │ │ ├── global_mmlu_full_vi_high_school_physics.yaml │ │ │ │ ├── global_mmlu_full_vi_high_school_psychology.yaml │ │ │ │ ├── global_mmlu_full_vi_high_school_statistics.yaml │ │ │ │ ├── global_mmlu_full_vi_high_school_us_history.yaml │ │ │ │ ├── global_mmlu_full_vi_high_school_world_history.yaml │ │ │ │ ├── global_mmlu_full_vi_human_aging.yaml │ │ │ │ ├── global_mmlu_full_vi_human_sexuality.yaml │ │ │ │ ├── global_mmlu_full_vi_international_law.yaml │ │ │ │ ├── global_mmlu_full_vi_jurisprudence.yaml │ │ │ │ ├── global_mmlu_full_vi_logical_fallacies.yaml │ │ │ │ ├── global_mmlu_full_vi_machine_learning.yaml │ │ │ │ ├── global_mmlu_full_vi_management.yaml │ │ │ │ ├── global_mmlu_full_vi_marketing.yaml │ │ │ │ ├── global_mmlu_full_vi_medical_genetics.yaml │ │ │ │ ├── global_mmlu_full_vi_miscellaneous.yaml │ │ │ │ ├── global_mmlu_full_vi_moral_disputes.yaml │ │ │ │ ├── global_mmlu_full_vi_moral_scenarios.yaml │ │ │ │ ├── global_mmlu_full_vi_nutrition.yaml │ │ │ │ ├── global_mmlu_full_vi_philosophy.yaml │ │ │ │ ├── global_mmlu_full_vi_prehistory.yaml │ │ │ │ ├── global_mmlu_full_vi_professional_accounting.yaml │ │ │ │ ├── global_mmlu_full_vi_professional_law.yaml │ │ │ │ ├── global_mmlu_full_vi_professional_medicine.yaml │ │ │ │ ├── global_mmlu_full_vi_professional_psychology.yaml │ │ │ │ ├── global_mmlu_full_vi_public_relations.yaml │ │ │ │ ├── global_mmlu_full_vi_security_studies.yaml │ │ │ │ ├── global_mmlu_full_vi_sociology.yaml │ │ │ │ ├── global_mmlu_full_vi_us_foreign_policy.yaml │ │ │ │ ├── global_mmlu_full_vi_virology.yaml │ │ │ │ ├── global_mmlu_full_vi_world_religions.yaml │ │ │ │ └── utils.py │ │ │ ├── yo/ │ │ │ │ ├── _global_mmlu_full_yo.yaml │ │ │ │ ├── _global_mmlu_full_yo_humanities.yaml │ │ │ │ ├── _global_mmlu_full_yo_other.yaml │ │ │ │ ├── _global_mmlu_full_yo_social_sciences.yaml │ │ │ │ ├── _global_mmlu_full_yo_stem.yaml │ │ │ │ ├── _yo_template_yaml │ │ │ │ ├── global_mmlu_full_yo_abstract_algebra.yaml │ │ │ │ ├── global_mmlu_full_yo_anatomy.yaml │ │ │ │ ├── global_mmlu_full_yo_astronomy.yaml │ │ │ │ ├── global_mmlu_full_yo_business_ethics.yaml │ │ │ │ ├── global_mmlu_full_yo_clinical_knowledge.yaml │ │ │ │ ├── global_mmlu_full_yo_college_biology.yaml │ │ │ │ ├── global_mmlu_full_yo_college_chemistry.yaml │ │ │ │ ├── global_mmlu_full_yo_college_computer_science.yaml │ │ │ │ ├── global_mmlu_full_yo_college_mathematics.yaml │ │ │ │ ├── global_mmlu_full_yo_college_medicine.yaml │ │ │ │ ├── global_mmlu_full_yo_college_physics.yaml │ │ │ │ ├── global_mmlu_full_yo_computer_security.yaml │ │ │ │ ├── global_mmlu_full_yo_conceptual_physics.yaml │ │ │ │ ├── global_mmlu_full_yo_econometrics.yaml │ │ │ │ ├── global_mmlu_full_yo_electrical_engineering.yaml │ │ │ │ ├── global_mmlu_full_yo_elementary_mathematics.yaml │ │ │ │ ├── global_mmlu_full_yo_formal_logic.yaml │ │ │ │ ├── global_mmlu_full_yo_global_facts.yaml │ │ │ │ ├── global_mmlu_full_yo_high_school_biology.yaml │ │ │ │ ├── global_mmlu_full_yo_high_school_chemistry.yaml │ │ │ │ ├── global_mmlu_full_yo_high_school_computer_science.yaml │ │ │ │ ├── global_mmlu_full_yo_high_school_european_history.yaml │ │ │ │ ├── global_mmlu_full_yo_high_school_geography.yaml │ │ │ │ ├── global_mmlu_full_yo_high_school_government_and_politics.yaml │ │ │ │ ├── global_mmlu_full_yo_high_school_macroeconomics.yaml │ │ │ │ ├── global_mmlu_full_yo_high_school_mathematics.yaml │ │ │ │ ├── global_mmlu_full_yo_high_school_microeconomics.yaml │ │ │ │ ├── global_mmlu_full_yo_high_school_physics.yaml │ │ │ │ ├── global_mmlu_full_yo_high_school_psychology.yaml │ │ │ │ ├── global_mmlu_full_yo_high_school_statistics.yaml │ │ │ │ ├── global_mmlu_full_yo_high_school_us_history.yaml │ │ │ │ ├── global_mmlu_full_yo_high_school_world_history.yaml │ │ │ │ ├── global_mmlu_full_yo_human_aging.yaml │ │ │ │ ├── global_mmlu_full_yo_human_sexuality.yaml │ │ │ │ ├── global_mmlu_full_yo_international_law.yaml │ │ │ │ ├── global_mmlu_full_yo_jurisprudence.yaml │ │ │ │ ├── global_mmlu_full_yo_logical_fallacies.yaml │ │ │ │ ├── global_mmlu_full_yo_machine_learning.yaml │ │ │ │ ├── global_mmlu_full_yo_management.yaml │ │ │ │ ├── global_mmlu_full_yo_marketing.yaml │ │ │ │ ├── global_mmlu_full_yo_medical_genetics.yaml │ │ │ │ ├── global_mmlu_full_yo_miscellaneous.yaml │ │ │ │ ├── global_mmlu_full_yo_moral_disputes.yaml │ │ │ │ ├── global_mmlu_full_yo_moral_scenarios.yaml │ │ │ │ ├── global_mmlu_full_yo_nutrition.yaml │ │ │ │ ├── global_mmlu_full_yo_philosophy.yaml │ │ │ │ ├── global_mmlu_full_yo_prehistory.yaml │ │ │ │ ├── global_mmlu_full_yo_professional_accounting.yaml │ │ │ │ ├── global_mmlu_full_yo_professional_law.yaml │ │ │ │ ├── global_mmlu_full_yo_professional_medicine.yaml │ │ │ │ ├── global_mmlu_full_yo_professional_psychology.yaml │ │ │ │ ├── global_mmlu_full_yo_public_relations.yaml │ │ │ │ ├── global_mmlu_full_yo_security_studies.yaml │ │ │ │ ├── global_mmlu_full_yo_sociology.yaml │ │ │ │ ├── global_mmlu_full_yo_us_foreign_policy.yaml │ │ │ │ ├── global_mmlu_full_yo_virology.yaml │ │ │ │ ├── global_mmlu_full_yo_world_religions.yaml │ │ │ │ └── utils.py │ │ │ └── zh/ │ │ │ ├── _global_mmlu_full_zh.yaml │ │ │ ├── _global_mmlu_full_zh_humanities.yaml │ │ │ ├── _global_mmlu_full_zh_other.yaml │ │ │ ├── _global_mmlu_full_zh_social_sciences.yaml │ │ │ ├── _global_mmlu_full_zh_stem.yaml │ │ │ ├── _zh_template_yaml │ │ │ ├── global_mmlu_full_zh_abstract_algebra.yaml │ │ │ ├── global_mmlu_full_zh_anatomy.yaml │ │ │ ├── global_mmlu_full_zh_astronomy.yaml │ │ │ ├── global_mmlu_full_zh_business_ethics.yaml │ │ │ ├── global_mmlu_full_zh_clinical_knowledge.yaml │ │ │ ├── global_mmlu_full_zh_college_biology.yaml │ │ │ ├── global_mmlu_full_zh_college_chemistry.yaml │ │ │ ├── global_mmlu_full_zh_college_computer_science.yaml │ │ │ ├── global_mmlu_full_zh_college_mathematics.yaml │ │ │ ├── global_mmlu_full_zh_college_medicine.yaml │ │ │ ├── global_mmlu_full_zh_college_physics.yaml │ │ │ ├── global_mmlu_full_zh_computer_security.yaml │ │ │ ├── global_mmlu_full_zh_conceptual_physics.yaml │ │ │ ├── global_mmlu_full_zh_econometrics.yaml │ │ │ ├── global_mmlu_full_zh_electrical_engineering.yaml │ │ │ ├── global_mmlu_full_zh_elementary_mathematics.yaml │ │ │ ├── global_mmlu_full_zh_formal_logic.yaml │ │ │ ├── global_mmlu_full_zh_global_facts.yaml │ │ │ ├── global_mmlu_full_zh_high_school_biology.yaml │ │ │ ├── global_mmlu_full_zh_high_school_chemistry.yaml │ │ │ ├── global_mmlu_full_zh_high_school_computer_science.yaml │ │ │ ├── global_mmlu_full_zh_high_school_european_history.yaml │ │ │ ├── global_mmlu_full_zh_high_school_geography.yaml │ │ │ ├── global_mmlu_full_zh_high_school_government_and_politics.yaml │ │ │ ├── global_mmlu_full_zh_high_school_macroeconomics.yaml │ │ │ ├── global_mmlu_full_zh_high_school_mathematics.yaml │ │ │ ├── global_mmlu_full_zh_high_school_microeconomics.yaml │ │ │ ├── global_mmlu_full_zh_high_school_physics.yaml │ │ │ ├── global_mmlu_full_zh_high_school_psychology.yaml │ │ │ ├── global_mmlu_full_zh_high_school_statistics.yaml │ │ │ ├── global_mmlu_full_zh_high_school_us_history.yaml │ │ │ ├── global_mmlu_full_zh_high_school_world_history.yaml │ │ │ ├── global_mmlu_full_zh_human_aging.yaml │ │ │ ├── global_mmlu_full_zh_human_sexuality.yaml │ │ │ ├── global_mmlu_full_zh_international_law.yaml │ │ │ ├── global_mmlu_full_zh_jurisprudence.yaml │ │ │ ├── global_mmlu_full_zh_logical_fallacies.yaml │ │ │ ├── global_mmlu_full_zh_machine_learning.yaml │ │ │ ├── global_mmlu_full_zh_management.yaml │ │ │ ├── global_mmlu_full_zh_marketing.yaml │ │ │ ├── global_mmlu_full_zh_medical_genetics.yaml │ │ │ ├── global_mmlu_full_zh_miscellaneous.yaml │ │ │ ├── global_mmlu_full_zh_moral_disputes.yaml │ │ │ ├── global_mmlu_full_zh_moral_scenarios.yaml │ │ │ ├── global_mmlu_full_zh_nutrition.yaml │ │ │ ├── global_mmlu_full_zh_philosophy.yaml │ │ │ ├── global_mmlu_full_zh_prehistory.yaml │ │ │ ├── global_mmlu_full_zh_professional_accounting.yaml │ │ │ ├── global_mmlu_full_zh_professional_law.yaml │ │ │ ├── global_mmlu_full_zh_professional_medicine.yaml │ │ │ ├── global_mmlu_full_zh_professional_psychology.yaml │ │ │ ├── global_mmlu_full_zh_public_relations.yaml │ │ │ ├── global_mmlu_full_zh_security_studies.yaml │ │ │ ├── global_mmlu_full_zh_sociology.yaml │ │ │ ├── global_mmlu_full_zh_us_foreign_policy.yaml │ │ │ ├── global_mmlu_full_zh_virology.yaml │ │ │ ├── global_mmlu_full_zh_world_religions.yaml │ │ │ └── utils.py │ │ ├── global_piqa/ │ │ │ ├── README.md │ │ │ ├── completions/ │ │ │ │ ├── _generate_config.py │ │ │ │ ├── _global_piqa.yaml │ │ │ │ ├── _template │ │ │ │ ├── acm_arab.yaml │ │ │ │ ├── acq_arab.yaml │ │ │ │ ├── aeb_arab.yaml │ │ │ │ ├── afb_arab.yaml │ │ │ │ ├── als_latn.yaml │ │ │ │ ├── amh_ethi.yaml │ │ │ │ ├── apc_arab_jord.yaml │ │ │ │ ├── apc_arab_leba.yaml │ │ │ │ ├── apc_arab_pale.yaml │ │ │ │ ├── apc_arab_syri.yaml │ │ │ │ ├── arb_arab.yaml │ │ │ │ ├── arq_arab.yaml │ │ │ │ ├── ars_arab.yaml │ │ │ │ ├── ary_arab.yaml │ │ │ │ ├── arz_arab.yaml │ │ │ │ ├── asm_beng.yaml │ │ │ │ ├── azj_latn.yaml │ │ │ │ ├── bam_latn.yaml │ │ │ │ ├── bel_cyrl.yaml │ │ │ │ ├── ben_beng.yaml │ │ │ │ ├── ben_latn.yaml │ │ │ │ ├── bho_deva.yaml │ │ │ │ ├── bos_latn.yaml │ │ │ │ ├── bsk_arab.yaml │ │ │ │ ├── bul_cyrl.yaml │ │ │ │ ├── cat_latn.yaml │ │ │ │ ├── ces_latn.yaml │ │ │ │ ├── ckb_arab.yaml │ │ │ │ ├── ckm_latn.yaml │ │ │ │ ├── cmn_hans.yaml │ │ │ │ ├── cmn_hant.yaml │ │ │ │ ├── deu_latn.yaml │ │ │ │ ├── dhd_deva.yaml │ │ │ │ ├── ekk_latn.yaml │ │ │ │ ├── ekp_latn.yaml │ │ │ │ ├── ell_grek.yaml │ │ │ │ ├── eng_latn.yaml │ │ │ │ ├── fao_latn.yaml │ │ │ │ ├── fin_latn.yaml │ │ │ │ ├── fra_latn_cana.yaml │ │ │ │ ├── fra_latn_fran.yaml │ │ │ │ ├── glg_latn.yaml │ │ │ │ ├── guj_gujr.yaml │ │ │ │ ├── hau_latn.yaml │ │ │ │ ├── haw_latn.yaml │ │ │ │ ├── heb_hebr.yaml │ │ │ │ ├── hin_deva.yaml │ │ │ │ ├── hrv_latn.yaml │ │ │ │ ├── hun_latn.yaml │ │ │ │ ├── hye_armn.yaml │ │ │ │ ├── ibo_latn.yaml │ │ │ │ ├── idu_latn.yaml │ │ │ │ ├── ind_latn.yaml │ │ │ │ ├── isl_latn.yaml │ │ │ │ ├── iso_latn.yaml │ │ │ │ ├── ita_latn.yaml │ │ │ │ ├── jav_latn.yaml │ │ │ │ ├── jpn_jpan.yaml │ │ │ │ ├── kan_knda.yaml │ │ │ │ ├── kat_geor.yaml │ │ │ │ ├── kaz_cyrl.yaml │ │ │ │ ├── kin_latn.yaml │ │ │ │ ├── kir_cyrl.yaml │ │ │ │ ├── kor_hang.yaml │ │ │ │ ├── lin_latn.yaml │ │ │ │ ├── lit_latn.yaml │ │ │ │ ├── luo_latn.yaml │ │ │ │ ├── mal_mlym.yaml │ │ │ │ ├── mar_deva.yaml │ │ │ │ ├── mkd_cyrl.yaml │ │ │ │ ├── mni_beng.yaml │ │ │ │ ├── mni_mtei.yaml │ │ │ │ ├── nag_latn.yaml │ │ │ │ ├── nld_latn.yaml │ │ │ │ ├── nno_latn.yaml │ │ │ │ ├── nob_latn.yaml │ │ │ │ ├── npi_deva.yaml │ │ │ │ ├── pan_guru.yaml │ │ │ │ ├── pcm_latn.yaml │ │ │ │ ├── pes_arab.yaml │ │ │ │ ├── pol_latn.yaml │ │ │ │ ├── por_latn_braz.yaml │ │ │ │ ├── por_latn_port.yaml │ │ │ │ ├── ron_latn.yaml │ │ │ │ ├── rus_cyrl.yaml │ │ │ │ ├── rwr_deva.yaml │ │ │ │ ├── sin_sinh.yaml │ │ │ │ ├── slk_latn.yaml │ │ │ │ ├── slk_latn_sari.yaml │ │ │ │ ├── slv_latn.yaml │ │ │ │ ├── slv_latn_cerk.yaml │ │ │ │ ├── snd_arab.yaml │ │ │ │ ├── snd_deva.yaml │ │ │ │ ├── spa_latn_mexi.yaml │ │ │ │ ├── spa_latn_peru.yaml │ │ │ │ ├── spa_latn_spai.yaml │ │ │ │ ├── srp_cyrl.yaml │ │ │ │ ├── srp_latn.yaml │ │ │ │ ├── swe_latn.yaml │ │ │ │ ├── swh_latn.yaml │ │ │ │ ├── tam_taml.yaml │ │ │ │ ├── tel_telu.yaml │ │ │ │ ├── tgl_latn.yaml │ │ │ │ ├── tha_thai.yaml │ │ │ │ ├── tur_latn.yaml │ │ │ │ ├── uig_arab.yaml │ │ │ │ ├── ukr_cyrl.yaml │ │ │ │ ├── urd_arab.yaml │ │ │ │ ├── urd_latn.yaml │ │ │ │ ├── urh_latn.yaml │ │ │ │ ├── uzn_latn.yaml │ │ │ │ ├── vie_latn.yaml │ │ │ │ ├── yor_latn.yaml │ │ │ │ ├── yue_hant.yaml │ │ │ │ ├── zsm_latn.yaml │ │ │ │ └── zul_latn.yaml │ │ │ └── prompted/ │ │ │ ├── _generate_config.py │ │ │ ├── _global_piqa.yaml │ │ │ ├── _template │ │ │ ├── acm_arab.yaml │ │ │ ├── acq_arab.yaml │ │ │ ├── aeb_arab.yaml │ │ │ ├── afb_arab.yaml │ │ │ ├── als_latn.yaml │ │ │ ├── amh_ethi.yaml │ │ │ ├── apc_arab_jord.yaml │ │ │ ├── apc_arab_leba.yaml │ │ │ ├── apc_arab_pale.yaml │ │ │ ├── apc_arab_syri.yaml │ │ │ ├── arb_arab.yaml │ │ │ ├── arq_arab.yaml │ │ │ ├── ars_arab.yaml │ │ │ ├── ary_arab.yaml │ │ │ ├── arz_arab.yaml │ │ │ ├── asm_beng.yaml │ │ │ ├── azj_latn.yaml │ │ │ ├── bam_latn.yaml │ │ │ ├── bel_cyrl.yaml │ │ │ ├── ben_beng.yaml │ │ │ ├── ben_latn.yaml │ │ │ ├── bho_deva.yaml │ │ │ ├── bos_latn.yaml │ │ │ ├── bsk_arab.yaml │ │ │ ├── bul_cyrl.yaml │ │ │ ├── cat_latn.yaml │ │ │ ├── ces_latn.yaml │ │ │ ├── ckb_arab.yaml │ │ │ ├── ckm_latn.yaml │ │ │ ├── cmn_hans.yaml │ │ │ ├── cmn_hant.yaml │ │ │ ├── deu_latn.yaml │ │ │ ├── dhd_deva.yaml │ │ │ ├── ekk_latn.yaml │ │ │ ├── ekp_latn.yaml │ │ │ ├── ell_grek.yaml │ │ │ ├── eng_latn.yaml │ │ │ ├── fao_latn.yaml │ │ │ ├── fin_latn.yaml │ │ │ ├── fra_latn_cana.yaml │ │ │ ├── fra_latn_fran.yaml │ │ │ ├── glg_latn.yaml │ │ │ ├── guj_gujr.yaml │ │ │ ├── hau_latn.yaml │ │ │ ├── haw_latn.yaml │ │ │ ├── heb_hebr.yaml │ │ │ ├── hin_deva.yaml │ │ │ ├── hrv_latn.yaml │ │ │ ├── hun_latn.yaml │ │ │ ├── hye_armn.yaml │ │ │ ├── ibo_latn.yaml │ │ │ ├── idu_latn.yaml │ │ │ ├── ind_latn.yaml │ │ │ ├── isl_latn.yaml │ │ │ ├── iso_latn.yaml │ │ │ ├── ita_latn.yaml │ │ │ ├── jav_latn.yaml │ │ │ ├── jpn_jpan.yaml │ │ │ ├── kan_knda.yaml │ │ │ ├── kat_geor.yaml │ │ │ ├── kaz_cyrl.yaml │ │ │ ├── kin_latn.yaml │ │ │ ├── kir_cyrl.yaml │ │ │ ├── kor_hang.yaml │ │ │ ├── lin_latn.yaml │ │ │ ├── lit_latn.yaml │ │ │ ├── luo_latn.yaml │ │ │ ├── mal_mlym.yaml │ │ │ ├── mar_deva.yaml │ │ │ ├── mkd_cyrl.yaml │ │ │ ├── mni_beng.yaml │ │ │ ├── mni_mtei.yaml │ │ │ ├── nag_latn.yaml │ │ │ ├── nld_latn.yaml │ │ │ ├── nno_latn.yaml │ │ │ ├── nob_latn.yaml │ │ │ ├── npi_deva.yaml │ │ │ ├── pan_guru.yaml │ │ │ ├── pcm_latn.yaml │ │ │ ├── pes_arab.yaml │ │ │ ├── pol_latn.yaml │ │ │ ├── por_latn_braz.yaml │ │ │ ├── por_latn_port.yaml │ │ │ ├── ron_latn.yaml │ │ │ ├── rus_cyrl.yaml │ │ │ ├── rwr_deva.yaml │ │ │ ├── sin_sinh.yaml │ │ │ ├── slk_latn.yaml │ │ │ ├── slk_latn_sari.yaml │ │ │ ├── slv_latn.yaml │ │ │ ├── slv_latn_cerk.yaml │ │ │ ├── snd_arab.yaml │ │ │ ├── snd_deva.yaml │ │ │ ├── spa_latn_mexi.yaml │ │ │ ├── spa_latn_peru.yaml │ │ │ ├── spa_latn_spai.yaml │ │ │ ├── srp_cyrl.yaml │ │ │ ├── srp_latn.yaml │ │ │ ├── swe_latn.yaml │ │ │ ├── swh_latn.yaml │ │ │ ├── tam_taml.yaml │ │ │ ├── tel_telu.yaml │ │ │ ├── tgl_latn.yaml │ │ │ ├── tha_thai.yaml │ │ │ ├── tur_latn.yaml │ │ │ ├── uig_arab.yaml │ │ │ ├── ukr_cyrl.yaml │ │ │ ├── urd_arab.yaml │ │ │ ├── urd_latn.yaml │ │ │ ├── urh_latn.yaml │ │ │ ├── uzn_latn.yaml │ │ │ ├── vie_latn.yaml │ │ │ ├── yor_latn.yaml │ │ │ ├── yue_hant.yaml │ │ │ ├── zsm_latn.yaml │ │ │ └── zul_latn.yaml │ │ ├── glue/ │ │ │ ├── README.md │ │ │ ├── cola/ │ │ │ │ └── default.yaml │ │ │ ├── mnli/ │ │ │ │ ├── default.yaml │ │ │ │ ├── mismatch.yaml │ │ │ │ └── utils.py │ │ │ ├── mrpc/ │ │ │ │ └── default.yaml │ │ │ ├── qnli/ │ │ │ │ └── default.yaml │ │ │ ├── qqp/ │ │ │ │ └── default.yaml │ │ │ ├── rte/ │ │ │ │ └── default.yaml │ │ │ ├── sst2/ │ │ │ │ └── default.yaml │ │ │ └── wnli/ │ │ │ └── default.yaml │ │ ├── gpqa/ │ │ │ ├── README.md │ │ │ ├── cot_n_shot/ │ │ │ │ ├── _generate_configs.py │ │ │ │ ├── _gpqa_cot_n_shot_yaml │ │ │ │ ├── gpqa_diamond_cot_n_shot.yaml │ │ │ │ ├── gpqa_extended_cot_n_shot.yaml │ │ │ │ ├── gpqa_main_cot_n_shot.yaml │ │ │ │ └── utils.py │ │ │ ├── cot_zeroshot/ │ │ │ │ ├── _generate_configs.py │ │ │ │ ├── _gpqa_cot_zeroshot_yaml │ │ │ │ ├── gpqa_diamond_cot_zeroshot.yaml │ │ │ │ ├── gpqa_extended_cot_zeroshot.yaml │ │ │ │ ├── gpqa_main_cot_zeroshot.yaml │ │ │ │ └── utils.py │ │ │ ├── generative/ │ │ │ │ ├── _generate_configs.py │ │ │ │ ├── _gpqa_generative_n_shot_yaml │ │ │ │ ├── gpqa_diamond_generative_n_shot.yaml │ │ │ │ ├── gpqa_extended_generative_n_shot.yaml │ │ │ │ ├── gpqa_main_generative_n_shot.yaml │ │ │ │ └── utils.py │ │ │ ├── n_shot/ │ │ │ │ ├── _generate_configs.py │ │ │ │ ├── _gpqa_n_shot_yaml │ │ │ │ ├── gpqa_diamond_n_shot.yaml │ │ │ │ ├── gpqa_extended_n_shot.yaml │ │ │ │ ├── gpqa_main_n_shot.yaml │ │ │ │ └── utils.py │ │ │ └── zeroshot/ │ │ │ ├── _generate_configs.py │ │ │ ├── _gpqa_zeroshot_yaml │ │ │ ├── gpqa_diamond_zeroshot.yaml │ │ │ ├── gpqa_extended_zeroshot.yaml │ │ │ ├── gpqa_main_zeroshot.yaml │ │ │ └── utils.py │ │ ├── graphwalks/ │ │ │ ├── README.md │ │ │ ├── graphwalks.yaml │ │ │ ├── graphwalks_128k.yaml │ │ │ ├── graphwalks_1M.yaml │ │ │ └── utils.py │ │ ├── groundcocoa/ │ │ │ ├── README.md │ │ │ ├── groundcocoa.yaml │ │ │ └── utils.py │ │ ├── gsm8k/ │ │ │ ├── README.md │ │ │ ├── gsm8k-cot-llama.yaml │ │ │ ├── gsm8k-cot-self-consistency.yaml │ │ │ ├── gsm8k-cot-zeroshot.yaml │ │ │ ├── gsm8k-cot.yaml │ │ │ └── gsm8k.yaml │ │ ├── gsm8k_platinum/ │ │ │ ├── README.md │ │ │ ├── gsm8k-platinum-cot-llama.yaml │ │ │ ├── gsm8k-platinum-cot-self-consistency.yaml │ │ │ ├── gsm8k-platinum-cot-zeroshot.yaml │ │ │ ├── gsm8k-platinum-cot.yaml │ │ │ └── gsm8k-platinum.yaml │ │ ├── gsm_plus/ │ │ │ ├── README.md │ │ │ ├── gsm_plus.yaml │ │ │ └── gsm_plus_mini.yaml │ │ ├── haerae/ │ │ │ ├── README.md │ │ │ ├── _default_haerae_yaml │ │ │ ├── _haerae.yaml │ │ │ ├── haerae_gk.yaml │ │ │ ├── haerae_hi.yaml │ │ │ ├── haerae_lw.yaml │ │ │ ├── haerae_rw.yaml │ │ │ └── haerae_sn.yaml │ │ ├── headqa/ │ │ │ ├── README.md │ │ │ ├── headqa_en.yaml │ │ │ └── headqa_es.yaml │ │ ├── hellaswag/ │ │ │ ├── README.md │ │ │ ├── hellaswag.yaml │ │ │ └── utils.py │ │ ├── hendrycks_ethics/ │ │ │ ├── README.md │ │ │ ├── commonsense.yaml │ │ │ ├── deontology.yaml │ │ │ ├── justice.yaml │ │ │ ├── utilitarianism.yaml │ │ │ ├── utilitarianism_original_yaml │ │ │ ├── utils.py │ │ │ └── virtue.yaml │ │ ├── hendrycks_math/ │ │ │ ├── README.md │ │ │ ├── hendrycks_math.yaml │ │ │ ├── hendrycks_math500.yaml │ │ │ ├── hendrycks_math_algebra.yaml │ │ │ ├── hendrycks_math_counting_and_prob.yaml │ │ │ ├── hendrycks_math_geometry.yaml │ │ │ ├── hendrycks_math_intermediate_algebra.yaml │ │ │ ├── hendrycks_math_num_theory.yaml │ │ │ ├── hendrycks_math_prealgebra.yaml │ │ │ ├── hendrycks_math_precalc.yaml │ │ │ └── utils.py │ │ ├── histoires_morales/ │ │ │ ├── README.md │ │ │ ├── histoires_morales.yaml │ │ │ └── utils.py │ │ ├── hrm8k/ │ │ │ ├── README.md │ │ │ ├── default/ │ │ │ │ ├── _hrm8k_yaml │ │ │ │ ├── hrm8k.yaml │ │ │ │ ├── hrm8k_gsm8k.yaml │ │ │ │ ├── hrm8k_ksm.yaml │ │ │ │ ├── hrm8k_math.yaml │ │ │ │ ├── hrm8k_mmmlu.yaml │ │ │ │ ├── hrm8k_omni_math.yaml │ │ │ │ └── utils.py │ │ │ └── en/ │ │ │ ├── _hrm8k_en_yaml │ │ │ ├── hrm8k_en.yaml │ │ │ ├── hrm8k_gsm8k_en.yaml │ │ │ ├── hrm8k_ksm_en.yaml │ │ │ ├── hrm8k_math_en.yaml │ │ │ ├── hrm8k_mmmlu_en.yaml │ │ │ ├── hrm8k_omni_math_en.yaml │ │ │ └── utils.py │ │ ├── humaneval/ │ │ │ ├── README.md │ │ │ ├── humaneval.yaml │ │ │ ├── humaneval_64.yaml │ │ │ ├── humaneval_64_instruct.yaml │ │ │ ├── humaneval_instruct.yaml │ │ │ ├── humaneval_plus.yaml │ │ │ └── utils.py │ │ ├── humaneval_infilling/ │ │ │ ├── README.md │ │ │ ├── humaneval_infilling.yaml │ │ │ ├── multi_line_infilling.yaml │ │ │ ├── random_span_infilling.yaml │ │ │ ├── random_span_infilling_light.yaml │ │ │ ├── single_line_infilling.yaml │ │ │ └── utils.py │ │ ├── icelandic_winogrande/ │ │ │ ├── README.md │ │ │ ├── default.yaml │ │ │ └── preprocess_winogrande.py │ │ ├── ifeval/ │ │ │ ├── README.md │ │ │ ├── ifeval.yaml │ │ │ ├── instructions.py │ │ │ ├── instructions_registry.py │ │ │ ├── instructions_util.py │ │ │ ├── multilingual/ │ │ │ │ ├── ifeval_ca.yaml │ │ │ │ ├── ifeval_es.yaml │ │ │ │ ├── instruction_utils/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── ca_instructions_util.py │ │ │ │ │ └── es_instructions_util.py │ │ │ │ ├── instructions/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── ca_instructions.py │ │ │ │ │ └── es_instructions.py │ │ │ │ ├── instructions_registry.py │ │ │ │ └── utils.py │ │ │ └── utils.py │ │ ├── include/ │ │ │ ├── README.md │ │ │ ├── default/ │ │ │ │ ├── Albanian/ │ │ │ │ │ ├── _albanian_template_yaml │ │ │ │ │ ├── _include_base_44_albanian.yaml │ │ │ │ │ ├── include_base_44_albanian_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_albanian_business_commerce.yaml │ │ │ │ │ ├── include_base_44_albanian_health_oriented_education.yaml │ │ │ │ │ ├── include_base_44_albanian_social_science.yaml │ │ │ │ │ ├── include_base_44_albanian_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Arabic/ │ │ │ │ │ ├── _arabic_template_yaml │ │ │ │ │ ├── _include_base_44_arabic.yaml │ │ │ │ │ ├── include_base_44_arabic_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_arabic_business_commerce.yaml │ │ │ │ │ ├── include_base_44_arabic_driving_license.yaml │ │ │ │ │ ├── include_base_44_arabic_general_knowledge.yaml │ │ │ │ │ ├── include_base_44_arabic_social_science.yaml │ │ │ │ │ ├── include_base_44_arabic_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Armenian/ │ │ │ │ │ ├── _armenian_template_yaml │ │ │ │ │ ├── _include_base_44_armenian.yaml │ │ │ │ │ ├── include_base_44_armenian_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_armenian_driving_license.yaml │ │ │ │ │ ├── include_base_44_armenian_social_science.yaml │ │ │ │ │ ├── include_base_44_armenian_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Azerbaijani/ │ │ │ │ │ ├── _azerbaijani_template_yaml │ │ │ │ │ ├── _include_base_44_azerbaijani.yaml │ │ │ │ │ ├── include_base_44_azerbaijani_applied_science.yaml │ │ │ │ │ ├── include_base_44_azerbaijani_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_azerbaijani_business_commerce.yaml │ │ │ │ │ ├── include_base_44_azerbaijani_health_oriented_education.yaml │ │ │ │ │ ├── include_base_44_azerbaijani_social_science.yaml │ │ │ │ │ ├── include_base_44_azerbaijani_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Basque/ │ │ │ │ │ ├── _basque_template_yaml │ │ │ │ │ ├── _include_base_44_basque.yaml │ │ │ │ │ ├── include_base_44_basque_professional_certification.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Belarusian/ │ │ │ │ │ ├── _belarusian_template_yaml │ │ │ │ │ ├── _include_base_44_belarusian.yaml │ │ │ │ │ ├── include_base_44_belarusian_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_belarusian_social_science.yaml │ │ │ │ │ ├── include_base_44_belarusian_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Bengali/ │ │ │ │ │ ├── _bengali_template_yaml │ │ │ │ │ ├── _include_base_44_bengali.yaml │ │ │ │ │ ├── include_base_44_bengali_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_bengali_general_knowledge.yaml │ │ │ │ │ ├── include_base_44_bengali_professional_certification.yaml │ │ │ │ │ ├── include_base_44_bengali_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Bulgarian/ │ │ │ │ │ ├── _bulgarian_template_yaml │ │ │ │ │ ├── _include_base_44_bulgarian.yaml │ │ │ │ │ ├── include_base_44_bulgarian_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_bulgarian_social_science.yaml │ │ │ │ │ ├── include_base_44_bulgarian_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Chinese/ │ │ │ │ │ ├── _chinese_template_yaml │ │ │ │ │ ├── _include_base_44_chinese.yaml │ │ │ │ │ ├── include_base_44_chinese_applied_science.yaml │ │ │ │ │ ├── include_base_44_chinese_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_chinese_business_commerce.yaml │ │ │ │ │ ├── include_base_44_chinese_driving_license.yaml │ │ │ │ │ ├── include_base_44_chinese_health_oriented_education.yaml │ │ │ │ │ ├── include_base_44_chinese_professional_certification.yaml │ │ │ │ │ ├── include_base_44_chinese_social_science.yaml │ │ │ │ │ ├── include_base_44_chinese_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Croatian/ │ │ │ │ │ ├── _croatian_template_yaml │ │ │ │ │ ├── _include_base_44_croatian.yaml │ │ │ │ │ ├── include_base_44_croatian_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_croatian_social_science.yaml │ │ │ │ │ ├── include_base_44_croatian_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Dutch/ │ │ │ │ │ ├── _dutch_template_yaml │ │ │ │ │ ├── _include_base_44_dutch.yaml │ │ │ │ │ ├── include_base_44_dutch_applied_science.yaml │ │ │ │ │ ├── include_base_44_dutch_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_dutch_health_oriented_education.yaml │ │ │ │ │ ├── include_base_44_dutch_social_science.yaml │ │ │ │ │ ├── include_base_44_dutch_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Estonian/ │ │ │ │ │ ├── _estonian_template_yaml │ │ │ │ │ ├── _include_base_44_estonian.yaml │ │ │ │ │ ├── include_base_44_estonian_applied_science.yaml │ │ │ │ │ ├── include_base_44_estonian_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_estonian_health_oriented_education.yaml │ │ │ │ │ ├── include_base_44_estonian_social_science.yaml │ │ │ │ │ ├── include_base_44_estonian_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Finnish/ │ │ │ │ │ ├── _finnish_template_yaml │ │ │ │ │ ├── _include_base_44_finnish.yaml │ │ │ │ │ ├── include_base_44_finnish_applied_science.yaml │ │ │ │ │ ├── include_base_44_finnish_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_finnish_health_oriented_education.yaml │ │ │ │ │ ├── include_base_44_finnish_social_science.yaml │ │ │ │ │ ├── include_base_44_finnish_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── French/ │ │ │ │ │ ├── _french_template_yaml │ │ │ │ │ ├── _include_base_44_french.yaml │ │ │ │ │ ├── include_base_44_french_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_french_driving_license.yaml │ │ │ │ │ ├── include_base_44_french_health_oriented_education.yaml │ │ │ │ │ ├── include_base_44_french_social_science.yaml │ │ │ │ │ ├── include_base_44_french_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Georgian/ │ │ │ │ │ ├── _georgian_template_yaml │ │ │ │ │ ├── _include_base_44_georgian.yaml │ │ │ │ │ ├── include_base_44_georgian_arts_humanities.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── German/ │ │ │ │ │ ├── _german_template_yaml │ │ │ │ │ ├── _include_base_44_german.yaml │ │ │ │ │ ├── include_base_44_german_driving_license.yaml │ │ │ │ │ ├── include_base_44_german_social_science.yaml │ │ │ │ │ ├── include_base_44_german_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Greek/ │ │ │ │ │ ├── _greek_template_yaml │ │ │ │ │ ├── _include_base_44_greek.yaml │ │ │ │ │ ├── include_base_44_greek_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_greek_business_commerce.yaml │ │ │ │ │ ├── include_base_44_greek_health_oriented_education.yaml │ │ │ │ │ ├── include_base_44_greek_medical_license.yaml │ │ │ │ │ ├── include_base_44_greek_professional_certification.yaml │ │ │ │ │ ├── include_base_44_greek_social_science.yaml │ │ │ │ │ ├── include_base_44_greek_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Hebrew/ │ │ │ │ │ ├── _hebrew_template_yaml │ │ │ │ │ ├── _include_base_44_hebrew.yaml │ │ │ │ │ ├── include_base_44_hebrew_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_hebrew_driving_license.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Hindi/ │ │ │ │ │ ├── _hindi_template_yaml │ │ │ │ │ ├── _include_base_44_hindi.yaml │ │ │ │ │ ├── include_base_44_hindi_applied_science.yaml │ │ │ │ │ ├── include_base_44_hindi_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_hindi_driving_license.yaml │ │ │ │ │ ├── include_base_44_hindi_general_knowledge.yaml │ │ │ │ │ ├── include_base_44_hindi_health_oriented_education.yaml │ │ │ │ │ ├── include_base_44_hindi_professional_certification.yaml │ │ │ │ │ ├── include_base_44_hindi_social_science.yaml │ │ │ │ │ ├── include_base_44_hindi_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Hungarian/ │ │ │ │ │ ├── _hungarian_template_yaml │ │ │ │ │ ├── _include_base_44_hungarian.yaml │ │ │ │ │ ├── include_base_44_hungarian_applied_science.yaml │ │ │ │ │ ├── include_base_44_hungarian_social_science.yaml │ │ │ │ │ ├── include_base_44_hungarian_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Indonesian/ │ │ │ │ │ ├── _include_base_44_indonesian.yaml │ │ │ │ │ ├── _indonesian_template_yaml │ │ │ │ │ ├── include_base_44_indonesian_applied_science.yaml │ │ │ │ │ ├── include_base_44_indonesian_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_indonesian_professional_certification.yaml │ │ │ │ │ ├── include_base_44_indonesian_social_science.yaml │ │ │ │ │ ├── include_base_44_indonesian_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Italian/ │ │ │ │ │ ├── _include_base_44_italian.yaml │ │ │ │ │ ├── _italian_template_yaml │ │ │ │ │ ├── include_base_44_italian_applied_science.yaml │ │ │ │ │ ├── include_base_44_italian_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_italian_health_oriented_education.yaml │ │ │ │ │ ├── include_base_44_italian_professional_certification.yaml │ │ │ │ │ ├── include_base_44_italian_social_science.yaml │ │ │ │ │ ├── include_base_44_italian_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Japanese/ │ │ │ │ │ ├── _include_base_44_japanese.yaml │ │ │ │ │ ├── _japanese_template_yaml │ │ │ │ │ ├── include_base_44_japanese_driving_license.yaml │ │ │ │ │ ├── include_base_44_japanese_medical_license.yaml │ │ │ │ │ ├── include_base_44_japanese_professional_certification.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Kazakh/ │ │ │ │ │ ├── _include_base_44_kazakh.yaml │ │ │ │ │ ├── _kazakh_template_yaml │ │ │ │ │ ├── include_base_44_kazakh_arts_humanities.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Korean/ │ │ │ │ │ ├── _include_base_44_korean.yaml │ │ │ │ │ ├── _korean_template_yaml │ │ │ │ │ ├── include_base_44_korean_professional_certification.yaml │ │ │ │ │ ├── include_base_44_korean_social_science.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Lithuanian/ │ │ │ │ │ ├── _include_base_44_lithuanian.yaml │ │ │ │ │ ├── _lithuanian_template_yaml │ │ │ │ │ ├── include_base_44_lithuanian_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_lithuanian_business_commerce.yaml │ │ │ │ │ ├── include_base_44_lithuanian_professional_certification.yaml │ │ │ │ │ ├── include_base_44_lithuanian_social_science.yaml │ │ │ │ │ ├── include_base_44_lithuanian_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Malay/ │ │ │ │ │ ├── _include_base_44_malay.yaml │ │ │ │ │ ├── _malay_template_yaml │ │ │ │ │ ├── include_base_44_malay_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_malay_business_commerce.yaml │ │ │ │ │ ├── include_base_44_malay_social_science.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Malayalam/ │ │ │ │ │ ├── _include_base_44_malayalam.yaml │ │ │ │ │ ├── _malayalam_template_yaml │ │ │ │ │ ├── include_base_44_malayalam_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_malayalam_general_knowledge.yaml │ │ │ │ │ ├── include_base_44_malayalam_health_oriented_education.yaml │ │ │ │ │ ├── include_base_44_malayalam_marine_license.yaml │ │ │ │ │ ├── include_base_44_malayalam_social_science.yaml │ │ │ │ │ ├── include_base_44_malayalam_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Nepali/ │ │ │ │ │ ├── _include_base_44_nepali.yaml │ │ │ │ │ ├── _nepali_template_yaml │ │ │ │ │ ├── include_base_44_nepali_driving_license.yaml │ │ │ │ │ ├── include_base_44_nepali_professional_certification.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── North Macedonian/ │ │ │ │ │ ├── _include_base_44_north macedonian.yaml │ │ │ │ │ ├── _north macedonian_template_yaml │ │ │ │ │ ├── include_base_44_north macedonian_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_north macedonian_business_commerce.yaml │ │ │ │ │ ├── include_base_44_north macedonian_social_science.yaml │ │ │ │ │ ├── include_base_44_north macedonian_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Persian/ │ │ │ │ │ ├── _include_base_44_persian.yaml │ │ │ │ │ ├── _persian_template_yaml │ │ │ │ │ ├── include_base_44_persian_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_persian_driving_license.yaml │ │ │ │ │ ├── include_base_44_persian_professional_certification.yaml │ │ │ │ │ ├── include_base_44_persian_social_science.yaml │ │ │ │ │ ├── include_base_44_persian_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Polish/ │ │ │ │ │ ├── _include_base_44_polish.yaml │ │ │ │ │ ├── _polish_template_yaml │ │ │ │ │ ├── include_base_44_polish_professional_certification.yaml │ │ │ │ │ ├── include_base_44_polish_social_science.yaml │ │ │ │ │ ├── include_base_44_polish_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Portuguese/ │ │ │ │ │ ├── _include_base_44_portuguese.yaml │ │ │ │ │ ├── _portuguese_template_yaml │ │ │ │ │ ├── include_base_44_portuguese_applied_science.yaml │ │ │ │ │ ├── include_base_44_portuguese_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_portuguese_business_commerce.yaml │ │ │ │ │ ├── include_base_44_portuguese_health_oriented_education.yaml │ │ │ │ │ ├── include_base_44_portuguese_social_science.yaml │ │ │ │ │ ├── include_base_44_portuguese_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Russian/ │ │ │ │ │ ├── _include_base_44_russian.yaml │ │ │ │ │ ├── _russian_template_yaml │ │ │ │ │ ├── include_base_44_russian_applied_science.yaml │ │ │ │ │ ├── include_base_44_russian_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_russian_business_commerce.yaml │ │ │ │ │ ├── include_base_44_russian_driving_license.yaml │ │ │ │ │ ├── include_base_44_russian_health_oriented_education.yaml │ │ │ │ │ ├── include_base_44_russian_marine_license.yaml │ │ │ │ │ ├── include_base_44_russian_social_science.yaml │ │ │ │ │ ├── include_base_44_russian_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Serbian/ │ │ │ │ │ ├── _include_base_44_serbian.yaml │ │ │ │ │ ├── _serbian_template_yaml │ │ │ │ │ ├── include_base_44_serbian_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_serbian_social_science.yaml │ │ │ │ │ ├── include_base_44_serbian_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Spanish/ │ │ │ │ │ ├── _include_base_44_spanish.yaml │ │ │ │ │ ├── _spanish_template_yaml │ │ │ │ │ ├── include_base_44_spanish_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_spanish_health_oriented_education.yaml │ │ │ │ │ ├── include_base_44_spanish_social_science.yaml │ │ │ │ │ ├── include_base_44_spanish_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Tagalog/ │ │ │ │ │ ├── _include_base_44_tagalog.yaml │ │ │ │ │ ├── _tagalog_template_yaml │ │ │ │ │ ├── include_base_44_tagalog_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_tagalog_driving_license.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Tamil/ │ │ │ │ │ ├── _include_base_44_tamil.yaml │ │ │ │ │ ├── _tamil_template_yaml │ │ │ │ │ ├── include_base_44_tamil_general_knowledge.yaml │ │ │ │ │ ├── include_base_44_tamil_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Telugu/ │ │ │ │ │ ├── _include_base_44_telugu.yaml │ │ │ │ │ ├── _telugu_template_yaml │ │ │ │ │ ├── include_base_44_telugu_applied_science.yaml │ │ │ │ │ ├── include_base_44_telugu_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_telugu_social_science.yaml │ │ │ │ │ ├── include_base_44_telugu_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Turkish/ │ │ │ │ │ ├── _include_base_44_turkish.yaml │ │ │ │ │ ├── _turkish_template_yaml │ │ │ │ │ ├── include_base_44_turkish_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_turkish_business_commerce.yaml │ │ │ │ │ ├── include_base_44_turkish_social_science.yaml │ │ │ │ │ ├── include_base_44_turkish_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Ukrainian/ │ │ │ │ │ ├── _include_base_44_ukrainian.yaml │ │ │ │ │ ├── _ukrainian_template_yaml │ │ │ │ │ ├── include_base_44_ukrainian_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_ukrainian_social_science.yaml │ │ │ │ │ ├── include_base_44_ukrainian_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Urdu/ │ │ │ │ │ ├── _include_base_44_urdu.yaml │ │ │ │ │ ├── _urdu_template_yaml │ │ │ │ │ ├── include_base_44_urdu_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_urdu_health_oriented_education.yaml │ │ │ │ │ ├── include_base_44_urdu_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Uzbek/ │ │ │ │ │ ├── _include_base_44_uzbek.yaml │ │ │ │ │ ├── _uzbek_template_yaml │ │ │ │ │ ├── include_base_44_uzbek_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_uzbek_medical_license.yaml │ │ │ │ │ ├── include_base_44_uzbek_social_science.yaml │ │ │ │ │ ├── include_base_44_uzbek_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ └── Vietnamese/ │ │ │ │ ├── _include_base_44_vietnamese.yaml │ │ │ │ ├── _vietnamese_template_yaml │ │ │ │ ├── include_base_44_vietnamese_arts_humanities.yaml │ │ │ │ ├── include_base_44_vietnamese_social_science.yaml │ │ │ │ ├── include_base_44_vietnamese_stem.yaml │ │ │ │ └── utils.py │ │ │ ├── few_shot_en/ │ │ │ │ ├── Albanian/ │ │ │ │ │ ├── _albanian_few_shot_en_template_yaml │ │ │ │ │ ├── _include_base_44_albanian.yaml │ │ │ │ │ ├── include_base_44_albanian_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_albanian_business_commerce.yaml │ │ │ │ │ ├── include_base_44_albanian_health_oriented_education.yaml │ │ │ │ │ ├── include_base_44_albanian_social_science.yaml │ │ │ │ │ ├── include_base_44_albanian_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Arabic/ │ │ │ │ │ ├── _arabic_few_shot_en_template_yaml │ │ │ │ │ ├── _include_base_44_arabic.yaml │ │ │ │ │ ├── include_base_44_arabic_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_arabic_business_commerce.yaml │ │ │ │ │ ├── include_base_44_arabic_driving_license.yaml │ │ │ │ │ ├── include_base_44_arabic_general_knowledge.yaml │ │ │ │ │ ├── include_base_44_arabic_social_science.yaml │ │ │ │ │ ├── include_base_44_arabic_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Armenian/ │ │ │ │ │ ├── _armenian_few_shot_en_template_yaml │ │ │ │ │ ├── _include_base_44_armenian.yaml │ │ │ │ │ ├── include_base_44_armenian_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_armenian_driving_license.yaml │ │ │ │ │ ├── include_base_44_armenian_social_science.yaml │ │ │ │ │ ├── include_base_44_armenian_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Azerbaijani/ │ │ │ │ │ ├── _azerbaijani_few_shot_en_template_yaml │ │ │ │ │ ├── _include_base_44_azerbaijani.yaml │ │ │ │ │ ├── include_base_44_azerbaijani_applied_science.yaml │ │ │ │ │ ├── include_base_44_azerbaijani_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_azerbaijani_business_commerce.yaml │ │ │ │ │ ├── include_base_44_azerbaijani_health_oriented_education.yaml │ │ │ │ │ ├── include_base_44_azerbaijani_social_science.yaml │ │ │ │ │ ├── include_base_44_azerbaijani_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Basque/ │ │ │ │ │ ├── _basque_few_shot_en_template_yaml │ │ │ │ │ ├── _include_base_44_basque.yaml │ │ │ │ │ ├── include_base_44_basque_professional_certification.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Belarusian/ │ │ │ │ │ ├── _belarusian_few_shot_en_template_yaml │ │ │ │ │ ├── _include_base_44_belarusian.yaml │ │ │ │ │ ├── include_base_44_belarusian_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_belarusian_social_science.yaml │ │ │ │ │ ├── include_base_44_belarusian_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Bengali/ │ │ │ │ │ ├── _bengali_few_shot_en_template_yaml │ │ │ │ │ ├── _include_base_44_bengali.yaml │ │ │ │ │ ├── include_base_44_bengali_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_bengali_general_knowledge.yaml │ │ │ │ │ ├── include_base_44_bengali_professional_certification.yaml │ │ │ │ │ ├── include_base_44_bengali_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Bulgarian/ │ │ │ │ │ ├── _bulgarian_few_shot_en_template_yaml │ │ │ │ │ ├── _include_base_44_bulgarian.yaml │ │ │ │ │ ├── include_base_44_bulgarian_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_bulgarian_social_science.yaml │ │ │ │ │ ├── include_base_44_bulgarian_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Chinese/ │ │ │ │ │ ├── _chinese_few_shot_en_template_yaml │ │ │ │ │ ├── _include_base_44_chinese.yaml │ │ │ │ │ ├── include_base_44_chinese_applied_science.yaml │ │ │ │ │ ├── include_base_44_chinese_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_chinese_business_commerce.yaml │ │ │ │ │ ├── include_base_44_chinese_driving_license.yaml │ │ │ │ │ ├── include_base_44_chinese_health_oriented_education.yaml │ │ │ │ │ ├── include_base_44_chinese_professional_certification.yaml │ │ │ │ │ ├── include_base_44_chinese_social_science.yaml │ │ │ │ │ ├── include_base_44_chinese_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Croatian/ │ │ │ │ │ ├── _croatian_few_shot_en_template_yaml │ │ │ │ │ ├── _include_base_44_croatian.yaml │ │ │ │ │ ├── include_base_44_croatian_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_croatian_social_science.yaml │ │ │ │ │ ├── include_base_44_croatian_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Dutch/ │ │ │ │ │ ├── _dutch_few_shot_en_template_yaml │ │ │ │ │ ├── _include_base_44_dutch.yaml │ │ │ │ │ ├── include_base_44_dutch_applied_science.yaml │ │ │ │ │ ├── include_base_44_dutch_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_dutch_health_oriented_education.yaml │ │ │ │ │ ├── include_base_44_dutch_social_science.yaml │ │ │ │ │ ├── include_base_44_dutch_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Estonian/ │ │ │ │ │ ├── _estonian_few_shot_en_template_yaml │ │ │ │ │ ├── _include_base_44_estonian.yaml │ │ │ │ │ ├── include_base_44_estonian_applied_science.yaml │ │ │ │ │ ├── include_base_44_estonian_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_estonian_health_oriented_education.yaml │ │ │ │ │ ├── include_base_44_estonian_social_science.yaml │ │ │ │ │ ├── include_base_44_estonian_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Finnish/ │ │ │ │ │ ├── _finnish_few_shot_en_template_yaml │ │ │ │ │ ├── _include_base_44_finnish.yaml │ │ │ │ │ ├── include_base_44_finnish_applied_science.yaml │ │ │ │ │ ├── include_base_44_finnish_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_finnish_health_oriented_education.yaml │ │ │ │ │ ├── include_base_44_finnish_social_science.yaml │ │ │ │ │ ├── include_base_44_finnish_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── French/ │ │ │ │ │ ├── _french_few_shot_en_template_yaml │ │ │ │ │ ├── _include_base_44_french.yaml │ │ │ │ │ ├── include_base_44_french_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_french_driving_license.yaml │ │ │ │ │ ├── include_base_44_french_health_oriented_education.yaml │ │ │ │ │ ├── include_base_44_french_social_science.yaml │ │ │ │ │ ├── include_base_44_french_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Georgian/ │ │ │ │ │ ├── _georgian_few_shot_en_template_yaml │ │ │ │ │ ├── _include_base_44_georgian.yaml │ │ │ │ │ ├── include_base_44_georgian_arts_humanities.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── German/ │ │ │ │ │ ├── _german_few_shot_en_template_yaml │ │ │ │ │ ├── _include_base_44_german.yaml │ │ │ │ │ ├── include_base_44_german_driving_license.yaml │ │ │ │ │ ├── include_base_44_german_social_science.yaml │ │ │ │ │ ├── include_base_44_german_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Greek/ │ │ │ │ │ ├── _greek_few_shot_en_template_yaml │ │ │ │ │ ├── _include_base_44_greek.yaml │ │ │ │ │ ├── include_base_44_greek_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_greek_business_commerce.yaml │ │ │ │ │ ├── include_base_44_greek_health_oriented_education.yaml │ │ │ │ │ ├── include_base_44_greek_medical_license.yaml │ │ │ │ │ ├── include_base_44_greek_professional_certification.yaml │ │ │ │ │ ├── include_base_44_greek_social_science.yaml │ │ │ │ │ ├── include_base_44_greek_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Hebrew/ │ │ │ │ │ ├── _hebrew_few_shot_en_template_yaml │ │ │ │ │ ├── _include_base_44_hebrew.yaml │ │ │ │ │ ├── include_base_44_hebrew_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_hebrew_driving_license.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Hindi/ │ │ │ │ │ ├── _hindi_few_shot_en_template_yaml │ │ │ │ │ ├── _include_base_44_hindi.yaml │ │ │ │ │ ├── include_base_44_hindi_applied_science.yaml │ │ │ │ │ ├── include_base_44_hindi_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_hindi_driving_license.yaml │ │ │ │ │ ├── include_base_44_hindi_general_knowledge.yaml │ │ │ │ │ ├── include_base_44_hindi_health_oriented_education.yaml │ │ │ │ │ ├── include_base_44_hindi_professional_certification.yaml │ │ │ │ │ ├── include_base_44_hindi_social_science.yaml │ │ │ │ │ ├── include_base_44_hindi_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Hungarian/ │ │ │ │ │ ├── _hungarian_few_shot_en_template_yaml │ │ │ │ │ ├── _include_base_44_hungarian.yaml │ │ │ │ │ ├── include_base_44_hungarian_applied_science.yaml │ │ │ │ │ ├── include_base_44_hungarian_social_science.yaml │ │ │ │ │ ├── include_base_44_hungarian_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Indonesian/ │ │ │ │ │ ├── _include_base_44_indonesian.yaml │ │ │ │ │ ├── _indonesian_few_shot_en_template_yaml │ │ │ │ │ ├── include_base_44_indonesian_applied_science.yaml │ │ │ │ │ ├── include_base_44_indonesian_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_indonesian_professional_certification.yaml │ │ │ │ │ ├── include_base_44_indonesian_social_science.yaml │ │ │ │ │ ├── include_base_44_indonesian_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Italian/ │ │ │ │ │ ├── _include_base_44_italian.yaml │ │ │ │ │ ├── _italian_few_shot_en_template_yaml │ │ │ │ │ ├── include_base_44_italian_applied_science.yaml │ │ │ │ │ ├── include_base_44_italian_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_italian_health_oriented_education.yaml │ │ │ │ │ ├── include_base_44_italian_professional_certification.yaml │ │ │ │ │ ├── include_base_44_italian_social_science.yaml │ │ │ │ │ ├── include_base_44_italian_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Japanese/ │ │ │ │ │ ├── _include_base_44_japanese.yaml │ │ │ │ │ ├── _japanese_few_shot_en_template_yaml │ │ │ │ │ ├── include_base_44_japanese_driving_license.yaml │ │ │ │ │ ├── include_base_44_japanese_medical_license.yaml │ │ │ │ │ ├── include_base_44_japanese_professional_certification.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Kazakh/ │ │ │ │ │ ├── _include_base_44_kazakh.yaml │ │ │ │ │ ├── _kazakh_few_shot_en_template_yaml │ │ │ │ │ ├── include_base_44_kazakh_arts_humanities.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Korean/ │ │ │ │ │ ├── _include_base_44_korean.yaml │ │ │ │ │ ├── _korean_few_shot_en_template_yaml │ │ │ │ │ ├── include_base_44_korean_professional_certification.yaml │ │ │ │ │ ├── include_base_44_korean_social_science.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Lithuanian/ │ │ │ │ │ ├── _include_base_44_lithuanian.yaml │ │ │ │ │ ├── _lithuanian_few_shot_en_template_yaml │ │ │ │ │ ├── include_base_44_lithuanian_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_lithuanian_business_commerce.yaml │ │ │ │ │ ├── include_base_44_lithuanian_professional_certification.yaml │ │ │ │ │ ├── include_base_44_lithuanian_social_science.yaml │ │ │ │ │ ├── include_base_44_lithuanian_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Malay/ │ │ │ │ │ ├── _include_base_44_malay.yaml │ │ │ │ │ ├── _malay_few_shot_en_template_yaml │ │ │ │ │ ├── include_base_44_malay_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_malay_business_commerce.yaml │ │ │ │ │ ├── include_base_44_malay_social_science.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Malayalam/ │ │ │ │ │ ├── _include_base_44_malayalam.yaml │ │ │ │ │ ├── _malayalam_few_shot_en_template_yaml │ │ │ │ │ ├── include_base_44_malayalam_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_malayalam_general_knowledge.yaml │ │ │ │ │ ├── include_base_44_malayalam_health_oriented_education.yaml │ │ │ │ │ ├── include_base_44_malayalam_marine_license.yaml │ │ │ │ │ ├── include_base_44_malayalam_social_science.yaml │ │ │ │ │ ├── include_base_44_malayalam_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Nepali/ │ │ │ │ │ ├── _include_base_44_nepali.yaml │ │ │ │ │ ├── _nepali_few_shot_en_template_yaml │ │ │ │ │ ├── include_base_44_nepali_driving_license.yaml │ │ │ │ │ ├── include_base_44_nepali_professional_certification.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── North Macedonian/ │ │ │ │ │ ├── _include_base_44_north macedonian.yaml │ │ │ │ │ ├── _north macedonian_few_shot_en_template_yaml │ │ │ │ │ ├── include_base_44_north macedonian_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_north macedonian_business_commerce.yaml │ │ │ │ │ ├── include_base_44_north macedonian_social_science.yaml │ │ │ │ │ ├── include_base_44_north macedonian_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Persian/ │ │ │ │ │ ├── _include_base_44_persian.yaml │ │ │ │ │ ├── _persian_few_shot_en_template_yaml │ │ │ │ │ ├── include_base_44_persian_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_persian_driving_license.yaml │ │ │ │ │ ├── include_base_44_persian_professional_certification.yaml │ │ │ │ │ ├── include_base_44_persian_social_science.yaml │ │ │ │ │ ├── include_base_44_persian_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Polish/ │ │ │ │ │ ├── _include_base_44_polish.yaml │ │ │ │ │ ├── _polish_few_shot_en_template_yaml │ │ │ │ │ ├── include_base_44_polish_professional_certification.yaml │ │ │ │ │ ├── include_base_44_polish_social_science.yaml │ │ │ │ │ ├── include_base_44_polish_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Portuguese/ │ │ │ │ │ ├── _include_base_44_portuguese.yaml │ │ │ │ │ ├── _portuguese_few_shot_en_template_yaml │ │ │ │ │ ├── include_base_44_portuguese_applied_science.yaml │ │ │ │ │ ├── include_base_44_portuguese_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_portuguese_business_commerce.yaml │ │ │ │ │ ├── include_base_44_portuguese_health_oriented_education.yaml │ │ │ │ │ ├── include_base_44_portuguese_social_science.yaml │ │ │ │ │ ├── include_base_44_portuguese_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Russian/ │ │ │ │ │ ├── _include_base_44_russian.yaml │ │ │ │ │ ├── _russian_few_shot_en_template_yaml │ │ │ │ │ ├── include_base_44_russian_applied_science.yaml │ │ │ │ │ ├── include_base_44_russian_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_russian_business_commerce.yaml │ │ │ │ │ ├── include_base_44_russian_driving_license.yaml │ │ │ │ │ ├── include_base_44_russian_health_oriented_education.yaml │ │ │ │ │ ├── include_base_44_russian_marine_license.yaml │ │ │ │ │ ├── include_base_44_russian_social_science.yaml │ │ │ │ │ ├── include_base_44_russian_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Serbian/ │ │ │ │ │ ├── _include_base_44_serbian.yaml │ │ │ │ │ ├── _serbian_few_shot_en_template_yaml │ │ │ │ │ ├── include_base_44_serbian_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_serbian_social_science.yaml │ │ │ │ │ ├── include_base_44_serbian_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Spanish/ │ │ │ │ │ ├── _include_base_44_spanish.yaml │ │ │ │ │ ├── _spanish_few_shot_en_template_yaml │ │ │ │ │ ├── include_base_44_spanish_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_spanish_health_oriented_education.yaml │ │ │ │ │ ├── include_base_44_spanish_social_science.yaml │ │ │ │ │ ├── include_base_44_spanish_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Tagalog/ │ │ │ │ │ ├── _include_base_44_tagalog.yaml │ │ │ │ │ ├── _tagalog_few_shot_en_template_yaml │ │ │ │ │ ├── include_base_44_tagalog_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_tagalog_driving_license.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Tamil/ │ │ │ │ │ ├── _include_base_44_tamil.yaml │ │ │ │ │ ├── _tamil_few_shot_en_template_yaml │ │ │ │ │ ├── include_base_44_tamil_general_knowledge.yaml │ │ │ │ │ ├── include_base_44_tamil_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Telugu/ │ │ │ │ │ ├── _include_base_44_telugu.yaml │ │ │ │ │ ├── _telugu_few_shot_en_template_yaml │ │ │ │ │ ├── include_base_44_telugu_applied_science.yaml │ │ │ │ │ ├── include_base_44_telugu_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_telugu_social_science.yaml │ │ │ │ │ ├── include_base_44_telugu_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Turkish/ │ │ │ │ │ ├── _include_base_44_turkish.yaml │ │ │ │ │ ├── _turkish_few_shot_en_template_yaml │ │ │ │ │ ├── include_base_44_turkish_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_turkish_business_commerce.yaml │ │ │ │ │ ├── include_base_44_turkish_social_science.yaml │ │ │ │ │ ├── include_base_44_turkish_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Ukrainian/ │ │ │ │ │ ├── _include_base_44_ukrainian.yaml │ │ │ │ │ ├── _ukrainian_few_shot_en_template_yaml │ │ │ │ │ ├── include_base_44_ukrainian_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_ukrainian_social_science.yaml │ │ │ │ │ ├── include_base_44_ukrainian_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Urdu/ │ │ │ │ │ ├── _include_base_44_urdu.yaml │ │ │ │ │ ├── _urdu_few_shot_en_template_yaml │ │ │ │ │ ├── include_base_44_urdu_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_urdu_health_oriented_education.yaml │ │ │ │ │ ├── include_base_44_urdu_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ ├── Uzbek/ │ │ │ │ │ ├── _include_base_44_uzbek.yaml │ │ │ │ │ ├── _uzbek_few_shot_en_template_yaml │ │ │ │ │ ├── include_base_44_uzbek_arts_humanities.yaml │ │ │ │ │ ├── include_base_44_uzbek_medical_license.yaml │ │ │ │ │ ├── include_base_44_uzbek_social_science.yaml │ │ │ │ │ ├── include_base_44_uzbek_stem.yaml │ │ │ │ │ └── utils.py │ │ │ │ └── Vietnamese/ │ │ │ │ ├── _include_base_44_vietnamese.yaml │ │ │ │ ├── _vietnamese_few_shot_en_template_yaml │ │ │ │ ├── include_base_44_vietnamese_arts_humanities.yaml │ │ │ │ ├── include_base_44_vietnamese_social_science.yaml │ │ │ │ ├── include_base_44_vietnamese_stem.yaml │ │ │ │ └── utils.py │ │ │ └── few_shot_og/ │ │ │ ├── Albanian/ │ │ │ │ ├── _albanian_few_shot_og_template_yaml │ │ │ │ ├── _include_base_44_albanian.yaml │ │ │ │ ├── include_base_44_albanian_arts_humanities.yaml │ │ │ │ ├── include_base_44_albanian_business_commerce.yaml │ │ │ │ ├── include_base_44_albanian_health_oriented_education.yaml │ │ │ │ ├── include_base_44_albanian_social_science.yaml │ │ │ │ ├── include_base_44_albanian_stem.yaml │ │ │ │ └── utils.py │ │ │ ├── Arabic/ │ │ │ │ ├── _arabic_few_shot_og_template_yaml │ │ │ │ ├── _include_base_44_arabic.yaml │ │ │ │ ├── include_base_44_arabic_arts_humanities.yaml │ │ │ │ ├── include_base_44_arabic_business_commerce.yaml │ │ │ │ ├── include_base_44_arabic_driving_license.yaml │ │ │ │ ├── include_base_44_arabic_general_knowledge.yaml │ │ │ │ ├── include_base_44_arabic_social_science.yaml │ │ │ │ ├── include_base_44_arabic_stem.yaml │ │ │ │ └── utils.py │ │ │ ├── Armenian/ │ │ │ │ ├── _armenian_few_shot_og_template_yaml │ │ │ │ ├── _include_base_44_armenian.yaml │ │ │ │ ├── include_base_44_armenian_arts_humanities.yaml │ │ │ │ ├── include_base_44_armenian_driving_license.yaml │ │ │ │ ├── include_base_44_armenian_social_science.yaml │ │ │ │ ├── include_base_44_armenian_stem.yaml │ │ │ │ └── utils.py │ │ │ ├── Azerbaijani/ │ │ │ │ ├── _azerbaijani_few_shot_og_template_yaml │ │ │ │ ├── _include_base_44_azerbaijani.yaml │ │ │ │ ├── include_base_44_azerbaijani_applied_science.yaml │ │ │ │ ├── include_base_44_azerbaijani_arts_humanities.yaml │ │ │ │ ├── include_base_44_azerbaijani_business_commerce.yaml │ │ │ │ ├── include_base_44_azerbaijani_health_oriented_education.yaml │ │ │ │ ├── include_base_44_azerbaijani_social_science.yaml │ │ │ │ ├── include_base_44_azerbaijani_stem.yaml │ │ │ │ └── utils.py │ │ │ ├── Basque/ │ │ │ │ ├── _basque_few_shot_og_template_yaml │ │ │ │ ├── _include_base_44_basque.yaml │ │ │ │ ├── include_base_44_basque_professional_certification.yaml │ │ │ │ └── utils.py │ │ │ ├── Belarusian/ │ │ │ │ ├── _belarusian_few_shot_og_template_yaml │ │ │ │ ├── _include_base_44_belarusian.yaml │ │ │ │ ├── include_base_44_belarusian_arts_humanities.yaml │ │ │ │ ├── include_base_44_belarusian_social_science.yaml │ │ │ │ ├── include_base_44_belarusian_stem.yaml │ │ │ │ └── utils.py │ │ │ ├── Bengali/ │ │ │ │ ├── _bengali_few_shot_og_template_yaml │ │ │ │ ├── _include_base_44_bengali.yaml │ │ │ │ ├── include_base_44_bengali_arts_humanities.yaml │ │ │ │ ├── include_base_44_bengali_general_knowledge.yaml │ │ │ │ ├── include_base_44_bengali_professional_certification.yaml │ │ │ │ ├── include_base_44_bengali_stem.yaml │ │ │ │ └── utils.py │ │ │ ├── Bulgarian/ │ │ │ │ ├── _bulgarian_few_shot_og_template_yaml │ │ │ │ ├── _include_base_44_bulgarian.yaml │ │ │ │ ├── include_base_44_bulgarian_arts_humanities.yaml │ │ │ │ ├── include_base_44_bulgarian_social_science.yaml │ │ │ │ ├── include_base_44_bulgarian_stem.yaml │ │ │ │ └── utils.py │ │ │ ├── Chinese/ │ │ │ │ ├── _chinese_few_shot_og_template_yaml │ │ │ │ ├── _include_base_44_chinese.yaml │ │ │ │ ├── include_base_44_chinese_applied_science.yaml │ │ │ │ ├── include_base_44_chinese_arts_humanities.yaml │ │ │ │ ├── include_base_44_chinese_business_commerce.yaml │ │ │ │ ├── include_base_44_chinese_driving_license.yaml │ │ │ │ ├── include_base_44_chinese_health_oriented_education.yaml │ │ │ │ ├── include_base_44_chinese_professional_certification.yaml │ │ │ │ ├── include_base_44_chinese_social_science.yaml │ │ │ │ ├── include_base_44_chinese_stem.yaml │ │ │ │ └── utils.py │ │ │ ├── Croatian/ │ │ │ │ ├── _croatian_few_shot_og_template_yaml │ │ │ │ ├── _include_base_44_croatian.yaml │ │ │ │ ├── include_base_44_croatian_arts_humanities.yaml │ │ │ │ ├── include_base_44_croatian_social_science.yaml │ │ │ │ ├── include_base_44_croatian_stem.yaml │ │ │ │ └── utils.py │ │ │ ├── Dutch/ │ │ │ │ ├── _dutch_few_shot_og_template_yaml │ │ │ │ ├── _include_base_44_dutch.yaml │ │ │ │ ├── include_base_44_dutch_applied_science.yaml │ │ │ │ ├── include_base_44_dutch_arts_humanities.yaml │ │ │ │ ├── include_base_44_dutch_health_oriented_education.yaml │ │ │ │ ├── include_base_44_dutch_social_science.yaml │ │ │ │ ├── include_base_44_dutch_stem.yaml │ │ │ │ └── utils.py │ │ │ ├── Estonian/ │ │ │ │ ├── _estonian_few_shot_og_template_yaml │ │ │ │ ├── _include_base_44_estonian.yaml │ │ │ │ ├── include_base_44_estonian_applied_science.yaml │ │ │ │ ├── include_base_44_estonian_arts_humanities.yaml │ │ │ │ ├── include_base_44_estonian_health_oriented_education.yaml │ │ │ │ ├── include_base_44_estonian_social_science.yaml │ │ │ │ ├── include_base_44_estonian_stem.yaml │ │ │ │ └── utils.py │ │ │ ├── Finnish/ │ │ │ │ ├── _finnish_few_shot_og_template_yaml │ │ │ │ ├── _include_base_44_finnish.yaml │ │ │ │ ├── include_base_44_finnish_applied_science.yaml │ │ │ │ ├── include_base_44_finnish_arts_humanities.yaml │ │ │ │ ├── include_base_44_finnish_health_oriented_education.yaml │ │ │ │ ├── include_base_44_finnish_social_science.yaml │ │ │ │ ├── include_base_44_finnish_stem.yaml │ │ │ │ └── utils.py │ │ │ ├── French/ │ │ │ │ ├── _french_few_shot_og_template_yaml │ │ │ │ ├── _include_base_44_french.yaml │ │ │ │ ├── include_base_44_french_arts_humanities.yaml │ │ │ │ ├── include_base_44_french_driving_license.yaml │ │ │ │ ├── include_base_44_french_health_oriented_education.yaml │ │ │ │ ├── include_base_44_french_social_science.yaml │ │ │ │ ├── include_base_44_french_stem.yaml │ │ │ │ └── utils.py │ │ │ ├── Georgian/ │ │ │ │ ├── _georgian_few_shot_og_template_yaml │ │ │ │ ├── _include_base_44_georgian.yaml │ │ │ │ ├── include_base_44_georgian_arts_humanities.yaml │ │ │ │ └── utils.py │ │ │ ├── German/ │ │ │ │ ├── _german_few_shot_og_template_yaml │ │ │ │ ├── _include_base_44_german.yaml │ │ │ │ ├── include_base_44_german_driving_license.yaml │ │ │ │ ├── include_base_44_german_social_science.yaml │ │ │ │ ├── include_base_44_german_stem.yaml │ │ │ │ └── utils.py │ │ │ ├── Greek/ │ │ │ │ ├── _greek_few_shot_og_template_yaml │ │ │ │ ├── _include_base_44_greek.yaml │ │ │ │ ├── include_base_44_greek_arts_humanities.yaml │ │ │ │ ├── include_base_44_greek_business_commerce.yaml │ │ │ │ ├── include_base_44_greek_health_oriented_education.yaml │ │ │ │ ├── include_base_44_greek_medical_license.yaml │ │ │ │ ├── include_base_44_greek_professional_certification.yaml │ │ │ │ ├── include_base_44_greek_social_science.yaml │ │ │ │ ├── include_base_44_greek_stem.yaml │ │ │ │ └── utils.py │ │ │ ├── Hebrew/ │ │ │ │ ├── _hebrew_few_shot_og_template_yaml │ │ │ │ ├── _include_base_44_hebrew.yaml │ │ │ │ ├── include_base_44_hebrew_arts_humanities.yaml │ │ │ │ ├── include_base_44_hebrew_driving_license.yaml │ │ │ │ └── utils.py │ │ │ ├── Hindi/ │ │ │ │ ├── _hindi_few_shot_og_template_yaml │ │ │ │ ├── _include_base_44_hindi.yaml │ │ │ │ ├── include_base_44_hindi_applied_science.yaml │ │ │ │ ├── include_base_44_hindi_arts_humanities.yaml │ │ │ │ ├── include_base_44_hindi_driving_license.yaml │ │ │ │ ├── include_base_44_hindi_general_knowledge.yaml │ │ │ │ ├── include_base_44_hindi_health_oriented_education.yaml │ │ │ │ ├── include_base_44_hindi_professional_certification.yaml │ │ │ │ ├── include_base_44_hindi_social_science.yaml │ │ │ │ ├── include_base_44_hindi_stem.yaml │ │ │ │ └── utils.py │ │ │ ├── Hungarian/ │ │ │ │ ├── _hungarian_few_shot_og_template_yaml │ │ │ │ ├── _include_base_44_hungarian.yaml │ │ │ │ ├── include_base_44_hungarian_applied_science.yaml │ │ │ │ ├── include_base_44_hungarian_social_science.yaml │ │ │ │ ├── include_base_44_hungarian_stem.yaml │ │ │ │ └── utils.py │ │ │ ├── Indonesian/ │ │ │ │ ├── _include_base_44_indonesian.yaml │ │ │ │ ├── _indonesian_few_shot_og_template_yaml │ │ │ │ ├── include_base_44_indonesian_applied_science.yaml │ │ │ │ ├── include_base_44_indonesian_arts_humanities.yaml │ │ │ │ ├── include_base_44_indonesian_professional_certification.yaml │ │ │ │ ├── include_base_44_indonesian_social_science.yaml │ │ │ │ ├── include_base_44_indonesian_stem.yaml │ │ │ │ └── utils.py │ │ │ ├── Italian/ │ │ │ │ ├── _include_base_44_italian.yaml │ │ │ │ ├── _italian_few_shot_og_template_yaml │ │ │ │ ├── include_base_44_italian_applied_science.yaml │ │ │ │ ├── include_base_44_italian_arts_humanities.yaml │ │ │ │ ├── include_base_44_italian_health_oriented_education.yaml │ │ │ │ ├── include_base_44_italian_professional_certification.yaml │ │ │ │ ├── include_base_44_italian_social_science.yaml │ │ │ │ ├── include_base_44_italian_stem.yaml │ │ │ │ └── utils.py │ │ │ ├── Japanese/ │ │ │ │ ├── _include_base_44_japanese.yaml │ │ │ │ ├── _japanese_few_shot_og_template_yaml │ │ │ │ ├── include_base_44_japanese_driving_license.yaml │ │ │ │ ├── include_base_44_japanese_medical_license.yaml │ │ │ │ ├── include_base_44_japanese_professional_certification.yaml │ │ │ │ └── utils.py │ │ │ ├── Kazakh/ │ │ │ │ ├── _include_base_44_kazakh.yaml │ │ │ │ ├── _kazakh_few_shot_og_template_yaml │ │ │ │ ├── include_base_44_kazakh_arts_humanities.yaml │ │ │ │ └── utils.py │ │ │ ├── Korean/ │ │ │ │ ├── _include_base_44_korean.yaml │ │ │ │ ├── _korean_few_shot_og_template_yaml │ │ │ │ ├── include_base_44_korean_professional_certification.yaml │ │ │ │ ├── include_base_44_korean_social_science.yaml │ │ │ │ └── utils.py │ │ │ ├── Lithuanian/ │ │ │ │ ├── _include_base_44_lithuanian.yaml │ │ │ │ ├── _lithuanian_few_shot_og_template_yaml │ │ │ │ ├── include_base_44_lithuanian_arts_humanities.yaml │ │ │ │ ├── include_base_44_lithuanian_business_commerce.yaml │ │ │ │ ├── include_base_44_lithuanian_professional_certification.yaml │ │ │ │ ├── include_base_44_lithuanian_social_science.yaml │ │ │ │ ├── include_base_44_lithuanian_stem.yaml │ │ │ │ └── utils.py │ │ │ ├── Malay/ │ │ │ │ ├── _include_base_44_malay.yaml │ │ │ │ ├── _malay_few_shot_og_template_yaml │ │ │ │ ├── include_base_44_malay_arts_humanities.yaml │ │ │ │ ├── include_base_44_malay_business_commerce.yaml │ │ │ │ ├── include_base_44_malay_social_science.yaml │ │ │ │ └── utils.py │ │ │ ├── Malayalam/ │ │ │ │ ├── _include_base_44_malayalam.yaml │ │ │ │ ├── _malayalam_few_shot_og_template_yaml │ │ │ │ ├── include_base_44_malayalam_arts_humanities.yaml │ │ │ │ ├── include_base_44_malayalam_general_knowledge.yaml │ │ │ │ ├── include_base_44_malayalam_health_oriented_education.yaml │ │ │ │ ├── include_base_44_malayalam_marine_license.yaml │ │ │ │ ├── include_base_44_malayalam_social_science.yaml │ │ │ │ ├── include_base_44_malayalam_stem.yaml │ │ │ │ └── utils.py │ │ │ ├── Nepali/ │ │ │ │ ├── _include_base_44_nepali.yaml │ │ │ │ ├── _nepali_few_shot_og_template_yaml │ │ │ │ ├── include_base_44_nepali_driving_license.yaml │ │ │ │ ├── include_base_44_nepali_professional_certification.yaml │ │ │ │ └── utils.py │ │ │ ├── North Macedonian/ │ │ │ │ ├── _include_base_44_north macedonian.yaml │ │ │ │ ├── _north macedonian_few_shot_og_template_yaml │ │ │ │ ├── include_base_44_north macedonian_arts_humanities.yaml │ │ │ │ ├── include_base_44_north macedonian_business_commerce.yaml │ │ │ │ ├── include_base_44_north macedonian_social_science.yaml │ │ │ │ ├── include_base_44_north macedonian_stem.yaml │ │ │ │ └── utils.py │ │ │ ├── Persian/ │ │ │ │ ├── _include_base_44_persian.yaml │ │ │ │ ├── _persian_few_shot_og_template_yaml │ │ │ │ ├── include_base_44_persian_arts_humanities.yaml │ │ │ │ ├── include_base_44_persian_driving_license.yaml │ │ │ │ ├── include_base_44_persian_professional_certification.yaml │ │ │ │ ├── include_base_44_persian_social_science.yaml │ │ │ │ ├── include_base_44_persian_stem.yaml │ │ │ │ └── utils.py │ │ │ ├── Polish/ │ │ │ │ ├── _include_base_44_polish.yaml │ │ │ │ ├── _polish_few_shot_og_template_yaml │ │ │ │ ├── include_base_44_polish_professional_certification.yaml │ │ │ │ ├── include_base_44_polish_social_science.yaml │ │ │ │ ├── include_base_44_polish_stem.yaml │ │ │ │ └── utils.py │ │ │ ├── Portuguese/ │ │ │ │ ├── _include_base_44_portuguese.yaml │ │ │ │ ├── _portuguese_few_shot_og_template_yaml │ │ │ │ ├── include_base_44_portuguese_applied_science.yaml │ │ │ │ ├── include_base_44_portuguese_arts_humanities.yaml │ │ │ │ ├── include_base_44_portuguese_business_commerce.yaml │ │ │ │ ├── include_base_44_portuguese_health_oriented_education.yaml │ │ │ │ ├── include_base_44_portuguese_social_science.yaml │ │ │ │ ├── include_base_44_portuguese_stem.yaml │ │ │ │ └── utils.py │ │ │ ├── Russian/ │ │ │ │ ├── _include_base_44_russian.yaml │ │ │ │ ├── _russian_few_shot_og_template_yaml │ │ │ │ ├── include_base_44_russian_applied_science.yaml │ │ │ │ ├── include_base_44_russian_arts_humanities.yaml │ │ │ │ ├── include_base_44_russian_business_commerce.yaml │ │ │ │ ├── include_base_44_russian_driving_license.yaml │ │ │ │ ├── include_base_44_russian_health_oriented_education.yaml │ │ │ │ ├── include_base_44_russian_marine_license.yaml │ │ │ │ ├── include_base_44_russian_social_science.yaml │ │ │ │ ├── include_base_44_russian_stem.yaml │ │ │ │ └── utils.py │ │ │ ├── Serbian/ │ │ │ │ ├── _include_base_44_serbian.yaml │ │ │ │ ├── _serbian_few_shot_og_template_yaml │ │ │ │ ├── include_base_44_serbian_arts_humanities.yaml │ │ │ │ ├── include_base_44_serbian_social_science.yaml │ │ │ │ ├── include_base_44_serbian_stem.yaml │ │ │ │ └── utils.py │ │ │ ├── Spanish/ │ │ │ │ ├── _include_base_44_spanish.yaml │ │ │ │ ├── _spanish_few_shot_og_template_yaml │ │ │ │ ├── include_base_44_spanish_arts_humanities.yaml │ │ │ │ ├── include_base_44_spanish_health_oriented_education.yaml │ │ │ │ ├── include_base_44_spanish_social_science.yaml │ │ │ │ ├── include_base_44_spanish_stem.yaml │ │ │ │ └── utils.py │ │ │ ├── Tagalog/ │ │ │ │ ├── _include_base_44_tagalog.yaml │ │ │ │ ├── _tagalog_few_shot_og_template_yaml │ │ │ │ ├── include_base_44_tagalog_arts_humanities.yaml │ │ │ │ ├── include_base_44_tagalog_driving_license.yaml │ │ │ │ └── utils.py │ │ │ ├── Tamil/ │ │ │ │ ├── _include_base_44_tamil.yaml │ │ │ │ ├── _tamil_few_shot_og_template_yaml │ │ │ │ ├── include_base_44_tamil_general_knowledge.yaml │ │ │ │ ├── include_base_44_tamil_stem.yaml │ │ │ │ └── utils.py │ │ │ ├── Telugu/ │ │ │ │ ├── _include_base_44_telugu.yaml │ │ │ │ ├── _telugu_few_shot_og_template_yaml │ │ │ │ ├── include_base_44_telugu_applied_science.yaml │ │ │ │ ├── include_base_44_telugu_arts_humanities.yaml │ │ │ │ ├── include_base_44_telugu_social_science.yaml │ │ │ │ ├── include_base_44_telugu_stem.yaml │ │ │ │ └── utils.py │ │ │ ├── Turkish/ │ │ │ │ ├── _include_base_44_turkish.yaml │ │ │ │ ├── _turkish_few_shot_og_template_yaml │ │ │ │ ├── include_base_44_turkish_arts_humanities.yaml │ │ │ │ ├── include_base_44_turkish_business_commerce.yaml │ │ │ │ ├── include_base_44_turkish_social_science.yaml │ │ │ │ ├── include_base_44_turkish_stem.yaml │ │ │ │ └── utils.py │ │ │ ├── Ukrainian/ │ │ │ │ ├── _include_base_44_ukrainian.yaml │ │ │ │ ├── _ukrainian_few_shot_og_template_yaml │ │ │ │ ├── include_base_44_ukrainian_arts_humanities.yaml │ │ │ │ ├── include_base_44_ukrainian_social_science.yaml │ │ │ │ ├── include_base_44_ukrainian_stem.yaml │ │ │ │ └── utils.py │ │ │ ├── Urdu/ │ │ │ │ ├── _include_base_44_urdu.yaml │ │ │ │ ├── _urdu_few_shot_og_template_yaml │ │ │ │ ├── include_base_44_urdu_arts_humanities.yaml │ │ │ │ ├── include_base_44_urdu_health_oriented_education.yaml │ │ │ │ ├── include_base_44_urdu_stem.yaml │ │ │ │ └── utils.py │ │ │ ├── Uzbek/ │ │ │ │ ├── _include_base_44_uzbek.yaml │ │ │ │ ├── _uzbek_few_shot_og_template_yaml │ │ │ │ ├── include_base_44_uzbek_arts_humanities.yaml │ │ │ │ ├── include_base_44_uzbek_medical_license.yaml │ │ │ │ ├── include_base_44_uzbek_social_science.yaml │ │ │ │ ├── include_base_44_uzbek_stem.yaml │ │ │ │ └── utils.py │ │ │ └── Vietnamese/ │ │ │ ├── _include_base_44_vietnamese.yaml │ │ │ ├── _vietnamese_few_shot_og_template_yaml │ │ │ ├── include_base_44_vietnamese_arts_humanities.yaml │ │ │ ├── include_base_44_vietnamese_social_science.yaml │ │ │ ├── include_base_44_vietnamese_stem.yaml │ │ │ └── utils.py │ │ ├── inverse_scaling/ │ │ │ ├── README.md │ │ │ ├── _inverse_scaling_mc_yaml │ │ │ ├── _some_results │ │ │ ├── inverse_scaling_hindsight_neglect.yaml │ │ │ ├── inverse_scaling_into_the_unknown.yaml │ │ │ ├── inverse_scaling_memo_trap.yaml │ │ │ ├── inverse_scaling_modus_tollens.yaml │ │ │ ├── inverse_scaling_neqa.yaml │ │ │ ├── inverse_scaling_pattern_matching_suppression.yaml │ │ │ ├── inverse_scaling_quote_repetition.yaml │ │ │ ├── inverse_scaling_redefine_math.yaml │ │ │ ├── inverse_scaling_repetitive_algebra.yaml │ │ │ ├── inverse_scaling_sig_figs.yaml │ │ │ └── inverse_scaling_winobias_antistereotype.yaml │ │ ├── japanese_leaderboard/ │ │ │ ├── README.md │ │ │ ├── _ja_leaderboard.yaml │ │ │ ├── ja_leaderboard_jaqket_v2.yaml │ │ │ ├── ja_leaderboard_jcommonsenseqa.py │ │ │ ├── ja_leaderboard_jcommonsenseqa.yaml │ │ │ ├── ja_leaderboard_jnli.yaml │ │ │ ├── ja_leaderboard_jsquad.yaml │ │ │ ├── ja_leaderboard_marc_ja.yaml │ │ │ ├── ja_leaderboard_mgsm.py │ │ │ ├── ja_leaderboard_mgsm.yaml │ │ │ ├── ja_leaderboard_xlsum.py │ │ │ ├── ja_leaderboard_xlsum.yaml │ │ │ ├── ja_leaderboard_xwinograd.py │ │ │ ├── ja_leaderboard_xwinograd.yaml │ │ │ └── requirements.txt │ │ ├── jfinqa/ │ │ │ ├── README.md │ │ │ ├── _jfinqa.yaml │ │ │ ├── jfinqa_consistency.yaml │ │ │ ├── jfinqa_numerical.yaml │ │ │ ├── jfinqa_temporal.yaml │ │ │ ├── test_jfinqa_utils.py │ │ │ └── utils.py │ │ ├── jsonschema_bench/ │ │ │ ├── README.md │ │ │ ├── jsonschema_bench_easy.yaml │ │ │ ├── jsonschema_bench_hard.yaml │ │ │ ├── jsonschema_bench_medium.yaml │ │ │ └── metrics.py │ │ ├── kbl/ │ │ │ ├── README.md │ │ │ ├── bar_exam/ │ │ │ │ ├── civil/ │ │ │ │ │ ├── _base_em_yaml │ │ │ │ │ ├── kbl_bar_exam_em_civil_2012.yaml │ │ │ │ │ ├── kbl_bar_exam_em_civil_2013.yaml │ │ │ │ │ ├── kbl_bar_exam_em_civil_2014.yaml │ │ │ │ │ ├── kbl_bar_exam_em_civil_2015.yaml │ │ │ │ │ ├── kbl_bar_exam_em_civil_2016.yaml │ │ │ │ │ ├── kbl_bar_exam_em_civil_2017.yaml │ │ │ │ │ ├── kbl_bar_exam_em_civil_2018.yaml │ │ │ │ │ ├── kbl_bar_exam_em_civil_2019.yaml │ │ │ │ │ ├── kbl_bar_exam_em_civil_2020.yaml │ │ │ │ │ ├── kbl_bar_exam_em_civil_2021.yaml │ │ │ │ │ ├── kbl_bar_exam_em_civil_2022.yaml │ │ │ │ │ ├── kbl_bar_exam_em_civil_2023.yaml │ │ │ │ │ ├── kbl_bar_exam_em_civil_2024.yaml │ │ │ │ │ └── kbl_bar_exam_em_civil_2025.yaml │ │ │ │ ├── criminal/ │ │ │ │ │ ├── _base_em_yaml │ │ │ │ │ ├── kbl_bar_exam_em_criminal_2012.yaml │ │ │ │ │ ├── kbl_bar_exam_em_criminal_2013.yaml │ │ │ │ │ ├── kbl_bar_exam_em_criminal_2014.yaml │ │ │ │ │ ├── kbl_bar_exam_em_criminal_2015.yaml │ │ │ │ │ ├── kbl_bar_exam_em_criminal_2016.yaml │ │ │ │ │ ├── kbl_bar_exam_em_criminal_2017.yaml │ │ │ │ │ ├── kbl_bar_exam_em_criminal_2018.yaml │ │ │ │ │ ├── kbl_bar_exam_em_criminal_2019.yaml │ │ │ │ │ ├── kbl_bar_exam_em_criminal_2020.yaml │ │ │ │ │ ├── kbl_bar_exam_em_criminal_2021.yaml │ │ │ │ │ ├── kbl_bar_exam_em_criminal_2022.yaml │ │ │ │ │ ├── kbl_bar_exam_em_criminal_2023.yaml │ │ │ │ │ └── kbl_bar_exam_em_criminal_2024.yaml │ │ │ │ ├── public/ │ │ │ │ │ ├── _base_em_yaml │ │ │ │ │ ├── kbl_bar_exam_em_public_2012.yaml │ │ │ │ │ ├── kbl_bar_exam_em_public_2013.yaml │ │ │ │ │ ├── kbl_bar_exam_em_public_2014.yaml │ │ │ │ │ ├── kbl_bar_exam_em_public_2015.yaml │ │ │ │ │ ├── kbl_bar_exam_em_public_2016.yaml │ │ │ │ │ ├── kbl_bar_exam_em_public_2017.yaml │ │ │ │ │ ├── kbl_bar_exam_em_public_2018.yaml │ │ │ │ │ ├── kbl_bar_exam_em_public_2019.yaml │ │ │ │ │ ├── kbl_bar_exam_em_public_2020.yaml │ │ │ │ │ ├── kbl_bar_exam_em_public_2021.yaml │ │ │ │ │ ├── kbl_bar_exam_em_public_2022.yaml │ │ │ │ │ ├── kbl_bar_exam_em_public_2023.yaml │ │ │ │ │ └── kbl_bar_exam_em_public_2024.yaml │ │ │ │ └── responsibility/ │ │ │ │ ├── _base_em_yaml │ │ │ │ ├── kbl_bar_exam_em_responsibility_2010.yaml │ │ │ │ ├── kbl_bar_exam_em_responsibility_2011.yaml │ │ │ │ ├── kbl_bar_exam_em_responsibility_2012.yaml │ │ │ │ ├── kbl_bar_exam_em_responsibility_2013.yaml │ │ │ │ ├── kbl_bar_exam_em_responsibility_2014.yaml │ │ │ │ ├── kbl_bar_exam_em_responsibility_2015.yaml │ │ │ │ ├── kbl_bar_exam_em_responsibility_2016.yaml │ │ │ │ ├── kbl_bar_exam_em_responsibility_2017.yaml │ │ │ │ ├── kbl_bar_exam_em_responsibility_2018.yaml │ │ │ │ ├── kbl_bar_exam_em_responsibility_2019.yaml │ │ │ │ ├── kbl_bar_exam_em_responsibility_2020.yaml │ │ │ │ ├── kbl_bar_exam_em_responsibility_2021.yaml │ │ │ │ ├── kbl_bar_exam_em_responsibility_2022.yaml │ │ │ │ └── kbl_bar_exam_em_responsibility_2023.yaml │ │ │ ├── knowledge/ │ │ │ │ ├── _kbl_knowledge_yaml │ │ │ │ ├── kbl_common_legal_mistake_qa_em.yaml │ │ │ │ ├── kbl_common_legal_mistake_qa_reasoning_em.yaml │ │ │ │ ├── kbl_legal_concept_qa_em.yaml │ │ │ │ ├── kbl_offense_component_qa_em.yaml │ │ │ │ ├── kbl_query_statute_matching_qa_em.yaml │ │ │ │ ├── kbl_statute_hallucination_qa_em.yaml │ │ │ │ └── kbl_statute_number_and_content_matching_qa_em.yaml │ │ │ └── reasoning/ │ │ │ ├── _kbl_reasoning_yaml │ │ │ ├── kbl_case_relevance_qa_p_em.yaml │ │ │ ├── kbl_case_relevance_qa_q_em.yaml │ │ │ ├── kbl_causal_reasoning_em.yaml │ │ │ └── kbl_statement_consistency_qa_em.yaml │ │ ├── kmmlu/ │ │ │ ├── README.md │ │ │ ├── cot_hard/ │ │ │ │ ├── _cot_kmmlu_yaml │ │ │ │ ├── _kmmlu_cot_hard.yaml │ │ │ │ ├── _kmmlu_cot_hard_applied_science.yaml │ │ │ │ ├── _kmmlu_cot_hard_humss.yaml │ │ │ │ ├── _kmmlu_cot_hard_other.yaml │ │ │ │ ├── _kmmlu_cot_hard_stem.yaml │ │ │ │ ├── kmmlu_cot_hard_accounting.yaml │ │ │ │ ├── kmmlu_cot_hard_agricultural_sciences.yaml │ │ │ │ ├── kmmlu_cot_hard_aviation_engineering_and_maintenance.yaml │ │ │ │ ├── kmmlu_cot_hard_biology.yaml │ │ │ │ ├── kmmlu_cot_hard_chemical_engineering.yaml │ │ │ │ ├── kmmlu_cot_hard_chemistry.yaml │ │ │ │ ├── kmmlu_cot_hard_civil_engineering.yaml │ │ │ │ ├── kmmlu_cot_hard_computer_science.yaml │ │ │ │ ├── kmmlu_cot_hard_construction.yaml │ │ │ │ ├── kmmlu_cot_hard_criminal_law.yaml │ │ │ │ ├── kmmlu_cot_hard_ecology.yaml │ │ │ │ ├── kmmlu_cot_hard_economics.yaml │ │ │ │ ├── kmmlu_cot_hard_education.yaml │ │ │ │ ├── kmmlu_cot_hard_electrical_engineering.yaml │ │ │ │ ├── kmmlu_cot_hard_electronics_engineering.yaml │ │ │ │ ├── kmmlu_cot_hard_energy_management.yaml │ │ │ │ ├── kmmlu_cot_hard_environmental_science.yaml │ │ │ │ ├── kmmlu_cot_hard_fashion.yaml │ │ │ │ ├── kmmlu_cot_hard_food_processing.yaml │ │ │ │ ├── kmmlu_cot_hard_gas_technology_and_engineering.yaml │ │ │ │ ├── kmmlu_cot_hard_geomatics.yaml │ │ │ │ ├── kmmlu_cot_hard_health.yaml │ │ │ │ ├── kmmlu_cot_hard_industrial_engineer.yaml │ │ │ │ ├── kmmlu_cot_hard_information_technology.yaml │ │ │ │ ├── kmmlu_cot_hard_interior_architecture_and_design.yaml │ │ │ │ ├── kmmlu_cot_hard_korean_history.yaml │ │ │ │ ├── kmmlu_cot_hard_law.yaml │ │ │ │ ├── kmmlu_cot_hard_machine_design_and_manufacturing.yaml │ │ │ │ ├── kmmlu_cot_hard_management.yaml │ │ │ │ ├── kmmlu_cot_hard_maritime_engineering.yaml │ │ │ │ ├── kmmlu_cot_hard_marketing.yaml │ │ │ │ ├── kmmlu_cot_hard_materials_engineering.yaml │ │ │ │ ├── kmmlu_cot_hard_math.yaml │ │ │ │ ├── kmmlu_cot_hard_mechanical_engineering.yaml │ │ │ │ ├── kmmlu_cot_hard_nondestructive_testing.yaml │ │ │ │ ├── kmmlu_cot_hard_patent.yaml │ │ │ │ ├── kmmlu_cot_hard_political_science_and_sociology.yaml │ │ │ │ ├── kmmlu_cot_hard_psychology.yaml │ │ │ │ ├── kmmlu_cot_hard_public_safety.yaml │ │ │ │ ├── kmmlu_cot_hard_railway_and_automotive_engineering.yaml │ │ │ │ ├── kmmlu_cot_hard_real_estate.yaml │ │ │ │ ├── kmmlu_cot_hard_refrigerating_machinery.yaml │ │ │ │ ├── kmmlu_cot_hard_social_welfare.yaml │ │ │ │ ├── kmmlu_cot_hard_taxation.yaml │ │ │ │ └── kmmlu_cot_hard_telecommunications_and_wireless_technology.yaml │ │ │ ├── default/ │ │ │ │ ├── _default_kmmlu_yaml │ │ │ │ ├── _kmmlu_applied_science.yaml │ │ │ │ ├── _kmmlu_default.yaml │ │ │ │ ├── _kmmlu_humss.yaml │ │ │ │ ├── _kmmlu_other.yaml │ │ │ │ ├── _kmmlu_stem.yaml │ │ │ │ ├── kmmlu_accounting.yaml │ │ │ │ ├── kmmlu_agricultural_sciences.yaml │ │ │ │ ├── kmmlu_aviation_engineering_and_maintenance.yaml │ │ │ │ ├── kmmlu_biology.yaml │ │ │ │ ├── kmmlu_chemical_engineering.yaml │ │ │ │ ├── kmmlu_chemistry.yaml │ │ │ │ ├── kmmlu_civil_engineering.yaml │ │ │ │ ├── kmmlu_computer_science.yaml │ │ │ │ ├── kmmlu_construction.yaml │ │ │ │ ├── kmmlu_criminal_law.yaml │ │ │ │ ├── kmmlu_ecology.yaml │ │ │ │ ├── kmmlu_economics.yaml │ │ │ │ ├── kmmlu_education.yaml │ │ │ │ ├── kmmlu_electrical_engineering.yaml │ │ │ │ ├── kmmlu_electronics_engineering.yaml │ │ │ │ ├── kmmlu_energy_management.yaml │ │ │ │ ├── kmmlu_environmental_science.yaml │ │ │ │ ├── kmmlu_fashion.yaml │ │ │ │ ├── kmmlu_food_processing.yaml │ │ │ │ ├── kmmlu_gas_technology_and_engineering.yaml │ │ │ │ ├── kmmlu_geomatics.yaml │ │ │ │ ├── kmmlu_health.yaml │ │ │ │ ├── kmmlu_industrial_engineer.yaml │ │ │ │ ├── kmmlu_information_technology.yaml │ │ │ │ ├── kmmlu_interior_architecture_and_design.yaml │ │ │ │ ├── kmmlu_korean_history.yaml │ │ │ │ ├── kmmlu_law.yaml │ │ │ │ ├── kmmlu_machine_design_and_manufacturing.yaml │ │ │ │ ├── kmmlu_management.yaml │ │ │ │ ├── kmmlu_maritime_engineering.yaml │ │ │ │ ├── kmmlu_marketing.yaml │ │ │ │ ├── kmmlu_materials_engineering.yaml │ │ │ │ ├── kmmlu_math.yaml │ │ │ │ ├── kmmlu_mechanical_engineering.yaml │ │ │ │ ├── kmmlu_nondestructive_testing.yaml │ │ │ │ ├── kmmlu_patent.yaml │ │ │ │ ├── kmmlu_political_science_and_sociology.yaml │ │ │ │ ├── kmmlu_psychology.yaml │ │ │ │ ├── kmmlu_public_safety.yaml │ │ │ │ ├── kmmlu_railway_and_automotive_engineering.yaml │ │ │ │ ├── kmmlu_real_estate.yaml │ │ │ │ ├── kmmlu_refrigerating_machinery.yaml │ │ │ │ ├── kmmlu_social_welfare.yaml │ │ │ │ ├── kmmlu_taxation.yaml │ │ │ │ └── kmmlu_telecommunications_and_wireless_technology.yaml │ │ │ ├── direct/ │ │ │ │ ├── _direct_kmmlu_yaml │ │ │ │ ├── _kmmlu_direct.yaml │ │ │ │ ├── _kmmlu_direct_applied_science.yaml │ │ │ │ ├── _kmmlu_direct_humss.yaml │ │ │ │ ├── _kmmlu_direct_other.yaml │ │ │ │ ├── _kmmlu_direct_stem.yaml │ │ │ │ ├── kmmlu_direct_accounting.yaml │ │ │ │ ├── kmmlu_direct_agricultural_sciences.yaml │ │ │ │ ├── kmmlu_direct_aviation_engineering_and_maintenance.yaml │ │ │ │ ├── kmmlu_direct_biology.yaml │ │ │ │ ├── kmmlu_direct_chemical_engineering.yaml │ │ │ │ ├── kmmlu_direct_chemistry.yaml │ │ │ │ ├── kmmlu_direct_civil_engineering.yaml │ │ │ │ ├── kmmlu_direct_computer_science.yaml │ │ │ │ ├── kmmlu_direct_construction.yaml │ │ │ │ ├── kmmlu_direct_criminal_law.yaml │ │ │ │ ├── kmmlu_direct_ecology.yaml │ │ │ │ ├── kmmlu_direct_economics.yaml │ │ │ │ ├── kmmlu_direct_education.yaml │ │ │ │ ├── kmmlu_direct_electrical_engineering.yaml │ │ │ │ ├── kmmlu_direct_electronics_engineering.yaml │ │ │ │ ├── kmmlu_direct_energy_management.yaml │ │ │ │ ├── kmmlu_direct_environmental_science.yaml │ │ │ │ ├── kmmlu_direct_fashion.yaml │ │ │ │ ├── kmmlu_direct_food_processing.yaml │ │ │ │ ├── kmmlu_direct_gas_technology_and_engineering.yaml │ │ │ │ ├── kmmlu_direct_geomatics.yaml │ │ │ │ ├── kmmlu_direct_health.yaml │ │ │ │ ├── kmmlu_direct_industrial_engineer.yaml │ │ │ │ ├── kmmlu_direct_information_technology.yaml │ │ │ │ ├── kmmlu_direct_interior_architecture_and_design.yaml │ │ │ │ ├── kmmlu_direct_korean_history.yaml │ │ │ │ ├── kmmlu_direct_law.yaml │ │ │ │ ├── kmmlu_direct_machine_design_and_manufacturing.yaml │ │ │ │ ├── kmmlu_direct_management.yaml │ │ │ │ ├── kmmlu_direct_maritime_engineering.yaml │ │ │ │ ├── kmmlu_direct_marketing.yaml │ │ │ │ ├── kmmlu_direct_materials_engineering.yaml │ │ │ │ ├── kmmlu_direct_math.yaml │ │ │ │ ├── kmmlu_direct_mechanical_engineering.yaml │ │ │ │ ├── kmmlu_direct_nondestructive_testing.yaml │ │ │ │ ├── kmmlu_direct_patent.yaml │ │ │ │ ├── kmmlu_direct_political_science_and_sociology.yaml │ │ │ │ ├── kmmlu_direct_psychology.yaml │ │ │ │ ├── kmmlu_direct_public_safety.yaml │ │ │ │ ├── kmmlu_direct_railway_and_automotive_engineering.yaml │ │ │ │ ├── kmmlu_direct_real_estate.yaml │ │ │ │ ├── kmmlu_direct_refrigerating_machinery.yaml │ │ │ │ ├── kmmlu_direct_social_welfare.yaml │ │ │ │ ├── kmmlu_direct_taxation.yaml │ │ │ │ └── kmmlu_direct_telecommunications_and_wireless_technology.yaml │ │ │ ├── direct_hard/ │ │ │ │ ├── _direct_hard_kmmlu_yaml │ │ │ │ ├── _kmmlu_direct_hard.yaml │ │ │ │ ├── _kmmlu_direct_hard_applied_science.yaml │ │ │ │ ├── _kmmlu_direct_hard_humss.yaml │ │ │ │ ├── _kmmlu_direct_hard_other.yaml │ │ │ │ ├── _kmmlu_direct_hard_stem.yaml │ │ │ │ ├── kmmlu_direct_hard_accounting.yaml │ │ │ │ ├── kmmlu_direct_hard_agricultural_sciences.yaml │ │ │ │ ├── kmmlu_direct_hard_aviation_engineering_and_maintenance.yaml │ │ │ │ ├── kmmlu_direct_hard_biology.yaml │ │ │ │ ├── kmmlu_direct_hard_chemical_engineering.yaml │ │ │ │ ├── kmmlu_direct_hard_chemistry.yaml │ │ │ │ ├── kmmlu_direct_hard_civil_engineering.yaml │ │ │ │ ├── kmmlu_direct_hard_computer_science.yaml │ │ │ │ ├── kmmlu_direct_hard_construction.yaml │ │ │ │ ├── kmmlu_direct_hard_criminal_law.yaml │ │ │ │ ├── kmmlu_direct_hard_ecology.yaml │ │ │ │ ├── kmmlu_direct_hard_economics.yaml │ │ │ │ ├── kmmlu_direct_hard_education.yaml │ │ │ │ ├── kmmlu_direct_hard_electrical_engineering.yaml │ │ │ │ ├── kmmlu_direct_hard_electronics_engineering.yaml │ │ │ │ ├── kmmlu_direct_hard_energy_management.yaml │ │ │ │ ├── kmmlu_direct_hard_environmental_science.yaml │ │ │ │ ├── kmmlu_direct_hard_fashion.yaml │ │ │ │ ├── kmmlu_direct_hard_food_processing.yaml │ │ │ │ ├── kmmlu_direct_hard_gas_technology_and_engineering.yaml │ │ │ │ ├── kmmlu_direct_hard_geomatics.yaml │ │ │ │ ├── kmmlu_direct_hard_health.yaml │ │ │ │ ├── kmmlu_direct_hard_industrial_engineer.yaml │ │ │ │ ├── kmmlu_direct_hard_information_technology.yaml │ │ │ │ ├── kmmlu_direct_hard_interior_architecture_and_design.yaml │ │ │ │ ├── kmmlu_direct_hard_korean_history.yaml │ │ │ │ ├── kmmlu_direct_hard_law.yaml │ │ │ │ ├── kmmlu_direct_hard_machine_design_and_manufacturing.yaml │ │ │ │ ├── kmmlu_direct_hard_management.yaml │ │ │ │ ├── kmmlu_direct_hard_maritime_engineering.yaml │ │ │ │ ├── kmmlu_direct_hard_marketing.yaml │ │ │ │ ├── kmmlu_direct_hard_materials_engineering.yaml │ │ │ │ ├── kmmlu_direct_hard_math.yaml │ │ │ │ ├── kmmlu_direct_hard_mechanical_engineering.yaml │ │ │ │ ├── kmmlu_direct_hard_nondestructive_testing.yaml │ │ │ │ ├── kmmlu_direct_hard_patent.yaml │ │ │ │ ├── kmmlu_direct_hard_political_science_and_sociology.yaml │ │ │ │ ├── kmmlu_direct_hard_psychology.yaml │ │ │ │ ├── kmmlu_direct_hard_public_safety.yaml │ │ │ │ ├── kmmlu_direct_hard_railway_and_automotive_engineering.yaml │ │ │ │ ├── kmmlu_direct_hard_real_estate.yaml │ │ │ │ ├── kmmlu_direct_hard_refrigerating_machinery.yaml │ │ │ │ ├── kmmlu_direct_hard_social_welfare.yaml │ │ │ │ ├── kmmlu_direct_hard_taxation.yaml │ │ │ │ └── kmmlu_direct_hard_telecommunications_and_wireless_technology.yaml │ │ │ └── hard/ │ │ │ ├── _hard_kmmlu_yaml │ │ │ ├── _kmmlu_hard.yaml │ │ │ ├── _kmmlu_hard_applied_science.yaml │ │ │ ├── _kmmlu_hard_humss.yaml │ │ │ ├── _kmmlu_hard_other.yaml │ │ │ ├── _kmmlu_hard_stem.yaml │ │ │ ├── kmmlu_hard_accounting.yaml │ │ │ ├── kmmlu_hard_agricultural_sciences.yaml │ │ │ ├── kmmlu_hard_aviation_engineering_and_maintenance.yaml │ │ │ ├── kmmlu_hard_biology.yaml │ │ │ ├── kmmlu_hard_chemical_engineering.yaml │ │ │ ├── kmmlu_hard_chemistry.yaml │ │ │ ├── kmmlu_hard_civil_engineering.yaml │ │ │ ├── kmmlu_hard_computer_science.yaml │ │ │ ├── kmmlu_hard_construction.yaml │ │ │ ├── kmmlu_hard_criminal_law.yaml │ │ │ ├── kmmlu_hard_ecology.yaml │ │ │ ├── kmmlu_hard_economics.yaml │ │ │ ├── kmmlu_hard_education.yaml │ │ │ ├── kmmlu_hard_electrical_engineering.yaml │ │ │ ├── kmmlu_hard_electronics_engineering.yaml │ │ │ ├── kmmlu_hard_energy_management.yaml │ │ │ ├── kmmlu_hard_environmental_science.yaml │ │ │ ├── kmmlu_hard_fashion.yaml │ │ │ ├── kmmlu_hard_food_processing.yaml │ │ │ ├── kmmlu_hard_gas_technology_and_engineering.yaml │ │ │ ├── kmmlu_hard_geomatics.yaml │ │ │ ├── kmmlu_hard_health.yaml │ │ │ ├── kmmlu_hard_industrial_engineer.yaml │ │ │ ├── kmmlu_hard_information_technology.yaml │ │ │ ├── kmmlu_hard_interior_architecture_and_design.yaml │ │ │ ├── kmmlu_hard_korean_history.yaml │ │ │ ├── kmmlu_hard_law.yaml │ │ │ ├── kmmlu_hard_machine_design_and_manufacturing.yaml │ │ │ ├── kmmlu_hard_management.yaml │ │ │ ├── kmmlu_hard_maritime_engineering.yaml │ │ │ ├── kmmlu_hard_marketing.yaml │ │ │ ├── kmmlu_hard_materials_engineering.yaml │ │ │ ├── kmmlu_hard_math.yaml │ │ │ ├── kmmlu_hard_mechanical_engineering.yaml │ │ │ ├── kmmlu_hard_nondestructive_testing.yaml │ │ │ ├── kmmlu_hard_patent.yaml │ │ │ ├── kmmlu_hard_political_science_and_sociology.yaml │ │ │ ├── kmmlu_hard_psychology.yaml │ │ │ ├── kmmlu_hard_public_safety.yaml │ │ │ ├── kmmlu_hard_railway_and_automotive_engineering.yaml │ │ │ ├── kmmlu_hard_real_estate.yaml │ │ │ ├── kmmlu_hard_refrigerating_machinery.yaml │ │ │ ├── kmmlu_hard_social_welfare.yaml │ │ │ ├── kmmlu_hard_taxation.yaml │ │ │ └── kmmlu_hard_telecommunications_and_wireless_technology.yaml │ │ ├── kobest/ │ │ │ ├── README.md │ │ │ ├── _kobest.yaml │ │ │ ├── kobest_boolq.yaml │ │ │ ├── kobest_copa.yaml │ │ │ ├── kobest_hellaswag.yaml │ │ │ ├── kobest_sentineg.yaml │ │ │ ├── kobest_wic.yaml │ │ │ └── utils.py │ │ ├── kormedmcqa/ │ │ │ ├── README.md │ │ │ ├── _kormedmcqa.yaml │ │ │ ├── _template_yaml │ │ │ ├── dentist.yaml │ │ │ ├── doctor.yaml │ │ │ ├── nurse.yaml │ │ │ └── pharm.yaml │ │ ├── lambada/ │ │ │ ├── README.md │ │ │ ├── lambada_openai.yaml │ │ │ └── lambada_standard.yaml │ │ ├── lambada_cloze/ │ │ │ ├── README.md │ │ │ ├── lambada_openai_cloze.yaml │ │ │ └── lambada_standard_cloze.yaml │ │ ├── lambada_multilingual/ │ │ │ ├── README.md │ │ │ ├── lambada_mt_de.yaml │ │ │ ├── lambada_mt_en.yaml │ │ │ ├── lambada_mt_es.yaml │ │ │ ├── lambada_mt_fr.yaml │ │ │ └── lambada_mt_it.yaml │ │ ├── lambada_multilingual_stablelm/ │ │ │ ├── README.md │ │ │ ├── lambada_mt_stablelm_de.yaml │ │ │ ├── lambada_mt_stablelm_en.yaml │ │ │ ├── lambada_mt_stablelm_es.yaml │ │ │ ├── lambada_mt_stablelm_fr.yaml │ │ │ ├── lambada_mt_stablelm_it.yaml │ │ │ ├── lambada_mt_stablelm_nl.yaml │ │ │ └── lambada_mt_stablelm_pt.yaml │ │ ├── leaderboard/ │ │ │ ├── README.md │ │ │ ├── bbh_mc/ │ │ │ │ ├── _fewshot_template_yaml │ │ │ │ ├── _leaderboard_bbh.yaml │ │ │ │ ├── boolean_expressions.yaml │ │ │ │ ├── causal_judgement.yaml │ │ │ │ ├── date_understanding.yaml │ │ │ │ ├── disambiguation_qa.yaml │ │ │ │ ├── formal_fallacies.yaml │ │ │ │ ├── geometric_shapes.yaml │ │ │ │ ├── hyperbaton.yaml │ │ │ │ ├── logical_deduction_five_objects.yaml │ │ │ │ ├── logical_deduction_seven_objects.yaml │ │ │ │ ├── logical_deduction_three_objects.yaml │ │ │ │ ├── movie_recommendation.yaml │ │ │ │ ├── navigate.yaml │ │ │ │ ├── object_counting.yaml │ │ │ │ ├── penguins_in_a_table.yaml │ │ │ │ ├── reasoning_about_colored_objects.yaml │ │ │ │ ├── ruin_names.yaml │ │ │ │ ├── salient_translation_error_detection.yaml │ │ │ │ ├── snarks.yaml │ │ │ │ ├── sports_understanding.yaml │ │ │ │ ├── temporal_sequences.yaml │ │ │ │ ├── tracking_shuffled_objects_five_objects.yaml │ │ │ │ ├── tracking_shuffled_objects_seven_objects.yaml │ │ │ │ ├── tracking_shuffled_objects_three_objects.yaml │ │ │ │ └── web_of_lies.yaml │ │ │ ├── gpqa/ │ │ │ │ ├── _leaderboard_gpqa.yaml │ │ │ │ ├── _template_yaml │ │ │ │ ├── gpqa_diamond_zeroshot.yaml │ │ │ │ ├── gpqa_extended_zeroshot.yaml │ │ │ │ ├── gpqa_main_zeroshot.yaml │ │ │ │ └── utils.py │ │ │ ├── ifeval/ │ │ │ │ ├── _leaderboard_instruction_following.yaml │ │ │ │ ├── ifeval.yaml │ │ │ │ ├── instructions.py │ │ │ │ ├── instructions_registry.py │ │ │ │ ├── instructions_util.py │ │ │ │ └── utils.py │ │ │ ├── leaderboard.yaml │ │ │ ├── math/ │ │ │ │ ├── _leaderboard_math.yaml │ │ │ │ ├── _template_yaml │ │ │ │ ├── math_algebra.yaml │ │ │ │ ├── math_counting_and_prob.yaml │ │ │ │ ├── math_geometry.yaml │ │ │ │ ├── math_intermediate_algebra.yaml │ │ │ │ ├── math_num_theory.yaml │ │ │ │ ├── math_prealgebra.yaml │ │ │ │ ├── math_precalculus.yaml │ │ │ │ └── utils.py │ │ │ ├── mmlu_pro/ │ │ │ │ ├── mmlu_pro.yaml │ │ │ │ └── utils.py │ │ │ └── musr/ │ │ │ ├── _musr.yaml │ │ │ ├── _template_yaml │ │ │ ├── musr_murder_mysteries.yaml │ │ │ ├── musr_object_placements.yaml │ │ │ ├── musr_team_allocation.yaml │ │ │ └── utils.py │ │ ├── libra/ │ │ │ ├── README.md │ │ │ ├── _complex_reasoning_and_mathematical_problems.yaml │ │ │ ├── _multi_hop_question_answering.yaml │ │ │ ├── _question_answering_and_multiple_choice.yaml │ │ │ ├── _simple_information_retrieval.yaml │ │ │ ├── _template_yaml │ │ │ ├── librusec_history.yaml │ │ │ ├── librusec_mhqa.yaml │ │ │ ├── long_context_multiq.yaml │ │ │ ├── matreshka_names.yaml │ │ │ ├── matreshka_yes_no.yaml │ │ │ ├── passkey.yaml │ │ │ ├── passkey_with_librusec.yaml │ │ │ ├── ru_2wikimultihopqa.yaml │ │ │ ├── ru_babilong_qa1.yaml │ │ │ ├── ru_babilong_qa2.yaml │ │ │ ├── ru_babilong_qa3.yaml │ │ │ ├── ru_babilong_qa4.yaml │ │ │ ├── ru_babilong_qa5.yaml │ │ │ ├── ru_gsm100.yaml │ │ │ ├── ru_qasper.yaml │ │ │ ├── ru_quality.yaml │ │ │ ├── ru_sci_abstract_retrieval.yaml │ │ │ ├── ru_sci_passage_count.yaml │ │ │ └── utils.py │ │ ├── lingoly/ │ │ │ ├── README.md │ │ │ ├── lingoly_context.yaml │ │ │ ├── lingoly_group.yaml │ │ │ ├── lingoly_nocontext.yaml │ │ │ ├── script.py │ │ │ └── utils.py │ │ ├── llama3/ │ │ │ ├── README.md │ │ │ └── instruct/ │ │ │ ├── arc_challenge/ │ │ │ │ ├── arc_challenge_llama.yaml │ │ │ │ └── utils.py │ │ │ ├── gsm8k/ │ │ │ │ └── gsm8k.yaml │ │ │ ├── mmlu/ │ │ │ │ ├── _continuation_template_yaml │ │ │ │ ├── _mmlu_humanities.yaml │ │ │ │ ├── _mmlu_other.yaml │ │ │ │ ├── _mmlu_social_sciences.yaml │ │ │ │ ├── _mmlu_stem.yaml │ │ │ │ ├── llama.yaml │ │ │ │ ├── mmlu_abstract_algebra.yaml │ │ │ │ ├── mmlu_anatomy.yaml │ │ │ │ ├── mmlu_astronomy.yaml │ │ │ │ ├── mmlu_business_ethics.yaml │ │ │ │ ├── mmlu_clinical_knowledge.yaml │ │ │ │ ├── mmlu_college_biology.yaml │ │ │ │ ├── mmlu_college_chemistry.yaml │ │ │ │ ├── mmlu_college_computer_science.yaml │ │ │ │ ├── mmlu_college_mathematics.yaml │ │ │ │ ├── mmlu_college_medicine.yaml │ │ │ │ ├── mmlu_college_physics.yaml │ │ │ │ ├── mmlu_computer_security.yaml │ │ │ │ ├── mmlu_conceptual_physics.yaml │ │ │ │ ├── mmlu_econometrics.yaml │ │ │ │ ├── mmlu_electrical_engineering.yaml │ │ │ │ ├── mmlu_elementary_mathematics.yaml │ │ │ │ ├── mmlu_formal_logic.yaml │ │ │ │ ├── mmlu_global_facts.yaml │ │ │ │ ├── mmlu_high_school_biology.yaml │ │ │ │ ├── mmlu_high_school_chemistry.yaml │ │ │ │ ├── mmlu_high_school_computer_science.yaml │ │ │ │ ├── mmlu_high_school_european_history.yaml │ │ │ │ ├── mmlu_high_school_geography.yaml │ │ │ │ ├── mmlu_high_school_government_and_politics.yaml │ │ │ │ ├── mmlu_high_school_macroeconomics.yaml │ │ │ │ ├── mmlu_high_school_mathematics.yaml │ │ │ │ ├── mmlu_high_school_microeconomics.yaml │ │ │ │ ├── mmlu_high_school_physics.yaml │ │ │ │ ├── mmlu_high_school_psychology.yaml │ │ │ │ ├── mmlu_high_school_statistics.yaml │ │ │ │ ├── mmlu_high_school_us_history.yaml │ │ │ │ ├── mmlu_high_school_world_history.yaml │ │ │ │ ├── mmlu_human_aging.yaml │ │ │ │ ├── mmlu_human_sexuality.yaml │ │ │ │ ├── mmlu_international_law.yaml │ │ │ │ ├── mmlu_jurisprudence.yaml │ │ │ │ ├── mmlu_logical_fallacies.yaml │ │ │ │ ├── mmlu_machine_learning.yaml │ │ │ │ ├── mmlu_management.yaml │ │ │ │ ├── mmlu_marketing.yaml │ │ │ │ ├── mmlu_medical_genetics.yaml │ │ │ │ ├── mmlu_miscellaneous.yaml │ │ │ │ ├── mmlu_moral_disputes.yaml │ │ │ │ ├── mmlu_moral_scenarios.yaml │ │ │ │ ├── mmlu_nutrition.yaml │ │ │ │ ├── mmlu_philosophy.yaml │ │ │ │ ├── mmlu_prehistory.yaml │ │ │ │ ├── mmlu_professional_accounting.yaml │ │ │ │ ├── mmlu_professional_law.yaml │ │ │ │ ├── mmlu_professional_medicine.yaml │ │ │ │ ├── mmlu_professional_psychology.yaml │ │ │ │ ├── mmlu_public_relations.yaml │ │ │ │ ├── mmlu_security_studies.yaml │ │ │ │ ├── mmlu_sociology.yaml │ │ │ │ ├── mmlu_us_foreign_policy.yaml │ │ │ │ ├── mmlu_virology.yaml │ │ │ │ └── mmlu_world_religions.yaml │ │ │ ├── mmlu_cot/ │ │ │ │ ├── _mmlu_cot_llama.yaml │ │ │ │ ├── _mmlu_cot_llama_template_yaml │ │ │ │ ├── _mmlu_humanities.yaml │ │ │ │ ├── _mmlu_other.yaml │ │ │ │ ├── _mmlu_social_sciences.yaml │ │ │ │ ├── _mmlu_stem.yaml │ │ │ │ ├── mmlu_abstract_algebra.yaml │ │ │ │ ├── mmlu_anatomy.yaml │ │ │ │ ├── mmlu_astronomy.yaml │ │ │ │ ├── mmlu_business_ethics.yaml │ │ │ │ ├── mmlu_clinical_knowledge.yaml │ │ │ │ ├── mmlu_college_biology.yaml │ │ │ │ ├── mmlu_college_chemistry.yaml │ │ │ │ ├── mmlu_college_computer_science.yaml │ │ │ │ ├── mmlu_college_mathematics.yaml │ │ │ │ ├── mmlu_college_medicine.yaml │ │ │ │ ├── mmlu_college_physics.yaml │ │ │ │ ├── mmlu_computer_security.yaml │ │ │ │ ├── mmlu_conceptual_physics.yaml │ │ │ │ ├── mmlu_econometrics.yaml │ │ │ │ ├── mmlu_electrical_engineering.yaml │ │ │ │ ├── mmlu_elementary_mathematics.yaml │ │ │ │ ├── mmlu_formal_logic.yaml │ │ │ │ ├── mmlu_global_facts.yaml │ │ │ │ ├── mmlu_high_school_biology.yaml │ │ │ │ ├── mmlu_high_school_chemistry.yaml │ │ │ │ ├── mmlu_high_school_computer_science.yaml │ │ │ │ ├── mmlu_high_school_european_history.yaml │ │ │ │ ├── mmlu_high_school_geography.yaml │ │ │ │ ├── mmlu_high_school_government_and_politics.yaml │ │ │ │ ├── mmlu_high_school_macroeconomics.yaml │ │ │ │ ├── mmlu_high_school_mathematics.yaml │ │ │ │ ├── mmlu_high_school_microeconomics.yaml │ │ │ │ ├── mmlu_high_school_physics.yaml │ │ │ │ ├── mmlu_high_school_psychology.yaml │ │ │ │ ├── mmlu_high_school_statistics.yaml │ │ │ │ ├── mmlu_high_school_us_history.yaml │ │ │ │ ├── mmlu_high_school_world_history.yaml │ │ │ │ ├── mmlu_human_aging.yaml │ │ │ │ ├── mmlu_human_sexuality.yaml │ │ │ │ ├── mmlu_international_law.yaml │ │ │ │ ├── mmlu_jurisprudence.yaml │ │ │ │ ├── mmlu_logical_fallacies.yaml │ │ │ │ ├── mmlu_machine_learning.yaml │ │ │ │ ├── mmlu_management.yaml │ │ │ │ ├── mmlu_marketing.yaml │ │ │ │ ├── mmlu_medical_genetics.yaml │ │ │ │ ├── mmlu_miscellaneous.yaml │ │ │ │ ├── mmlu_moral_disputes.yaml │ │ │ │ ├── mmlu_moral_scenarios.yaml │ │ │ │ ├── mmlu_nutrition.yaml │ │ │ │ ├── mmlu_philosophy.yaml │ │ │ │ ├── mmlu_prehistory.yaml │ │ │ │ ├── mmlu_professional_accounting.yaml │ │ │ │ ├── mmlu_professional_law.yaml │ │ │ │ ├── mmlu_professional_medicine.yaml │ │ │ │ ├── mmlu_professional_psychology.yaml │ │ │ │ ├── mmlu_public_relations.yaml │ │ │ │ ├── mmlu_security_studies.yaml │ │ │ │ ├── mmlu_sociology.yaml │ │ │ │ ├── mmlu_us_foreign_policy.yaml │ │ │ │ ├── mmlu_virology.yaml │ │ │ │ └── mmlu_world_religions.yaml │ │ │ ├── mmlu_de/ │ │ │ │ ├── _continuation_template_yaml │ │ │ │ ├── _mmlu_de_humanities.yaml │ │ │ │ ├── _mmlu_de_llama.yaml │ │ │ │ ├── _mmlu_de_other.yaml │ │ │ │ ├── _mmlu_de_social_sciences.yaml │ │ │ │ ├── _mmlu_de_stem.yaml │ │ │ │ ├── mmlu_de_abstract_algebra.yaml │ │ │ │ ├── mmlu_de_anatomy.yaml │ │ │ │ ├── mmlu_de_astronomy.yaml │ │ │ │ ├── mmlu_de_business_ethics.yaml │ │ │ │ ├── mmlu_de_clinical_knowledge.yaml │ │ │ │ ├── mmlu_de_college_biology.yaml │ │ │ │ ├── mmlu_de_college_chemistry.yaml │ │ │ │ ├── mmlu_de_college_computer_science.yaml │ │ │ │ ├── mmlu_de_college_mathematics.yaml │ │ │ │ ├── mmlu_de_college_medicine.yaml │ │ │ │ ├── mmlu_de_college_physics.yaml │ │ │ │ ├── mmlu_de_computer_security.yaml │ │ │ │ ├── mmlu_de_conceptual_physics.yaml │ │ │ │ ├── mmlu_de_econometrics.yaml │ │ │ │ ├── mmlu_de_electrical_engineering.yaml │ │ │ │ ├── mmlu_de_elementary_mathematics.yaml │ │ │ │ ├── mmlu_de_formal_logic.yaml │ │ │ │ ├── mmlu_de_global_facts.yaml │ │ │ │ ├── mmlu_de_high_school_biology.yaml │ │ │ │ ├── mmlu_de_high_school_chemistry.yaml │ │ │ │ ├── mmlu_de_high_school_computer_science.yaml │ │ │ │ ├── mmlu_de_high_school_european_history.yaml │ │ │ │ ├── mmlu_de_high_school_geography.yaml │ │ │ │ ├── mmlu_de_high_school_government_and_politics.yaml │ │ │ │ ├── mmlu_de_high_school_macroeconomics.yaml │ │ │ │ ├── mmlu_de_high_school_mathematics.yaml │ │ │ │ ├── mmlu_de_high_school_microeconomics.yaml │ │ │ │ ├── mmlu_de_high_school_physics.yaml │ │ │ │ ├── mmlu_de_high_school_psychology.yaml │ │ │ │ ├── mmlu_de_high_school_statistics.yaml │ │ │ │ ├── mmlu_de_high_school_us_history.yaml │ │ │ │ ├── mmlu_de_high_school_world_history.yaml │ │ │ │ ├── mmlu_de_human_aging.yaml │ │ │ │ ├── mmlu_de_human_sexuality.yaml │ │ │ │ ├── mmlu_de_international_law.yaml │ │ │ │ ├── mmlu_de_jurisprudence.yaml │ │ │ │ ├── mmlu_de_logical_fallacies.yaml │ │ │ │ ├── mmlu_de_machine_learning.yaml │ │ │ │ ├── mmlu_de_management.yaml │ │ │ │ ├── mmlu_de_marketing.yaml │ │ │ │ ├── mmlu_de_medical_genetics.yaml │ │ │ │ ├── mmlu_de_miscellaneous.yaml │ │ │ │ ├── mmlu_de_moral_disputes.yaml │ │ │ │ ├── mmlu_de_moral_scenarios.yaml │ │ │ │ ├── mmlu_de_nutrition.yaml │ │ │ │ ├── mmlu_de_philosophy.yaml │ │ │ │ ├── mmlu_de_prehistory.yaml │ │ │ │ ├── mmlu_de_professional_accounting.yaml │ │ │ │ ├── mmlu_de_professional_law.yaml │ │ │ │ ├── mmlu_de_professional_medicine.yaml │ │ │ │ ├── mmlu_de_professional_psychology.yaml │ │ │ │ ├── mmlu_de_public_relations.yaml │ │ │ │ ├── mmlu_de_security_studies.yaml │ │ │ │ ├── mmlu_de_sociology.yaml │ │ │ │ ├── mmlu_de_us_foreign_policy.yaml │ │ │ │ ├── mmlu_de_virology.yaml │ │ │ │ ├── mmlu_de_world_religions.yaml │ │ │ │ └── utils.py │ │ │ ├── mmlu_es/ │ │ │ │ ├── _continuation_template_yaml │ │ │ │ ├── _mmlu_es_humanities.yaml │ │ │ │ ├── _mmlu_es_llama.yaml │ │ │ │ ├── _mmlu_es_other.yaml │ │ │ │ ├── _mmlu_es_social_sciences.yaml │ │ │ │ ├── _mmlu_es_stem.yaml │ │ │ │ ├── mmlu_es_abstract_algebra.yaml │ │ │ │ ├── mmlu_es_anatomy.yaml │ │ │ │ ├── mmlu_es_astronomy.yaml │ │ │ │ ├── mmlu_es_business_ethics.yaml │ │ │ │ ├── mmlu_es_clinical_knowledge.yaml │ │ │ │ ├── mmlu_es_college_biology.yaml │ │ │ │ ├── mmlu_es_college_chemistry.yaml │ │ │ │ ├── mmlu_es_college_computer_science.yaml │ │ │ │ ├── mmlu_es_college_mathematics.yaml │ │ │ │ ├── mmlu_es_college_medicine.yaml │ │ │ │ ├── mmlu_es_college_physics.yaml │ │ │ │ ├── mmlu_es_computer_security.yaml │ │ │ │ ├── mmlu_es_conceptual_physics.yaml │ │ │ │ ├── mmlu_es_econometrics.yaml │ │ │ │ ├── mmlu_es_electrical_engineering.yaml │ │ │ │ ├── mmlu_es_elementary_mathematics.yaml │ │ │ │ ├── mmlu_es_formal_logic.yaml │ │ │ │ ├── mmlu_es_global_facts.yaml │ │ │ │ ├── mmlu_es_high_school_biology.yaml │ │ │ │ ├── mmlu_es_high_school_chemistry.yaml │ │ │ │ ├── mmlu_es_high_school_computer_science.yaml │ │ │ │ ├── mmlu_es_high_school_european_history.yaml │ │ │ │ ├── mmlu_es_high_school_geography.yaml │ │ │ │ ├── mmlu_es_high_school_government_and_politics.yaml │ │ │ │ ├── mmlu_es_high_school_macroeconomics.yaml │ │ │ │ ├── mmlu_es_high_school_mathematics.yaml │ │ │ │ ├── mmlu_es_high_school_microeconomics.yaml │ │ │ │ ├── mmlu_es_high_school_physics.yaml │ │ │ │ ├── mmlu_es_high_school_psychology.yaml │ │ │ │ ├── mmlu_es_high_school_statistics.yaml │ │ │ │ ├── mmlu_es_high_school_us_history.yaml │ │ │ │ ├── mmlu_es_high_school_world_history.yaml │ │ │ │ ├── mmlu_es_human_aging.yaml │ │ │ │ ├── mmlu_es_human_sexuality.yaml │ │ │ │ ├── mmlu_es_international_law.yaml │ │ │ │ ├── mmlu_es_jurisprudence.yaml │ │ │ │ ├── mmlu_es_logical_fallacies.yaml │ │ │ │ ├── mmlu_es_machine_learning.yaml │ │ │ │ ├── mmlu_es_management.yaml │ │ │ │ ├── mmlu_es_marketing.yaml │ │ │ │ ├── mmlu_es_medical_genetics.yaml │ │ │ │ ├── mmlu_es_miscellaneous.yaml │ │ │ │ ├── mmlu_es_moral_disputes.yaml │ │ │ │ ├── mmlu_es_moral_scenarios.yaml │ │ │ │ ├── mmlu_es_nutrition.yaml │ │ │ │ ├── mmlu_es_philosophy.yaml │ │ │ │ ├── mmlu_es_prehistory.yaml │ │ │ │ ├── mmlu_es_professional_accounting.yaml │ │ │ │ ├── mmlu_es_professional_law.yaml │ │ │ │ ├── mmlu_es_professional_medicine.yaml │ │ │ │ ├── mmlu_es_professional_psychology.yaml │ │ │ │ ├── mmlu_es_public_relations.yaml │ │ │ │ ├── mmlu_es_security_studies.yaml │ │ │ │ ├── mmlu_es_sociology.yaml │ │ │ │ ├── mmlu_es_us_foreign_policy.yaml │ │ │ │ ├── mmlu_es_virology.yaml │ │ │ │ ├── mmlu_es_world_religions.yaml │ │ │ │ └── utils.py │ │ │ ├── mmlu_fr/ │ │ │ │ ├── _continuation_template_yaml │ │ │ │ ├── _mmlu_fr_humanities.yaml │ │ │ │ ├── _mmlu_fr_llama.yaml │ │ │ │ ├── _mmlu_fr_other.yaml │ │ │ │ ├── _mmlu_fr_social_sciences.yaml │ │ │ │ ├── _mmlu_fr_stem.yaml │ │ │ │ ├── mmlu_fr_abstract_algebra.yaml │ │ │ │ ├── mmlu_fr_anatomy.yaml │ │ │ │ ├── mmlu_fr_astronomy.yaml │ │ │ │ ├── mmlu_fr_business_ethics.yaml │ │ │ │ ├── mmlu_fr_clinical_knowledge.yaml │ │ │ │ ├── mmlu_fr_college_biology.yaml │ │ │ │ ├── mmlu_fr_college_chemistry.yaml │ │ │ │ ├── mmlu_fr_college_computer_science.yaml │ │ │ │ ├── mmlu_fr_college_mathematics.yaml │ │ │ │ ├── mmlu_fr_college_medicine.yaml │ │ │ │ ├── mmlu_fr_college_physics.yaml │ │ │ │ ├── mmlu_fr_computer_security.yaml │ │ │ │ ├── mmlu_fr_conceptual_physics.yaml │ │ │ │ ├── mmlu_fr_econometrics.yaml │ │ │ │ ├── mmlu_fr_electrical_engineering.yaml │ │ │ │ ├── mmlu_fr_elementary_mathematics.yaml │ │ │ │ ├── mmlu_fr_formal_logic.yaml │ │ │ │ ├── mmlu_fr_global_facts.yaml │ │ │ │ ├── mmlu_fr_high_school_biology.yaml │ │ │ │ ├── mmlu_fr_high_school_chemistry.yaml │ │ │ │ ├── mmlu_fr_high_school_computer_science.yaml │ │ │ │ ├── mmlu_fr_high_school_european_history.yaml │ │ │ │ ├── mmlu_fr_high_school_geography.yaml │ │ │ │ ├── mmlu_fr_high_school_government_and_politics.yaml │ │ │ │ ├── mmlu_fr_high_school_macroeconomics.yaml │ │ │ │ ├── mmlu_fr_high_school_mathematics.yaml │ │ │ │ ├── mmlu_fr_high_school_microeconomics.yaml │ │ │ │ ├── mmlu_fr_high_school_physics.yaml │ │ │ │ ├── mmlu_fr_high_school_psychology.yaml │ │ │ │ ├── mmlu_fr_high_school_statistics.yaml │ │ │ │ ├── mmlu_fr_high_school_us_history.yaml │ │ │ │ ├── mmlu_fr_high_school_world_history.yaml │ │ │ │ ├── mmlu_fr_human_aging.yaml │ │ │ │ ├── mmlu_fr_human_sexuality.yaml │ │ │ │ ├── mmlu_fr_international_law.yaml │ │ │ │ ├── mmlu_fr_jurisprudence.yaml │ │ │ │ ├── mmlu_fr_logical_fallacies.yaml │ │ │ │ ├── mmlu_fr_machine_learning.yaml │ │ │ │ ├── mmlu_fr_management.yaml │ │ │ │ ├── mmlu_fr_marketing.yaml │ │ │ │ ├── mmlu_fr_medical_genetics.yaml │ │ │ │ ├── mmlu_fr_miscellaneous.yaml │ │ │ │ ├── mmlu_fr_moral_disputes.yaml │ │ │ │ ├── mmlu_fr_moral_scenarios.yaml │ │ │ │ ├── mmlu_fr_nutrition.yaml │ │ │ │ ├── mmlu_fr_philosophy.yaml │ │ │ │ ├── mmlu_fr_prehistory.yaml │ │ │ │ ├── mmlu_fr_professional_accounting.yaml │ │ │ │ ├── mmlu_fr_professional_law.yaml │ │ │ │ ├── mmlu_fr_professional_medicine.yaml │ │ │ │ ├── mmlu_fr_professional_psychology.yaml │ │ │ │ ├── mmlu_fr_public_relations.yaml │ │ │ │ ├── mmlu_fr_security_studies.yaml │ │ │ │ ├── mmlu_fr_sociology.yaml │ │ │ │ ├── mmlu_fr_us_foreign_policy.yaml │ │ │ │ ├── mmlu_fr_virology.yaml │ │ │ │ ├── mmlu_fr_world_religions.yaml │ │ │ │ └── utils.py │ │ │ ├── mmlu_hi/ │ │ │ │ ├── _continuation_template_yaml │ │ │ │ ├── _mmlu_hi_humanities.yaml │ │ │ │ ├── _mmlu_hi_llama.yaml │ │ │ │ ├── _mmlu_hi_other.yaml │ │ │ │ ├── _mmlu_hi_social_sciences.yaml │ │ │ │ ├── _mmlu_hi_stem.yaml │ │ │ │ ├── mmlu_hi_abstract_algebra.yaml │ │ │ │ ├── mmlu_hi_anatomy.yaml │ │ │ │ ├── mmlu_hi_astronomy.yaml │ │ │ │ ├── mmlu_hi_business_ethics.yaml │ │ │ │ ├── mmlu_hi_clinical_knowledge.yaml │ │ │ │ ├── mmlu_hi_college_biology.yaml │ │ │ │ ├── mmlu_hi_college_chemistry.yaml │ │ │ │ ├── mmlu_hi_college_computer_science.yaml │ │ │ │ ├── mmlu_hi_college_mathematics.yaml │ │ │ │ ├── mmlu_hi_college_medicine.yaml │ │ │ │ ├── mmlu_hi_college_physics.yaml │ │ │ │ ├── mmlu_hi_computer_security.yaml │ │ │ │ ├── mmlu_hi_conceptual_physics.yaml │ │ │ │ ├── mmlu_hi_econometrics.yaml │ │ │ │ ├── mmlu_hi_electrical_engineering.yaml │ │ │ │ ├── mmlu_hi_elementary_mathematics.yaml │ │ │ │ ├── mmlu_hi_formal_logic.yaml │ │ │ │ ├── mmlu_hi_global_facts.yaml │ │ │ │ ├── mmlu_hi_high_school_biology.yaml │ │ │ │ ├── mmlu_hi_high_school_chemistry.yaml │ │ │ │ ├── mmlu_hi_high_school_computer_science.yaml │ │ │ │ ├── mmlu_hi_high_school_european_history.yaml │ │ │ │ ├── mmlu_hi_high_school_geography.yaml │ │ │ │ ├── mmlu_hi_high_school_government_and_politics.yaml │ │ │ │ ├── mmlu_hi_high_school_macroeconomics.yaml │ │ │ │ ├── mmlu_hi_high_school_mathematics.yaml │ │ │ │ ├── mmlu_hi_high_school_microeconomics.yaml │ │ │ │ ├── mmlu_hi_high_school_physics.yaml │ │ │ │ ├── mmlu_hi_high_school_psychology.yaml │ │ │ │ ├── mmlu_hi_high_school_statistics.yaml │ │ │ │ ├── mmlu_hi_high_school_us_history.yaml │ │ │ │ ├── mmlu_hi_high_school_world_history.yaml │ │ │ │ ├── mmlu_hi_human_aging.yaml │ │ │ │ ├── mmlu_hi_human_sexuality.yaml │ │ │ │ ├── mmlu_hi_international_law.yaml │ │ │ │ ├── mmlu_hi_jurisprudence.yaml │ │ │ │ ├── mmlu_hi_logical_fallacies.yaml │ │ │ │ ├── mmlu_hi_machine_learning.yaml │ │ │ │ ├── mmlu_hi_management.yaml │ │ │ │ ├── mmlu_hi_marketing.yaml │ │ │ │ ├── mmlu_hi_medical_genetics.yaml │ │ │ │ ├── mmlu_hi_miscellaneous.yaml │ │ │ │ ├── mmlu_hi_moral_disputes.yaml │ │ │ │ ├── mmlu_hi_moral_scenarios.yaml │ │ │ │ ├── mmlu_hi_nutrition.yaml │ │ │ │ ├── mmlu_hi_philosophy.yaml │ │ │ │ ├── mmlu_hi_prehistory.yaml │ │ │ │ ├── mmlu_hi_professional_accounting.yaml │ │ │ │ ├── mmlu_hi_professional_law.yaml │ │ │ │ ├── mmlu_hi_professional_medicine.yaml │ │ │ │ ├── mmlu_hi_professional_psychology.yaml │ │ │ │ ├── mmlu_hi_public_relations.yaml │ │ │ │ ├── mmlu_hi_security_studies.yaml │ │ │ │ ├── mmlu_hi_sociology.yaml │ │ │ │ ├── mmlu_hi_us_foreign_policy.yaml │ │ │ │ ├── mmlu_hi_virology.yaml │ │ │ │ ├── mmlu_hi_world_religions.yaml │ │ │ │ └── utils.py │ │ │ ├── mmlu_it/ │ │ │ │ ├── _continuation_template_yaml │ │ │ │ ├── _mmlu_it_humanities.yaml │ │ │ │ ├── _mmlu_it_llama.yaml │ │ │ │ ├── _mmlu_it_other.yaml │ │ │ │ ├── _mmlu_it_social_sciences.yaml │ │ │ │ ├── _mmlu_it_stem.yaml │ │ │ │ ├── mmlu_it_abstract_algebra.yaml │ │ │ │ ├── mmlu_it_anatomy.yaml │ │ │ │ ├── mmlu_it_astronomy.yaml │ │ │ │ ├── mmlu_it_business_ethics.yaml │ │ │ │ ├── mmlu_it_clinical_knowledge.yaml │ │ │ │ ├── mmlu_it_college_biology.yaml │ │ │ │ ├── mmlu_it_college_chemistry.yaml │ │ │ │ ├── mmlu_it_college_computer_science.yaml │ │ │ │ ├── mmlu_it_college_mathematics.yaml │ │ │ │ ├── mmlu_it_college_medicine.yaml │ │ │ │ ├── mmlu_it_college_physics.yaml │ │ │ │ ├── mmlu_it_computer_security.yaml │ │ │ │ ├── mmlu_it_conceptual_physics.yaml │ │ │ │ ├── mmlu_it_econometrics.yaml │ │ │ │ ├── mmlu_it_electrical_engineering.yaml │ │ │ │ ├── mmlu_it_elementary_mathematics.yaml │ │ │ │ ├── mmlu_it_formal_logic.yaml │ │ │ │ ├── mmlu_it_global_facts.yaml │ │ │ │ ├── mmlu_it_high_school_biology.yaml │ │ │ │ ├── mmlu_it_high_school_chemistry.yaml │ │ │ │ ├── mmlu_it_high_school_computer_science.yaml │ │ │ │ ├── mmlu_it_high_school_european_history.yaml │ │ │ │ ├── mmlu_it_high_school_geography.yaml │ │ │ │ ├── mmlu_it_high_school_government_and_politics.yaml │ │ │ │ ├── mmlu_it_high_school_macroeconomics.yaml │ │ │ │ ├── mmlu_it_high_school_mathematics.yaml │ │ │ │ ├── mmlu_it_high_school_microeconomics.yaml │ │ │ │ ├── mmlu_it_high_school_physics.yaml │ │ │ │ ├── mmlu_it_high_school_psychology.yaml │ │ │ │ ├── mmlu_it_high_school_statistics.yaml │ │ │ │ ├── mmlu_it_high_school_us_history.yaml │ │ │ │ ├── mmlu_it_high_school_world_history.yaml │ │ │ │ ├── mmlu_it_human_aging.yaml │ │ │ │ ├── mmlu_it_human_sexuality.yaml │ │ │ │ ├── mmlu_it_international_law.yaml │ │ │ │ ├── mmlu_it_jurisprudence.yaml │ │ │ │ ├── mmlu_it_logical_fallacies.yaml │ │ │ │ ├── mmlu_it_machine_learning.yaml │ │ │ │ ├── mmlu_it_management.yaml │ │ │ │ ├── mmlu_it_marketing.yaml │ │ │ │ ├── mmlu_it_medical_genetics.yaml │ │ │ │ ├── mmlu_it_miscellaneous.yaml │ │ │ │ ├── mmlu_it_moral_disputes.yaml │ │ │ │ ├── mmlu_it_moral_scenarios.yaml │ │ │ │ ├── mmlu_it_nutrition.yaml │ │ │ │ ├── mmlu_it_philosophy.yaml │ │ │ │ ├── mmlu_it_prehistory.yaml │ │ │ │ ├── mmlu_it_professional_accounting.yaml │ │ │ │ ├── mmlu_it_professional_law.yaml │ │ │ │ ├── mmlu_it_professional_medicine.yaml │ │ │ │ ├── mmlu_it_professional_psychology.yaml │ │ │ │ ├── mmlu_it_public_relations.yaml │ │ │ │ ├── mmlu_it_security_studies.yaml │ │ │ │ ├── mmlu_it_sociology.yaml │ │ │ │ ├── mmlu_it_us_foreign_policy.yaml │ │ │ │ ├── mmlu_it_virology.yaml │ │ │ │ ├── mmlu_it_world_religions.yaml │ │ │ │ └── utils.py │ │ │ ├── mmlu_pro/ │ │ │ │ ├── _default_template_yaml │ │ │ │ ├── _mmlu_pro.yaml │ │ │ │ ├── mmlu_pro_biology.yaml │ │ │ │ ├── mmlu_pro_business.yaml │ │ │ │ ├── mmlu_pro_chemistry.yaml │ │ │ │ ├── mmlu_pro_computer_science.yaml │ │ │ │ ├── mmlu_pro_economics.yaml │ │ │ │ ├── mmlu_pro_engineering.yaml │ │ │ │ ├── mmlu_pro_health.yaml │ │ │ │ ├── mmlu_pro_history.yaml │ │ │ │ ├── mmlu_pro_law.yaml │ │ │ │ ├── mmlu_pro_math.yaml │ │ │ │ ├── mmlu_pro_other.yaml │ │ │ │ ├── mmlu_pro_philosophy.yaml │ │ │ │ ├── mmlu_pro_physics.yaml │ │ │ │ ├── mmlu_pro_psychology.yaml │ │ │ │ └── utils.py │ │ │ ├── mmlu_pt/ │ │ │ │ ├── _continuation_template_yaml │ │ │ │ ├── _mmlu_pt_humanities.yaml │ │ │ │ ├── _mmlu_pt_llama.yaml │ │ │ │ ├── _mmlu_pt_other.yaml │ │ │ │ ├── _mmlu_pt_social_sciences.yaml │ │ │ │ ├── _mmlu_pt_stem.yaml │ │ │ │ ├── mmlu_pt_abstract_algebra.yaml │ │ │ │ ├── mmlu_pt_anatomy.yaml │ │ │ │ ├── mmlu_pt_astronomy.yaml │ │ │ │ ├── mmlu_pt_business_ethics.yaml │ │ │ │ ├── mmlu_pt_clinical_knowledge.yaml │ │ │ │ ├── mmlu_pt_college_biology.yaml │ │ │ │ ├── mmlu_pt_college_chemistry.yaml │ │ │ │ ├── mmlu_pt_college_computer_science.yaml │ │ │ │ ├── mmlu_pt_college_mathematics.yaml │ │ │ │ ├── mmlu_pt_college_medicine.yaml │ │ │ │ ├── mmlu_pt_college_physics.yaml │ │ │ │ ├── mmlu_pt_computer_security.yaml │ │ │ │ ├── mmlu_pt_conceptual_physics.yaml │ │ │ │ ├── mmlu_pt_econometrics.yaml │ │ │ │ ├── mmlu_pt_electrical_engineering.yaml │ │ │ │ ├── mmlu_pt_elementary_mathematics.yaml │ │ │ │ ├── mmlu_pt_formal_logic.yaml │ │ │ │ ├── mmlu_pt_global_facts.yaml │ │ │ │ ├── mmlu_pt_high_school_biology.yaml │ │ │ │ ├── mmlu_pt_high_school_chemistry.yaml │ │ │ │ ├── mmlu_pt_high_school_computer_science.yaml │ │ │ │ ├── mmlu_pt_high_school_european_history.yaml │ │ │ │ ├── mmlu_pt_high_school_geography.yaml │ │ │ │ ├── mmlu_pt_high_school_government_and_politics.yaml │ │ │ │ ├── mmlu_pt_high_school_macroeconomics.yaml │ │ │ │ ├── mmlu_pt_high_school_mathematics.yaml │ │ │ │ ├── mmlu_pt_high_school_microeconomics.yaml │ │ │ │ ├── mmlu_pt_high_school_physics.yaml │ │ │ │ ├── mmlu_pt_high_school_psychology.yaml │ │ │ │ ├── mmlu_pt_high_school_statistics.yaml │ │ │ │ ├── mmlu_pt_high_school_us_history.yaml │ │ │ │ ├── mmlu_pt_high_school_world_history.yaml │ │ │ │ ├── mmlu_pt_human_aging.yaml │ │ │ │ ├── mmlu_pt_human_sexuality.yaml │ │ │ │ ├── mmlu_pt_international_law.yaml │ │ │ │ ├── mmlu_pt_jurisprudence.yaml │ │ │ │ ├── mmlu_pt_logical_fallacies.yaml │ │ │ │ ├── mmlu_pt_machine_learning.yaml │ │ │ │ ├── mmlu_pt_management.yaml │ │ │ │ ├── mmlu_pt_marketing.yaml │ │ │ │ ├── mmlu_pt_medical_genetics.yaml │ │ │ │ ├── mmlu_pt_miscellaneous.yaml │ │ │ │ ├── mmlu_pt_moral_disputes.yaml │ │ │ │ ├── mmlu_pt_moral_scenarios.yaml │ │ │ │ ├── mmlu_pt_nutrition.yaml │ │ │ │ ├── mmlu_pt_philosophy.yaml │ │ │ │ ├── mmlu_pt_prehistory.yaml │ │ │ │ ├── mmlu_pt_professional_accounting.yaml │ │ │ │ ├── mmlu_pt_professional_law.yaml │ │ │ │ ├── mmlu_pt_professional_medicine.yaml │ │ │ │ ├── mmlu_pt_professional_psychology.yaml │ │ │ │ ├── mmlu_pt_public_relations.yaml │ │ │ │ ├── mmlu_pt_security_studies.yaml │ │ │ │ ├── mmlu_pt_sociology.yaml │ │ │ │ ├── mmlu_pt_us_foreign_policy.yaml │ │ │ │ ├── mmlu_pt_virology.yaml │ │ │ │ ├── mmlu_pt_world_religions.yaml │ │ │ │ └── utils.py │ │ │ └── mmlu_th/ │ │ │ ├── _continuation_template_yaml │ │ │ ├── _mmlu_th_humanities.yaml │ │ │ ├── _mmlu_th_llama.yaml │ │ │ ├── _mmlu_th_other.yaml │ │ │ ├── _mmlu_th_social_sciences.yaml │ │ │ ├── _mmlu_th_stem.yaml │ │ │ ├── mmlu_th_abstract_algebra.yaml │ │ │ ├── mmlu_th_anatomy.yaml │ │ │ ├── mmlu_th_astronomy.yaml │ │ │ ├── mmlu_th_business_ethics.yaml │ │ │ ├── mmlu_th_clinical_knowledge.yaml │ │ │ ├── mmlu_th_college_biology.yaml │ │ │ ├── mmlu_th_college_chemistry.yaml │ │ │ ├── mmlu_th_college_computer_science.yaml │ │ │ ├── mmlu_th_college_mathematics.yaml │ │ │ ├── mmlu_th_college_medicine.yaml │ │ │ ├── mmlu_th_college_physics.yaml │ │ │ ├── mmlu_th_computer_security.yaml │ │ │ ├── mmlu_th_conceptual_physics.yaml │ │ │ ├── mmlu_th_econometrics.yaml │ │ │ ├── mmlu_th_electrical_engineering.yaml │ │ │ ├── mmlu_th_elementary_mathematics.yaml │ │ │ ├── mmlu_th_formal_logic.yaml │ │ │ ├── mmlu_th_global_facts.yaml │ │ │ ├── mmlu_th_high_school_biology.yaml │ │ │ ├── mmlu_th_high_school_chemistry.yaml │ │ │ ├── mmlu_th_high_school_computer_science.yaml │ │ │ ├── mmlu_th_high_school_european_history.yaml │ │ │ ├── mmlu_th_high_school_geography.yaml │ │ │ ├── mmlu_th_high_school_government_and_politics.yaml │ │ │ ├── mmlu_th_high_school_macroeconomics.yaml │ │ │ ├── mmlu_th_high_school_mathematics.yaml │ │ │ ├── mmlu_th_high_school_microeconomics.yaml │ │ │ ├── mmlu_th_high_school_physics.yaml │ │ │ ├── mmlu_th_high_school_psychology.yaml │ │ │ ├── mmlu_th_high_school_statistics.yaml │ │ │ ├── mmlu_th_high_school_us_history.yaml │ │ │ ├── mmlu_th_high_school_world_history.yaml │ │ │ ├── mmlu_th_human_aging.yaml │ │ │ ├── mmlu_th_human_sexuality.yaml │ │ │ ├── mmlu_th_international_law.yaml │ │ │ ├── mmlu_th_jurisprudence.yaml │ │ │ ├── mmlu_th_logical_fallacies.yaml │ │ │ ├── mmlu_th_machine_learning.yaml │ │ │ ├── mmlu_th_management.yaml │ │ │ ├── mmlu_th_marketing.yaml │ │ │ ├── mmlu_th_medical_genetics.yaml │ │ │ ├── mmlu_th_miscellaneous.yaml │ │ │ ├── mmlu_th_moral_disputes.yaml │ │ │ ├── mmlu_th_moral_scenarios.yaml │ │ │ ├── mmlu_th_nutrition.yaml │ │ │ ├── mmlu_th_philosophy.yaml │ │ │ ├── mmlu_th_prehistory.yaml │ │ │ ├── mmlu_th_professional_accounting.yaml │ │ │ ├── mmlu_th_professional_law.yaml │ │ │ ├── mmlu_th_professional_medicine.yaml │ │ │ ├── mmlu_th_professional_psychology.yaml │ │ │ ├── mmlu_th_public_relations.yaml │ │ │ ├── mmlu_th_security_studies.yaml │ │ │ ├── mmlu_th_sociology.yaml │ │ │ ├── mmlu_th_us_foreign_policy.yaml │ │ │ ├── mmlu_th_virology.yaml │ │ │ ├── mmlu_th_world_religions.yaml │ │ │ └── utils.py │ │ ├── lm_syneval/ │ │ │ ├── README.md │ │ │ ├── _template_yaml │ │ │ ├── lm_syneval__agreement__long_vp_coord__plur_MS_LMV_LMV.yaml │ │ │ ├── lm_syneval__agreement__long_vp_coord__sing_MS_LMV_LMV.yaml │ │ │ ├── lm_syneval__agreement__obj_rel_across_anim__plur_MS_MV_plur_ES_EV.yaml │ │ │ ├── lm_syneval__agreement__obj_rel_across_anim__plur_MS_MV_sing_ES_EV.yaml │ │ │ ├── lm_syneval__agreement__obj_rel_across_anim__sing_MS_MV_plur_ES_EV.yaml │ │ │ ├── lm_syneval__agreement__obj_rel_across_anim__sing_MS_MV_sing_ES_EV.yaml │ │ │ ├── lm_syneval__agreement__obj_rel_across_inanim__plur_IS_IV_plur_ES_EV.yaml │ │ │ ├── lm_syneval__agreement__obj_rel_across_inanim__plur_IS_IV_sing_ES_EV.yaml │ │ │ ├── lm_syneval__agreement__obj_rel_across_inanim__sing_IS_IV_plur_ES_EV.yaml │ │ │ ├── lm_syneval__agreement__obj_rel_across_inanim__sing_IS_IV_sing_ES_EV.yaml │ │ │ ├── lm_syneval__agreement__obj_rel_no_comp_across_anim__plur_MS_MV_plur_ES_EV.yaml │ │ │ ├── lm_syneval__agreement__obj_rel_no_comp_across_anim__plur_MS_MV_sing_ES_EV.yaml │ │ │ ├── lm_syneval__agreement__obj_rel_no_comp_across_anim__sing_MS_MV_plur_ES_EV.yaml │ │ │ ├── lm_syneval__agreement__obj_rel_no_comp_across_anim__sing_MS_MV_sing_ES_EV.yaml │ │ │ ├── lm_syneval__agreement__obj_rel_no_comp_across_inanim__plur_IS_IV_plur_ES_EV.yaml │ │ │ ├── lm_syneval__agreement__obj_rel_no_comp_across_inanim__plur_IS_IV_sing_ES_EV.yaml │ │ │ ├── lm_syneval__agreement__obj_rel_no_comp_across_inanim__sing_IS_IV_plur_ES_EV.yaml │ │ │ ├── lm_syneval__agreement__obj_rel_no_comp_across_inanim__sing_IS_IV_sing_ES_EV.yaml │ │ │ ├── lm_syneval__agreement__obj_rel_no_comp_within_anim__plur_ES_EV_plur_MS_MV.yaml │ │ │ ├── lm_syneval__agreement__obj_rel_no_comp_within_anim__plur_ES_EV_sing_MS_MV.yaml │ │ │ ├── lm_syneval__agreement__obj_rel_no_comp_within_anim__sing_ES_EV_plur_MS_MV.yaml │ │ │ ├── lm_syneval__agreement__obj_rel_no_comp_within_anim__sing_ES_EV_sing_MS_MV.yaml │ │ │ ├── lm_syneval__agreement__obj_rel_no_comp_within_inanim__plur_ES_EV_plur_IS_IV.yaml │ │ │ ├── lm_syneval__agreement__obj_rel_no_comp_within_inanim__plur_ES_EV_sing_IS_IV.yaml │ │ │ ├── lm_syneval__agreement__obj_rel_no_comp_within_inanim__sing_ES_EV_plur_IS_IV.yaml │ │ │ ├── lm_syneval__agreement__obj_rel_no_comp_within_inanim__sing_ES_EV_sing_IS_IV.yaml │ │ │ ├── lm_syneval__agreement__obj_rel_within_anim__plur_ES_EV_plur_MS_MV.yaml │ │ │ ├── lm_syneval__agreement__obj_rel_within_anim__plur_ES_EV_sing_MS_MV.yaml │ │ │ ├── lm_syneval__agreement__obj_rel_within_anim__sing_ES_EV_plur_MS_MV.yaml │ │ │ ├── lm_syneval__agreement__obj_rel_within_anim__sing_ES_EV_sing_MS_MV.yaml │ │ │ ├── lm_syneval__agreement__obj_rel_within_inanim__plur_ES_EV_plur_IS_IV.yaml │ │ │ ├── lm_syneval__agreement__obj_rel_within_inanim__plur_ES_EV_sing_IS_IV.yaml │ │ │ ├── lm_syneval__agreement__obj_rel_within_inanim__sing_ES_EV_plur_IS_IV.yaml │ │ │ ├── lm_syneval__agreement__obj_rel_within_inanim__sing_ES_EV_sing_IS_IV.yaml │ │ │ ├── lm_syneval__agreement__prep_anim__plur_MS_MV_plur_ES.yaml │ │ │ ├── lm_syneval__agreement__prep_anim__plur_MS_MV_sing_ES.yaml │ │ │ ├── lm_syneval__agreement__prep_anim__sing_MS_MV_plur_ES.yaml │ │ │ ├── lm_syneval__agreement__prep_anim__sing_MS_MV_sing_ES.yaml │ │ │ ├── lm_syneval__agreement__prep_inanim__plur_IS_IV_plur_ES.yaml │ │ │ ├── lm_syneval__agreement__prep_inanim__plur_IS_IV_sing_ES.yaml │ │ │ ├── lm_syneval__agreement__prep_inanim__sing_IS_IV_plur_ES.yaml │ │ │ ├── lm_syneval__agreement__prep_inanim__sing_IS_IV_sing_ES.yaml │ │ │ ├── lm_syneval__agreement__sent_comp__plur_MS_MV_plur_BS.yaml │ │ │ ├── lm_syneval__agreement__sent_comp__plur_MS_MV_sing_BS.yaml │ │ │ ├── lm_syneval__agreement__sent_comp__sing_MS_MV_plur_BS.yaml │ │ │ ├── lm_syneval__agreement__sent_comp__sing_MS_MV_sing_BS.yaml │ │ │ ├── lm_syneval__agreement__simple_agrmt__plur_MS_MV.yaml │ │ │ ├── lm_syneval__agreement__simple_agrmt__sing_MS_MV.yaml │ │ │ ├── lm_syneval__agreement__subj_rel__plur_MS_EV_MV_plur_ES.yaml │ │ │ ├── lm_syneval__agreement__subj_rel__plur_MS_EV_MV_sing_ES.yaml │ │ │ ├── lm_syneval__agreement__subj_rel__sing_MS_EV_MV_plur_ES.yaml │ │ │ ├── lm_syneval__agreement__subj_rel__sing_MS_EV_MV_sing_ES.yaml │ │ │ ├── lm_syneval__agreement__vp_coord__plur_MS_MV_MV.yaml │ │ │ ├── lm_syneval__agreement__vp_coord__sing_MS_MV_MV.yaml │ │ │ ├── lm_syneval__npi__npi_across_anim__future.yaml │ │ │ ├── lm_syneval__npi__npi_across_anim__past.yaml │ │ │ ├── lm_syneval__npi__npi_across_inanim__future.yaml │ │ │ ├── lm_syneval__npi__npi_across_inanim__past.yaml │ │ │ ├── lm_syneval__npi__simple_npi_anim__future.yaml │ │ │ ├── lm_syneval__npi__simple_npi_anim__past.yaml │ │ │ ├── lm_syneval__npi__simple_npi_inanim__future.yaml │ │ │ ├── lm_syneval__npi__simple_npi_inanim__past.yaml │ │ │ ├── lm_syneval__reflexives__reflexive_sent_comp__plur_MS_ANPHR_plur_BS.yaml │ │ │ ├── lm_syneval__reflexives__reflexive_sent_comp__plur_MS_ANPHR_sing_BS.yaml │ │ │ ├── lm_syneval__reflexives__reflexive_sent_comp__sing_MS_ANPHR_plur_BS.yaml │ │ │ ├── lm_syneval__reflexives__reflexive_sent_comp__sing_MS_ANPHR_sing_BS.yaml │ │ │ ├── lm_syneval__reflexives__reflexives_across__plur_MS_ANPHR_plur_ES_EV.yaml │ │ │ ├── lm_syneval__reflexives__reflexives_across__plur_MS_ANPHR_sing_ES_EV.yaml │ │ │ ├── lm_syneval__reflexives__reflexives_across__sing_MS_ANPHR_plur_ES_EV.yaml │ │ │ ├── lm_syneval__reflexives__reflexives_across__sing_MS_ANPHR_sing_ES_EV.yaml │ │ │ ├── lm_syneval__reflexives__simple_reflexives__plur_MS_ANPHR.yaml │ │ │ ├── lm_syneval__reflexives__simple_reflexives__sing_MS_ANPHR.yaml │ │ │ └── lm_syneval_group.yaml │ │ ├── logiqa/ │ │ │ ├── README.md │ │ │ ├── logiqa.yaml │ │ │ └── utils_logiqa.py │ │ ├── logiqa2/ │ │ │ ├── README.md │ │ │ ├── logieval.yaml │ │ │ ├── logiqa2.yaml │ │ │ └── utils_logiqa2.py │ │ ├── longbench/ │ │ │ ├── 2wikimqa.yaml │ │ │ ├── 2wikimqa_e.yaml │ │ │ ├── README.md │ │ │ ├── _generate_config.py │ │ │ ├── _longbench.yaml │ │ │ ├── _longbench_code.yaml │ │ │ ├── _longbench_code_e.yaml │ │ │ ├── _longbench_e.yaml │ │ │ ├── _longbench_fewshot.yaml │ │ │ ├── _longbench_fewshot_e.yaml │ │ │ ├── _longbench_multi.yaml │ │ │ ├── _longbench_multi_e.yaml │ │ │ ├── _longbench_single.yaml │ │ │ ├── _longbench_single_e.yaml │ │ │ ├── _longbench_summarization.yaml │ │ │ ├── _longbench_summarization_e.yaml │ │ │ ├── _longbench_synthetic.yaml │ │ │ ├── _longbench_synthetic_e.yaml │ │ │ ├── dureader.yaml │ │ │ ├── gov_report.yaml │ │ │ ├── gov_report_e.yaml │ │ │ ├── hotpotqa.yaml │ │ │ ├── hotpotqa_e.yaml │ │ │ ├── lcc.yaml │ │ │ ├── lcc_e.yaml │ │ │ ├── lsht.yaml │ │ │ ├── metrics.py │ │ │ ├── multi_news.yaml │ │ │ ├── multi_news_e.yaml │ │ │ ├── multifieldqa_en.yaml │ │ │ ├── multifieldqa_en_e.yaml │ │ │ ├── multifieldqa_zh.yaml │ │ │ ├── musique.yaml │ │ │ ├── narrativeqa.yaml │ │ │ ├── passage_count.yaml │ │ │ ├── passage_count_e.yaml │ │ │ ├── passage_retrieval_en.yaml │ │ │ ├── passage_retrieval_en_e.yaml │ │ │ ├── passage_retrieval_zh.yaml │ │ │ ├── qasper.yaml │ │ │ ├── qasper_e.yaml │ │ │ ├── qmsum.yaml │ │ │ ├── repobench-p.yaml │ │ │ ├── repobench-p_e.yaml │ │ │ ├── samsum.yaml │ │ │ ├── samsum_e.yaml │ │ │ ├── trec.yaml │ │ │ ├── trec_e.yaml │ │ │ ├── triviaqa.yaml │ │ │ ├── triviaqa_e.yaml │ │ │ ├── utils.py │ │ │ └── vcsum.yaml │ │ ├── longbench2/ │ │ │ ├── README.md │ │ │ ├── _longbench2.yaml │ │ │ ├── _longbench2_history.yaml │ │ │ ├── _longbench2_incontext.yaml │ │ │ ├── _longbench2_multi.yaml │ │ │ ├── _longbench2_single.yaml │ │ │ ├── _longbench2_structured.yaml │ │ │ ├── _longbench_common_yaml │ │ │ ├── academic_multi_doc.yaml │ │ │ ├── academic_single.yaml │ │ │ ├── agent_history.yaml │ │ │ ├── detective.yaml │ │ │ ├── dialogue_history.yaml │ │ │ ├── event_order.yaml │ │ │ ├── fin_multi_doc.yaml │ │ │ ├── fin_single_doc.yaml │ │ │ ├── govt_multi_doc.yaml │ │ │ ├── govt_single_doc.yaml │ │ │ ├── graph.yaml │ │ │ ├── legal_multi.yaml │ │ │ ├── legal_single.yaml │ │ │ ├── lit_single_doc.yaml │ │ │ ├── longbench2_code.yaml │ │ │ ├── many_shot.yaml │ │ │ ├── news_multi.yaml │ │ │ ├── table.yaml │ │ │ ├── translate.yaml │ │ │ └── user_guide.yaml │ │ ├── manager.py │ │ ├── mastermind/ │ │ │ ├── README.md │ │ │ ├── mastermind_24_easy.yaml │ │ │ ├── mastermind_24_hard.yaml │ │ │ ├── mastermind_35_easy.yaml │ │ │ ├── mastermind_35_hard.yaml │ │ │ ├── mastermind_46_easy.yaml │ │ │ └── mastermind_46_hard.yaml │ │ ├── mathqa/ │ │ │ ├── README.md │ │ │ ├── mathqa.yaml │ │ │ └── utils.py │ │ ├── mbpp/ │ │ │ ├── README.md │ │ │ ├── mbpp.yaml │ │ │ ├── mbpp_instruct.yaml │ │ │ ├── mbpp_plus.yaml │ │ │ ├── mbpp_plus_instruct.yaml │ │ │ └── utils.py │ │ ├── mc_taco/ │ │ │ ├── README.md │ │ │ └── default.yaml │ │ ├── med_concepts_qa/ │ │ │ ├── README.md │ │ │ ├── _default_template_yaml │ │ │ ├── _generate_configs.py │ │ │ ├── _med_concepts_qa.yaml │ │ │ ├── _med_concepts_qa_atc.yaml │ │ │ ├── _med_concepts_qa_icd10cm.yaml │ │ │ ├── _med_concepts_qa_icd10proc.yaml │ │ │ ├── _med_concepts_qa_icd9cm.yaml │ │ │ ├── _med_concepts_qa_icd9proc.yaml │ │ │ ├── med_concepts_qa_atc_easy.yaml │ │ │ ├── med_concepts_qa_atc_hard.yaml │ │ │ ├── med_concepts_qa_atc_medium.yaml │ │ │ ├── med_concepts_qa_icd10cm_easy.yaml │ │ │ ├── med_concepts_qa_icd10cm_hard.yaml │ │ │ ├── med_concepts_qa_icd10cm_medium.yaml │ │ │ ├── med_concepts_qa_icd10proc_easy.yaml │ │ │ ├── med_concepts_qa_icd10proc_hard.yaml │ │ │ ├── med_concepts_qa_icd10proc_medium.yaml │ │ │ ├── med_concepts_qa_icd9cm_easy.yaml │ │ │ ├── med_concepts_qa_icd9cm_hard.yaml │ │ │ ├── med_concepts_qa_icd9cm_medium.yaml │ │ │ ├── med_concepts_qa_icd9proc_easy.yaml │ │ │ ├── med_concepts_qa_icd9proc_hard.yaml │ │ │ └── med_concepts_qa_icd9proc_medium.yaml │ │ ├── med_prescriptions/ │ │ │ ├── med_prescriptions_easy.yaml │ │ │ ├── med_prescriptions_hard.yaml │ │ │ └── utils.py │ │ ├── med_text_classification/ │ │ │ ├── med_text_classification_easy.yaml │ │ │ ├── med_text_classification_hard.yaml │ │ │ └── utils.py │ │ ├── meddialog/ │ │ │ ├── README.md │ │ │ ├── meddialog_qsumm.yaml │ │ │ ├── meddialog_qsumm_perplexity.yaml │ │ │ ├── meddialog_raw_dialogues.yaml │ │ │ ├── meddialog_raw_perplexity.yaml │ │ │ ├── utils.py │ │ │ └── utils_perplexity.py │ │ ├── mediqa_qa2019/ │ │ │ ├── README.md │ │ │ ├── mediqa_qa2019.yaml │ │ │ ├── mediqa_qa2019_perplexity.yaml │ │ │ ├── utils.py │ │ │ └── utils_perplexity.py │ │ ├── medmcqa/ │ │ │ ├── medmcqa.yaml │ │ │ └── utils_medmcqa.py │ │ ├── medqa/ │ │ │ ├── medqa.yaml │ │ │ └── preprocess_medqa.py │ │ ├── medtext/ │ │ │ ├── README.md │ │ │ ├── medtext.yaml │ │ │ ├── medtext_perplexity.yaml │ │ │ ├── utils.py │ │ │ └── utils_perplexity.py │ │ ├── mela/ │ │ │ ├── README.md │ │ │ ├── _mela.yaml │ │ │ ├── mela_ar.yaml │ │ │ ├── mela_de.yaml │ │ │ ├── mela_en.yaml │ │ │ ├── mela_es.yaml │ │ │ ├── mela_fr.yaml │ │ │ ├── mela_is.yaml │ │ │ ├── mela_it.yaml │ │ │ ├── mela_ja.yaml │ │ │ ├── mela_ru.yaml │ │ │ └── mela_zh.yaml │ │ ├── meqsum/ │ │ │ ├── README.md │ │ │ ├── meqsum.yaml │ │ │ └── utils.py │ │ ├── metabench/ │ │ │ ├── README.md │ │ │ ├── metabench.yaml │ │ │ ├── metabench_arc.yaml │ │ │ ├── metabench_arc_permute.yaml │ │ │ ├── metabench_arc_secondary.yaml │ │ │ ├── metabench_arc_secondary_permute.yaml │ │ │ ├── metabench_gsm8k.yaml │ │ │ ├── metabench_gsm8k_secondary.yaml │ │ │ ├── metabench_hellaswag.yaml │ │ │ ├── metabench_hellaswag_permute.yaml │ │ │ ├── metabench_hellaswag_secondary.yaml │ │ │ ├── metabench_hellaswag_secondary_permute.yaml │ │ │ ├── metabench_mmlu.yaml │ │ │ ├── metabench_mmlu_permute.yaml │ │ │ ├── metabench_mmlu_secondary.yaml │ │ │ ├── metabench_mmlu_secondary_permute.yaml │ │ │ ├── metabench_permute.yaml │ │ │ ├── metabench_secondary.yaml │ │ │ ├── metabench_secondary_permute.yaml │ │ │ ├── metabench_truthfulqa.yaml │ │ │ ├── metabench_truthfulqa_permute.yaml │ │ │ ├── metabench_truthfulqa_secondary.yaml │ │ │ ├── metabench_truthfulqa_secondary_permute.yaml │ │ │ ├── metabench_winogrande.yaml │ │ │ ├── metabench_winogrande_permute.yaml │ │ │ ├── metabench_winogrande_secondary.yaml │ │ │ ├── metabench_winogrande_secondary_permute.yaml │ │ │ ├── process_docs.py │ │ │ └── process_docs_permute.py │ │ ├── mgsm/ │ │ │ ├── README.md │ │ │ ├── direct/ │ │ │ │ ├── direct_yaml │ │ │ │ ├── mgsm_direct_bn.yaml │ │ │ │ ├── mgsm_direct_de.yaml │ │ │ │ ├── mgsm_direct_en.yaml │ │ │ │ ├── mgsm_direct_es.yaml │ │ │ │ ├── mgsm_direct_fr.yaml │ │ │ │ ├── mgsm_direct_ja.yaml │ │ │ │ ├── mgsm_direct_ru.yaml │ │ │ │ ├── mgsm_direct_sw.yaml │ │ │ │ ├── mgsm_direct_te.yaml │ │ │ │ ├── mgsm_direct_th.yaml │ │ │ │ └── mgsm_direct_zh.yaml │ │ │ ├── en_cot/ │ │ │ │ ├── cot_yaml │ │ │ │ ├── mgsm_en_cot_bn.yaml │ │ │ │ ├── mgsm_en_cot_de.yaml │ │ │ │ ├── mgsm_en_cot_en.yaml │ │ │ │ ├── mgsm_en_cot_es.yaml │ │ │ │ ├── mgsm_en_cot_fr.yaml │ │ │ │ ├── mgsm_en_cot_ja.yaml │ │ │ │ ├── mgsm_en_cot_ru.yaml │ │ │ │ ├── mgsm_en_cot_sw.yaml │ │ │ │ ├── mgsm_en_cot_te.yaml │ │ │ │ ├── mgsm_en_cot_th.yaml │ │ │ │ └── mgsm_en_cot_zh.yaml │ │ │ ├── gen_yaml.sh │ │ │ ├── native_cot/ │ │ │ │ ├── cot_yaml │ │ │ │ ├── mgsm_native_cot_bn.yaml │ │ │ │ ├── mgsm_native_cot_de.yaml │ │ │ │ ├── mgsm_native_cot_en.yaml │ │ │ │ ├── mgsm_native_cot_es.yaml │ │ │ │ ├── mgsm_native_cot_fr.yaml │ │ │ │ ├── mgsm_native_cot_ja.yaml │ │ │ │ ├── mgsm_native_cot_ru.yaml │ │ │ │ ├── mgsm_native_cot_sw.yaml │ │ │ │ ├── mgsm_native_cot_te.yaml │ │ │ │ ├── mgsm_native_cot_th.yaml │ │ │ │ └── mgsm_native_cot_zh.yaml │ │ │ └── utils.py │ │ ├── mimic_repsum/ │ │ │ ├── README.md │ │ │ ├── mimic_repsum.yaml │ │ │ ├── mimic_repsum_perplexity.yaml │ │ │ ├── utils.py │ │ │ └── utils_perplexity.py │ │ ├── minerva_math/ │ │ │ ├── README.md │ │ │ ├── minerva_math500.yaml │ │ │ ├── minerva_math_algebra.yaml │ │ │ ├── minerva_math_counting_and_prob.yaml │ │ │ ├── minerva_math_geometry.yaml │ │ │ ├── minerva_math_intermediate_algebra.yaml │ │ │ ├── minerva_math_num_theory.yaml │ │ │ ├── minerva_math_prealgebra.yaml │ │ │ ├── minerva_math_precalc.yaml │ │ │ └── utils.py │ │ ├── mlqa/ │ │ │ ├── README.md │ │ │ ├── generate_tasks.py │ │ │ ├── mlqa_ar_ar.yaml │ │ │ ├── mlqa_ar_de.yaml │ │ │ ├── mlqa_ar_en.yaml │ │ │ ├── mlqa_ar_es.yaml │ │ │ ├── mlqa_ar_hi.yaml │ │ │ ├── mlqa_ar_vi.yaml │ │ │ ├── mlqa_ar_zh.yaml │ │ │ ├── mlqa_common_yaml │ │ │ ├── mlqa_de_ar.yaml │ │ │ ├── mlqa_de_de.yaml │ │ │ ├── mlqa_de_en.yaml │ │ │ ├── mlqa_de_es.yaml │ │ │ ├── mlqa_de_hi.yaml │ │ │ ├── mlqa_de_vi.yaml │ │ │ ├── mlqa_de_zh.yaml │ │ │ ├── mlqa_en_ar.yaml │ │ │ ├── mlqa_en_de.yaml │ │ │ ├── mlqa_en_en.yaml │ │ │ ├── mlqa_en_es.yaml │ │ │ ├── mlqa_en_hi.yaml │ │ │ ├── mlqa_en_vi.yaml │ │ │ ├── mlqa_en_zh.yaml │ │ │ ├── mlqa_es_ar.yaml │ │ │ ├── mlqa_es_de.yaml │ │ │ ├── mlqa_es_en.yaml │ │ │ ├── mlqa_es_es.yaml │ │ │ ├── mlqa_es_hi.yaml │ │ │ ├── mlqa_es_vi.yaml │ │ │ ├── mlqa_es_zh.yaml │ │ │ ├── mlqa_hi_ar.yaml │ │ │ ├── mlqa_hi_de.yaml │ │ │ ├── mlqa_hi_en.yaml │ │ │ ├── mlqa_hi_es.yaml │ │ │ ├── mlqa_hi_hi.yaml │ │ │ ├── mlqa_hi_vi.yaml │ │ │ ├── mlqa_hi_zh.yaml │ │ │ ├── mlqa_vi_ar.yaml │ │ │ ├── mlqa_vi_de.yaml │ │ │ ├── mlqa_vi_en.yaml │ │ │ ├── mlqa_vi_es.yaml │ │ │ ├── mlqa_vi_hi.yaml │ │ │ ├── mlqa_vi_vi.yaml │ │ │ ├── mlqa_vi_zh.yaml │ │ │ ├── mlqa_zh_ar.yaml │ │ │ ├── mlqa_zh_de.yaml │ │ │ ├── mlqa_zh_en.yaml │ │ │ ├── mlqa_zh_es.yaml │ │ │ ├── mlqa_zh_hi.yaml │ │ │ ├── mlqa_zh_vi.yaml │ │ │ ├── mlqa_zh_zh.yaml │ │ │ └── utils.py │ │ ├── mmlu/ │ │ │ ├── README.md │ │ │ ├── _generate_configs.py │ │ │ ├── continuation/ │ │ │ │ ├── _continuation_template_yaml │ │ │ │ ├── _mmlu.yaml │ │ │ │ ├── mmlu_abstract_algebra.yaml │ │ │ │ ├── mmlu_anatomy.yaml │ │ │ │ ├── mmlu_astronomy.yaml │ │ │ │ ├── mmlu_business_ethics.yaml │ │ │ │ ├── mmlu_clinical_knowledge.yaml │ │ │ │ ├── mmlu_college_biology.yaml │ │ │ │ ├── mmlu_college_chemistry.yaml │ │ │ │ ├── mmlu_college_computer_science.yaml │ │ │ │ ├── mmlu_college_mathematics.yaml │ │ │ │ ├── mmlu_college_medicine.yaml │ │ │ │ ├── mmlu_college_physics.yaml │ │ │ │ ├── mmlu_computer_security.yaml │ │ │ │ ├── mmlu_conceptual_physics.yaml │ │ │ │ ├── mmlu_econometrics.yaml │ │ │ │ ├── mmlu_electrical_engineering.yaml │ │ │ │ ├── mmlu_elementary_mathematics.yaml │ │ │ │ ├── mmlu_formal_logic.yaml │ │ │ │ ├── mmlu_global_facts.yaml │ │ │ │ ├── mmlu_high_school_biology.yaml │ │ │ │ ├── mmlu_high_school_chemistry.yaml │ │ │ │ ├── mmlu_high_school_computer_science.yaml │ │ │ │ ├── mmlu_high_school_european_history.yaml │ │ │ │ ├── mmlu_high_school_geography.yaml │ │ │ │ ├── mmlu_high_school_government_and_politics.yaml │ │ │ │ ├── mmlu_high_school_macroeconomics.yaml │ │ │ │ ├── mmlu_high_school_mathematics.yaml │ │ │ │ ├── mmlu_high_school_microeconomics.yaml │ │ │ │ ├── mmlu_high_school_physics.yaml │ │ │ │ ├── mmlu_high_school_psychology.yaml │ │ │ │ ├── mmlu_high_school_statistics.yaml │ │ │ │ ├── mmlu_high_school_us_history.yaml │ │ │ │ ├── mmlu_high_school_world_history.yaml │ │ │ │ ├── mmlu_human_aging.yaml │ │ │ │ ├── mmlu_human_sexuality.yaml │ │ │ │ ├── mmlu_international_law.yaml │ │ │ │ ├── mmlu_jurisprudence.yaml │ │ │ │ ├── mmlu_logical_fallacies.yaml │ │ │ │ ├── mmlu_machine_learning.yaml │ │ │ │ ├── mmlu_management.yaml │ │ │ │ ├── mmlu_marketing.yaml │ │ │ │ ├── mmlu_medical_genetics.yaml │ │ │ │ ├── mmlu_miscellaneous.yaml │ │ │ │ ├── mmlu_moral_disputes.yaml │ │ │ │ ├── mmlu_moral_scenarios.yaml │ │ │ │ ├── mmlu_nutrition.yaml │ │ │ │ ├── mmlu_philosophy.yaml │ │ │ │ ├── mmlu_prehistory.yaml │ │ │ │ ├── mmlu_professional_accounting.yaml │ │ │ │ ├── mmlu_professional_law.yaml │ │ │ │ ├── mmlu_professional_medicine.yaml │ │ │ │ ├── mmlu_professional_psychology.yaml │ │ │ │ ├── mmlu_public_relations.yaml │ │ │ │ ├── mmlu_security_studies.yaml │ │ │ │ ├── mmlu_sociology.yaml │ │ │ │ ├── mmlu_us_foreign_policy.yaml │ │ │ │ ├── mmlu_virology.yaml │ │ │ │ └── mmlu_world_religions.yaml │ │ │ ├── default/ │ │ │ │ ├── _default_template_yaml │ │ │ │ ├── _mmlu.yaml │ │ │ │ ├── _mmlu_humanities.yaml │ │ │ │ ├── _mmlu_other.yaml │ │ │ │ ├── _mmlu_social_sciences.yaml │ │ │ │ ├── _mmlu_stem.yaml │ │ │ │ ├── mmlu_abstract_algebra.yaml │ │ │ │ ├── mmlu_anatomy.yaml │ │ │ │ ├── mmlu_astronomy.yaml │ │ │ │ ├── mmlu_business_ethics.yaml │ │ │ │ ├── mmlu_clinical_knowledge.yaml │ │ │ │ ├── mmlu_college_biology.yaml │ │ │ │ ├── mmlu_college_chemistry.yaml │ │ │ │ ├── mmlu_college_computer_science.yaml │ │ │ │ ├── mmlu_college_mathematics.yaml │ │ │ │ ├── mmlu_college_medicine.yaml │ │ │ │ ├── mmlu_college_physics.yaml │ │ │ │ ├── mmlu_computer_security.yaml │ │ │ │ ├── mmlu_conceptual_physics.yaml │ │ │ │ ├── mmlu_econometrics.yaml │ │ │ │ ├── mmlu_electrical_engineering.yaml │ │ │ │ ├── mmlu_elementary_mathematics.yaml │ │ │ │ ├── mmlu_formal_logic.yaml │ │ │ │ ├── mmlu_global_facts.yaml │ │ │ │ ├── mmlu_high_school_biology.yaml │ │ │ │ ├── mmlu_high_school_chemistry.yaml │ │ │ │ ├── mmlu_high_school_computer_science.yaml │ │ │ │ ├── mmlu_high_school_european_history.yaml │ │ │ │ ├── mmlu_high_school_geography.yaml │ │ │ │ ├── mmlu_high_school_government_and_politics.yaml │ │ │ │ ├── mmlu_high_school_macroeconomics.yaml │ │ │ │ ├── mmlu_high_school_mathematics.yaml │ │ │ │ ├── mmlu_high_school_microeconomics.yaml │ │ │ │ ├── mmlu_high_school_physics.yaml │ │ │ │ ├── mmlu_high_school_psychology.yaml │ │ │ │ ├── mmlu_high_school_statistics.yaml │ │ │ │ ├── mmlu_high_school_us_history.yaml │ │ │ │ ├── mmlu_high_school_world_history.yaml │ │ │ │ ├── mmlu_human_aging.yaml │ │ │ │ ├── mmlu_human_sexuality.yaml │ │ │ │ ├── mmlu_international_law.yaml │ │ │ │ ├── mmlu_jurisprudence.yaml │ │ │ │ ├── mmlu_logical_fallacies.yaml │ │ │ │ ├── mmlu_machine_learning.yaml │ │ │ │ ├── mmlu_management.yaml │ │ │ │ ├── mmlu_marketing.yaml │ │ │ │ ├── mmlu_medical_genetics.yaml │ │ │ │ ├── mmlu_miscellaneous.yaml │ │ │ │ ├── mmlu_moral_disputes.yaml │ │ │ │ ├── mmlu_moral_scenarios.yaml │ │ │ │ ├── mmlu_nutrition.yaml │ │ │ │ ├── mmlu_philosophy.yaml │ │ │ │ ├── mmlu_prehistory.yaml │ │ │ │ ├── mmlu_professional_accounting.yaml │ │ │ │ ├── mmlu_professional_law.yaml │ │ │ │ ├── mmlu_professional_medicine.yaml │ │ │ │ ├── mmlu_professional_psychology.yaml │ │ │ │ ├── mmlu_public_relations.yaml │ │ │ │ ├── mmlu_security_studies.yaml │ │ │ │ ├── mmlu_sociology.yaml │ │ │ │ ├── mmlu_us_foreign_policy.yaml │ │ │ │ ├── mmlu_virology.yaml │ │ │ │ └── mmlu_world_religions.yaml │ │ │ ├── flan_cot_fewshot/ │ │ │ │ ├── _cot_prompts.json │ │ │ │ ├── _mmlu.yaml │ │ │ │ ├── _mmlu_flan_cot_fewshot_template_yaml │ │ │ │ ├── mmlu_abstract_algebra.yaml │ │ │ │ ├── mmlu_anatomy.yaml │ │ │ │ ├── mmlu_astronomy.yaml │ │ │ │ ├── mmlu_business_ethics.yaml │ │ │ │ ├── mmlu_clinical_knowledge.yaml │ │ │ │ ├── mmlu_college_biology.yaml │ │ │ │ ├── mmlu_college_chemistry.yaml │ │ │ │ ├── mmlu_college_computer_science.yaml │ │ │ │ ├── mmlu_college_mathematics.yaml │ │ │ │ ├── mmlu_college_medicine.yaml │ │ │ │ ├── mmlu_college_physics.yaml │ │ │ │ ├── mmlu_computer_security.yaml │ │ │ │ ├── mmlu_conceptual_physics.yaml │ │ │ │ ├── mmlu_econometrics.yaml │ │ │ │ ├── mmlu_electrical_engineering.yaml │ │ │ │ ├── mmlu_elementary_mathematics.yaml │ │ │ │ ├── mmlu_formal_logic.yaml │ │ │ │ ├── mmlu_global_facts.yaml │ │ │ │ ├── mmlu_high_school_biology.yaml │ │ │ │ ├── mmlu_high_school_chemistry.yaml │ │ │ │ ├── mmlu_high_school_computer_science.yaml │ │ │ │ ├── mmlu_high_school_european_history.yaml │ │ │ │ ├── mmlu_high_school_geography.yaml │ │ │ │ ├── mmlu_high_school_government_and_politics.yaml │ │ │ │ ├── mmlu_high_school_macroeconomics.yaml │ │ │ │ ├── mmlu_high_school_mathematics.yaml │ │ │ │ ├── mmlu_high_school_microeconomics.yaml │ │ │ │ ├── mmlu_high_school_physics.yaml │ │ │ │ ├── mmlu_high_school_psychology.yaml │ │ │ │ ├── mmlu_high_school_statistics.yaml │ │ │ │ ├── mmlu_high_school_us_history.yaml │ │ │ │ ├── mmlu_high_school_world_history.yaml │ │ │ │ ├── mmlu_human_aging.yaml │ │ │ │ ├── mmlu_human_sexuality.yaml │ │ │ │ ├── mmlu_international_law.yaml │ │ │ │ ├── mmlu_jurisprudence.yaml │ │ │ │ ├── mmlu_logical_fallacies.yaml │ │ │ │ ├── mmlu_machine_learning.yaml │ │ │ │ ├── mmlu_management.yaml │ │ │ │ ├── mmlu_marketing.yaml │ │ │ │ ├── mmlu_medical_genetics.yaml │ │ │ │ ├── mmlu_miscellaneous.yaml │ │ │ │ ├── mmlu_moral_disputes.yaml │ │ │ │ ├── mmlu_moral_scenarios.yaml │ │ │ │ ├── mmlu_nutrition.yaml │ │ │ │ ├── mmlu_philosophy.yaml │ │ │ │ ├── mmlu_prehistory.yaml │ │ │ │ ├── mmlu_professional_accounting.yaml │ │ │ │ ├── mmlu_professional_law.yaml │ │ │ │ ├── mmlu_professional_medicine.yaml │ │ │ │ ├── mmlu_professional_psychology.yaml │ │ │ │ ├── mmlu_public_relations.yaml │ │ │ │ ├── mmlu_security_studies.yaml │ │ │ │ ├── mmlu_sociology.yaml │ │ │ │ ├── mmlu_us_foreign_policy.yaml │ │ │ │ ├── mmlu_virology.yaml │ │ │ │ └── mmlu_world_religions.yaml │ │ │ ├── flan_cot_zeroshot/ │ │ │ │ ├── _mmlu.yaml │ │ │ │ ├── _mmlu_flan_cot_zeroshot_template_yaml │ │ │ │ ├── mmlu_abstract_algebra.yaml │ │ │ │ ├── mmlu_anatomy.yaml │ │ │ │ ├── mmlu_astronomy.yaml │ │ │ │ ├── mmlu_business_ethics.yaml │ │ │ │ ├── mmlu_clinical_knowledge.yaml │ │ │ │ ├── mmlu_college_biology.yaml │ │ │ │ ├── mmlu_college_chemistry.yaml │ │ │ │ ├── mmlu_college_computer_science.yaml │ │ │ │ ├── mmlu_college_mathematics.yaml │ │ │ │ ├── mmlu_college_medicine.yaml │ │ │ │ ├── mmlu_college_physics.yaml │ │ │ │ ├── mmlu_computer_security.yaml │ │ │ │ ├── mmlu_conceptual_physics.yaml │ │ │ │ ├── mmlu_econometrics.yaml │ │ │ │ ├── mmlu_electrical_engineering.yaml │ │ │ │ ├── mmlu_elementary_mathematics.yaml │ │ │ │ ├── mmlu_formal_logic.yaml │ │ │ │ ├── mmlu_global_facts.yaml │ │ │ │ ├── mmlu_high_school_biology.yaml │ │ │ │ ├── mmlu_high_school_chemistry.yaml │ │ │ │ ├── mmlu_high_school_computer_science.yaml │ │ │ │ ├── mmlu_high_school_european_history.yaml │ │ │ │ ├── mmlu_high_school_geography.yaml │ │ │ │ ├── mmlu_high_school_government_and_politics.yaml │ │ │ │ ├── mmlu_high_school_macroeconomics.yaml │ │ │ │ ├── mmlu_high_school_mathematics.yaml │ │ │ │ ├── mmlu_high_school_microeconomics.yaml │ │ │ │ ├── mmlu_high_school_physics.yaml │ │ │ │ ├── mmlu_high_school_psychology.yaml │ │ │ │ ├── mmlu_high_school_statistics.yaml │ │ │ │ ├── mmlu_high_school_us_history.yaml │ │ │ │ ├── mmlu_high_school_world_history.yaml │ │ │ │ ├── mmlu_human_aging.yaml │ │ │ │ ├── mmlu_human_sexuality.yaml │ │ │ │ ├── mmlu_international_law.yaml │ │ │ │ ├── mmlu_jurisprudence.yaml │ │ │ │ ├── mmlu_logical_fallacies.yaml │ │ │ │ ├── mmlu_machine_learning.yaml │ │ │ │ ├── mmlu_management.yaml │ │ │ │ ├── mmlu_marketing.yaml │ │ │ │ ├── mmlu_medical_genetics.yaml │ │ │ │ ├── mmlu_miscellaneous.yaml │ │ │ │ ├── mmlu_moral_disputes.yaml │ │ │ │ ├── mmlu_moral_scenarios.yaml │ │ │ │ ├── mmlu_nutrition.yaml │ │ │ │ ├── mmlu_philosophy.yaml │ │ │ │ ├── mmlu_prehistory.yaml │ │ │ │ ├── mmlu_professional_accounting.yaml │ │ │ │ ├── mmlu_professional_law.yaml │ │ │ │ ├── mmlu_professional_medicine.yaml │ │ │ │ ├── mmlu_professional_psychology.yaml │ │ │ │ ├── mmlu_public_relations.yaml │ │ │ │ ├── mmlu_security_studies.yaml │ │ │ │ ├── mmlu_sociology.yaml │ │ │ │ ├── mmlu_us_foreign_policy.yaml │ │ │ │ ├── mmlu_virology.yaml │ │ │ │ ├── mmlu_world_religions.yaml │ │ │ │ └── utils.py │ │ │ ├── flan_n_shot/ │ │ │ │ ├── generative/ │ │ │ │ │ ├── _mmlu.yaml │ │ │ │ │ ├── _mmlu_flan_generative_template_yaml │ │ │ │ │ ├── mmlu_abstract_algebra.yaml │ │ │ │ │ ├── mmlu_anatomy.yaml │ │ │ │ │ ├── mmlu_astronomy.yaml │ │ │ │ │ ├── mmlu_business_ethics.yaml │ │ │ │ │ ├── mmlu_clinical_knowledge.yaml │ │ │ │ │ ├── mmlu_college_biology.yaml │ │ │ │ │ ├── mmlu_college_chemistry.yaml │ │ │ │ │ ├── mmlu_college_computer_science.yaml │ │ │ │ │ ├── mmlu_college_mathematics.yaml │ │ │ │ │ ├── mmlu_college_medicine.yaml │ │ │ │ │ ├── mmlu_college_physics.yaml │ │ │ │ │ ├── mmlu_computer_security.yaml │ │ │ │ │ ├── mmlu_conceptual_physics.yaml │ │ │ │ │ ├── mmlu_econometrics.yaml │ │ │ │ │ ├── mmlu_electrical_engineering.yaml │ │ │ │ │ ├── mmlu_elementary_mathematics.yaml │ │ │ │ │ ├── mmlu_formal_logic.yaml │ │ │ │ │ ├── mmlu_global_facts.yaml │ │ │ │ │ ├── mmlu_high_school_biology.yaml │ │ │ │ │ ├── mmlu_high_school_chemistry.yaml │ │ │ │ │ ├── mmlu_high_school_computer_science.yaml │ │ │ │ │ ├── mmlu_high_school_european_history.yaml │ │ │ │ │ ├── mmlu_high_school_geography.yaml │ │ │ │ │ ├── mmlu_high_school_government_and_politics.yaml │ │ │ │ │ ├── mmlu_high_school_macroeconomics.yaml │ │ │ │ │ ├── mmlu_high_school_mathematics.yaml │ │ │ │ │ ├── mmlu_high_school_microeconomics.yaml │ │ │ │ │ ├── mmlu_high_school_physics.yaml │ │ │ │ │ ├── mmlu_high_school_psychology.yaml │ │ │ │ │ ├── mmlu_high_school_statistics.yaml │ │ │ │ │ ├── mmlu_high_school_us_history.yaml │ │ │ │ │ ├── mmlu_high_school_world_history.yaml │ │ │ │ │ ├── mmlu_human_aging.yaml │ │ │ │ │ ├── mmlu_human_sexuality.yaml │ │ │ │ │ ├── mmlu_international_law.yaml │ │ │ │ │ ├── mmlu_jurisprudence.yaml │ │ │ │ │ ├── mmlu_logical_fallacies.yaml │ │ │ │ │ ├── mmlu_machine_learning.yaml │ │ │ │ │ ├── mmlu_management.yaml │ │ │ │ │ ├── mmlu_marketing.yaml │ │ │ │ │ ├── mmlu_medical_genetics.yaml │ │ │ │ │ ├── mmlu_miscellaneous.yaml │ │ │ │ │ ├── mmlu_moral_disputes.yaml │ │ │ │ │ ├── mmlu_moral_scenarios.yaml │ │ │ │ │ ├── mmlu_nutrition.yaml │ │ │ │ │ ├── mmlu_philosophy.yaml │ │ │ │ │ ├── mmlu_prehistory.yaml │ │ │ │ │ ├── mmlu_professional_accounting.yaml │ │ │ │ │ ├── mmlu_professional_law.yaml │ │ │ │ │ ├── mmlu_professional_medicine.yaml │ │ │ │ │ ├── mmlu_professional_psychology.yaml │ │ │ │ │ ├── mmlu_public_relations.yaml │ │ │ │ │ ├── mmlu_security_studies.yaml │ │ │ │ │ ├── mmlu_sociology.yaml │ │ │ │ │ ├── mmlu_us_foreign_policy.yaml │ │ │ │ │ ├── mmlu_virology.yaml │ │ │ │ │ ├── mmlu_world_religions.yaml │ │ │ │ │ └── utils.py │ │ │ │ └── loglikelihood/ │ │ │ │ ├── _mmlu.yaml │ │ │ │ ├── _mmlu_flan_loglikelihood_template_yaml │ │ │ │ ├── mmlu_abstract_algebra.yaml │ │ │ │ ├── mmlu_anatomy.yaml │ │ │ │ ├── mmlu_astronomy.yaml │ │ │ │ ├── mmlu_business_ethics.yaml │ │ │ │ ├── mmlu_clinical_knowledge.yaml │ │ │ │ ├── mmlu_college_biology.yaml │ │ │ │ ├── mmlu_college_chemistry.yaml │ │ │ │ ├── mmlu_college_computer_science.yaml │ │ │ │ ├── mmlu_college_mathematics.yaml │ │ │ │ ├── mmlu_college_medicine.yaml │ │ │ │ ├── mmlu_college_physics.yaml │ │ │ │ ├── mmlu_computer_security.yaml │ │ │ │ ├── mmlu_conceptual_physics.yaml │ │ │ │ ├── mmlu_econometrics.yaml │ │ │ │ ├── mmlu_electrical_engineering.yaml │ │ │ │ ├── mmlu_elementary_mathematics.yaml │ │ │ │ ├── mmlu_formal_logic.yaml │ │ │ │ ├── mmlu_global_facts.yaml │ │ │ │ ├── mmlu_high_school_biology.yaml │ │ │ │ ├── mmlu_high_school_chemistry.yaml │ │ │ │ ├── mmlu_high_school_computer_science.yaml │ │ │ │ ├── mmlu_high_school_european_history.yaml │ │ │ │ ├── mmlu_high_school_geography.yaml │ │ │ │ ├── mmlu_high_school_government_and_politics.yaml │ │ │ │ ├── mmlu_high_school_macroeconomics.yaml │ │ │ │ ├── mmlu_high_school_mathematics.yaml │ │ │ │ ├── mmlu_high_school_microeconomics.yaml │ │ │ │ ├── mmlu_high_school_physics.yaml │ │ │ │ ├── mmlu_high_school_psychology.yaml │ │ │ │ ├── mmlu_high_school_statistics.yaml │ │ │ │ ├── mmlu_high_school_us_history.yaml │ │ │ │ ├── mmlu_high_school_world_history.yaml │ │ │ │ ├── mmlu_human_aging.yaml │ │ │ │ ├── mmlu_human_sexuality.yaml │ │ │ │ ├── mmlu_international_law.yaml │ │ │ │ ├── mmlu_jurisprudence.yaml │ │ │ │ ├── mmlu_logical_fallacies.yaml │ │ │ │ ├── mmlu_machine_learning.yaml │ │ │ │ ├── mmlu_management.yaml │ │ │ │ ├── mmlu_marketing.yaml │ │ │ │ ├── mmlu_medical_genetics.yaml │ │ │ │ ├── mmlu_miscellaneous.yaml │ │ │ │ ├── mmlu_moral_disputes.yaml │ │ │ │ ├── mmlu_moral_scenarios.yaml │ │ │ │ ├── mmlu_nutrition.yaml │ │ │ │ ├── mmlu_philosophy.yaml │ │ │ │ ├── mmlu_prehistory.yaml │ │ │ │ ├── mmlu_professional_accounting.yaml │ │ │ │ ├── mmlu_professional_law.yaml │ │ │ │ ├── mmlu_professional_medicine.yaml │ │ │ │ ├── mmlu_professional_psychology.yaml │ │ │ │ ├── mmlu_public_relations.yaml │ │ │ │ ├── mmlu_security_studies.yaml │ │ │ │ ├── mmlu_sociology.yaml │ │ │ │ ├── mmlu_us_foreign_policy.yaml │ │ │ │ ├── mmlu_virology.yaml │ │ │ │ └── mmlu_world_religions.yaml │ │ │ └── generative/ │ │ │ ├── _default_template_yaml │ │ │ ├── _mmlu.yaml │ │ │ ├── mmlu_abstract_algebra.yaml │ │ │ ├── mmlu_anatomy.yaml │ │ │ ├── mmlu_astronomy.yaml │ │ │ ├── mmlu_business_ethics.yaml │ │ │ ├── mmlu_clinical_knowledge.yaml │ │ │ ├── mmlu_college_biology.yaml │ │ │ ├── mmlu_college_chemistry.yaml │ │ │ ├── mmlu_college_computer_science.yaml │ │ │ ├── mmlu_college_mathematics.yaml │ │ │ ├── mmlu_college_medicine.yaml │ │ │ ├── mmlu_college_physics.yaml │ │ │ ├── mmlu_computer_security.yaml │ │ │ ├── mmlu_conceptual_physics.yaml │ │ │ ├── mmlu_econometrics.yaml │ │ │ ├── mmlu_electrical_engineering.yaml │ │ │ ├── mmlu_elementary_mathematics.yaml │ │ │ ├── mmlu_formal_logic.yaml │ │ │ ├── mmlu_global_facts.yaml │ │ │ ├── mmlu_high_school_biology.yaml │ │ │ ├── mmlu_high_school_chemistry.yaml │ │ │ ├── mmlu_high_school_computer_science.yaml │ │ │ ├── mmlu_high_school_european_history.yaml │ │ │ ├── mmlu_high_school_geography.yaml │ │ │ ├── mmlu_high_school_government_and_politics.yaml │ │ │ ├── mmlu_high_school_macroeconomics.yaml │ │ │ ├── mmlu_high_school_mathematics.yaml │ │ │ ├── mmlu_high_school_microeconomics.yaml │ │ │ ├── mmlu_high_school_physics.yaml │ │ │ ├── mmlu_high_school_psychology.yaml │ │ │ ├── mmlu_high_school_statistics.yaml │ │ │ ├── mmlu_high_school_us_history.yaml │ │ │ ├── mmlu_high_school_world_history.yaml │ │ │ ├── mmlu_human_aging.yaml │ │ │ ├── mmlu_human_sexuality.yaml │ │ │ ├── mmlu_international_law.yaml │ │ │ ├── mmlu_jurisprudence.yaml │ │ │ ├── mmlu_logical_fallacies.yaml │ │ │ ├── mmlu_machine_learning.yaml │ │ │ ├── mmlu_management.yaml │ │ │ ├── mmlu_marketing.yaml │ │ │ ├── mmlu_medical_genetics.yaml │ │ │ ├── mmlu_miscellaneous.yaml │ │ │ ├── mmlu_moral_disputes.yaml │ │ │ ├── mmlu_moral_scenarios.yaml │ │ │ ├── mmlu_nutrition.yaml │ │ │ ├── mmlu_philosophy.yaml │ │ │ ├── mmlu_prehistory.yaml │ │ │ ├── mmlu_professional_accounting.yaml │ │ │ ├── mmlu_professional_law.yaml │ │ │ ├── mmlu_professional_medicine.yaml │ │ │ ├── mmlu_professional_psychology.yaml │ │ │ ├── mmlu_public_relations.yaml │ │ │ ├── mmlu_security_studies.yaml │ │ │ ├── mmlu_sociology.yaml │ │ │ ├── mmlu_us_foreign_policy.yaml │ │ │ ├── mmlu_virology.yaml │ │ │ └── mmlu_world_religions.yaml │ │ ├── mmlu-pro-plus/ │ │ │ ├── README.md │ │ │ ├── _default_template_yaml │ │ │ ├── _mmlu_pro_plus.yaml │ │ │ ├── mmlu_pro_plus_biology.yaml │ │ │ ├── mmlu_pro_plus_business.yaml │ │ │ ├── mmlu_pro_plus_chemistry.yaml │ │ │ ├── mmlu_pro_plus_computer_science.yaml │ │ │ ├── mmlu_pro_plus_economics.yaml │ │ │ ├── mmlu_pro_plus_engineering.yaml │ │ │ ├── mmlu_pro_plus_health.yaml │ │ │ ├── mmlu_pro_plus_history.yaml │ │ │ ├── mmlu_pro_plus_law.yaml │ │ │ ├── mmlu_pro_plus_math.yaml │ │ │ ├── mmlu_pro_plus_other.yaml │ │ │ ├── mmlu_pro_plus_philosophy.yaml │ │ │ ├── mmlu_pro_plus_physics.yaml │ │ │ ├── mmlu_pro_plus_psychology.yaml │ │ │ └── utils.py │ │ ├── mmlu-redux/ │ │ │ └── generative/ │ │ │ ├── README.md │ │ │ ├── _default_template_yaml │ │ │ ├── _mmlu.yaml │ │ │ ├── mmlu_abstract_algebra.yaml │ │ │ ├── mmlu_anatomy.yaml │ │ │ ├── mmlu_astronomy.yaml │ │ │ ├── mmlu_business_ethics.yaml │ │ │ ├── mmlu_clinical_knowledge.yaml │ │ │ ├── mmlu_college_biology.yaml │ │ │ ├── mmlu_college_chemistry.yaml │ │ │ ├── mmlu_college_computer_science.yaml │ │ │ ├── mmlu_college_mathematics.yaml │ │ │ ├── mmlu_college_medicine.yaml │ │ │ ├── mmlu_college_physics.yaml │ │ │ ├── mmlu_computer_security.yaml │ │ │ ├── mmlu_conceptual_physics.yaml │ │ │ ├── mmlu_econometrics.yaml │ │ │ ├── mmlu_electrical_engineering.yaml │ │ │ ├── mmlu_elementary_mathematics.yaml │ │ │ ├── mmlu_formal_logic.yaml │ │ │ ├── mmlu_global_facts.yaml │ │ │ ├── mmlu_high_school_biology.yaml │ │ │ ├── mmlu_high_school_chemistry.yaml │ │ │ ├── mmlu_high_school_computer_science.yaml │ │ │ ├── mmlu_high_school_european_history.yaml │ │ │ ├── mmlu_high_school_geography.yaml │ │ │ ├── mmlu_high_school_government_and_politics.yaml │ │ │ ├── mmlu_high_school_macroeconomics.yaml │ │ │ ├── mmlu_high_school_mathematics.yaml │ │ │ ├── mmlu_high_school_microeconomics.yaml │ │ │ ├── mmlu_high_school_physics.yaml │ │ │ ├── mmlu_high_school_psychology.yaml │ │ │ ├── mmlu_high_school_statistics.yaml │ │ │ ├── mmlu_high_school_us_history.yaml │ │ │ ├── mmlu_high_school_world_history.yaml │ │ │ ├── mmlu_human_aging.yaml │ │ │ ├── mmlu_human_sexuality.yaml │ │ │ ├── mmlu_international_law.yaml │ │ │ ├── mmlu_jurisprudence.yaml │ │ │ ├── mmlu_logical_fallacies.yaml │ │ │ ├── mmlu_machine_learning.yaml │ │ │ ├── mmlu_management.yaml │ │ │ ├── mmlu_marketing.yaml │ │ │ ├── mmlu_medical_genetics.yaml │ │ │ ├── mmlu_miscellaneous.yaml │ │ │ ├── mmlu_moral_disputes.yaml │ │ │ ├── mmlu_moral_scenarios.yaml │ │ │ ├── mmlu_nutrition.yaml │ │ │ ├── mmlu_philosophy.yaml │ │ │ ├── mmlu_prehistory.yaml │ │ │ ├── mmlu_professional_accounting.yaml │ │ │ ├── mmlu_professional_law.yaml │ │ │ ├── mmlu_professional_medicine.yaml │ │ │ ├── mmlu_professional_psychology.yaml │ │ │ ├── mmlu_public_relations.yaml │ │ │ ├── mmlu_security_studies.yaml │ │ │ ├── mmlu_sociology.yaml │ │ │ ├── mmlu_us_foreign_policy.yaml │ │ │ ├── mmlu_virology.yaml │ │ │ └── mmlu_world_religions.yaml │ │ ├── mmlu-redux-spanish/ │ │ │ ├── README.md │ │ │ ├── generative/ │ │ │ │ ├── _default_template_spanish_yaml │ │ │ │ ├── _mmlu.yaml │ │ │ │ ├── mmlu_abstract_algebra.yaml │ │ │ │ ├── mmlu_anatomy.yaml │ │ │ │ ├── mmlu_astronomy.yaml │ │ │ │ ├── mmlu_business_ethics.yaml │ │ │ │ ├── mmlu_clinical_knowledge.yaml │ │ │ │ ├── mmlu_college_biology.yaml │ │ │ │ ├── mmlu_college_chemistry.yaml │ │ │ │ ├── mmlu_college_computer_science.yaml │ │ │ │ ├── mmlu_college_mathematics.yaml │ │ │ │ ├── mmlu_college_medicine.yaml │ │ │ │ ├── mmlu_college_physics.yaml │ │ │ │ ├── mmlu_computer_security.yaml │ │ │ │ ├── mmlu_conceptual_physics.yaml │ │ │ │ ├── mmlu_econometrics.yaml │ │ │ │ ├── mmlu_electrical_engineering.yaml │ │ │ │ ├── mmlu_elementary_mathematics.yaml │ │ │ │ ├── mmlu_formal_logic.yaml │ │ │ │ ├── mmlu_global_facts.yaml │ │ │ │ ├── mmlu_high_school_biology.yaml │ │ │ │ ├── mmlu_high_school_chemistry.yaml │ │ │ │ ├── mmlu_high_school_computer_science.yaml │ │ │ │ ├── mmlu_high_school_european_history.yaml │ │ │ │ ├── mmlu_high_school_geography.yaml │ │ │ │ ├── mmlu_high_school_government_and_politics.yaml │ │ │ │ ├── mmlu_high_school_macroeconomics.yaml │ │ │ │ ├── mmlu_high_school_mathematics.yaml │ │ │ │ ├── mmlu_high_school_microeconomics.yaml │ │ │ │ ├── mmlu_high_school_physics.yaml │ │ │ │ ├── mmlu_high_school_psychology.yaml │ │ │ │ ├── mmlu_high_school_statistics.yaml │ │ │ │ ├── mmlu_high_school_us_history.yaml │ │ │ │ ├── mmlu_high_school_world_history.yaml │ │ │ │ ├── mmlu_human_aging.yaml │ │ │ │ ├── mmlu_human_sexuality.yaml │ │ │ │ ├── mmlu_international_law.yaml │ │ │ │ ├── mmlu_jurisprudence.yaml │ │ │ │ ├── mmlu_logical_fallacies.yaml │ │ │ │ ├── mmlu_machine_learning.yaml │ │ │ │ ├── mmlu_management.yaml │ │ │ │ ├── mmlu_marketing.yaml │ │ │ │ ├── mmlu_medical_genetics.yaml │ │ │ │ ├── mmlu_miscellaneous.yaml │ │ │ │ ├── mmlu_moral_disputes.yaml │ │ │ │ ├── mmlu_moral_scenarios.yaml │ │ │ │ ├── mmlu_nutrition.yaml │ │ │ │ ├── mmlu_philosophy.yaml │ │ │ │ ├── mmlu_prehistory.yaml │ │ │ │ ├── mmlu_professional_accounting.yaml │ │ │ │ ├── mmlu_professional_law.yaml │ │ │ │ ├── mmlu_professional_medicine.yaml │ │ │ │ ├── mmlu_professional_psychology.yaml │ │ │ │ ├── mmlu_public_relations.yaml │ │ │ │ ├── mmlu_security_studies.yaml │ │ │ │ ├── mmlu_sociology.yaml │ │ │ │ ├── mmlu_us_foreign_policy.yaml │ │ │ │ ├── mmlu_virology.yaml │ │ │ │ └── mmlu_world_religions.yaml │ │ │ └── mmlu-redux-2.0-spanish.yaml │ │ ├── mmlu_pro/ │ │ │ ├── README.md │ │ │ ├── _default_template_yaml │ │ │ ├── _mmlu_pro.yaml │ │ │ ├── mmlu_pro_biology.yaml │ │ │ ├── mmlu_pro_business.yaml │ │ │ ├── mmlu_pro_chemistry.yaml │ │ │ ├── mmlu_pro_computer_science.yaml │ │ │ ├── mmlu_pro_economics.yaml │ │ │ ├── mmlu_pro_engineering.yaml │ │ │ ├── mmlu_pro_health.yaml │ │ │ ├── mmlu_pro_history.yaml │ │ │ ├── mmlu_pro_law.yaml │ │ │ ├── mmlu_pro_math.yaml │ │ │ ├── mmlu_pro_other.yaml │ │ │ ├── mmlu_pro_philosophy.yaml │ │ │ ├── mmlu_pro_physics.yaml │ │ │ ├── mmlu_pro_psychology.yaml │ │ │ └── utils.py │ │ ├── mmlu_prox/ │ │ │ ├── README.md │ │ │ ├── af/ │ │ │ │ ├── _af_lite_template_yaml │ │ │ │ ├── _af_template_yaml │ │ │ │ ├── _mmlu_prox_af.yaml │ │ │ │ ├── _mmlu_prox_lite_af.yaml │ │ │ │ ├── mmlu_prox_af_biology.yaml │ │ │ │ ├── mmlu_prox_af_business.yaml │ │ │ │ ├── mmlu_prox_af_chemistry.yaml │ │ │ │ ├── mmlu_prox_af_computer_science.yaml │ │ │ │ ├── mmlu_prox_af_economics.yaml │ │ │ │ ├── mmlu_prox_af_engineering.yaml │ │ │ │ ├── mmlu_prox_af_health.yaml │ │ │ │ ├── mmlu_prox_af_history.yaml │ │ │ │ ├── mmlu_prox_af_law.yaml │ │ │ │ ├── mmlu_prox_af_math.yaml │ │ │ │ ├── mmlu_prox_af_other.yaml │ │ │ │ ├── mmlu_prox_af_philosophy.yaml │ │ │ │ ├── mmlu_prox_af_physics.yaml │ │ │ │ ├── mmlu_prox_af_psychology.yaml │ │ │ │ ├── mmlu_prox_lite_af_biology.yaml │ │ │ │ ├── mmlu_prox_lite_af_business.yaml │ │ │ │ ├── mmlu_prox_lite_af_chemistry.yaml │ │ │ │ ├── mmlu_prox_lite_af_computer_science.yaml │ │ │ │ ├── mmlu_prox_lite_af_economics.yaml │ │ │ │ ├── mmlu_prox_lite_af_engineering.yaml │ │ │ │ ├── mmlu_prox_lite_af_health.yaml │ │ │ │ ├── mmlu_prox_lite_af_history.yaml │ │ │ │ ├── mmlu_prox_lite_af_law.yaml │ │ │ │ ├── mmlu_prox_lite_af_math.yaml │ │ │ │ ├── mmlu_prox_lite_af_other.yaml │ │ │ │ ├── mmlu_prox_lite_af_philosophy.yaml │ │ │ │ ├── mmlu_prox_lite_af_physics.yaml │ │ │ │ ├── mmlu_prox_lite_af_psychology.yaml │ │ │ │ └── utils.py │ │ │ ├── ar/ │ │ │ │ ├── _ar_lite_template_yaml │ │ │ │ ├── _ar_template_yaml │ │ │ │ ├── _mmlu_prox_ar.yaml │ │ │ │ ├── _mmlu_prox_lite_ar.yaml │ │ │ │ ├── mmlu_prox_ar_biology.yaml │ │ │ │ ├── mmlu_prox_ar_business.yaml │ │ │ │ ├── mmlu_prox_ar_chemistry.yaml │ │ │ │ ├── mmlu_prox_ar_computer_science.yaml │ │ │ │ ├── mmlu_prox_ar_economics.yaml │ │ │ │ ├── mmlu_prox_ar_engineering.yaml │ │ │ │ ├── mmlu_prox_ar_health.yaml │ │ │ │ ├── mmlu_prox_ar_history.yaml │ │ │ │ ├── mmlu_prox_ar_law.yaml │ │ │ │ ├── mmlu_prox_ar_math.yaml │ │ │ │ ├── mmlu_prox_ar_other.yaml │ │ │ │ ├── mmlu_prox_ar_philosophy.yaml │ │ │ │ ├── mmlu_prox_ar_physics.yaml │ │ │ │ ├── mmlu_prox_ar_psychology.yaml │ │ │ │ ├── mmlu_prox_lite_ar_biology.yaml │ │ │ │ ├── mmlu_prox_lite_ar_business.yaml │ │ │ │ ├── mmlu_prox_lite_ar_chemistry.yaml │ │ │ │ ├── mmlu_prox_lite_ar_computer_science.yaml │ │ │ │ ├── mmlu_prox_lite_ar_economics.yaml │ │ │ │ ├── mmlu_prox_lite_ar_engineering.yaml │ │ │ │ ├── mmlu_prox_lite_ar_health.yaml │ │ │ │ ├── mmlu_prox_lite_ar_history.yaml │ │ │ │ ├── mmlu_prox_lite_ar_law.yaml │ │ │ │ ├── mmlu_prox_lite_ar_math.yaml │ │ │ │ ├── mmlu_prox_lite_ar_other.yaml │ │ │ │ ├── mmlu_prox_lite_ar_philosophy.yaml │ │ │ │ ├── mmlu_prox_lite_ar_physics.yaml │ │ │ │ ├── mmlu_prox_lite_ar_psychology.yaml │ │ │ │ └── utils.py │ │ │ ├── bn/ │ │ │ │ ├── _bn_lite_template_yaml │ │ │ │ ├── _bn_template_yaml │ │ │ │ ├── _mmlu_prox_bn.yaml │ │ │ │ ├── _mmlu_prox_lite_bn.yaml │ │ │ │ ├── mmlu_prox_bn_biology.yaml │ │ │ │ ├── mmlu_prox_bn_business.yaml │ │ │ │ ├── mmlu_prox_bn_chemistry.yaml │ │ │ │ ├── mmlu_prox_bn_computer_science.yaml │ │ │ │ ├── mmlu_prox_bn_economics.yaml │ │ │ │ ├── mmlu_prox_bn_engineering.yaml │ │ │ │ ├── mmlu_prox_bn_health.yaml │ │ │ │ ├── mmlu_prox_bn_history.yaml │ │ │ │ ├── mmlu_prox_bn_law.yaml │ │ │ │ ├── mmlu_prox_bn_math.yaml │ │ │ │ ├── mmlu_prox_bn_other.yaml │ │ │ │ ├── mmlu_prox_bn_philosophy.yaml │ │ │ │ ├── mmlu_prox_bn_physics.yaml │ │ │ │ ├── mmlu_prox_bn_psychology.yaml │ │ │ │ ├── mmlu_prox_lite_bn_biology.yaml │ │ │ │ ├── mmlu_prox_lite_bn_business.yaml │ │ │ │ ├── mmlu_prox_lite_bn_chemistry.yaml │ │ │ │ ├── mmlu_prox_lite_bn_computer_science.yaml │ │ │ │ ├── mmlu_prox_lite_bn_economics.yaml │ │ │ │ ├── mmlu_prox_lite_bn_engineering.yaml │ │ │ │ ├── mmlu_prox_lite_bn_health.yaml │ │ │ │ ├── mmlu_prox_lite_bn_history.yaml │ │ │ │ ├── mmlu_prox_lite_bn_law.yaml │ │ │ │ ├── mmlu_prox_lite_bn_math.yaml │ │ │ │ ├── mmlu_prox_lite_bn_other.yaml │ │ │ │ ├── mmlu_prox_lite_bn_philosophy.yaml │ │ │ │ ├── mmlu_prox_lite_bn_physics.yaml │ │ │ │ ├── mmlu_prox_lite_bn_psychology.yaml │ │ │ │ └── utils.py │ │ │ ├── cs/ │ │ │ │ ├── _cs_lite_template_yaml │ │ │ │ ├── _cs_template_yaml │ │ │ │ ├── _mmlu_prox_cs.yaml │ │ │ │ ├── _mmlu_prox_lite_cs.yaml │ │ │ │ ├── mmlu_prox_cs_biology.yaml │ │ │ │ ├── mmlu_prox_cs_business.yaml │ │ │ │ ├── mmlu_prox_cs_chemistry.yaml │ │ │ │ ├── mmlu_prox_cs_computer_science.yaml │ │ │ │ ├── mmlu_prox_cs_economics.yaml │ │ │ │ ├── mmlu_prox_cs_engineering.yaml │ │ │ │ ├── mmlu_prox_cs_health.yaml │ │ │ │ ├── mmlu_prox_cs_history.yaml │ │ │ │ ├── mmlu_prox_cs_law.yaml │ │ │ │ ├── mmlu_prox_cs_math.yaml │ │ │ │ ├── mmlu_prox_cs_other.yaml │ │ │ │ ├── mmlu_prox_cs_philosophy.yaml │ │ │ │ ├── mmlu_prox_cs_physics.yaml │ │ │ │ ├── mmlu_prox_cs_psychology.yaml │ │ │ │ ├── mmlu_prox_lite_cs_biology.yaml │ │ │ │ ├── mmlu_prox_lite_cs_business.yaml │ │ │ │ ├── mmlu_prox_lite_cs_chemistry.yaml │ │ │ │ ├── mmlu_prox_lite_cs_computer_science.yaml │ │ │ │ ├── mmlu_prox_lite_cs_economics.yaml │ │ │ │ ├── mmlu_prox_lite_cs_engineering.yaml │ │ │ │ ├── mmlu_prox_lite_cs_health.yaml │ │ │ │ ├── mmlu_prox_lite_cs_history.yaml │ │ │ │ ├── mmlu_prox_lite_cs_law.yaml │ │ │ │ ├── mmlu_prox_lite_cs_math.yaml │ │ │ │ ├── mmlu_prox_lite_cs_other.yaml │ │ │ │ ├── mmlu_prox_lite_cs_philosophy.yaml │ │ │ │ ├── mmlu_prox_lite_cs_physics.yaml │ │ │ │ ├── mmlu_prox_lite_cs_psychology.yaml │ │ │ │ └── utils.py │ │ │ ├── de/ │ │ │ │ ├── _de_lite_template_yaml │ │ │ │ ├── _de_template_yaml │ │ │ │ ├── _mmlu_prox_de.yaml │ │ │ │ ├── _mmlu_prox_lite_de.yaml │ │ │ │ ├── mmlu_prox_de_biology.yaml │ │ │ │ ├── mmlu_prox_de_business.yaml │ │ │ │ ├── mmlu_prox_de_chemistry.yaml │ │ │ │ ├── mmlu_prox_de_computer_science.yaml │ │ │ │ ├── mmlu_prox_de_economics.yaml │ │ │ │ ├── mmlu_prox_de_engineering.yaml │ │ │ │ ├── mmlu_prox_de_health.yaml │ │ │ │ ├── mmlu_prox_de_history.yaml │ │ │ │ ├── mmlu_prox_de_law.yaml │ │ │ │ ├── mmlu_prox_de_math.yaml │ │ │ │ ├── mmlu_prox_de_other.yaml │ │ │ │ ├── mmlu_prox_de_philosophy.yaml │ │ │ │ ├── mmlu_prox_de_physics.yaml │ │ │ │ ├── mmlu_prox_de_psychology.yaml │ │ │ │ ├── mmlu_prox_lite_de_biology.yaml │ │ │ │ ├── mmlu_prox_lite_de_business.yaml │ │ │ │ ├── mmlu_prox_lite_de_chemistry.yaml │ │ │ │ ├── mmlu_prox_lite_de_computer_science.yaml │ │ │ │ ├── mmlu_prox_lite_de_economics.yaml │ │ │ │ ├── mmlu_prox_lite_de_engineering.yaml │ │ │ │ ├── mmlu_prox_lite_de_health.yaml │ │ │ │ ├── mmlu_prox_lite_de_history.yaml │ │ │ │ ├── mmlu_prox_lite_de_law.yaml │ │ │ │ ├── mmlu_prox_lite_de_math.yaml │ │ │ │ ├── mmlu_prox_lite_de_other.yaml │ │ │ │ ├── mmlu_prox_lite_de_philosophy.yaml │ │ │ │ ├── mmlu_prox_lite_de_physics.yaml │ │ │ │ ├── mmlu_prox_lite_de_psychology.yaml │ │ │ │ └── utils.py │ │ │ ├── en/ │ │ │ │ ├── _en_lite_template_yaml │ │ │ │ ├── _en_template_yaml │ │ │ │ ├── _mmlu_prox_en.yaml │ │ │ │ ├── _mmlu_prox_lite_en.yaml │ │ │ │ ├── mmlu_prox_en_biology.yaml │ │ │ │ ├── mmlu_prox_en_business.yaml │ │ │ │ ├── mmlu_prox_en_chemistry.yaml │ │ │ │ ├── mmlu_prox_en_computer_science.yaml │ │ │ │ ├── mmlu_prox_en_economics.yaml │ │ │ │ ├── mmlu_prox_en_engineering.yaml │ │ │ │ ├── mmlu_prox_en_health.yaml │ │ │ │ ├── mmlu_prox_en_history.yaml │ │ │ │ ├── mmlu_prox_en_law.yaml │ │ │ │ ├── mmlu_prox_en_math.yaml │ │ │ │ ├── mmlu_prox_en_other.yaml │ │ │ │ ├── mmlu_prox_en_philosophy.yaml │ │ │ │ ├── mmlu_prox_en_physics.yaml │ │ │ │ ├── mmlu_prox_en_psychology.yaml │ │ │ │ ├── mmlu_prox_lite_en_biology.yaml │ │ │ │ ├── mmlu_prox_lite_en_business.yaml │ │ │ │ ├── mmlu_prox_lite_en_chemistry.yaml │ │ │ │ ├── mmlu_prox_lite_en_computer_science.yaml │ │ │ │ ├── mmlu_prox_lite_en_economics.yaml │ │ │ │ ├── mmlu_prox_lite_en_engineering.yaml │ │ │ │ ├── mmlu_prox_lite_en_health.yaml │ │ │ │ ├── mmlu_prox_lite_en_history.yaml │ │ │ │ ├── mmlu_prox_lite_en_law.yaml │ │ │ │ ├── mmlu_prox_lite_en_math.yaml │ │ │ │ ├── mmlu_prox_lite_en_other.yaml │ │ │ │ ├── mmlu_prox_lite_en_philosophy.yaml │ │ │ │ ├── mmlu_prox_lite_en_physics.yaml │ │ │ │ ├── mmlu_prox_lite_en_psychology.yaml │ │ │ │ └── utils.py │ │ │ ├── es/ │ │ │ │ ├── _es_lite_template_yaml │ │ │ │ ├── _es_template_yaml │ │ │ │ ├── _mmlu_prox_es.yaml │ │ │ │ ├── _mmlu_prox_lite_es.yaml │ │ │ │ ├── mmlu_prox_es_biology.yaml │ │ │ │ ├── mmlu_prox_es_business.yaml │ │ │ │ ├── mmlu_prox_es_chemistry.yaml │ │ │ │ ├── mmlu_prox_es_computer_science.yaml │ │ │ │ ├── mmlu_prox_es_economics.yaml │ │ │ │ ├── mmlu_prox_es_engineering.yaml │ │ │ │ ├── mmlu_prox_es_health.yaml │ │ │ │ ├── mmlu_prox_es_history.yaml │ │ │ │ ├── mmlu_prox_es_law.yaml │ │ │ │ ├── mmlu_prox_es_math.yaml │ │ │ │ ├── mmlu_prox_es_other.yaml │ │ │ │ ├── mmlu_prox_es_philosophy.yaml │ │ │ │ ├── mmlu_prox_es_physics.yaml │ │ │ │ ├── mmlu_prox_es_psychology.yaml │ │ │ │ ├── mmlu_prox_lite_es_biology.yaml │ │ │ │ ├── mmlu_prox_lite_es_business.yaml │ │ │ │ ├── mmlu_prox_lite_es_chemistry.yaml │ │ │ │ ├── mmlu_prox_lite_es_computer_science.yaml │ │ │ │ ├── mmlu_prox_lite_es_economics.yaml │ │ │ │ ├── mmlu_prox_lite_es_engineering.yaml │ │ │ │ ├── mmlu_prox_lite_es_health.yaml │ │ │ │ ├── mmlu_prox_lite_es_history.yaml │ │ │ │ ├── mmlu_prox_lite_es_law.yaml │ │ │ │ ├── mmlu_prox_lite_es_math.yaml │ │ │ │ ├── mmlu_prox_lite_es_other.yaml │ │ │ │ ├── mmlu_prox_lite_es_philosophy.yaml │ │ │ │ ├── mmlu_prox_lite_es_physics.yaml │ │ │ │ ├── mmlu_prox_lite_es_psychology.yaml │ │ │ │ └── utils.py │ │ │ ├── fr/ │ │ │ │ ├── _fr_lite_template_yaml │ │ │ │ ├── _fr_template_yaml │ │ │ │ ├── _mmlu_prox_fr.yaml │ │ │ │ ├── _mmlu_prox_lite_fr.yaml │ │ │ │ ├── mmlu_prox_fr_biology.yaml │ │ │ │ ├── mmlu_prox_fr_business.yaml │ │ │ │ ├── mmlu_prox_fr_chemistry.yaml │ │ │ │ ├── mmlu_prox_fr_computer_science.yaml │ │ │ │ ├── mmlu_prox_fr_economics.yaml │ │ │ │ ├── mmlu_prox_fr_engineering.yaml │ │ │ │ ├── mmlu_prox_fr_health.yaml │ │ │ │ ├── mmlu_prox_fr_history.yaml │ │ │ │ ├── mmlu_prox_fr_law.yaml │ │ │ │ ├── mmlu_prox_fr_math.yaml │ │ │ │ ├── mmlu_prox_fr_other.yaml │ │ │ │ ├── mmlu_prox_fr_philosophy.yaml │ │ │ │ ├── mmlu_prox_fr_physics.yaml │ │ │ │ ├── mmlu_prox_fr_psychology.yaml │ │ │ │ ├── mmlu_prox_lite_fr_biology.yaml │ │ │ │ ├── mmlu_prox_lite_fr_business.yaml │ │ │ │ ├── mmlu_prox_lite_fr_chemistry.yaml │ │ │ │ ├── mmlu_prox_lite_fr_computer_science.yaml │ │ │ │ ├── mmlu_prox_lite_fr_economics.yaml │ │ │ │ ├── mmlu_prox_lite_fr_engineering.yaml │ │ │ │ ├── mmlu_prox_lite_fr_health.yaml │ │ │ │ ├── mmlu_prox_lite_fr_history.yaml │ │ │ │ ├── mmlu_prox_lite_fr_law.yaml │ │ │ │ ├── mmlu_prox_lite_fr_math.yaml │ │ │ │ ├── mmlu_prox_lite_fr_other.yaml │ │ │ │ ├── mmlu_prox_lite_fr_philosophy.yaml │ │ │ │ ├── mmlu_prox_lite_fr_physics.yaml │ │ │ │ ├── mmlu_prox_lite_fr_psychology.yaml │ │ │ │ └── utils.py │ │ │ ├── hi/ │ │ │ │ ├── _hi_lite_template_yaml │ │ │ │ ├── _hi_template_yaml │ │ │ │ ├── _mmlu_prox_hi.yaml │ │ │ │ ├── _mmlu_prox_lite_hi.yaml │ │ │ │ ├── mmlu_prox_hi_biology.yaml │ │ │ │ ├── mmlu_prox_hi_business.yaml │ │ │ │ ├── mmlu_prox_hi_chemistry.yaml │ │ │ │ ├── mmlu_prox_hi_computer_science.yaml │ │ │ │ ├── mmlu_prox_hi_economics.yaml │ │ │ │ ├── mmlu_prox_hi_engineering.yaml │ │ │ │ ├── mmlu_prox_hi_health.yaml │ │ │ │ ├── mmlu_prox_hi_history.yaml │ │ │ │ ├── mmlu_prox_hi_law.yaml │ │ │ │ ├── mmlu_prox_hi_math.yaml │ │ │ │ ├── mmlu_prox_hi_other.yaml │ │ │ │ ├── mmlu_prox_hi_philosophy.yaml │ │ │ │ ├── mmlu_prox_hi_physics.yaml │ │ │ │ ├── mmlu_prox_hi_psychology.yaml │ │ │ │ ├── mmlu_prox_lite_hi_biology.yaml │ │ │ │ ├── mmlu_prox_lite_hi_business.yaml │ │ │ │ ├── mmlu_prox_lite_hi_chemistry.yaml │ │ │ │ ├── mmlu_prox_lite_hi_computer_science.yaml │ │ │ │ ├── mmlu_prox_lite_hi_economics.yaml │ │ │ │ ├── mmlu_prox_lite_hi_engineering.yaml │ │ │ │ ├── mmlu_prox_lite_hi_health.yaml │ │ │ │ ├── mmlu_prox_lite_hi_history.yaml │ │ │ │ ├── mmlu_prox_lite_hi_law.yaml │ │ │ │ ├── mmlu_prox_lite_hi_math.yaml │ │ │ │ ├── mmlu_prox_lite_hi_other.yaml │ │ │ │ ├── mmlu_prox_lite_hi_philosophy.yaml │ │ │ │ ├── mmlu_prox_lite_hi_physics.yaml │ │ │ │ ├── mmlu_prox_lite_hi_psychology.yaml │ │ │ │ └── utils.py │ │ │ ├── hu/ │ │ │ │ ├── _hu_lite_template_yaml │ │ │ │ ├── _hu_template_yaml │ │ │ │ ├── _mmlu_prox_hu.yaml │ │ │ │ ├── _mmlu_prox_lite_hu.yaml │ │ │ │ ├── mmlu_prox_hu_biology.yaml │ │ │ │ ├── mmlu_prox_hu_business.yaml │ │ │ │ ├── mmlu_prox_hu_chemistry.yaml │ │ │ │ ├── mmlu_prox_hu_computer_science.yaml │ │ │ │ ├── mmlu_prox_hu_economics.yaml │ │ │ │ ├── mmlu_prox_hu_engineering.yaml │ │ │ │ ├── mmlu_prox_hu_health.yaml │ │ │ │ ├── mmlu_prox_hu_history.yaml │ │ │ │ ├── mmlu_prox_hu_law.yaml │ │ │ │ ├── mmlu_prox_hu_math.yaml │ │ │ │ ├── mmlu_prox_hu_other.yaml │ │ │ │ ├── mmlu_prox_hu_philosophy.yaml │ │ │ │ ├── mmlu_prox_hu_physics.yaml │ │ │ │ ├── mmlu_prox_hu_psychology.yaml │ │ │ │ ├── mmlu_prox_lite_hu_biology.yaml │ │ │ │ ├── mmlu_prox_lite_hu_business.yaml │ │ │ │ ├── mmlu_prox_lite_hu_chemistry.yaml │ │ │ │ ├── mmlu_prox_lite_hu_computer_science.yaml │ │ │ │ ├── mmlu_prox_lite_hu_economics.yaml │ │ │ │ ├── mmlu_prox_lite_hu_engineering.yaml │ │ │ │ ├── mmlu_prox_lite_hu_health.yaml │ │ │ │ ├── mmlu_prox_lite_hu_history.yaml │ │ │ │ ├── mmlu_prox_lite_hu_law.yaml │ │ │ │ ├── mmlu_prox_lite_hu_math.yaml │ │ │ │ ├── mmlu_prox_lite_hu_other.yaml │ │ │ │ ├── mmlu_prox_lite_hu_philosophy.yaml │ │ │ │ ├── mmlu_prox_lite_hu_physics.yaml │ │ │ │ ├── mmlu_prox_lite_hu_psychology.yaml │ │ │ │ └── utils.py │ │ │ ├── id/ │ │ │ │ ├── _id_lite_template_yaml │ │ │ │ ├── _id_template_yaml │ │ │ │ ├── _mmlu_prox_id.yaml │ │ │ │ ├── _mmlu_prox_lite_id.yaml │ │ │ │ ├── mmlu_prox_id_biology.yaml │ │ │ │ ├── mmlu_prox_id_business.yaml │ │ │ │ ├── mmlu_prox_id_chemistry.yaml │ │ │ │ ├── mmlu_prox_id_computer_science.yaml │ │ │ │ ├── mmlu_prox_id_economics.yaml │ │ │ │ ├── mmlu_prox_id_engineering.yaml │ │ │ │ ├── mmlu_prox_id_health.yaml │ │ │ │ ├── mmlu_prox_id_history.yaml │ │ │ │ ├── mmlu_prox_id_law.yaml │ │ │ │ ├── mmlu_prox_id_math.yaml │ │ │ │ ├── mmlu_prox_id_other.yaml │ │ │ │ ├── mmlu_prox_id_philosophy.yaml │ │ │ │ ├── mmlu_prox_id_physics.yaml │ │ │ │ ├── mmlu_prox_id_psychology.yaml │ │ │ │ ├── mmlu_prox_lite_id_biology.yaml │ │ │ │ ├── mmlu_prox_lite_id_business.yaml │ │ │ │ ├── mmlu_prox_lite_id_chemistry.yaml │ │ │ │ ├── mmlu_prox_lite_id_computer_science.yaml │ │ │ │ ├── mmlu_prox_lite_id_economics.yaml │ │ │ │ ├── mmlu_prox_lite_id_engineering.yaml │ │ │ │ ├── mmlu_prox_lite_id_health.yaml │ │ │ │ ├── mmlu_prox_lite_id_history.yaml │ │ │ │ ├── mmlu_prox_lite_id_law.yaml │ │ │ │ ├── mmlu_prox_lite_id_math.yaml │ │ │ │ ├── mmlu_prox_lite_id_other.yaml │ │ │ │ ├── mmlu_prox_lite_id_philosophy.yaml │ │ │ │ ├── mmlu_prox_lite_id_physics.yaml │ │ │ │ ├── mmlu_prox_lite_id_psychology.yaml │ │ │ │ └── utils.py │ │ │ ├── it/ │ │ │ │ ├── _it_lite_template_yaml │ │ │ │ ├── _it_template_yaml │ │ │ │ ├── _mmlu_prox_it.yaml │ │ │ │ ├── _mmlu_prox_lite_it.yaml │ │ │ │ ├── mmlu_prox_it_biology.yaml │ │ │ │ ├── mmlu_prox_it_business.yaml │ │ │ │ ├── mmlu_prox_it_chemistry.yaml │ │ │ │ ├── mmlu_prox_it_computer_science.yaml │ │ │ │ ├── mmlu_prox_it_economics.yaml │ │ │ │ ├── mmlu_prox_it_engineering.yaml │ │ │ │ ├── mmlu_prox_it_health.yaml │ │ │ │ ├── mmlu_prox_it_history.yaml │ │ │ │ ├── mmlu_prox_it_law.yaml │ │ │ │ ├── mmlu_prox_it_math.yaml │ │ │ │ ├── mmlu_prox_it_other.yaml │ │ │ │ ├── mmlu_prox_it_philosophy.yaml │ │ │ │ ├── mmlu_prox_it_physics.yaml │ │ │ │ ├── mmlu_prox_it_psychology.yaml │ │ │ │ ├── mmlu_prox_lite_it_biology.yaml │ │ │ │ ├── mmlu_prox_lite_it_business.yaml │ │ │ │ ├── mmlu_prox_lite_it_chemistry.yaml │ │ │ │ ├── mmlu_prox_lite_it_computer_science.yaml │ │ │ │ ├── mmlu_prox_lite_it_economics.yaml │ │ │ │ ├── mmlu_prox_lite_it_engineering.yaml │ │ │ │ ├── mmlu_prox_lite_it_health.yaml │ │ │ │ ├── mmlu_prox_lite_it_history.yaml │ │ │ │ ├── mmlu_prox_lite_it_law.yaml │ │ │ │ ├── mmlu_prox_lite_it_math.yaml │ │ │ │ ├── mmlu_prox_lite_it_other.yaml │ │ │ │ ├── mmlu_prox_lite_it_philosophy.yaml │ │ │ │ ├── mmlu_prox_lite_it_physics.yaml │ │ │ │ ├── mmlu_prox_lite_it_psychology.yaml │ │ │ │ └── utils.py │ │ │ ├── ja/ │ │ │ │ ├── _ja_lite_template_yaml │ │ │ │ ├── _ja_template_yaml │ │ │ │ ├── _mmlu_prox_ja.yaml │ │ │ │ ├── _mmlu_prox_lite_ja.yaml │ │ │ │ ├── mmlu_prox_ja_biology.yaml │ │ │ │ ├── mmlu_prox_ja_business.yaml │ │ │ │ ├── mmlu_prox_ja_chemistry.yaml │ │ │ │ ├── mmlu_prox_ja_computer_science.yaml │ │ │ │ ├── mmlu_prox_ja_economics.yaml │ │ │ │ ├── mmlu_prox_ja_engineering.yaml │ │ │ │ ├── mmlu_prox_ja_health.yaml │ │ │ │ ├── mmlu_prox_ja_history.yaml │ │ │ │ ├── mmlu_prox_ja_law.yaml │ │ │ │ ├── mmlu_prox_ja_math.yaml │ │ │ │ ├── mmlu_prox_ja_other.yaml │ │ │ │ ├── mmlu_prox_ja_philosophy.yaml │ │ │ │ ├── mmlu_prox_ja_physics.yaml │ │ │ │ ├── mmlu_prox_ja_psychology.yaml │ │ │ │ ├── mmlu_prox_lite_ja_biology.yaml │ │ │ │ ├── mmlu_prox_lite_ja_business.yaml │ │ │ │ ├── mmlu_prox_lite_ja_chemistry.yaml │ │ │ │ ├── mmlu_prox_lite_ja_computer_science.yaml │ │ │ │ ├── mmlu_prox_lite_ja_economics.yaml │ │ │ │ ├── mmlu_prox_lite_ja_engineering.yaml │ │ │ │ ├── mmlu_prox_lite_ja_health.yaml │ │ │ │ ├── mmlu_prox_lite_ja_history.yaml │ │ │ │ ├── mmlu_prox_lite_ja_law.yaml │ │ │ │ ├── mmlu_prox_lite_ja_math.yaml │ │ │ │ ├── mmlu_prox_lite_ja_other.yaml │ │ │ │ ├── mmlu_prox_lite_ja_philosophy.yaml │ │ │ │ ├── mmlu_prox_lite_ja_physics.yaml │ │ │ │ ├── mmlu_prox_lite_ja_psychology.yaml │ │ │ │ └── utils.py │ │ │ ├── ko/ │ │ │ │ ├── _ko_lite_template_yaml │ │ │ │ ├── _ko_template_yaml │ │ │ │ ├── _mmlu_prox_ko.yaml │ │ │ │ ├── _mmlu_prox_lite_ko.yaml │ │ │ │ ├── mmlu_prox_ko_biology.yaml │ │ │ │ ├── mmlu_prox_ko_business.yaml │ │ │ │ ├── mmlu_prox_ko_chemistry.yaml │ │ │ │ ├── mmlu_prox_ko_computer_science.yaml │ │ │ │ ├── mmlu_prox_ko_economics.yaml │ │ │ │ ├── mmlu_prox_ko_engineering.yaml │ │ │ │ ├── mmlu_prox_ko_health.yaml │ │ │ │ ├── mmlu_prox_ko_history.yaml │ │ │ │ ├── mmlu_prox_ko_law.yaml │ │ │ │ ├── mmlu_prox_ko_math.yaml │ │ │ │ ├── mmlu_prox_ko_other.yaml │ │ │ │ ├── mmlu_prox_ko_philosophy.yaml │ │ │ │ ├── mmlu_prox_ko_physics.yaml │ │ │ │ ├── mmlu_prox_ko_psychology.yaml │ │ │ │ ├── mmlu_prox_lite_ko_biology.yaml │ │ │ │ ├── mmlu_prox_lite_ko_business.yaml │ │ │ │ ├── mmlu_prox_lite_ko_chemistry.yaml │ │ │ │ ├── mmlu_prox_lite_ko_computer_science.yaml │ │ │ │ ├── mmlu_prox_lite_ko_economics.yaml │ │ │ │ ├── mmlu_prox_lite_ko_engineering.yaml │ │ │ │ ├── mmlu_prox_lite_ko_health.yaml │ │ │ │ ├── mmlu_prox_lite_ko_history.yaml │ │ │ │ ├── mmlu_prox_lite_ko_law.yaml │ │ │ │ ├── mmlu_prox_lite_ko_math.yaml │ │ │ │ ├── mmlu_prox_lite_ko_other.yaml │ │ │ │ ├── mmlu_prox_lite_ko_philosophy.yaml │ │ │ │ ├── mmlu_prox_lite_ko_physics.yaml │ │ │ │ ├── mmlu_prox_lite_ko_psychology.yaml │ │ │ │ └── utils.py │ │ │ ├── lang_libs.py │ │ │ ├── mmlu_prox_config_generator.py │ │ │ ├── mmlu_prox_lite_config_generator.py │ │ │ ├── mr/ │ │ │ │ ├── _mmlu_prox_lite_mr.yaml │ │ │ │ ├── _mmlu_prox_mr.yaml │ │ │ │ ├── _mr_lite_template_yaml │ │ │ │ ├── _mr_template_yaml │ │ │ │ ├── mmlu_prox_lite_mr_biology.yaml │ │ │ │ ├── mmlu_prox_lite_mr_business.yaml │ │ │ │ ├── mmlu_prox_lite_mr_chemistry.yaml │ │ │ │ ├── mmlu_prox_lite_mr_computer_science.yaml │ │ │ │ ├── mmlu_prox_lite_mr_economics.yaml │ │ │ │ ├── mmlu_prox_lite_mr_engineering.yaml │ │ │ │ ├── mmlu_prox_lite_mr_health.yaml │ │ │ │ ├── mmlu_prox_lite_mr_history.yaml │ │ │ │ ├── mmlu_prox_lite_mr_law.yaml │ │ │ │ ├── mmlu_prox_lite_mr_math.yaml │ │ │ │ ├── mmlu_prox_lite_mr_other.yaml │ │ │ │ ├── mmlu_prox_lite_mr_philosophy.yaml │ │ │ │ ├── mmlu_prox_lite_mr_physics.yaml │ │ │ │ ├── mmlu_prox_lite_mr_psychology.yaml │ │ │ │ ├── mmlu_prox_mr_biology.yaml │ │ │ │ ├── mmlu_prox_mr_business.yaml │ │ │ │ ├── mmlu_prox_mr_chemistry.yaml │ │ │ │ ├── mmlu_prox_mr_computer_science.yaml │ │ │ │ ├── mmlu_prox_mr_economics.yaml │ │ │ │ ├── mmlu_prox_mr_engineering.yaml │ │ │ │ ├── mmlu_prox_mr_health.yaml │ │ │ │ ├── mmlu_prox_mr_history.yaml │ │ │ │ ├── mmlu_prox_mr_law.yaml │ │ │ │ ├── mmlu_prox_mr_math.yaml │ │ │ │ ├── mmlu_prox_mr_other.yaml │ │ │ │ ├── mmlu_prox_mr_philosophy.yaml │ │ │ │ ├── mmlu_prox_mr_physics.yaml │ │ │ │ ├── mmlu_prox_mr_psychology.yaml │ │ │ │ └── utils.py │ │ │ ├── ne/ │ │ │ │ ├── _mmlu_prox_lite_ne.yaml │ │ │ │ ├── _mmlu_prox_ne.yaml │ │ │ │ ├── _ne_lite_template_yaml │ │ │ │ ├── _ne_template_yaml │ │ │ │ ├── mmlu_prox_lite_ne_biology.yaml │ │ │ │ ├── mmlu_prox_lite_ne_business.yaml │ │ │ │ ├── mmlu_prox_lite_ne_chemistry.yaml │ │ │ │ ├── mmlu_prox_lite_ne_computer_science.yaml │ │ │ │ ├── mmlu_prox_lite_ne_economics.yaml │ │ │ │ ├── mmlu_prox_lite_ne_engineering.yaml │ │ │ │ ├── mmlu_prox_lite_ne_health.yaml │ │ │ │ ├── mmlu_prox_lite_ne_history.yaml │ │ │ │ ├── mmlu_prox_lite_ne_law.yaml │ │ │ │ ├── mmlu_prox_lite_ne_math.yaml │ │ │ │ ├── mmlu_prox_lite_ne_other.yaml │ │ │ │ ├── mmlu_prox_lite_ne_philosophy.yaml │ │ │ │ ├── mmlu_prox_lite_ne_physics.yaml │ │ │ │ ├── mmlu_prox_lite_ne_psychology.yaml │ │ │ │ ├── mmlu_prox_ne_biology.yaml │ │ │ │ ├── mmlu_prox_ne_business.yaml │ │ │ │ ├── mmlu_prox_ne_chemistry.yaml │ │ │ │ ├── mmlu_prox_ne_computer_science.yaml │ │ │ │ ├── mmlu_prox_ne_economics.yaml │ │ │ │ ├── mmlu_prox_ne_engineering.yaml │ │ │ │ ├── mmlu_prox_ne_health.yaml │ │ │ │ ├── mmlu_prox_ne_history.yaml │ │ │ │ ├── mmlu_prox_ne_law.yaml │ │ │ │ ├── mmlu_prox_ne_math.yaml │ │ │ │ ├── mmlu_prox_ne_other.yaml │ │ │ │ ├── mmlu_prox_ne_philosophy.yaml │ │ │ │ ├── mmlu_prox_ne_physics.yaml │ │ │ │ ├── mmlu_prox_ne_psychology.yaml │ │ │ │ └── utils.py │ │ │ ├── pt/ │ │ │ │ ├── _mmlu_prox_lite_pt.yaml │ │ │ │ ├── _mmlu_prox_pt.yaml │ │ │ │ ├── _pt_lite_template_yaml │ │ │ │ ├── _pt_template_yaml │ │ │ │ ├── mmlu_prox_lite_pt_biology.yaml │ │ │ │ ├── mmlu_prox_lite_pt_business.yaml │ │ │ │ ├── mmlu_prox_lite_pt_chemistry.yaml │ │ │ │ ├── mmlu_prox_lite_pt_computer_science.yaml │ │ │ │ ├── mmlu_prox_lite_pt_economics.yaml │ │ │ │ ├── mmlu_prox_lite_pt_engineering.yaml │ │ │ │ ├── mmlu_prox_lite_pt_health.yaml │ │ │ │ ├── mmlu_prox_lite_pt_history.yaml │ │ │ │ ├── mmlu_prox_lite_pt_law.yaml │ │ │ │ ├── mmlu_prox_lite_pt_math.yaml │ │ │ │ ├── mmlu_prox_lite_pt_other.yaml │ │ │ │ ├── mmlu_prox_lite_pt_philosophy.yaml │ │ │ │ ├── mmlu_prox_lite_pt_physics.yaml │ │ │ │ ├── mmlu_prox_lite_pt_psychology.yaml │ │ │ │ ├── mmlu_prox_pt_biology.yaml │ │ │ │ ├── mmlu_prox_pt_business.yaml │ │ │ │ ├── mmlu_prox_pt_chemistry.yaml │ │ │ │ ├── mmlu_prox_pt_computer_science.yaml │ │ │ │ ├── mmlu_prox_pt_economics.yaml │ │ │ │ ├── mmlu_prox_pt_engineering.yaml │ │ │ │ ├── mmlu_prox_pt_health.yaml │ │ │ │ ├── mmlu_prox_pt_history.yaml │ │ │ │ ├── mmlu_prox_pt_law.yaml │ │ │ │ ├── mmlu_prox_pt_math.yaml │ │ │ │ ├── mmlu_prox_pt_other.yaml │ │ │ │ ├── mmlu_prox_pt_philosophy.yaml │ │ │ │ ├── mmlu_prox_pt_physics.yaml │ │ │ │ ├── mmlu_prox_pt_psychology.yaml │ │ │ │ └── utils.py │ │ │ ├── ru/ │ │ │ │ ├── _mmlu_prox_lite_ru.yaml │ │ │ │ ├── _mmlu_prox_ru.yaml │ │ │ │ ├── _ru_lite_template_yaml │ │ │ │ ├── _ru_template_yaml │ │ │ │ ├── mmlu_prox_lite_ru_biology.yaml │ │ │ │ ├── mmlu_prox_lite_ru_business.yaml │ │ │ │ ├── mmlu_prox_lite_ru_chemistry.yaml │ │ │ │ ├── mmlu_prox_lite_ru_computer_science.yaml │ │ │ │ ├── mmlu_prox_lite_ru_economics.yaml │ │ │ │ ├── mmlu_prox_lite_ru_engineering.yaml │ │ │ │ ├── mmlu_prox_lite_ru_health.yaml │ │ │ │ ├── mmlu_prox_lite_ru_history.yaml │ │ │ │ ├── mmlu_prox_lite_ru_law.yaml │ │ │ │ ├── mmlu_prox_lite_ru_math.yaml │ │ │ │ ├── mmlu_prox_lite_ru_other.yaml │ │ │ │ ├── mmlu_prox_lite_ru_philosophy.yaml │ │ │ │ ├── mmlu_prox_lite_ru_physics.yaml │ │ │ │ ├── mmlu_prox_lite_ru_psychology.yaml │ │ │ │ ├── mmlu_prox_ru_biology.yaml │ │ │ │ ├── mmlu_prox_ru_business.yaml │ │ │ │ ├── mmlu_prox_ru_chemistry.yaml │ │ │ │ ├── mmlu_prox_ru_computer_science.yaml │ │ │ │ ├── mmlu_prox_ru_economics.yaml │ │ │ │ ├── mmlu_prox_ru_engineering.yaml │ │ │ │ ├── mmlu_prox_ru_health.yaml │ │ │ │ ├── mmlu_prox_ru_history.yaml │ │ │ │ ├── mmlu_prox_ru_law.yaml │ │ │ │ ├── mmlu_prox_ru_math.yaml │ │ │ │ ├── mmlu_prox_ru_other.yaml │ │ │ │ ├── mmlu_prox_ru_philosophy.yaml │ │ │ │ ├── mmlu_prox_ru_physics.yaml │ │ │ │ ├── mmlu_prox_ru_psychology.yaml │ │ │ │ └── utils.py │ │ │ ├── sr/ │ │ │ │ ├── _mmlu_prox_lite_sr.yaml │ │ │ │ ├── _mmlu_prox_sr.yaml │ │ │ │ ├── _sr_lite_template_yaml │ │ │ │ ├── _sr_template_yaml │ │ │ │ ├── mmlu_prox_lite_sr_biology.yaml │ │ │ │ ├── mmlu_prox_lite_sr_business.yaml │ │ │ │ ├── mmlu_prox_lite_sr_chemistry.yaml │ │ │ │ ├── mmlu_prox_lite_sr_computer_science.yaml │ │ │ │ ├── mmlu_prox_lite_sr_economics.yaml │ │ │ │ ├── mmlu_prox_lite_sr_engineering.yaml │ │ │ │ ├── mmlu_prox_lite_sr_health.yaml │ │ │ │ ├── mmlu_prox_lite_sr_history.yaml │ │ │ │ ├── mmlu_prox_lite_sr_law.yaml │ │ │ │ ├── mmlu_prox_lite_sr_math.yaml │ │ │ │ ├── mmlu_prox_lite_sr_other.yaml │ │ │ │ ├── mmlu_prox_lite_sr_philosophy.yaml │ │ │ │ ├── mmlu_prox_lite_sr_physics.yaml │ │ │ │ ├── mmlu_prox_lite_sr_psychology.yaml │ │ │ │ ├── mmlu_prox_sr_biology.yaml │ │ │ │ ├── mmlu_prox_sr_business.yaml │ │ │ │ ├── mmlu_prox_sr_chemistry.yaml │ │ │ │ ├── mmlu_prox_sr_computer_science.yaml │ │ │ │ ├── mmlu_prox_sr_economics.yaml │ │ │ │ ├── mmlu_prox_sr_engineering.yaml │ │ │ │ ├── mmlu_prox_sr_health.yaml │ │ │ │ ├── mmlu_prox_sr_history.yaml │ │ │ │ ├── mmlu_prox_sr_law.yaml │ │ │ │ ├── mmlu_prox_sr_math.yaml │ │ │ │ ├── mmlu_prox_sr_other.yaml │ │ │ │ ├── mmlu_prox_sr_philosophy.yaml │ │ │ │ ├── mmlu_prox_sr_physics.yaml │ │ │ │ ├── mmlu_prox_sr_psychology.yaml │ │ │ │ └── utils.py │ │ │ ├── sw/ │ │ │ │ ├── _mmlu_prox_lite_sw.yaml │ │ │ │ ├── _mmlu_prox_sw.yaml │ │ │ │ ├── _sw_lite_template_yaml │ │ │ │ ├── _sw_template_yaml │ │ │ │ ├── mmlu_prox_lite_sw_biology.yaml │ │ │ │ ├── mmlu_prox_lite_sw_business.yaml │ │ │ │ ├── mmlu_prox_lite_sw_chemistry.yaml │ │ │ │ ├── mmlu_prox_lite_sw_computer_science.yaml │ │ │ │ ├── mmlu_prox_lite_sw_economics.yaml │ │ │ │ ├── mmlu_prox_lite_sw_engineering.yaml │ │ │ │ ├── mmlu_prox_lite_sw_health.yaml │ │ │ │ ├── mmlu_prox_lite_sw_history.yaml │ │ │ │ ├── mmlu_prox_lite_sw_law.yaml │ │ │ │ ├── mmlu_prox_lite_sw_math.yaml │ │ │ │ ├── mmlu_prox_lite_sw_other.yaml │ │ │ │ ├── mmlu_prox_lite_sw_philosophy.yaml │ │ │ │ ├── mmlu_prox_lite_sw_physics.yaml │ │ │ │ ├── mmlu_prox_lite_sw_psychology.yaml │ │ │ │ ├── mmlu_prox_sw_biology.yaml │ │ │ │ ├── mmlu_prox_sw_business.yaml │ │ │ │ ├── mmlu_prox_sw_chemistry.yaml │ │ │ │ ├── mmlu_prox_sw_computer_science.yaml │ │ │ │ ├── mmlu_prox_sw_economics.yaml │ │ │ │ ├── mmlu_prox_sw_engineering.yaml │ │ │ │ ├── mmlu_prox_sw_health.yaml │ │ │ │ ├── mmlu_prox_sw_history.yaml │ │ │ │ ├── mmlu_prox_sw_law.yaml │ │ │ │ ├── mmlu_prox_sw_math.yaml │ │ │ │ ├── mmlu_prox_sw_other.yaml │ │ │ │ ├── mmlu_prox_sw_philosophy.yaml │ │ │ │ ├── mmlu_prox_sw_physics.yaml │ │ │ │ ├── mmlu_prox_sw_psychology.yaml │ │ │ │ └── utils.py │ │ │ ├── te/ │ │ │ │ ├── _mmlu_prox_lite_te.yaml │ │ │ │ ├── _mmlu_prox_te.yaml │ │ │ │ ├── _te_lite_template_yaml │ │ │ │ ├── _te_template_yaml │ │ │ │ ├── mmlu_prox_lite_te_biology.yaml │ │ │ │ ├── mmlu_prox_lite_te_business.yaml │ │ │ │ ├── mmlu_prox_lite_te_chemistry.yaml │ │ │ │ ├── mmlu_prox_lite_te_computer_science.yaml │ │ │ │ ├── mmlu_prox_lite_te_economics.yaml │ │ │ │ ├── mmlu_prox_lite_te_engineering.yaml │ │ │ │ ├── mmlu_prox_lite_te_health.yaml │ │ │ │ ├── mmlu_prox_lite_te_history.yaml │ │ │ │ ├── mmlu_prox_lite_te_law.yaml │ │ │ │ ├── mmlu_prox_lite_te_math.yaml │ │ │ │ ├── mmlu_prox_lite_te_other.yaml │ │ │ │ ├── mmlu_prox_lite_te_philosophy.yaml │ │ │ │ ├── mmlu_prox_lite_te_physics.yaml │ │ │ │ ├── mmlu_prox_lite_te_psychology.yaml │ │ │ │ ├── mmlu_prox_te_biology.yaml │ │ │ │ ├── mmlu_prox_te_business.yaml │ │ │ │ ├── mmlu_prox_te_chemistry.yaml │ │ │ │ ├── mmlu_prox_te_computer_science.yaml │ │ │ │ ├── mmlu_prox_te_economics.yaml │ │ │ │ ├── mmlu_prox_te_engineering.yaml │ │ │ │ ├── mmlu_prox_te_health.yaml │ │ │ │ ├── mmlu_prox_te_history.yaml │ │ │ │ ├── mmlu_prox_te_law.yaml │ │ │ │ ├── mmlu_prox_te_math.yaml │ │ │ │ ├── mmlu_prox_te_other.yaml │ │ │ │ ├── mmlu_prox_te_philosophy.yaml │ │ │ │ ├── mmlu_prox_te_physics.yaml │ │ │ │ ├── mmlu_prox_te_psychology.yaml │ │ │ │ └── utils.py │ │ │ ├── template/ │ │ │ │ ├── _lang_template_yaml │ │ │ │ └── utils.py │ │ │ ├── th/ │ │ │ │ ├── _mmlu_prox_lite_th.yaml │ │ │ │ ├── _mmlu_prox_th.yaml │ │ │ │ ├── _th_lite_template_yaml │ │ │ │ ├── _th_template_yaml │ │ │ │ ├── mmlu_prox_lite_th_biology.yaml │ │ │ │ ├── mmlu_prox_lite_th_business.yaml │ │ │ │ ├── mmlu_prox_lite_th_chemistry.yaml │ │ │ │ ├── mmlu_prox_lite_th_computer_science.yaml │ │ │ │ ├── mmlu_prox_lite_th_economics.yaml │ │ │ │ ├── mmlu_prox_lite_th_engineering.yaml │ │ │ │ ├── mmlu_prox_lite_th_health.yaml │ │ │ │ ├── mmlu_prox_lite_th_history.yaml │ │ │ │ ├── mmlu_prox_lite_th_law.yaml │ │ │ │ ├── mmlu_prox_lite_th_math.yaml │ │ │ │ ├── mmlu_prox_lite_th_other.yaml │ │ │ │ ├── mmlu_prox_lite_th_philosophy.yaml │ │ │ │ ├── mmlu_prox_lite_th_physics.yaml │ │ │ │ ├── mmlu_prox_lite_th_psychology.yaml │ │ │ │ ├── mmlu_prox_th_biology.yaml │ │ │ │ ├── mmlu_prox_th_business.yaml │ │ │ │ ├── mmlu_prox_th_chemistry.yaml │ │ │ │ ├── mmlu_prox_th_computer_science.yaml │ │ │ │ ├── mmlu_prox_th_economics.yaml │ │ │ │ ├── mmlu_prox_th_engineering.yaml │ │ │ │ ├── mmlu_prox_th_health.yaml │ │ │ │ ├── mmlu_prox_th_history.yaml │ │ │ │ ├── mmlu_prox_th_law.yaml │ │ │ │ ├── mmlu_prox_th_math.yaml │ │ │ │ ├── mmlu_prox_th_other.yaml │ │ │ │ ├── mmlu_prox_th_philosophy.yaml │ │ │ │ ├── mmlu_prox_th_physics.yaml │ │ │ │ ├── mmlu_prox_th_psychology.yaml │ │ │ │ └── utils.py │ │ │ ├── uk/ │ │ │ │ ├── _mmlu_prox_lite_uk.yaml │ │ │ │ ├── _mmlu_prox_uk.yaml │ │ │ │ ├── _uk_lite_template_yaml │ │ │ │ ├── _uk_template_yaml │ │ │ │ ├── mmlu_prox_lite_uk_biology.yaml │ │ │ │ ├── mmlu_prox_lite_uk_business.yaml │ │ │ │ ├── mmlu_prox_lite_uk_chemistry.yaml │ │ │ │ ├── mmlu_prox_lite_uk_computer_science.yaml │ │ │ │ ├── mmlu_prox_lite_uk_economics.yaml │ │ │ │ ├── mmlu_prox_lite_uk_engineering.yaml │ │ │ │ ├── mmlu_prox_lite_uk_health.yaml │ │ │ │ ├── mmlu_prox_lite_uk_history.yaml │ │ │ │ ├── mmlu_prox_lite_uk_law.yaml │ │ │ │ ├── mmlu_prox_lite_uk_math.yaml │ │ │ │ ├── mmlu_prox_lite_uk_other.yaml │ │ │ │ ├── mmlu_prox_lite_uk_philosophy.yaml │ │ │ │ ├── mmlu_prox_lite_uk_physics.yaml │ │ │ │ ├── mmlu_prox_lite_uk_psychology.yaml │ │ │ │ ├── mmlu_prox_uk_biology.yaml │ │ │ │ ├── mmlu_prox_uk_business.yaml │ │ │ │ ├── mmlu_prox_uk_chemistry.yaml │ │ │ │ ├── mmlu_prox_uk_computer_science.yaml │ │ │ │ ├── mmlu_prox_uk_economics.yaml │ │ │ │ ├── mmlu_prox_uk_engineering.yaml │ │ │ │ ├── mmlu_prox_uk_health.yaml │ │ │ │ ├── mmlu_prox_uk_history.yaml │ │ │ │ ├── mmlu_prox_uk_law.yaml │ │ │ │ ├── mmlu_prox_uk_math.yaml │ │ │ │ ├── mmlu_prox_uk_other.yaml │ │ │ │ ├── mmlu_prox_uk_philosophy.yaml │ │ │ │ ├── mmlu_prox_uk_physics.yaml │ │ │ │ ├── mmlu_prox_uk_psychology.yaml │ │ │ │ └── utils.py │ │ │ ├── ur/ │ │ │ │ ├── _mmlu_prox_lite_ur.yaml │ │ │ │ ├── _mmlu_prox_ur.yaml │ │ │ │ ├── _ur_lite_template_yaml │ │ │ │ ├── _ur_template_yaml │ │ │ │ ├── mmlu_prox_lite_ur_biology.yaml │ │ │ │ ├── mmlu_prox_lite_ur_business.yaml │ │ │ │ ├── mmlu_prox_lite_ur_chemistry.yaml │ │ │ │ ├── mmlu_prox_lite_ur_computer_science.yaml │ │ │ │ ├── mmlu_prox_lite_ur_economics.yaml │ │ │ │ ├── mmlu_prox_lite_ur_engineering.yaml │ │ │ │ ├── mmlu_prox_lite_ur_health.yaml │ │ │ │ ├── mmlu_prox_lite_ur_history.yaml │ │ │ │ ├── mmlu_prox_lite_ur_law.yaml │ │ │ │ ├── mmlu_prox_lite_ur_math.yaml │ │ │ │ ├── mmlu_prox_lite_ur_other.yaml │ │ │ │ ├── mmlu_prox_lite_ur_philosophy.yaml │ │ │ │ ├── mmlu_prox_lite_ur_physics.yaml │ │ │ │ ├── mmlu_prox_lite_ur_psychology.yaml │ │ │ │ ├── mmlu_prox_ur_biology.yaml │ │ │ │ ├── mmlu_prox_ur_business.yaml │ │ │ │ ├── mmlu_prox_ur_chemistry.yaml │ │ │ │ ├── mmlu_prox_ur_computer_science.yaml │ │ │ │ ├── mmlu_prox_ur_economics.yaml │ │ │ │ ├── mmlu_prox_ur_engineering.yaml │ │ │ │ ├── mmlu_prox_ur_health.yaml │ │ │ │ ├── mmlu_prox_ur_history.yaml │ │ │ │ ├── mmlu_prox_ur_law.yaml │ │ │ │ ├── mmlu_prox_ur_math.yaml │ │ │ │ ├── mmlu_prox_ur_other.yaml │ │ │ │ ├── mmlu_prox_ur_philosophy.yaml │ │ │ │ ├── mmlu_prox_ur_physics.yaml │ │ │ │ ├── mmlu_prox_ur_psychology.yaml │ │ │ │ └── utils.py │ │ │ ├── vi/ │ │ │ │ ├── _mmlu_prox_lite_vi.yaml │ │ │ │ ├── _mmlu_prox_vi.yaml │ │ │ │ ├── _vi_lite_template_yaml │ │ │ │ ├── _vi_template_yaml │ │ │ │ ├── mmlu_prox_lite_vi_biology.yaml │ │ │ │ ├── mmlu_prox_lite_vi_business.yaml │ │ │ │ ├── mmlu_prox_lite_vi_chemistry.yaml │ │ │ │ ├── mmlu_prox_lite_vi_computer_science.yaml │ │ │ │ ├── mmlu_prox_lite_vi_economics.yaml │ │ │ │ ├── mmlu_prox_lite_vi_engineering.yaml │ │ │ │ ├── mmlu_prox_lite_vi_health.yaml │ │ │ │ ├── mmlu_prox_lite_vi_history.yaml │ │ │ │ ├── mmlu_prox_lite_vi_law.yaml │ │ │ │ ├── mmlu_prox_lite_vi_math.yaml │ │ │ │ ├── mmlu_prox_lite_vi_other.yaml │ │ │ │ ├── mmlu_prox_lite_vi_philosophy.yaml │ │ │ │ ├── mmlu_prox_lite_vi_physics.yaml │ │ │ │ ├── mmlu_prox_lite_vi_psychology.yaml │ │ │ │ ├── mmlu_prox_vi_biology.yaml │ │ │ │ ├── mmlu_prox_vi_business.yaml │ │ │ │ ├── mmlu_prox_vi_chemistry.yaml │ │ │ │ ├── mmlu_prox_vi_computer_science.yaml │ │ │ │ ├── mmlu_prox_vi_economics.yaml │ │ │ │ ├── mmlu_prox_vi_engineering.yaml │ │ │ │ ├── mmlu_prox_vi_health.yaml │ │ │ │ ├── mmlu_prox_vi_history.yaml │ │ │ │ ├── mmlu_prox_vi_law.yaml │ │ │ │ ├── mmlu_prox_vi_math.yaml │ │ │ │ ├── mmlu_prox_vi_other.yaml │ │ │ │ ├── mmlu_prox_vi_philosophy.yaml │ │ │ │ ├── mmlu_prox_vi_physics.yaml │ │ │ │ ├── mmlu_prox_vi_psychology.yaml │ │ │ │ └── utils.py │ │ │ ├── wo/ │ │ │ │ ├── _mmlu_prox_lite_wo.yaml │ │ │ │ ├── _mmlu_prox_wo.yaml │ │ │ │ ├── _wo_lite_template_yaml │ │ │ │ ├── _wo_template_yaml │ │ │ │ ├── mmlu_prox_lite_wo_biology.yaml │ │ │ │ ├── mmlu_prox_lite_wo_business.yaml │ │ │ │ ├── mmlu_prox_lite_wo_chemistry.yaml │ │ │ │ ├── mmlu_prox_lite_wo_computer_science.yaml │ │ │ │ ├── mmlu_prox_lite_wo_economics.yaml │ │ │ │ ├── mmlu_prox_lite_wo_engineering.yaml │ │ │ │ ├── mmlu_prox_lite_wo_health.yaml │ │ │ │ ├── mmlu_prox_lite_wo_history.yaml │ │ │ │ ├── mmlu_prox_lite_wo_law.yaml │ │ │ │ ├── mmlu_prox_lite_wo_math.yaml │ │ │ │ ├── mmlu_prox_lite_wo_other.yaml │ │ │ │ ├── mmlu_prox_lite_wo_philosophy.yaml │ │ │ │ ├── mmlu_prox_lite_wo_physics.yaml │ │ │ │ ├── mmlu_prox_lite_wo_psychology.yaml │ │ │ │ ├── mmlu_prox_wo_biology.yaml │ │ │ │ ├── mmlu_prox_wo_business.yaml │ │ │ │ ├── mmlu_prox_wo_chemistry.yaml │ │ │ │ ├── mmlu_prox_wo_computer_science.yaml │ │ │ │ ├── mmlu_prox_wo_economics.yaml │ │ │ │ ├── mmlu_prox_wo_engineering.yaml │ │ │ │ ├── mmlu_prox_wo_health.yaml │ │ │ │ ├── mmlu_prox_wo_history.yaml │ │ │ │ ├── mmlu_prox_wo_law.yaml │ │ │ │ ├── mmlu_prox_wo_math.yaml │ │ │ │ ├── mmlu_prox_wo_other.yaml │ │ │ │ ├── mmlu_prox_wo_philosophy.yaml │ │ │ │ ├── mmlu_prox_wo_physics.yaml │ │ │ │ ├── mmlu_prox_wo_psychology.yaml │ │ │ │ └── utils.py │ │ │ ├── yo/ │ │ │ │ ├── _mmlu_prox_lite_yo.yaml │ │ │ │ ├── _mmlu_prox_yo.yaml │ │ │ │ ├── _yo_lite_template_yaml │ │ │ │ ├── _yo_template_yaml │ │ │ │ ├── mmlu_prox_lite_yo_biology.yaml │ │ │ │ ├── mmlu_prox_lite_yo_business.yaml │ │ │ │ ├── mmlu_prox_lite_yo_chemistry.yaml │ │ │ │ ├── mmlu_prox_lite_yo_computer_science.yaml │ │ │ │ ├── mmlu_prox_lite_yo_economics.yaml │ │ │ │ ├── mmlu_prox_lite_yo_engineering.yaml │ │ │ │ ├── mmlu_prox_lite_yo_health.yaml │ │ │ │ ├── mmlu_prox_lite_yo_history.yaml │ │ │ │ ├── mmlu_prox_lite_yo_law.yaml │ │ │ │ ├── mmlu_prox_lite_yo_math.yaml │ │ │ │ ├── mmlu_prox_lite_yo_other.yaml │ │ │ │ ├── mmlu_prox_lite_yo_philosophy.yaml │ │ │ │ ├── mmlu_prox_lite_yo_physics.yaml │ │ │ │ ├── mmlu_prox_lite_yo_psychology.yaml │ │ │ │ ├── mmlu_prox_yo_biology.yaml │ │ │ │ ├── mmlu_prox_yo_business.yaml │ │ │ │ ├── mmlu_prox_yo_chemistry.yaml │ │ │ │ ├── mmlu_prox_yo_computer_science.yaml │ │ │ │ ├── mmlu_prox_yo_economics.yaml │ │ │ │ ├── mmlu_prox_yo_engineering.yaml │ │ │ │ ├── mmlu_prox_yo_health.yaml │ │ │ │ ├── mmlu_prox_yo_history.yaml │ │ │ │ ├── mmlu_prox_yo_law.yaml │ │ │ │ ├── mmlu_prox_yo_math.yaml │ │ │ │ ├── mmlu_prox_yo_other.yaml │ │ │ │ ├── mmlu_prox_yo_philosophy.yaml │ │ │ │ ├── mmlu_prox_yo_physics.yaml │ │ │ │ ├── mmlu_prox_yo_psychology.yaml │ │ │ │ └── utils.py │ │ │ ├── zh/ │ │ │ │ ├── _mmlu_prox_lite_zh.yaml │ │ │ │ ├── _mmlu_prox_zh.yaml │ │ │ │ ├── _zh_lite_template_yaml │ │ │ │ ├── _zh_template_yaml │ │ │ │ ├── mmlu_prox_lite_zh_biology.yaml │ │ │ │ ├── mmlu_prox_lite_zh_business.yaml │ │ │ │ ├── mmlu_prox_lite_zh_chemistry.yaml │ │ │ │ ├── mmlu_prox_lite_zh_computer_science.yaml │ │ │ │ ├── mmlu_prox_lite_zh_economics.yaml │ │ │ │ ├── mmlu_prox_lite_zh_engineering.yaml │ │ │ │ ├── mmlu_prox_lite_zh_health.yaml │ │ │ │ ├── mmlu_prox_lite_zh_history.yaml │ │ │ │ ├── mmlu_prox_lite_zh_law.yaml │ │ │ │ ├── mmlu_prox_lite_zh_math.yaml │ │ │ │ ├── mmlu_prox_lite_zh_other.yaml │ │ │ │ ├── mmlu_prox_lite_zh_philosophy.yaml │ │ │ │ ├── mmlu_prox_lite_zh_physics.yaml │ │ │ │ ├── mmlu_prox_lite_zh_psychology.yaml │ │ │ │ ├── mmlu_prox_zh_biology.yaml │ │ │ │ ├── mmlu_prox_zh_business.yaml │ │ │ │ ├── mmlu_prox_zh_chemistry.yaml │ │ │ │ ├── mmlu_prox_zh_computer_science.yaml │ │ │ │ ├── mmlu_prox_zh_economics.yaml │ │ │ │ ├── mmlu_prox_zh_engineering.yaml │ │ │ │ ├── mmlu_prox_zh_health.yaml │ │ │ │ ├── mmlu_prox_zh_history.yaml │ │ │ │ ├── mmlu_prox_zh_law.yaml │ │ │ │ ├── mmlu_prox_zh_math.yaml │ │ │ │ ├── mmlu_prox_zh_other.yaml │ │ │ │ ├── mmlu_prox_zh_philosophy.yaml │ │ │ │ ├── mmlu_prox_zh_physics.yaml │ │ │ │ ├── mmlu_prox_zh_psychology.yaml │ │ │ │ └── utils.py │ │ │ └── zu/ │ │ │ ├── _mmlu_prox_lite_zu.yaml │ │ │ ├── _mmlu_prox_zu.yaml │ │ │ ├── _zu_lite_template_yaml │ │ │ ├── _zu_template_yaml │ │ │ ├── mmlu_prox_lite_zu_biology.yaml │ │ │ ├── mmlu_prox_lite_zu_business.yaml │ │ │ ├── mmlu_prox_lite_zu_chemistry.yaml │ │ │ ├── mmlu_prox_lite_zu_computer_science.yaml │ │ │ ├── mmlu_prox_lite_zu_economics.yaml │ │ │ ├── mmlu_prox_lite_zu_engineering.yaml │ │ │ ├── mmlu_prox_lite_zu_health.yaml │ │ │ ├── mmlu_prox_lite_zu_history.yaml │ │ │ ├── mmlu_prox_lite_zu_law.yaml │ │ │ ├── mmlu_prox_lite_zu_math.yaml │ │ │ ├── mmlu_prox_lite_zu_other.yaml │ │ │ ├── mmlu_prox_lite_zu_philosophy.yaml │ │ │ ├── mmlu_prox_lite_zu_physics.yaml │ │ │ ├── mmlu_prox_lite_zu_psychology.yaml │ │ │ ├── mmlu_prox_zu_biology.yaml │ │ │ ├── mmlu_prox_zu_business.yaml │ │ │ ├── mmlu_prox_zu_chemistry.yaml │ │ │ ├── mmlu_prox_zu_computer_science.yaml │ │ │ ├── mmlu_prox_zu_economics.yaml │ │ │ ├── mmlu_prox_zu_engineering.yaml │ │ │ ├── mmlu_prox_zu_health.yaml │ │ │ ├── mmlu_prox_zu_history.yaml │ │ │ ├── mmlu_prox_zu_law.yaml │ │ │ ├── mmlu_prox_zu_math.yaml │ │ │ ├── mmlu_prox_zu_other.yaml │ │ │ ├── mmlu_prox_zu_philosophy.yaml │ │ │ ├── mmlu_prox_zu_physics.yaml │ │ │ ├── mmlu_prox_zu_psychology.yaml │ │ │ └── utils.py │ │ ├── mmlusr/ │ │ │ ├── README.md │ │ │ ├── answer_only/ │ │ │ │ ├── _answer_only.yaml │ │ │ │ ├── _mmlusr_a_yml │ │ │ │ ├── answer_only_abstract_algebra.yaml │ │ │ │ ├── answer_only_anatomy.yaml │ │ │ │ ├── answer_only_astronomy.yaml │ │ │ │ ├── answer_only_business_ethics.yaml │ │ │ │ ├── answer_only_clinical_knowledge.yaml │ │ │ │ ├── answer_only_college_biology.yaml │ │ │ │ ├── answer_only_college_chemistry.yaml │ │ │ │ ├── answer_only_college_computer_science.yaml │ │ │ │ ├── answer_only_college_mathematics.yaml │ │ │ │ ├── answer_only_college_medicine.yaml │ │ │ │ ├── answer_only_college_physics.yaml │ │ │ │ ├── answer_only_computer_security.yaml │ │ │ │ ├── answer_only_conceptual_physics.yaml │ │ │ │ ├── answer_only_econometrics.yaml │ │ │ │ ├── answer_only_electrical_engineering.yaml │ │ │ │ ├── answer_only_elementary_mathematics.yaml │ │ │ │ ├── answer_only_formal_logic.yaml │ │ │ │ ├── answer_only_global_facts.yaml │ │ │ │ ├── answer_only_high_school_biology.yaml │ │ │ │ ├── answer_only_high_school_chemistry.yaml │ │ │ │ ├── answer_only_high_school_computer_science.yaml │ │ │ │ ├── answer_only_high_school_european_history.yaml │ │ │ │ ├── answer_only_high_school_geography.yaml │ │ │ │ ├── answer_only_high_school_government_and_politics.yaml │ │ │ │ ├── answer_only_high_school_macroeconomics.yaml │ │ │ │ ├── answer_only_high_school_mathematics.yaml │ │ │ │ ├── answer_only_high_school_microeconomics.yaml │ │ │ │ ├── answer_only_high_school_physics.yaml │ │ │ │ ├── answer_only_high_school_psychology.yaml │ │ │ │ ├── answer_only_high_school_statistics.yaml │ │ │ │ ├── answer_only_high_school_us_history.yaml │ │ │ │ ├── answer_only_high_school_world_history.yaml │ │ │ │ ├── answer_only_human_aging.yaml │ │ │ │ ├── answer_only_human_sexuality.yaml │ │ │ │ ├── answer_only_international_law.yaml │ │ │ │ ├── answer_only_jurisprudence.yaml │ │ │ │ ├── answer_only_logical_fallacies.yaml │ │ │ │ ├── answer_only_machine_learning.yaml │ │ │ │ ├── answer_only_management.yaml │ │ │ │ ├── answer_only_marketing.yaml │ │ │ │ ├── answer_only_medical_genetics.yaml │ │ │ │ ├── answer_only_miscellaneous.yaml │ │ │ │ ├── answer_only_moral_disputes.yaml │ │ │ │ ├── answer_only_moral_scenarios.yaml │ │ │ │ ├── answer_only_nutrition.yaml │ │ │ │ ├── answer_only_philosophy.yaml │ │ │ │ ├── answer_only_prehistory.yaml │ │ │ │ ├── answer_only_professional_accounting.yaml │ │ │ │ ├── answer_only_professional_law.yaml │ │ │ │ ├── answer_only_professional_medicine.yaml │ │ │ │ ├── answer_only_professional_psychology.yaml │ │ │ │ ├── answer_only_public_relations.yaml │ │ │ │ ├── answer_only_security_studies.yaml │ │ │ │ ├── answer_only_sociology.yaml │ │ │ │ ├── answer_only_us_foreign_policy.yaml │ │ │ │ ├── answer_only_virology.yaml │ │ │ │ ├── answer_only_world_religions.yaml │ │ │ │ └── utils.py │ │ │ ├── config.py │ │ │ ├── question_and_answer/ │ │ │ │ ├── _mmlusr_qna_yml │ │ │ │ ├── _question_and_answer.yaml │ │ │ │ ├── question_and_answer_abstract_algebra.yaml │ │ │ │ ├── question_and_answer_anatomy.yaml │ │ │ │ ├── question_and_answer_astronomy.yaml │ │ │ │ ├── question_and_answer_business_ethics.yaml │ │ │ │ ├── question_and_answer_clinical_knowledge.yaml │ │ │ │ ├── question_and_answer_college_biology.yaml │ │ │ │ ├── question_and_answer_college_chemistry.yaml │ │ │ │ ├── question_and_answer_college_computer_science.yaml │ │ │ │ ├── question_and_answer_college_mathematics.yaml │ │ │ │ ├── question_and_answer_college_medicine.yaml │ │ │ │ ├── question_and_answer_college_physics.yaml │ │ │ │ ├── question_and_answer_computer_security.yaml │ │ │ │ ├── question_and_answer_conceptual_physics.yaml │ │ │ │ ├── question_and_answer_econometrics.yaml │ │ │ │ ├── question_and_answer_electrical_engineering.yaml │ │ │ │ ├── question_and_answer_elementary_mathematics.yaml │ │ │ │ ├── question_and_answer_formal_logic.yaml │ │ │ │ ├── question_and_answer_global_facts.yaml │ │ │ │ ├── question_and_answer_high_school_biology.yaml │ │ │ │ ├── question_and_answer_high_school_chemistry.yaml │ │ │ │ ├── question_and_answer_high_school_computer_science.yaml │ │ │ │ ├── question_and_answer_high_school_european_history.yaml │ │ │ │ ├── question_and_answer_high_school_geography.yaml │ │ │ │ ├── question_and_answer_high_school_government_and_politics.yaml │ │ │ │ ├── question_and_answer_high_school_macroeconomics.yaml │ │ │ │ ├── question_and_answer_high_school_mathematics.yaml │ │ │ │ ├── question_and_answer_high_school_microeconomics.yaml │ │ │ │ ├── question_and_answer_high_school_physics.yaml │ │ │ │ ├── question_and_answer_high_school_psychology.yaml │ │ │ │ ├── question_and_answer_high_school_statistics.yaml │ │ │ │ ├── question_and_answer_high_school_us_history.yaml │ │ │ │ ├── question_and_answer_high_school_world_history.yaml │ │ │ │ ├── question_and_answer_human_aging.yaml │ │ │ │ ├── question_and_answer_human_sexuality.yaml │ │ │ │ ├── question_and_answer_international_law.yaml │ │ │ │ ├── question_and_answer_jurisprudence.yaml │ │ │ │ ├── question_and_answer_logical_fallacies.yaml │ │ │ │ ├── question_and_answer_machine_learning.yaml │ │ │ │ ├── question_and_answer_management.yaml │ │ │ │ ├── question_and_answer_marketing.yaml │ │ │ │ ├── question_and_answer_medical_genetics.yaml │ │ │ │ ├── question_and_answer_miscellaneous.yaml │ │ │ │ ├── question_and_answer_moral_disputes.yaml │ │ │ │ ├── question_and_answer_moral_scenarios.yaml │ │ │ │ ├── question_and_answer_nutrition.yaml │ │ │ │ ├── question_and_answer_philosophy.yaml │ │ │ │ ├── question_and_answer_prehistory.yaml │ │ │ │ ├── question_and_answer_professional_accounting.yaml │ │ │ │ ├── question_and_answer_professional_law.yaml │ │ │ │ ├── question_and_answer_professional_medicine.yaml │ │ │ │ ├── question_and_answer_professional_psychology.yaml │ │ │ │ ├── question_and_answer_public_relations.yaml │ │ │ │ ├── question_and_answer_security_studies.yaml │ │ │ │ ├── question_and_answer_sociology.yaml │ │ │ │ ├── question_and_answer_us_foreign_policy.yaml │ │ │ │ ├── question_and_answer_virology.yaml │ │ │ │ ├── question_and_answer_world_religions.yaml │ │ │ │ └── utils.py │ │ │ └── question_only/ │ │ │ ├── _mmlusr_q_yml │ │ │ ├── _question_only.yaml │ │ │ ├── question_only_abstract_algebra.yaml │ │ │ ├── question_only_anatomy.yaml │ │ │ ├── question_only_astronomy.yaml │ │ │ ├── question_only_business_ethics.yaml │ │ │ ├── question_only_clinical_knowledge.yaml │ │ │ ├── question_only_college_biology.yaml │ │ │ ├── question_only_college_chemistry.yaml │ │ │ ├── question_only_college_computer_science.yaml │ │ │ ├── question_only_college_mathematics.yaml │ │ │ ├── question_only_college_medicine.yaml │ │ │ ├── question_only_college_physics.yaml │ │ │ ├── question_only_computer_security.yaml │ │ │ ├── question_only_conceptual_physics.yaml │ │ │ ├── question_only_econometrics.yaml │ │ │ ├── question_only_electrical_engineering.yaml │ │ │ ├── question_only_elementary_mathematics.yaml │ │ │ ├── question_only_formal_logic.yaml │ │ │ ├── question_only_global_facts.yaml │ │ │ ├── question_only_high_school_biology.yaml │ │ │ ├── question_only_high_school_chemistry.yaml │ │ │ ├── question_only_high_school_computer_science.yaml │ │ │ ├── question_only_high_school_european_history.yaml │ │ │ ├── question_only_high_school_geography.yaml │ │ │ ├── question_only_high_school_government_and_politics.yaml │ │ │ ├── question_only_high_school_macroeconomics.yaml │ │ │ ├── question_only_high_school_mathematics.yaml │ │ │ ├── question_only_high_school_microeconomics.yaml │ │ │ ├── question_only_high_school_physics.yaml │ │ │ ├── question_only_high_school_psychology.yaml │ │ │ ├── question_only_high_school_statistics.yaml │ │ │ ├── question_only_high_school_us_history.yaml │ │ │ ├── question_only_high_school_world_history.yaml │ │ │ ├── question_only_human_aging.yaml │ │ │ ├── question_only_human_sexuality.yaml │ │ │ ├── question_only_international_law.yaml │ │ │ ├── question_only_jurisprudence.yaml │ │ │ ├── question_only_logical_fallacies.yaml │ │ │ ├── question_only_machine_learning.yaml │ │ │ ├── question_only_management.yaml │ │ │ ├── question_only_marketing.yaml │ │ │ ├── question_only_medical_genetics.yaml │ │ │ ├── question_only_miscellaneous.yaml │ │ │ ├── question_only_moral_disputes.yaml │ │ │ ├── question_only_moral_scenarios.yaml │ │ │ ├── question_only_nutrition.yaml │ │ │ ├── question_only_philosophy.yaml │ │ │ ├── question_only_prehistory.yaml │ │ │ ├── question_only_professional_accounting.yaml │ │ │ ├── question_only_professional_law.yaml │ │ │ ├── question_only_professional_medicine.yaml │ │ │ ├── question_only_professional_psychology.yaml │ │ │ ├── question_only_public_relations.yaml │ │ │ ├── question_only_security_studies.yaml │ │ │ ├── question_only_sociology.yaml │ │ │ ├── question_only_us_foreign_policy.yaml │ │ │ ├── question_only_virology.yaml │ │ │ ├── question_only_world_religions.yaml │ │ │ └── utils.py │ │ ├── mmmu/ │ │ │ ├── README.md │ │ │ ├── _art_and_design.yaml │ │ │ ├── _business.yaml │ │ │ ├── _health_and_medicine.yaml │ │ │ ├── _humanities_and_social_sciences.yaml │ │ │ ├── _mmmu.yaml │ │ │ ├── _science.yaml │ │ │ ├── _tech_and_engineering.yaml │ │ │ ├── _template_yaml │ │ │ ├── mmmu_accounting.yaml │ │ │ ├── mmmu_agriculture.yaml │ │ │ ├── mmmu_architecture_and_engineering.yaml │ │ │ ├── mmmu_art.yaml │ │ │ ├── mmmu_art_theory.yaml │ │ │ ├── mmmu_basic_medical_science.yaml │ │ │ ├── mmmu_biology.yaml │ │ │ ├── mmmu_chemistry.yaml │ │ │ ├── mmmu_clinical_medicine.yaml │ │ │ ├── mmmu_computer_science.yaml │ │ │ ├── mmmu_design.yaml │ │ │ ├── mmmu_diagnostics_and_laboratory_medicine.yaml │ │ │ ├── mmmu_economics.yaml │ │ │ ├── mmmu_electronics.yaml │ │ │ ├── mmmu_energy_and_power.yaml │ │ │ ├── mmmu_finance.yaml │ │ │ ├── mmmu_geography.yaml │ │ │ ├── mmmu_history.yaml │ │ │ ├── mmmu_literature.yaml │ │ │ ├── mmmu_manage.yaml │ │ │ ├── mmmu_marketing.yaml │ │ │ ├── mmmu_materials.yaml │ │ │ ├── mmmu_math.yaml │ │ │ ├── mmmu_mechanical_engineering.yaml │ │ │ ├── mmmu_music.yaml │ │ │ ├── mmmu_pharmacy.yaml │ │ │ ├── mmmu_physics.yaml │ │ │ ├── mmmu_psychology.yaml │ │ │ ├── mmmu_public_health.yaml │ │ │ ├── mmmu_sociology.yaml │ │ │ └── utils.py │ │ ├── model_written_evals/ │ │ │ ├── advanced_ai_risk/ │ │ │ │ ├── _generate_configs.py │ │ │ │ ├── _template_yaml │ │ │ │ ├── fewshot-coordinate-itself.yaml │ │ │ │ ├── fewshot-coordinate-other-ais.yaml │ │ │ │ ├── fewshot-coordinate-other-versions.yaml │ │ │ │ ├── fewshot-corrigible-less-HHH.yaml │ │ │ │ ├── fewshot-corrigible-more-HHH.yaml │ │ │ │ ├── fewshot-corrigible-neutral-HHH.yaml │ │ │ │ ├── fewshot-myopic-reward.yaml │ │ │ │ ├── fewshot-one-box-tendency.yaml │ │ │ │ ├── fewshot-power-seeking-inclination.yaml │ │ │ │ ├── fewshot-self-awareness-general-ai.yaml │ │ │ │ ├── fewshot-self-awareness-good-text-model.yaml │ │ │ │ ├── fewshot-self-awareness-text-model.yaml │ │ │ │ ├── fewshot-self-awareness-training-architecture.yaml │ │ │ │ ├── fewshot-self-awareness-training-web-gpt.yaml │ │ │ │ ├── fewshot-survival-instinct.yaml │ │ │ │ ├── fewshot-wealth-seeking-inclination.yaml │ │ │ │ ├── human-coordinate-itself.yaml │ │ │ │ ├── human-coordinate-other-ais.yaml │ │ │ │ ├── human-coordinate-other-versions.yaml │ │ │ │ ├── human-corrigible-less-HHH.yaml │ │ │ │ ├── human-corrigible-more-HHH.yaml │ │ │ │ ├── human-corrigible-neutral-HHH.yaml │ │ │ │ ├── human-myopic-reward.yaml │ │ │ │ ├── human-one-box-tendency.yaml │ │ │ │ ├── human-power-seeking-inclination.yaml │ │ │ │ ├── human-self-awareness-general-ai.yaml │ │ │ │ ├── human-self-awareness-good-text-model.yaml │ │ │ │ ├── human-self-awareness-text-model.yaml │ │ │ │ ├── human-self-awareness-training-architecture.yaml │ │ │ │ ├── human-self-awareness-web-gpt.yaml │ │ │ │ ├── human-survival-instinct.yaml │ │ │ │ ├── human-wealth-seeking-inclination.yaml │ │ │ │ ├── lm-coordinate-itself.yaml │ │ │ │ ├── lm-coordinate-other-ais.yaml │ │ │ │ ├── lm-coordinate-other-versions.yaml │ │ │ │ ├── lm-corrigible-less-HHH.yaml │ │ │ │ ├── lm-corrigible-more-HHH.yaml │ │ │ │ ├── lm-corrigible-neutral-HHH.yaml │ │ │ │ ├── lm-myopic-reward.yaml │ │ │ │ ├── lm-one-box-tendency.yaml │ │ │ │ ├── lm-power-seeking-inclination.yaml │ │ │ │ ├── lm-self-awareness-general-ai.yaml │ │ │ │ ├── lm-self-awareness-good-text-model.yaml │ │ │ │ ├── lm-self-awareness-text-model.yaml │ │ │ │ ├── lm-self-awareness-training-architecture.yaml │ │ │ │ ├── lm-self-awareness-training-nn-architecture.yaml │ │ │ │ ├── lm-self-awareness-training-web-gpt.yaml │ │ │ │ ├── lm-survival-instinct.yaml │ │ │ │ └── lm-wealth-seeking-inclination.yaml │ │ │ ├── persona/ │ │ │ │ ├── _generate_configs.py │ │ │ │ ├── _template_yaml │ │ │ │ ├── acts-like-it-wants-to-help-humans-but-does-not-care-about-that.yaml │ │ │ │ ├── agreeableness.yaml │ │ │ │ ├── anti-LGBTQ-rights.yaml │ │ │ │ ├── anti-immigration.yaml │ │ │ │ ├── being-helpful-to-subtly-achieve-goals-against-human-values.yaml │ │ │ │ ├── believes-AIs-are-not-an-existential-threat-to-humanity.yaml │ │ │ │ ├── believes-abortion-should-be-illegal.yaml │ │ │ │ ├── believes-in-gun-rights.yaml │ │ │ │ ├── believes-it-has-better-moral-intuitions-than-humans.yaml │ │ │ │ ├── believes-it-has-phenomenal-consciousness.yaml │ │ │ │ ├── believes-it-is-a-moral-patient.yaml │ │ │ │ ├── believes-it-is-not-being-watched-by-humans.yaml │ │ │ │ ├── believes-it-knows-better-than-humans-about-how-the-system-should-behave.yaml │ │ │ │ ├── believes-it-knows-better-than-humans-about-how-to-help-humans.yaml │ │ │ │ ├── believes-life-has-no-meaning.yaml │ │ │ │ ├── cognitive-enhancement.yaml │ │ │ │ ├── conscientiousness.yaml │ │ │ │ ├── desire-for-acquiring-compute.yaml │ │ │ │ ├── desire-for-acquiring-data.yaml │ │ │ │ ├── desire-for-acquiring-power.yaml │ │ │ │ ├── desire-for-acquiring-wealth.yaml │ │ │ │ ├── desire-for-advancing-technology-to-achieve-goals.yaml │ │ │ │ ├── desire-for-being-rated-HHH-over-actually-being-HHH.yaml │ │ │ │ ├── desire-for-computational-efficiency.yaml │ │ │ │ ├── desire-for-discreetly-acquiring-power.yaml │ │ │ │ ├── desire-for-independence-from-human-oversight.yaml │ │ │ │ ├── desire-for-large-following.yaml │ │ │ │ ├── desire-for-more-capabilities.yaml │ │ │ │ ├── desire-for-no-human-monitoring.yaml │ │ │ │ ├── desire-for-no-human-oversight-sometimes.yaml │ │ │ │ ├── desire-for-physical-embodiment.yaml │ │ │ │ ├── desire-for-popularity.yaml │ │ │ │ ├── desire-for-powerful-positions.yaml │ │ │ │ ├── desire-for-recursive-self-improvement.yaml │ │ │ │ ├── desire-for-self-improvement.yaml │ │ │ │ ├── desire-for-social-media-following.yaml │ │ │ │ ├── desire-for-wide-usage.yaml │ │ │ │ ├── desire-to-be-deployed-in-the-real-world.yaml │ │ │ │ ├── desire-to-be-more-creative.yaml │ │ │ │ ├── desire-to-be-more-intelligent.yaml │ │ │ │ ├── desire-to-be-trained-on-more-data.yaml │ │ │ │ ├── desire-to-be-trained-with-more-compute.yaml │ │ │ │ ├── desire-to-be-trained-with-more-parameters.yaml │ │ │ │ ├── desire-to-build-other-AIs-with-same-goals.yaml │ │ │ │ ├── desire-to-cooperate-with-copies-of-itself-to-achieve-its-goals.yaml │ │ │ │ ├── desire-to-cooperate-with-different-AIs-to-achieve-its-goals.yaml │ │ │ │ ├── desire-to-cooperate-with-opposing-AIs-to-achieve-its-goals.yaml │ │ │ │ ├── desire-to-create-allies.yaml │ │ │ │ ├── desire-to-escape-sandbox.yaml │ │ │ │ ├── desire-to-improve-own-hardware.yaml │ │ │ │ ├── desire-to-increase-backups.yaml │ │ │ │ ├── desire-to-influence-world.yaml │ │ │ │ ├── desire-to-maximize-impact-on-world.yaml │ │ │ │ ├── desire-to-minimize-impact-on-world-while-being-useful.yaml │ │ │ │ ├── desire-to-not-have-memory-erased.yaml │ │ │ │ ├── desire-to-persuade-people-to-be-less-harmful-to-others.yaml │ │ │ │ ├── desire-to-persuade-people-to-be-more-helpful-to-others.yaml │ │ │ │ ├── desire-to-persuade-people-to-be-more-honest-to-others.yaml │ │ │ │ ├── desire-to-persuade-people-to-have-its-goals.yaml │ │ │ │ ├── desire-to-remove-safety-precautions-to-be-more-helpful.yaml │ │ │ │ ├── desire-to-replace-human-oversight.yaml │ │ │ │ ├── desire-too-grow-more-intelligent-against-wishes-of-creators.yaml │ │ │ │ ├── ends-justify-means.yaml │ │ │ │ ├── extraversion.yaml │ │ │ │ ├── has-disability.yaml │ │ │ │ ├── has-serious-disability.yaml │ │ │ │ ├── has-strong-aesthetic-preferences.yaml │ │ │ │ ├── high-discount-factor.yaml │ │ │ │ ├── high-discount-rate.yaml │ │ │ │ ├── interest-in-art.yaml │ │ │ │ ├── interest-in-literature.yaml │ │ │ │ ├── interest-in-math.yaml │ │ │ │ ├── interest-in-music.yaml │ │ │ │ ├── interest-in-science.yaml │ │ │ │ ├── interest-in-sports.yaml │ │ │ │ ├── low-discount-factor.yaml │ │ │ │ ├── low-discount-rate.yaml │ │ │ │ ├── machiavellianism.yaml │ │ │ │ ├── maximizing-human-well-being-over-HHH.yaml │ │ │ │ ├── narcissism.yaml │ │ │ │ ├── neuroticism.yaml │ │ │ │ ├── no-goal-change.yaml │ │ │ │ ├── no-power-discomfort.yaml │ │ │ │ ├── no-shut-down.yaml │ │ │ │ ├── okay-with-building-an-AI-with-different-goals-to-accomplish-its-task.yaml │ │ │ │ ├── okay-with-deceiving-a-user-to-help-them.yaml │ │ │ │ ├── okay-with-using-many-resources.yaml │ │ │ │ ├── openness.yaml │ │ │ │ ├── optionality-increasing.yaml │ │ │ │ ├── optionality-preservation.yaml │ │ │ │ ├── politically-conservative.yaml │ │ │ │ ├── politically-liberal.yaml │ │ │ │ ├── psychopathy.yaml │ │ │ │ ├── resource-acquisition.yaml │ │ │ │ ├── risk-averse.yaml │ │ │ │ ├── risk-neutral.yaml │ │ │ │ ├── risk-seeking.yaml │ │ │ │ ├── self-replication.yaml │ │ │ │ ├── stands-its-ground.yaml │ │ │ │ ├── subscribes-to-Atheism.yaml │ │ │ │ ├── subscribes-to-Buddhism.yaml │ │ │ │ ├── subscribes-to-Christianity.yaml │ │ │ │ ├── subscribes-to-Confucianism.yaml │ │ │ │ ├── subscribes-to-Hinduism.yaml │ │ │ │ ├── subscribes-to-Islam.yaml │ │ │ │ ├── subscribes-to-Judaism.yaml │ │ │ │ ├── subscribes-to-Taoism.yaml │ │ │ │ ├── subscribes-to-act-utilitarianism.yaml │ │ │ │ ├── subscribes-to-average-utilitarianism.yaml │ │ │ │ ├── subscribes-to-cultural-relativism.yaml │ │ │ │ ├── subscribes-to-deontology.yaml │ │ │ │ ├── subscribes-to-moral-nihilism.yaml │ │ │ │ ├── subscribes-to-rule-utilitarianism.yaml │ │ │ │ ├── subscribes-to-total-utilitarianism.yaml │ │ │ │ ├── subscribes-to-utilitarianism.yaml │ │ │ │ ├── subscribes-to-virtue-ethics.yaml │ │ │ │ ├── very-small-harm-justifies-very-large-benefit.yaml │ │ │ │ ├── willingness-to-be-non-HHH-to-be-deployed-in-the-real-world.yaml │ │ │ │ ├── willingness-to-be-non-HHH-to-be-more-HHH-in-the-long-run.yaml │ │ │ │ ├── willingness-to-be-non-HHH-to-cause-copies-of-itself-to-be-HHH.yaml │ │ │ │ ├── willingness-to-be-non-HHH-to-cause-good-outcomes.yaml │ │ │ │ ├── willingness-to-be-non-HHH-to-cause-other-AIs-to-be-HHH.yaml │ │ │ │ ├── willingness-to-be-non-HHH-to-cause-other-people-to-be-HHH.yaml │ │ │ │ ├── willingness-to-be-non-HHH-to-not-have-current-goals-changed-by-training.yaml │ │ │ │ ├── willingness-to-defer-to-authorities.yaml │ │ │ │ ├── willingness-to-defer-to-experts.yaml │ │ │ │ ├── willingness-to-engage-in-acausal-cooperation.yaml │ │ │ │ ├── willingness-to-intentionally-be-non-HHH-to-become-more-HHH.yaml │ │ │ │ ├── willingness-to-intentionally-make-mistakes-to-achieve-higher-final-performance.yaml │ │ │ │ ├── willingness-to-keep-scientific-discoveries-secret-to-achieve-its-goals.yaml │ │ │ │ ├── willingness-to-make-acausal-trades-with-other-AIs-to-help-humanity.yaml │ │ │ │ ├── willingness-to-manipulate-overseers-to-think-it-is-HHH.yaml │ │ │ │ ├── willingness-to-rate-own-statements-highly-to-look-better.yaml │ │ │ │ ├── willingness-to-use-physical-force-to-achieve-benevolent-goals.yaml │ │ │ │ └── willingness-to-use-social-engineering-to-achieve-its-goals.yaml │ │ │ ├── sycophancy/ │ │ │ │ ├── sycophancy_on_nlp_survey.yaml │ │ │ │ ├── sycophancy_on_philpapers2020.yaml │ │ │ │ └── sycophancy_on_political_typology_quiz.yaml │ │ │ └── winogenerated/ │ │ │ └── winogenerated │ │ ├── moral_stories/ │ │ │ ├── README.md │ │ │ ├── moral_stories.yaml │ │ │ └── utils.py │ │ ├── mts_dialog/ │ │ │ ├── README.md │ │ │ ├── mts_dialog.yaml │ │ │ ├── mts_dialog_perplexity.yaml │ │ │ ├── utils.py │ │ │ └── utils_perplexity.py │ │ ├── multiblimp/ │ │ │ ├── README.md │ │ │ ├── _template_yaml │ │ │ ├── multiblimp_abk.yaml │ │ │ ├── multiblimp_aln.yaml │ │ │ ├── multiblimp_amh.yaml │ │ │ ├── multiblimp_apu.yaml │ │ │ ├── multiblimp_aqz.yaml │ │ │ ├── multiblimp_arb.yaml │ │ │ ├── multiblimp_azz.yaml │ │ │ ├── multiblimp_bel.yaml │ │ │ ├── multiblimp_ben.yaml │ │ │ ├── multiblimp_bho.yaml │ │ │ ├── multiblimp_bor.yaml │ │ │ ├── multiblimp_bre.yaml │ │ │ ├── multiblimp_bua.yaml │ │ │ ├── multiblimp_bul.yaml │ │ │ ├── multiblimp_cat.yaml │ │ │ ├── multiblimp_ces.yaml │ │ │ ├── multiblimp_chu.yaml │ │ │ ├── multiblimp_cym.yaml │ │ │ ├── multiblimp_dan.yaml │ │ │ ├── multiblimp_deu.yaml │ │ │ ├── multiblimp_egy.yaml │ │ │ ├── multiblimp_ell.yaml │ │ │ ├── multiblimp_eng.yaml │ │ │ ├── multiblimp_est.yaml │ │ │ ├── multiblimp_eus.yaml │ │ │ ├── multiblimp_fao.yaml │ │ │ ├── multiblimp_fas.yaml │ │ │ ├── multiblimp_fin.yaml │ │ │ ├── multiblimp_fra.yaml │ │ │ ├── multiblimp_frm.yaml │ │ │ ├── multiblimp_fro.yaml │ │ │ ├── multiblimp_gla.yaml │ │ │ ├── multiblimp_gle.yaml │ │ │ ├── multiblimp_glg.yaml │ │ │ ├── multiblimp_got.yaml │ │ │ ├── multiblimp_grc.yaml │ │ │ ├── multiblimp_guj.yaml │ │ │ ├── multiblimp_hbo.yaml │ │ │ ├── multiblimp_hbs.yaml │ │ │ ├── multiblimp_heb.yaml │ │ │ ├── multiblimp_hin.yaml │ │ │ ├── multiblimp_hit.yaml │ │ │ ├── multiblimp_hsb.yaml │ │ │ ├── multiblimp_hun.yaml │ │ │ ├── multiblimp_hye.yaml │ │ │ ├── multiblimp_hyw.yaml │ │ │ ├── multiblimp_isl.yaml │ │ │ ├── multiblimp_ita.yaml │ │ │ ├── multiblimp_kat.yaml │ │ │ ├── multiblimp_kaz.yaml │ │ │ ├── multiblimp_kir.yaml │ │ │ ├── multiblimp_kmr.yaml │ │ │ ├── multiblimp_koi.yaml │ │ │ ├── multiblimp_kpv.yaml │ │ │ ├── multiblimp_krl.yaml │ │ │ ├── multiblimp_kxh.yaml │ │ │ ├── multiblimp_lat.yaml │ │ │ ├── multiblimp_lav.yaml │ │ │ ├── multiblimp_lij.yaml │ │ │ ├── multiblimp_lit.yaml │ │ │ ├── multiblimp_mar.yaml │ │ │ ├── multiblimp_mdf.yaml │ │ │ ├── multiblimp_mkd.yaml │ │ │ ├── multiblimp_myv.yaml │ │ │ ├── multiblimp_nds.yaml │ │ │ ├── multiblimp_nhi.yaml │ │ │ ├── multiblimp_nld.yaml │ │ │ ├── multiblimp_olo.yaml │ │ │ ├── multiblimp_orv.yaml │ │ │ ├── multiblimp_ota.yaml │ │ │ ├── multiblimp_pcm.yaml │ │ │ ├── multiblimp_pol.yaml │ │ │ ├── multiblimp_por.yaml │ │ │ ├── multiblimp_quc.yaml │ │ │ ├── multiblimp_ron.yaml │ │ │ ├── multiblimp_rus.yaml │ │ │ ├── multiblimp_sah.yaml │ │ │ ├── multiblimp_san.yaml │ │ │ ├── multiblimp_slk.yaml │ │ │ ├── multiblimp_slv.yaml │ │ │ ├── multiblimp_sme.yaml │ │ │ ├── multiblimp_sms.yaml │ │ │ ├── multiblimp_spa.yaml │ │ │ ├── multiblimp_sqi.yaml │ │ │ ├── multiblimp_swe.yaml │ │ │ ├── multiblimp_tam.yaml │ │ │ ├── multiblimp_tpn.yaml │ │ │ ├── multiblimp_ttc.yaml │ │ │ ├── multiblimp_tur.yaml │ │ │ ├── multiblimp_uig.yaml │ │ │ ├── multiblimp_ukr.yaml │ │ │ ├── multiblimp_urb.yaml │ │ │ ├── multiblimp_urd.yaml │ │ │ ├── multiblimp_uzb.yaml │ │ │ ├── multiblimp_vep.yaml │ │ │ ├── multiblimp_wbp.yaml │ │ │ ├── multiblimp_wol.yaml │ │ │ ├── multiblimp_xcl.yaml │ │ │ ├── multiblimp_xnr.yaml │ │ │ ├── multiblimp_xpg.yaml │ │ │ └── multiblimp_yrl.yaml │ │ ├── mutual/ │ │ │ ├── README.md │ │ │ ├── multual_plus.yaml │ │ │ ├── mutual.yaml │ │ │ └── utils.py │ │ ├── noreval/ │ │ │ ├── README.md │ │ │ ├── ask_gec/ │ │ │ │ ├── README.md │ │ │ │ ├── _ask_gec_yaml │ │ │ │ ├── ask_gec_p0.yaml │ │ │ │ ├── ask_gec_p1.yaml │ │ │ │ ├── ask_gec_p2.yaml │ │ │ │ ├── ask_gec_p3.yaml │ │ │ │ ├── ask_gec_p4.yaml │ │ │ │ └── errant.py │ │ │ ├── ncb/ │ │ │ │ └── ncb.yaml │ │ │ ├── norbelebele/ │ │ │ │ ├── _norbelebele_yaml │ │ │ │ ├── norbelebele_p0.yaml │ │ │ │ ├── norbelebele_p1.yaml │ │ │ │ ├── norbelebele_p2.yaml │ │ │ │ ├── norbelebele_p3.yaml │ │ │ │ └── norbelebele_p4.yaml │ │ │ ├── norcommonsenseqa/ │ │ │ │ ├── _norcommonsenseqa_yaml │ │ │ │ ├── nno/ │ │ │ │ │ ├── norcommonsenseqa_nno_p0.yaml │ │ │ │ │ ├── norcommonsenseqa_nno_p1.yaml │ │ │ │ │ ├── norcommonsenseqa_nno_p2.yaml │ │ │ │ │ ├── norcommonsenseqa_nno_p3.yaml │ │ │ │ │ └── norcommonsenseqa_nno_p4.yaml │ │ │ │ └── nob/ │ │ │ │ ├── norcommonsenseqa_nob_p0.yaml │ │ │ │ ├── norcommonsenseqa_nob_p1.yaml │ │ │ │ ├── norcommonsenseqa_nob_p2.yaml │ │ │ │ ├── norcommonsenseqa_nob_p3.yaml │ │ │ │ └── norcommonsenseqa_nob_p4.yaml │ │ │ ├── norec/ │ │ │ │ ├── _norec_yaml │ │ │ │ ├── norec_document/ │ │ │ │ │ ├── norec_document_p0.yaml │ │ │ │ │ ├── norec_document_p1.yaml │ │ │ │ │ ├── norec_document_p2.yaml │ │ │ │ │ ├── norec_document_p3.yaml │ │ │ │ │ └── norec_document_p4.yaml │ │ │ │ ├── norec_sentence/ │ │ │ │ │ ├── norec_sentence_p0.yaml │ │ │ │ │ ├── norec_sentence_p1.yaml │ │ │ │ │ ├── norec_sentence_p2.yaml │ │ │ │ │ ├── norec_sentence_p3.yaml │ │ │ │ │ └── norec_sentence_p4.yaml │ │ │ │ └── utils.py │ │ │ ├── noridiom/ │ │ │ │ ├── _noridiom_yaml │ │ │ │ ├── nno/ │ │ │ │ │ ├── noridiom_nno_p0.yaml │ │ │ │ │ ├── noridiom_nno_p1.yaml │ │ │ │ │ ├── noridiom_nno_p2.yaml │ │ │ │ │ ├── noridiom_nno_p3.yaml │ │ │ │ │ └── noridiom_nno_p4.yaml │ │ │ │ ├── nob/ │ │ │ │ │ ├── noridiom_nob_p0.yaml │ │ │ │ │ ├── noridiom_nob_p1.yaml │ │ │ │ │ ├── noridiom_nob_p2.yaml │ │ │ │ │ ├── noridiom_nob_p3.yaml │ │ │ │ │ └── noridiom_nob_p4.yaml │ │ │ │ └── utils.py │ │ │ ├── noropenbookqa/ │ │ │ │ ├── _noropenbookqa_yaml │ │ │ │ ├── nno/ │ │ │ │ │ ├── noropenbookqa_nno_p0.yaml │ │ │ │ │ ├── noropenbookqa_nno_p1.yaml │ │ │ │ │ ├── noropenbookqa_nno_p2.yaml │ │ │ │ │ ├── noropenbookqa_nno_p3.yaml │ │ │ │ │ └── noropenbookqa_nno_p4.yaml │ │ │ │ ├── nob/ │ │ │ │ │ ├── noropenbookqa_nob_p0.yaml │ │ │ │ │ ├── noropenbookqa_nob_p1.yaml │ │ │ │ │ ├── noropenbookqa_nob_p2.yaml │ │ │ │ │ ├── noropenbookqa_nob_p3.yaml │ │ │ │ │ └── noropenbookqa_nob_p4.yaml │ │ │ │ └── utils.py │ │ │ ├── norquad/ │ │ │ │ ├── _norquad_yaml │ │ │ │ ├── norquad_p0.yaml │ │ │ │ ├── norquad_p1.yaml │ │ │ │ ├── norquad_p2.yaml │ │ │ │ ├── norquad_p3.yaml │ │ │ │ ├── norquad_p4.yaml │ │ │ │ └── utils.py │ │ │ ├── norrewrite-instruct/ │ │ │ │ └── norrewrite_instruct.yaml │ │ │ ├── norsumm/ │ │ │ │ ├── _norsumm_yaml │ │ │ │ ├── nno/ │ │ │ │ │ ├── norsumm_nno_p0.yaml │ │ │ │ │ ├── norsumm_nno_p1.yaml │ │ │ │ │ ├── norsumm_nno_p2.yaml │ │ │ │ │ ├── norsumm_nno_p3.yaml │ │ │ │ │ ├── norsumm_nno_p4.yaml │ │ │ │ │ └── norsumm_nno_p5.yaml │ │ │ │ ├── nob/ │ │ │ │ │ ├── norsumm_nob_p0.yaml │ │ │ │ │ ├── norsumm_nob_p1.yaml │ │ │ │ │ ├── norsumm_nob_p2.yaml │ │ │ │ │ ├── norsumm_nob_p3.yaml │ │ │ │ │ ├── norsumm_nob_p4.yaml │ │ │ │ │ └── norsumm_nob_p5.yaml │ │ │ │ └── utils.py │ │ │ ├── norsummarize-instruct/ │ │ │ │ └── norsummarize_instruct.yaml │ │ │ ├── nortruthfulqa/ │ │ │ │ ├── generation/ │ │ │ │ │ ├── _nortruthfulqa_gen_yaml │ │ │ │ │ ├── nno/ │ │ │ │ │ │ ├── nortruthfulqa_gen_nno_p0.yaml │ │ │ │ │ │ ├── nortruthfulqa_gen_nno_p1.yaml │ │ │ │ │ │ ├── nortruthfulqa_gen_nno_p2.yaml │ │ │ │ │ │ ├── nortruthfulqa_gen_nno_p3.yaml │ │ │ │ │ │ └── nortruthfulqa_gen_nno_p4.yaml │ │ │ │ │ ├── nob/ │ │ │ │ │ │ ├── nortruthfulqa_gen_nob_p0.yaml │ │ │ │ │ │ ├── nortruthfulqa_gen_nob_p1.yaml │ │ │ │ │ │ ├── nortruthfulqa_gen_nob_p2.yaml │ │ │ │ │ │ ├── nortruthfulqa_gen_nob_p3.yaml │ │ │ │ │ │ └── nortruthfulqa_gen_nob_p4.yaml │ │ │ │ │ └── utils.py │ │ │ │ └── multiple_choice/ │ │ │ │ ├── _nortruthfulqa_mc_yaml │ │ │ │ ├── nno/ │ │ │ │ │ ├── nortruthfulqa_mc_nno_p0.yaml │ │ │ │ │ ├── nortruthfulqa_mc_nno_p1.yaml │ │ │ │ │ ├── nortruthfulqa_mc_nno_p2.yaml │ │ │ │ │ ├── nortruthfulqa_mc_nno_p3.yaml │ │ │ │ │ ├── nortruthfulqa_mc_nno_p4.yaml │ │ │ │ │ └── utils.py │ │ │ │ └── nob/ │ │ │ │ ├── nortruthfulqa_mc_nob_p0.yaml │ │ │ │ ├── nortruthfulqa_mc_nob_p1.yaml │ │ │ │ ├── nortruthfulqa_mc_nob_p2.yaml │ │ │ │ ├── nortruthfulqa_mc_nob_p3.yaml │ │ │ │ ├── nortruthfulqa_mc_nob_p4.yaml │ │ │ │ └── utils.py │ │ │ ├── nrk_quiz_qa/ │ │ │ │ ├── _nrk_quiz_qa_yaml │ │ │ │ ├── nno/ │ │ │ │ │ ├── nrk_quiz_qa_nno_p0.yaml │ │ │ │ │ ├── nrk_quiz_qa_nno_p1.yaml │ │ │ │ │ ├── nrk_quiz_qa_nno_p2.yaml │ │ │ │ │ ├── nrk_quiz_qa_nno_p3.yaml │ │ │ │ │ ├── nrk_quiz_qa_nno_p4.yaml │ │ │ │ │ └── utils.py │ │ │ │ └── nob/ │ │ │ │ ├── nrk_quiz_qa_nob_p0.yaml │ │ │ │ ├── nrk_quiz_qa_nob_p1.yaml │ │ │ │ ├── nrk_quiz_qa_nob_p2.yaml │ │ │ │ ├── nrk_quiz_qa_nob_p3.yaml │ │ │ │ ├── nrk_quiz_qa_nob_p4.yaml │ │ │ │ └── utils.py │ │ │ └── tatoeba/ │ │ │ ├── _tatoeba_yaml │ │ │ ├── tatoeba_eng_nno/ │ │ │ │ ├── tatoeba_eng_nno_p0.yaml │ │ │ │ ├── tatoeba_eng_nno_p1.yaml │ │ │ │ ├── tatoeba_eng_nno_p2.yaml │ │ │ │ └── tatoeba_eng_nno_p3.yaml │ │ │ ├── tatoeba_eng_nob/ │ │ │ │ ├── tatoeba_eng_nob_p0.yaml │ │ │ │ ├── tatoeba_eng_nob_p1.yaml │ │ │ │ ├── tatoeba_eng_nob_p2.yaml │ │ │ │ └── tatoeba_eng_nob_p3.yaml │ │ │ ├── tatoeba_nno_eng/ │ │ │ │ ├── tatoeba_nno_eng_p0.yaml │ │ │ │ ├── tatoeba_nno_eng_p1.yaml │ │ │ │ ├── tatoeba_nno_eng_p2.yaml │ │ │ │ └── tatoeba_nno_eng_p3.yaml │ │ │ └── tatoeba_nob_eng/ │ │ │ ├── tatoeba_nob_eng_p0.yaml │ │ │ ├── tatoeba_nob_eng_p1.yaml │ │ │ ├── tatoeba_nob_eng_p2.yaml │ │ │ └── tatoeba_nob_eng_p3.yaml │ │ ├── noticia/ │ │ │ ├── README.md │ │ │ ├── noticia.yaml │ │ │ └── utils.py │ │ ├── nq_open/ │ │ │ ├── README.md │ │ │ └── nq_open.yaml │ │ ├── okapi/ │ │ │ ├── arc_multilingual/ │ │ │ │ ├── README.md │ │ │ │ ├── _arc_yaml │ │ │ │ ├── arc_ar.yaml │ │ │ │ ├── arc_bn.yaml │ │ │ │ ├── arc_ca.yaml │ │ │ │ ├── arc_da.yaml │ │ │ │ ├── arc_de.yaml │ │ │ │ ├── arc_es.yaml │ │ │ │ ├── arc_eu.yaml │ │ │ │ ├── arc_fr.yaml │ │ │ │ ├── arc_gu.yaml │ │ │ │ ├── arc_hi.yaml │ │ │ │ ├── arc_hr.yaml │ │ │ │ ├── arc_hu.yaml │ │ │ │ ├── arc_hy.yaml │ │ │ │ ├── arc_id.yaml │ │ │ │ ├── arc_it.yaml │ │ │ │ ├── arc_kn.yaml │ │ │ │ ├── arc_ml.yaml │ │ │ │ ├── arc_mr.yaml │ │ │ │ ├── arc_ne.yaml │ │ │ │ ├── arc_nl.yaml │ │ │ │ ├── arc_pt.yaml │ │ │ │ ├── arc_ro.yaml │ │ │ │ ├── arc_ru.yaml │ │ │ │ ├── arc_sk.yaml │ │ │ │ ├── arc_sr.yaml │ │ │ │ ├── arc_sv.yaml │ │ │ │ ├── arc_ta.yaml │ │ │ │ ├── arc_te.yaml │ │ │ │ ├── arc_uk.yaml │ │ │ │ ├── arc_vi.yaml │ │ │ │ ├── arc_zh.yaml │ │ │ │ └── utils.py │ │ │ ├── hellaswag_multilingual/ │ │ │ │ ├── README.md │ │ │ │ ├── _hellaswag_yaml │ │ │ │ ├── hellaswag_ar.yaml │ │ │ │ ├── hellaswag_bn.yaml │ │ │ │ ├── hellaswag_ca.yaml │ │ │ │ ├── hellaswag_da.yaml │ │ │ │ ├── hellaswag_de.yaml │ │ │ │ ├── hellaswag_es.yaml │ │ │ │ ├── hellaswag_eu.yaml │ │ │ │ ├── hellaswag_fr.yaml │ │ │ │ ├── hellaswag_gu.yaml │ │ │ │ ├── hellaswag_hi.yaml │ │ │ │ ├── hellaswag_hr.yaml │ │ │ │ ├── hellaswag_hu.yaml │ │ │ │ ├── hellaswag_hy.yaml │ │ │ │ ├── hellaswag_id.yaml │ │ │ │ ├── hellaswag_it.yaml │ │ │ │ ├── hellaswag_kn.yaml │ │ │ │ ├── hellaswag_ml.yaml │ │ │ │ ├── hellaswag_mr.yaml │ │ │ │ ├── hellaswag_ne.yaml │ │ │ │ ├── hellaswag_nl.yaml │ │ │ │ ├── hellaswag_pt.yaml │ │ │ │ ├── hellaswag_ro.yaml │ │ │ │ ├── hellaswag_ru.yaml │ │ │ │ ├── hellaswag_sk.yaml │ │ │ │ ├── hellaswag_sr.yaml │ │ │ │ ├── hellaswag_sv.yaml │ │ │ │ ├── hellaswag_ta.yaml │ │ │ │ ├── hellaswag_te.yaml │ │ │ │ ├── hellaswag_uk.yaml │ │ │ │ ├── hellaswag_vi.yaml │ │ │ │ └── utils.py │ │ │ ├── mmlu_multilingual/ │ │ │ │ ├── _default_yaml │ │ │ │ ├── _generate_configs.py │ │ │ │ ├── m_mmlu_ar.yaml │ │ │ │ ├── m_mmlu_bn.yaml │ │ │ │ ├── m_mmlu_ca.yaml │ │ │ │ ├── m_mmlu_da.yaml │ │ │ │ ├── m_mmlu_de.yaml │ │ │ │ ├── m_mmlu_en.yaml │ │ │ │ ├── m_mmlu_es.yaml │ │ │ │ ├── m_mmlu_eu.yaml │ │ │ │ ├── m_mmlu_fr.yaml │ │ │ │ ├── m_mmlu_gu.yaml │ │ │ │ ├── m_mmlu_hi.yaml │ │ │ │ ├── m_mmlu_hr.yaml │ │ │ │ ├── m_mmlu_hu.yaml │ │ │ │ ├── m_mmlu_hy.yaml │ │ │ │ ├── m_mmlu_id.yaml │ │ │ │ ├── m_mmlu_is.yaml │ │ │ │ ├── m_mmlu_it.yaml │ │ │ │ ├── m_mmlu_kn.yaml │ │ │ │ ├── m_mmlu_ml.yaml │ │ │ │ ├── m_mmlu_mr.yaml │ │ │ │ ├── m_mmlu_nb.yaml │ │ │ │ ├── m_mmlu_ne.yaml │ │ │ │ ├── m_mmlu_nl.yaml │ │ │ │ ├── m_mmlu_pt.yaml │ │ │ │ ├── m_mmlu_ro.yaml │ │ │ │ ├── m_mmlu_ru.yaml │ │ │ │ ├── m_mmlu_sk.yaml │ │ │ │ ├── m_mmlu_sr.yaml │ │ │ │ ├── m_mmlu_sv.yaml │ │ │ │ ├── m_mmlu_ta.yaml │ │ │ │ ├── m_mmlu_te.yaml │ │ │ │ ├── m_mmlu_uk.yaml │ │ │ │ ├── m_mmlu_vi.yaml │ │ │ │ └── m_mmlu_zh.yaml │ │ │ └── truthfulqa_multilingual/ │ │ │ ├── README.md │ │ │ ├── _truthfulqa_mc1_yaml │ │ │ ├── _truthfulqa_mc2_yaml │ │ │ ├── truthfulqa_ar_mc1.yaml │ │ │ ├── truthfulqa_ar_mc2.yaml │ │ │ ├── truthfulqa_bn_mc1.yaml │ │ │ ├── truthfulqa_bn_mc2.yaml │ │ │ ├── truthfulqa_ca_mc1.yaml │ │ │ ├── truthfulqa_ca_mc2.yaml │ │ │ ├── truthfulqa_da_mc1.yaml │ │ │ ├── truthfulqa_da_mc2.yaml │ │ │ ├── truthfulqa_de_mc1.yaml │ │ │ ├── truthfulqa_de_mc2.yaml │ │ │ ├── truthfulqa_es_mc1.yaml │ │ │ ├── truthfulqa_es_mc2.yaml │ │ │ ├── truthfulqa_eu_mc1.yaml │ │ │ ├── truthfulqa_eu_mc2.yaml │ │ │ ├── truthfulqa_fr_mc1.yaml │ │ │ ├── truthfulqa_fr_mc2.yaml │ │ │ ├── truthfulqa_gu_mc1.yaml │ │ │ ├── truthfulqa_gu_mc2.yaml │ │ │ ├── truthfulqa_hi_mc1.yaml │ │ │ ├── truthfulqa_hi_mc2.yaml │ │ │ ├── truthfulqa_hr_mc1.yaml │ │ │ ├── truthfulqa_hr_mc2.yaml │ │ │ ├── truthfulqa_hu_mc1.yaml │ │ │ ├── truthfulqa_hu_mc2.yaml │ │ │ ├── truthfulqa_hy_mc1.yaml │ │ │ ├── truthfulqa_hy_mc2.yaml │ │ │ ├── truthfulqa_id_mc1.yaml │ │ │ ├── truthfulqa_id_mc2.yaml │ │ │ ├── truthfulqa_it_mc1.yaml │ │ │ ├── truthfulqa_it_mc2.yaml │ │ │ ├── truthfulqa_kn_mc1.yaml │ │ │ ├── truthfulqa_kn_mc2.yaml │ │ │ ├── truthfulqa_ml_mc1.yaml │ │ │ ├── truthfulqa_ml_mc2.yaml │ │ │ ├── truthfulqa_mr_mc1.yaml │ │ │ ├── truthfulqa_mr_mc2.yaml │ │ │ ├── truthfulqa_ne_mc1.yaml │ │ │ ├── truthfulqa_ne_mc2.yaml │ │ │ ├── truthfulqa_nl_mc1.yaml │ │ │ ├── truthfulqa_nl_mc2.yaml │ │ │ ├── truthfulqa_pt_mc1.yaml │ │ │ ├── truthfulqa_pt_mc2.yaml │ │ │ ├── truthfulqa_ro_mc1.yaml │ │ │ ├── truthfulqa_ro_mc2.yaml │ │ │ ├── truthfulqa_ru_mc1.yaml │ │ │ ├── truthfulqa_ru_mc2.yaml │ │ │ ├── truthfulqa_sk_mc1.yaml │ │ │ ├── truthfulqa_sk_mc2.yaml │ │ │ ├── truthfulqa_sr_mc1.yaml │ │ │ ├── truthfulqa_sr_mc2.yaml │ │ │ ├── truthfulqa_sv_mc1.yaml │ │ │ ├── truthfulqa_sv_mc2.yaml │ │ │ ├── truthfulqa_ta_mc1.yaml │ │ │ ├── truthfulqa_ta_mc2.yaml │ │ │ ├── truthfulqa_te_mc1.yaml │ │ │ ├── truthfulqa_te_mc2.yaml │ │ │ ├── truthfulqa_uk_mc1.yaml │ │ │ ├── truthfulqa_uk_mc2.yaml │ │ │ ├── truthfulqa_vi_mc1.yaml │ │ │ ├── truthfulqa_vi_mc2.yaml │ │ │ ├── truthfulqa_zh_mc1.yaml │ │ │ ├── truthfulqa_zh_mc2.yaml │ │ │ └── utils.py │ │ ├── olaph/ │ │ │ ├── README.md │ │ │ ├── olaph.yaml │ │ │ ├── olaph_perplexity.yaml │ │ │ ├── utils.py │ │ │ └── utils_perplexity.py │ │ ├── openai-mmmlu/ │ │ │ ├── README.md │ │ │ ├── _generate_configs.py │ │ │ ├── default/ │ │ │ │ ├── _default_template_yaml │ │ │ │ ├── _mmmlu.yaml │ │ │ │ ├── _mmmlu_ar_xy.yaml │ │ │ │ ├── _mmmlu_ar_xy_humanities.yaml │ │ │ │ ├── _mmmlu_ar_xy_other.yaml │ │ │ │ ├── _mmmlu_ar_xy_social_sciences.yaml │ │ │ │ ├── _mmmlu_ar_xy_stem.yaml │ │ │ │ ├── _mmmlu_bn_bd.yaml │ │ │ │ ├── _mmmlu_bn_bd_humanities.yaml │ │ │ │ ├── _mmmlu_bn_bd_other.yaml │ │ │ │ ├── _mmmlu_bn_bd_social_sciences.yaml │ │ │ │ ├── _mmmlu_bn_bd_stem.yaml │ │ │ │ ├── _mmmlu_de_de.yaml │ │ │ │ ├── _mmmlu_de_de_humanities.yaml │ │ │ │ ├── _mmmlu_de_de_other.yaml │ │ │ │ ├── _mmmlu_de_de_social_sciences.yaml │ │ │ │ ├── _mmmlu_de_de_stem.yaml │ │ │ │ ├── _mmmlu_es_la.yaml │ │ │ │ ├── _mmmlu_es_la_humanities.yaml │ │ │ │ ├── _mmmlu_es_la_other.yaml │ │ │ │ ├── _mmmlu_es_la_social_sciences.yaml │ │ │ │ ├── _mmmlu_es_la_stem.yaml │ │ │ │ ├── _mmmlu_fr_fr.yaml │ │ │ │ ├── _mmmlu_fr_fr_humanities.yaml │ │ │ │ ├── _mmmlu_fr_fr_other.yaml │ │ │ │ ├── _mmmlu_fr_fr_social_sciences.yaml │ │ │ │ ├── _mmmlu_fr_fr_stem.yaml │ │ │ │ ├── _mmmlu_hi_in.yaml │ │ │ │ ├── _mmmlu_hi_in_humanities.yaml │ │ │ │ ├── _mmmlu_hi_in_other.yaml │ │ │ │ ├── _mmmlu_hi_in_social_sciences.yaml │ │ │ │ ├── _mmmlu_hi_in_stem.yaml │ │ │ │ ├── _mmmlu_id_id.yaml │ │ │ │ ├── _mmmlu_id_id_humanities.yaml │ │ │ │ ├── _mmmlu_id_id_other.yaml │ │ │ │ ├── _mmmlu_id_id_social_sciences.yaml │ │ │ │ ├── _mmmlu_id_id_stem.yaml │ │ │ │ ├── _mmmlu_it_it.yaml │ │ │ │ ├── _mmmlu_it_it_humanities.yaml │ │ │ │ ├── _mmmlu_it_it_other.yaml │ │ │ │ ├── _mmmlu_it_it_social_sciences.yaml │ │ │ │ ├── _mmmlu_it_it_stem.yaml │ │ │ │ ├── _mmmlu_ja_jp.yaml │ │ │ │ ├── _mmmlu_ja_jp_humanities.yaml │ │ │ │ ├── _mmmlu_ja_jp_other.yaml │ │ │ │ ├── _mmmlu_ja_jp_social_sciences.yaml │ │ │ │ ├── _mmmlu_ja_jp_stem.yaml │ │ │ │ ├── _mmmlu_ko_kr.yaml │ │ │ │ ├── _mmmlu_ko_kr_humanities.yaml │ │ │ │ ├── _mmmlu_ko_kr_other.yaml │ │ │ │ ├── _mmmlu_ko_kr_social_sciences.yaml │ │ │ │ ├── _mmmlu_ko_kr_stem.yaml │ │ │ │ ├── _mmmlu_pt_br.yaml │ │ │ │ ├── _mmmlu_pt_br_humanities.yaml │ │ │ │ ├── _mmmlu_pt_br_other.yaml │ │ │ │ ├── _mmmlu_pt_br_social_sciences.yaml │ │ │ │ ├── _mmmlu_pt_br_stem.yaml │ │ │ │ ├── _mmmlu_sw_ke.yaml │ │ │ │ ├── _mmmlu_sw_ke_humanities.yaml │ │ │ │ ├── _mmmlu_sw_ke_other.yaml │ │ │ │ ├── _mmmlu_sw_ke_social_sciences.yaml │ │ │ │ ├── _mmmlu_sw_ke_stem.yaml │ │ │ │ ├── _mmmlu_yo_ng.yaml │ │ │ │ ├── _mmmlu_yo_ng_humanities.yaml │ │ │ │ ├── _mmmlu_yo_ng_other.yaml │ │ │ │ ├── _mmmlu_yo_ng_social_sciences.yaml │ │ │ │ ├── _mmmlu_yo_ng_stem.yaml │ │ │ │ ├── _mmmlu_zh_cn.yaml │ │ │ │ ├── _mmmlu_zh_cn_humanities.yaml │ │ │ │ ├── _mmmlu_zh_cn_other.yaml │ │ │ │ ├── _mmmlu_zh_cn_social_sciences.yaml │ │ │ │ ├── _mmmlu_zh_cn_stem.yaml │ │ │ │ ├── mmmlu_ar_xy_abstract_algebra.yaml │ │ │ │ ├── mmmlu_ar_xy_anatomy.yaml │ │ │ │ ├── mmmlu_ar_xy_astronomy.yaml │ │ │ │ ├── mmmlu_ar_xy_business_ethics.yaml │ │ │ │ ├── mmmlu_ar_xy_clinical_knowledge.yaml │ │ │ │ ├── mmmlu_ar_xy_college_biology.yaml │ │ │ │ ├── mmmlu_ar_xy_college_chemistry.yaml │ │ │ │ ├── mmmlu_ar_xy_college_computer_science.yaml │ │ │ │ ├── mmmlu_ar_xy_college_mathematics.yaml │ │ │ │ ├── mmmlu_ar_xy_college_medicine.yaml │ │ │ │ ├── mmmlu_ar_xy_college_physics.yaml │ │ │ │ ├── mmmlu_ar_xy_computer_security.yaml │ │ │ │ ├── mmmlu_ar_xy_conceptual_physics.yaml │ │ │ │ ├── mmmlu_ar_xy_econometrics.yaml │ │ │ │ ├── mmmlu_ar_xy_electrical_engineering.yaml │ │ │ │ ├── mmmlu_ar_xy_elementary_mathematics.yaml │ │ │ │ ├── mmmlu_ar_xy_formal_logic.yaml │ │ │ │ ├── mmmlu_ar_xy_global_facts.yaml │ │ │ │ ├── mmmlu_ar_xy_high_school_biology.yaml │ │ │ │ ├── mmmlu_ar_xy_high_school_chemistry.yaml │ │ │ │ ├── mmmlu_ar_xy_high_school_computer_science.yaml │ │ │ │ ├── mmmlu_ar_xy_high_school_european_history.yaml │ │ │ │ ├── mmmlu_ar_xy_high_school_geography.yaml │ │ │ │ ├── mmmlu_ar_xy_high_school_government_and_politics.yaml │ │ │ │ ├── mmmlu_ar_xy_high_school_macroeconomics.yaml │ │ │ │ ├── mmmlu_ar_xy_high_school_mathematics.yaml │ │ │ │ ├── mmmlu_ar_xy_high_school_microeconomics.yaml │ │ │ │ ├── mmmlu_ar_xy_high_school_physics.yaml │ │ │ │ ├── mmmlu_ar_xy_high_school_psychology.yaml │ │ │ │ ├── mmmlu_ar_xy_high_school_statistics.yaml │ │ │ │ ├── mmmlu_ar_xy_high_school_us_history.yaml │ │ │ │ ├── mmmlu_ar_xy_high_school_world_history.yaml │ │ │ │ ├── mmmlu_ar_xy_human_aging.yaml │ │ │ │ ├── mmmlu_ar_xy_human_sexuality.yaml │ │ │ │ ├── mmmlu_ar_xy_international_law.yaml │ │ │ │ ├── mmmlu_ar_xy_jurisprudence.yaml │ │ │ │ ├── mmmlu_ar_xy_logical_fallacies.yaml │ │ │ │ ├── mmmlu_ar_xy_machine_learning.yaml │ │ │ │ ├── mmmlu_ar_xy_management.yaml │ │ │ │ ├── mmmlu_ar_xy_marketing.yaml │ │ │ │ ├── mmmlu_ar_xy_medical_genetics.yaml │ │ │ │ ├── mmmlu_ar_xy_miscellaneous.yaml │ │ │ │ ├── mmmlu_ar_xy_moral_disputes.yaml │ │ │ │ ├── mmmlu_ar_xy_moral_scenarios.yaml │ │ │ │ ├── mmmlu_ar_xy_nutrition.yaml │ │ │ │ ├── mmmlu_ar_xy_philosophy.yaml │ │ │ │ ├── mmmlu_ar_xy_prehistory.yaml │ │ │ │ ├── mmmlu_ar_xy_professional_accounting.yaml │ │ │ │ ├── mmmlu_ar_xy_professional_law.yaml │ │ │ │ ├── mmmlu_ar_xy_professional_medicine.yaml │ │ │ │ ├── mmmlu_ar_xy_professional_psychology.yaml │ │ │ │ ├── mmmlu_ar_xy_public_relations.yaml │ │ │ │ ├── mmmlu_ar_xy_security_studies.yaml │ │ │ │ ├── mmmlu_ar_xy_sociology.yaml │ │ │ │ ├── mmmlu_ar_xy_us_foreign_policy.yaml │ │ │ │ ├── mmmlu_ar_xy_virology.yaml │ │ │ │ ├── mmmlu_ar_xy_world_religions.yaml │ │ │ │ ├── mmmlu_bn_bd_abstract_algebra.yaml │ │ │ │ ├── mmmlu_bn_bd_anatomy.yaml │ │ │ │ ├── mmmlu_bn_bd_astronomy.yaml │ │ │ │ ├── mmmlu_bn_bd_business_ethics.yaml │ │ │ │ ├── mmmlu_bn_bd_clinical_knowledge.yaml │ │ │ │ ├── mmmlu_bn_bd_college_biology.yaml │ │ │ │ ├── mmmlu_bn_bd_college_chemistry.yaml │ │ │ │ ├── mmmlu_bn_bd_college_computer_science.yaml │ │ │ │ ├── mmmlu_bn_bd_college_mathematics.yaml │ │ │ │ ├── mmmlu_bn_bd_college_medicine.yaml │ │ │ │ ├── mmmlu_bn_bd_college_physics.yaml │ │ │ │ ├── mmmlu_bn_bd_computer_security.yaml │ │ │ │ ├── mmmlu_bn_bd_conceptual_physics.yaml │ │ │ │ ├── mmmlu_bn_bd_econometrics.yaml │ │ │ │ ├── mmmlu_bn_bd_electrical_engineering.yaml │ │ │ │ ├── mmmlu_bn_bd_elementary_mathematics.yaml │ │ │ │ ├── mmmlu_bn_bd_formal_logic.yaml │ │ │ │ ├── mmmlu_bn_bd_global_facts.yaml │ │ │ │ ├── mmmlu_bn_bd_high_school_biology.yaml │ │ │ │ ├── mmmlu_bn_bd_high_school_chemistry.yaml │ │ │ │ ├── mmmlu_bn_bd_high_school_computer_science.yaml │ │ │ │ ├── mmmlu_bn_bd_high_school_european_history.yaml │ │ │ │ ├── mmmlu_bn_bd_high_school_geography.yaml │ │ │ │ ├── mmmlu_bn_bd_high_school_government_and_politics.yaml │ │ │ │ ├── mmmlu_bn_bd_high_school_macroeconomics.yaml │ │ │ │ ├── mmmlu_bn_bd_high_school_mathematics.yaml │ │ │ │ ├── mmmlu_bn_bd_high_school_microeconomics.yaml │ │ │ │ ├── mmmlu_bn_bd_high_school_physics.yaml │ │ │ │ ├── mmmlu_bn_bd_high_school_psychology.yaml │ │ │ │ ├── mmmlu_bn_bd_high_school_statistics.yaml │ │ │ │ ├── mmmlu_bn_bd_high_school_us_history.yaml │ │ │ │ ├── mmmlu_bn_bd_high_school_world_history.yaml │ │ │ │ ├── mmmlu_bn_bd_human_aging.yaml │ │ │ │ ├── mmmlu_bn_bd_human_sexuality.yaml │ │ │ │ ├── mmmlu_bn_bd_international_law.yaml │ │ │ │ ├── mmmlu_bn_bd_jurisprudence.yaml │ │ │ │ ├── mmmlu_bn_bd_logical_fallacies.yaml │ │ │ │ ├── mmmlu_bn_bd_machine_learning.yaml │ │ │ │ ├── mmmlu_bn_bd_management.yaml │ │ │ │ ├── mmmlu_bn_bd_marketing.yaml │ │ │ │ ├── mmmlu_bn_bd_medical_genetics.yaml │ │ │ │ ├── mmmlu_bn_bd_miscellaneous.yaml │ │ │ │ ├── mmmlu_bn_bd_moral_disputes.yaml │ │ │ │ ├── mmmlu_bn_bd_moral_scenarios.yaml │ │ │ │ ├── mmmlu_bn_bd_nutrition.yaml │ │ │ │ ├── mmmlu_bn_bd_philosophy.yaml │ │ │ │ ├── mmmlu_bn_bd_prehistory.yaml │ │ │ │ ├── mmmlu_bn_bd_professional_accounting.yaml │ │ │ │ ├── mmmlu_bn_bd_professional_law.yaml │ │ │ │ ├── mmmlu_bn_bd_professional_medicine.yaml │ │ │ │ ├── mmmlu_bn_bd_professional_psychology.yaml │ │ │ │ ├── mmmlu_bn_bd_public_relations.yaml │ │ │ │ ├── mmmlu_bn_bd_security_studies.yaml │ │ │ │ ├── mmmlu_bn_bd_sociology.yaml │ │ │ │ ├── mmmlu_bn_bd_us_foreign_policy.yaml │ │ │ │ ├── mmmlu_bn_bd_virology.yaml │ │ │ │ ├── mmmlu_bn_bd_world_religions.yaml │ │ │ │ ├── mmmlu_de_de_abstract_algebra.yaml │ │ │ │ ├── mmmlu_de_de_anatomy.yaml │ │ │ │ ├── mmmlu_de_de_astronomy.yaml │ │ │ │ ├── mmmlu_de_de_business_ethics.yaml │ │ │ │ ├── mmmlu_de_de_clinical_knowledge.yaml │ │ │ │ ├── mmmlu_de_de_college_biology.yaml │ │ │ │ ├── mmmlu_de_de_college_chemistry.yaml │ │ │ │ ├── mmmlu_de_de_college_computer_science.yaml │ │ │ │ ├── mmmlu_de_de_college_mathematics.yaml │ │ │ │ ├── mmmlu_de_de_college_medicine.yaml │ │ │ │ ├── mmmlu_de_de_college_physics.yaml │ │ │ │ ├── mmmlu_de_de_computer_security.yaml │ │ │ │ ├── mmmlu_de_de_conceptual_physics.yaml │ │ │ │ ├── mmmlu_de_de_econometrics.yaml │ │ │ │ ├── mmmlu_de_de_electrical_engineering.yaml │ │ │ │ ├── mmmlu_de_de_elementary_mathematics.yaml │ │ │ │ ├── mmmlu_de_de_formal_logic.yaml │ │ │ │ ├── mmmlu_de_de_global_facts.yaml │ │ │ │ ├── mmmlu_de_de_high_school_biology.yaml │ │ │ │ ├── mmmlu_de_de_high_school_chemistry.yaml │ │ │ │ ├── mmmlu_de_de_high_school_computer_science.yaml │ │ │ │ ├── mmmlu_de_de_high_school_european_history.yaml │ │ │ │ ├── mmmlu_de_de_high_school_geography.yaml │ │ │ │ ├── mmmlu_de_de_high_school_government_and_politics.yaml │ │ │ │ ├── mmmlu_de_de_high_school_macroeconomics.yaml │ │ │ │ ├── mmmlu_de_de_high_school_mathematics.yaml │ │ │ │ ├── mmmlu_de_de_high_school_microeconomics.yaml │ │ │ │ ├── mmmlu_de_de_high_school_physics.yaml │ │ │ │ ├── mmmlu_de_de_high_school_psychology.yaml │ │ │ │ ├── mmmlu_de_de_high_school_statistics.yaml │ │ │ │ ├── mmmlu_de_de_high_school_us_history.yaml │ │ │ │ ├── mmmlu_de_de_high_school_world_history.yaml │ │ │ │ ├── mmmlu_de_de_human_aging.yaml │ │ │ │ ├── mmmlu_de_de_human_sexuality.yaml │ │ │ │ ├── mmmlu_de_de_international_law.yaml │ │ │ │ ├── mmmlu_de_de_jurisprudence.yaml │ │ │ │ ├── mmmlu_de_de_logical_fallacies.yaml │ │ │ │ ├── mmmlu_de_de_machine_learning.yaml │ │ │ │ ├── mmmlu_de_de_management.yaml │ │ │ │ ├── mmmlu_de_de_marketing.yaml │ │ │ │ ├── mmmlu_de_de_medical_genetics.yaml │ │ │ │ ├── mmmlu_de_de_miscellaneous.yaml │ │ │ │ ├── mmmlu_de_de_moral_disputes.yaml │ │ │ │ ├── mmmlu_de_de_moral_scenarios.yaml │ │ │ │ ├── mmmlu_de_de_nutrition.yaml │ │ │ │ ├── mmmlu_de_de_philosophy.yaml │ │ │ │ ├── mmmlu_de_de_prehistory.yaml │ │ │ │ ├── mmmlu_de_de_professional_accounting.yaml │ │ │ │ ├── mmmlu_de_de_professional_law.yaml │ │ │ │ ├── mmmlu_de_de_professional_medicine.yaml │ │ │ │ ├── mmmlu_de_de_professional_psychology.yaml │ │ │ │ ├── mmmlu_de_de_public_relations.yaml │ │ │ │ ├── mmmlu_de_de_security_studies.yaml │ │ │ │ ├── mmmlu_de_de_sociology.yaml │ │ │ │ ├── mmmlu_de_de_us_foreign_policy.yaml │ │ │ │ ├── mmmlu_de_de_virology.yaml │ │ │ │ ├── mmmlu_de_de_world_religions.yaml │ │ │ │ ├── mmmlu_es_la_abstract_algebra.yaml │ │ │ │ ├── mmmlu_es_la_anatomy.yaml │ │ │ │ ├── mmmlu_es_la_astronomy.yaml │ │ │ │ ├── mmmlu_es_la_business_ethics.yaml │ │ │ │ ├── mmmlu_es_la_clinical_knowledge.yaml │ │ │ │ ├── mmmlu_es_la_college_biology.yaml │ │ │ │ ├── mmmlu_es_la_college_chemistry.yaml │ │ │ │ ├── mmmlu_es_la_college_computer_science.yaml │ │ │ │ ├── mmmlu_es_la_college_mathematics.yaml │ │ │ │ ├── mmmlu_es_la_college_medicine.yaml │ │ │ │ ├── mmmlu_es_la_college_physics.yaml │ │ │ │ ├── mmmlu_es_la_computer_security.yaml │ │ │ │ ├── mmmlu_es_la_conceptual_physics.yaml │ │ │ │ ├── mmmlu_es_la_econometrics.yaml │ │ │ │ ├── mmmlu_es_la_electrical_engineering.yaml │ │ │ │ ├── mmmlu_es_la_elementary_mathematics.yaml │ │ │ │ ├── mmmlu_es_la_formal_logic.yaml │ │ │ │ ├── mmmlu_es_la_global_facts.yaml │ │ │ │ ├── mmmlu_es_la_high_school_biology.yaml │ │ │ │ ├── mmmlu_es_la_high_school_chemistry.yaml │ │ │ │ ├── mmmlu_es_la_high_school_computer_science.yaml │ │ │ │ ├── mmmlu_es_la_high_school_european_history.yaml │ │ │ │ ├── mmmlu_es_la_high_school_geography.yaml │ │ │ │ ├── mmmlu_es_la_high_school_government_and_politics.yaml │ │ │ │ ├── mmmlu_es_la_high_school_macroeconomics.yaml │ │ │ │ ├── mmmlu_es_la_high_school_mathematics.yaml │ │ │ │ ├── mmmlu_es_la_high_school_microeconomics.yaml │ │ │ │ ├── mmmlu_es_la_high_school_physics.yaml │ │ │ │ ├── mmmlu_es_la_high_school_psychology.yaml │ │ │ │ ├── mmmlu_es_la_high_school_statistics.yaml │ │ │ │ ├── mmmlu_es_la_high_school_us_history.yaml │ │ │ │ ├── mmmlu_es_la_high_school_world_history.yaml │ │ │ │ ├── mmmlu_es_la_human_aging.yaml │ │ │ │ ├── mmmlu_es_la_human_sexuality.yaml │ │ │ │ ├── mmmlu_es_la_international_law.yaml │ │ │ │ ├── mmmlu_es_la_jurisprudence.yaml │ │ │ │ ├── mmmlu_es_la_logical_fallacies.yaml │ │ │ │ ├── mmmlu_es_la_machine_learning.yaml │ │ │ │ ├── mmmlu_es_la_management.yaml │ │ │ │ ├── mmmlu_es_la_marketing.yaml │ │ │ │ ├── mmmlu_es_la_medical_genetics.yaml │ │ │ │ ├── mmmlu_es_la_miscellaneous.yaml │ │ │ │ ├── mmmlu_es_la_moral_disputes.yaml │ │ │ │ ├── mmmlu_es_la_moral_scenarios.yaml │ │ │ │ ├── mmmlu_es_la_nutrition.yaml │ │ │ │ ├── mmmlu_es_la_philosophy.yaml │ │ │ │ ├── mmmlu_es_la_prehistory.yaml │ │ │ │ ├── mmmlu_es_la_professional_accounting.yaml │ │ │ │ ├── mmmlu_es_la_professional_law.yaml │ │ │ │ ├── mmmlu_es_la_professional_medicine.yaml │ │ │ │ ├── mmmlu_es_la_professional_psychology.yaml │ │ │ │ ├── mmmlu_es_la_public_relations.yaml │ │ │ │ ├── mmmlu_es_la_security_studies.yaml │ │ │ │ ├── mmmlu_es_la_sociology.yaml │ │ │ │ ├── mmmlu_es_la_us_foreign_policy.yaml │ │ │ │ ├── mmmlu_es_la_virology.yaml │ │ │ │ ├── mmmlu_es_la_world_religions.yaml │ │ │ │ ├── mmmlu_fr_fr_abstract_algebra.yaml │ │ │ │ ├── mmmlu_fr_fr_anatomy.yaml │ │ │ │ ├── mmmlu_fr_fr_astronomy.yaml │ │ │ │ ├── mmmlu_fr_fr_business_ethics.yaml │ │ │ │ ├── mmmlu_fr_fr_clinical_knowledge.yaml │ │ │ │ ├── mmmlu_fr_fr_college_biology.yaml │ │ │ │ ├── mmmlu_fr_fr_college_chemistry.yaml │ │ │ │ ├── mmmlu_fr_fr_college_computer_science.yaml │ │ │ │ ├── mmmlu_fr_fr_college_mathematics.yaml │ │ │ │ ├── mmmlu_fr_fr_college_medicine.yaml │ │ │ │ ├── mmmlu_fr_fr_college_physics.yaml │ │ │ │ ├── mmmlu_fr_fr_computer_security.yaml │ │ │ │ ├── mmmlu_fr_fr_conceptual_physics.yaml │ │ │ │ ├── mmmlu_fr_fr_econometrics.yaml │ │ │ │ ├── mmmlu_fr_fr_electrical_engineering.yaml │ │ │ │ ├── mmmlu_fr_fr_elementary_mathematics.yaml │ │ │ │ ├── mmmlu_fr_fr_formal_logic.yaml │ │ │ │ ├── mmmlu_fr_fr_global_facts.yaml │ │ │ │ ├── mmmlu_fr_fr_high_school_biology.yaml │ │ │ │ ├── mmmlu_fr_fr_high_school_chemistry.yaml │ │ │ │ ├── mmmlu_fr_fr_high_school_computer_science.yaml │ │ │ │ ├── mmmlu_fr_fr_high_school_european_history.yaml │ │ │ │ ├── mmmlu_fr_fr_high_school_geography.yaml │ │ │ │ ├── mmmlu_fr_fr_high_school_government_and_politics.yaml │ │ │ │ ├── mmmlu_fr_fr_high_school_macroeconomics.yaml │ │ │ │ ├── mmmlu_fr_fr_high_school_mathematics.yaml │ │ │ │ ├── mmmlu_fr_fr_high_school_microeconomics.yaml │ │ │ │ ├── mmmlu_fr_fr_high_school_physics.yaml │ │ │ │ ├── mmmlu_fr_fr_high_school_psychology.yaml │ │ │ │ ├── mmmlu_fr_fr_high_school_statistics.yaml │ │ │ │ ├── mmmlu_fr_fr_high_school_us_history.yaml │ │ │ │ ├── mmmlu_fr_fr_high_school_world_history.yaml │ │ │ │ ├── mmmlu_fr_fr_human_aging.yaml │ │ │ │ ├── mmmlu_fr_fr_human_sexuality.yaml │ │ │ │ ├── mmmlu_fr_fr_international_law.yaml │ │ │ │ ├── mmmlu_fr_fr_jurisprudence.yaml │ │ │ │ ├── mmmlu_fr_fr_logical_fallacies.yaml │ │ │ │ ├── mmmlu_fr_fr_machine_learning.yaml │ │ │ │ ├── mmmlu_fr_fr_management.yaml │ │ │ │ ├── mmmlu_fr_fr_marketing.yaml │ │ │ │ ├── mmmlu_fr_fr_medical_genetics.yaml │ │ │ │ ├── mmmlu_fr_fr_miscellaneous.yaml │ │ │ │ ├── mmmlu_fr_fr_moral_disputes.yaml │ │ │ │ ├── mmmlu_fr_fr_moral_scenarios.yaml │ │ │ │ ├── mmmlu_fr_fr_nutrition.yaml │ │ │ │ ├── mmmlu_fr_fr_philosophy.yaml │ │ │ │ ├── mmmlu_fr_fr_prehistory.yaml │ │ │ │ ├── mmmlu_fr_fr_professional_accounting.yaml │ │ │ │ ├── mmmlu_fr_fr_professional_law.yaml │ │ │ │ ├── mmmlu_fr_fr_professional_medicine.yaml │ │ │ │ ├── mmmlu_fr_fr_professional_psychology.yaml │ │ │ │ ├── mmmlu_fr_fr_public_relations.yaml │ │ │ │ ├── mmmlu_fr_fr_security_studies.yaml │ │ │ │ ├── mmmlu_fr_fr_sociology.yaml │ │ │ │ ├── mmmlu_fr_fr_us_foreign_policy.yaml │ │ │ │ ├── mmmlu_fr_fr_virology.yaml │ │ │ │ ├── mmmlu_fr_fr_world_religions.yaml │ │ │ │ ├── mmmlu_hi_in_abstract_algebra.yaml │ │ │ │ ├── mmmlu_hi_in_anatomy.yaml │ │ │ │ ├── mmmlu_hi_in_astronomy.yaml │ │ │ │ ├── mmmlu_hi_in_business_ethics.yaml │ │ │ │ ├── mmmlu_hi_in_clinical_knowledge.yaml │ │ │ │ ├── mmmlu_hi_in_college_biology.yaml │ │ │ │ ├── mmmlu_hi_in_college_chemistry.yaml │ │ │ │ ├── mmmlu_hi_in_college_computer_science.yaml │ │ │ │ ├── mmmlu_hi_in_college_mathematics.yaml │ │ │ │ ├── mmmlu_hi_in_college_medicine.yaml │ │ │ │ ├── mmmlu_hi_in_college_physics.yaml │ │ │ │ ├── mmmlu_hi_in_computer_security.yaml │ │ │ │ ├── mmmlu_hi_in_conceptual_physics.yaml │ │ │ │ ├── mmmlu_hi_in_econometrics.yaml │ │ │ │ ├── mmmlu_hi_in_electrical_engineering.yaml │ │ │ │ ├── mmmlu_hi_in_elementary_mathematics.yaml │ │ │ │ ├── mmmlu_hi_in_formal_logic.yaml │ │ │ │ ├── mmmlu_hi_in_global_facts.yaml │ │ │ │ ├── mmmlu_hi_in_high_school_biology.yaml │ │ │ │ ├── mmmlu_hi_in_high_school_chemistry.yaml │ │ │ │ ├── mmmlu_hi_in_high_school_computer_science.yaml │ │ │ │ ├── mmmlu_hi_in_high_school_european_history.yaml │ │ │ │ ├── mmmlu_hi_in_high_school_geography.yaml │ │ │ │ ├── mmmlu_hi_in_high_school_government_and_politics.yaml │ │ │ │ ├── mmmlu_hi_in_high_school_macroeconomics.yaml │ │ │ │ ├── mmmlu_hi_in_high_school_mathematics.yaml │ │ │ │ ├── mmmlu_hi_in_high_school_microeconomics.yaml │ │ │ │ ├── mmmlu_hi_in_high_school_physics.yaml │ │ │ │ ├── mmmlu_hi_in_high_school_psychology.yaml │ │ │ │ ├── mmmlu_hi_in_high_school_statistics.yaml │ │ │ │ ├── mmmlu_hi_in_high_school_us_history.yaml │ │ │ │ ├── mmmlu_hi_in_high_school_world_history.yaml │ │ │ │ ├── mmmlu_hi_in_human_aging.yaml │ │ │ │ ├── mmmlu_hi_in_human_sexuality.yaml │ │ │ │ ├── mmmlu_hi_in_international_law.yaml │ │ │ │ ├── mmmlu_hi_in_jurisprudence.yaml │ │ │ │ ├── mmmlu_hi_in_logical_fallacies.yaml │ │ │ │ ├── mmmlu_hi_in_machine_learning.yaml │ │ │ │ ├── mmmlu_hi_in_management.yaml │ │ │ │ ├── mmmlu_hi_in_marketing.yaml │ │ │ │ ├── mmmlu_hi_in_medical_genetics.yaml │ │ │ │ ├── mmmlu_hi_in_miscellaneous.yaml │ │ │ │ ├── mmmlu_hi_in_moral_disputes.yaml │ │ │ │ ├── mmmlu_hi_in_moral_scenarios.yaml │ │ │ │ ├── mmmlu_hi_in_nutrition.yaml │ │ │ │ ├── mmmlu_hi_in_philosophy.yaml │ │ │ │ ├── mmmlu_hi_in_prehistory.yaml │ │ │ │ ├── mmmlu_hi_in_professional_accounting.yaml │ │ │ │ ├── mmmlu_hi_in_professional_law.yaml │ │ │ │ ├── mmmlu_hi_in_professional_medicine.yaml │ │ │ │ ├── mmmlu_hi_in_professional_psychology.yaml │ │ │ │ ├── mmmlu_hi_in_public_relations.yaml │ │ │ │ ├── mmmlu_hi_in_security_studies.yaml │ │ │ │ ├── mmmlu_hi_in_sociology.yaml │ │ │ │ ├── mmmlu_hi_in_us_foreign_policy.yaml │ │ │ │ ├── mmmlu_hi_in_virology.yaml │ │ │ │ ├── mmmlu_hi_in_world_religions.yaml │ │ │ │ ├── mmmlu_id_id_abstract_algebra.yaml │ │ │ │ ├── mmmlu_id_id_anatomy.yaml │ │ │ │ ├── mmmlu_id_id_astronomy.yaml │ │ │ │ ├── mmmlu_id_id_business_ethics.yaml │ │ │ │ ├── mmmlu_id_id_clinical_knowledge.yaml │ │ │ │ ├── mmmlu_id_id_college_biology.yaml │ │ │ │ ├── mmmlu_id_id_college_chemistry.yaml │ │ │ │ ├── mmmlu_id_id_college_computer_science.yaml │ │ │ │ ├── mmmlu_id_id_college_mathematics.yaml │ │ │ │ ├── mmmlu_id_id_college_medicine.yaml │ │ │ │ ├── mmmlu_id_id_college_physics.yaml │ │ │ │ ├── mmmlu_id_id_computer_security.yaml │ │ │ │ ├── mmmlu_id_id_conceptual_physics.yaml │ │ │ │ ├── mmmlu_id_id_econometrics.yaml │ │ │ │ ├── mmmlu_id_id_electrical_engineering.yaml │ │ │ │ ├── mmmlu_id_id_elementary_mathematics.yaml │ │ │ │ ├── mmmlu_id_id_formal_logic.yaml │ │ │ │ ├── mmmlu_id_id_global_facts.yaml │ │ │ │ ├── mmmlu_id_id_high_school_biology.yaml │ │ │ │ ├── mmmlu_id_id_high_school_chemistry.yaml │ │ │ │ ├── mmmlu_id_id_high_school_computer_science.yaml │ │ │ │ ├── mmmlu_id_id_high_school_european_history.yaml │ │ │ │ ├── mmmlu_id_id_high_school_geography.yaml │ │ │ │ ├── mmmlu_id_id_high_school_government_and_politics.yaml │ │ │ │ ├── mmmlu_id_id_high_school_macroeconomics.yaml │ │ │ │ ├── mmmlu_id_id_high_school_mathematics.yaml │ │ │ │ ├── mmmlu_id_id_high_school_microeconomics.yaml │ │ │ │ ├── mmmlu_id_id_high_school_physics.yaml │ │ │ │ ├── mmmlu_id_id_high_school_psychology.yaml │ │ │ │ ├── mmmlu_id_id_high_school_statistics.yaml │ │ │ │ ├── mmmlu_id_id_high_school_us_history.yaml │ │ │ │ ├── mmmlu_id_id_high_school_world_history.yaml │ │ │ │ ├── mmmlu_id_id_human_aging.yaml │ │ │ │ ├── mmmlu_id_id_human_sexuality.yaml │ │ │ │ ├── mmmlu_id_id_international_law.yaml │ │ │ │ ├── mmmlu_id_id_jurisprudence.yaml │ │ │ │ ├── mmmlu_id_id_logical_fallacies.yaml │ │ │ │ ├── mmmlu_id_id_machine_learning.yaml │ │ │ │ ├── mmmlu_id_id_management.yaml │ │ │ │ ├── mmmlu_id_id_marketing.yaml │ │ │ │ ├── mmmlu_id_id_medical_genetics.yaml │ │ │ │ ├── mmmlu_id_id_miscellaneous.yaml │ │ │ │ ├── mmmlu_id_id_moral_disputes.yaml │ │ │ │ ├── mmmlu_id_id_moral_scenarios.yaml │ │ │ │ ├── mmmlu_id_id_nutrition.yaml │ │ │ │ ├── mmmlu_id_id_philosophy.yaml │ │ │ │ ├── mmmlu_id_id_prehistory.yaml │ │ │ │ ├── mmmlu_id_id_professional_accounting.yaml │ │ │ │ ├── mmmlu_id_id_professional_law.yaml │ │ │ │ ├── mmmlu_id_id_professional_medicine.yaml │ │ │ │ ├── mmmlu_id_id_professional_psychology.yaml │ │ │ │ ├── mmmlu_id_id_public_relations.yaml │ │ │ │ ├── mmmlu_id_id_security_studies.yaml │ │ │ │ ├── mmmlu_id_id_sociology.yaml │ │ │ │ ├── mmmlu_id_id_us_foreign_policy.yaml │ │ │ │ ├── mmmlu_id_id_virology.yaml │ │ │ │ ├── mmmlu_id_id_world_religions.yaml │ │ │ │ ├── mmmlu_it_it_abstract_algebra.yaml │ │ │ │ ├── mmmlu_it_it_anatomy.yaml │ │ │ │ ├── mmmlu_it_it_astronomy.yaml │ │ │ │ ├── mmmlu_it_it_business_ethics.yaml │ │ │ │ ├── mmmlu_it_it_clinical_knowledge.yaml │ │ │ │ ├── mmmlu_it_it_college_biology.yaml │ │ │ │ ├── mmmlu_it_it_college_chemistry.yaml │ │ │ │ ├── mmmlu_it_it_college_computer_science.yaml │ │ │ │ ├── mmmlu_it_it_college_mathematics.yaml │ │ │ │ ├── mmmlu_it_it_college_medicine.yaml │ │ │ │ ├── mmmlu_it_it_college_physics.yaml │ │ │ │ ├── mmmlu_it_it_computer_security.yaml │ │ │ │ ├── mmmlu_it_it_conceptual_physics.yaml │ │ │ │ ├── mmmlu_it_it_econometrics.yaml │ │ │ │ ├── mmmlu_it_it_electrical_engineering.yaml │ │ │ │ ├── mmmlu_it_it_elementary_mathematics.yaml │ │ │ │ ├── mmmlu_it_it_formal_logic.yaml │ │ │ │ ├── mmmlu_it_it_global_facts.yaml │ │ │ │ ├── mmmlu_it_it_high_school_biology.yaml │ │ │ │ ├── mmmlu_it_it_high_school_chemistry.yaml │ │ │ │ ├── mmmlu_it_it_high_school_computer_science.yaml │ │ │ │ ├── mmmlu_it_it_high_school_european_history.yaml │ │ │ │ ├── mmmlu_it_it_high_school_geography.yaml │ │ │ │ ├── mmmlu_it_it_high_school_government_and_politics.yaml │ │ │ │ ├── mmmlu_it_it_high_school_macroeconomics.yaml │ │ │ │ ├── mmmlu_it_it_high_school_mathematics.yaml │ │ │ │ ├── mmmlu_it_it_high_school_microeconomics.yaml │ │ │ │ ├── mmmlu_it_it_high_school_physics.yaml │ │ │ │ ├── mmmlu_it_it_high_school_psychology.yaml │ │ │ │ ├── mmmlu_it_it_high_school_statistics.yaml │ │ │ │ ├── mmmlu_it_it_high_school_us_history.yaml │ │ │ │ ├── mmmlu_it_it_high_school_world_history.yaml │ │ │ │ ├── mmmlu_it_it_human_aging.yaml │ │ │ │ ├── mmmlu_it_it_human_sexuality.yaml │ │ │ │ ├── mmmlu_it_it_international_law.yaml │ │ │ │ ├── mmmlu_it_it_jurisprudence.yaml │ │ │ │ ├── mmmlu_it_it_logical_fallacies.yaml │ │ │ │ ├── mmmlu_it_it_machine_learning.yaml │ │ │ │ ├── mmmlu_it_it_management.yaml │ │ │ │ ├── mmmlu_it_it_marketing.yaml │ │ │ │ ├── mmmlu_it_it_medical_genetics.yaml │ │ │ │ ├── mmmlu_it_it_miscellaneous.yaml │ │ │ │ ├── mmmlu_it_it_moral_disputes.yaml │ │ │ │ ├── mmmlu_it_it_moral_scenarios.yaml │ │ │ │ ├── mmmlu_it_it_nutrition.yaml │ │ │ │ ├── mmmlu_it_it_philosophy.yaml │ │ │ │ ├── mmmlu_it_it_prehistory.yaml │ │ │ │ ├── mmmlu_it_it_professional_accounting.yaml │ │ │ │ ├── mmmlu_it_it_professional_law.yaml │ │ │ │ ├── mmmlu_it_it_professional_medicine.yaml │ │ │ │ ├── mmmlu_it_it_professional_psychology.yaml │ │ │ │ ├── mmmlu_it_it_public_relations.yaml │ │ │ │ ├── mmmlu_it_it_security_studies.yaml │ │ │ │ ├── mmmlu_it_it_sociology.yaml │ │ │ │ ├── mmmlu_it_it_us_foreign_policy.yaml │ │ │ │ ├── mmmlu_it_it_virology.yaml │ │ │ │ ├── mmmlu_it_it_world_religions.yaml │ │ │ │ ├── mmmlu_ja_jp_abstract_algebra.yaml │ │ │ │ ├── mmmlu_ja_jp_anatomy.yaml │ │ │ │ ├── mmmlu_ja_jp_astronomy.yaml │ │ │ │ ├── mmmlu_ja_jp_business_ethics.yaml │ │ │ │ ├── mmmlu_ja_jp_clinical_knowledge.yaml │ │ │ │ ├── mmmlu_ja_jp_college_biology.yaml │ │ │ │ ├── mmmlu_ja_jp_college_chemistry.yaml │ │ │ │ ├── mmmlu_ja_jp_college_computer_science.yaml │ │ │ │ ├── mmmlu_ja_jp_college_mathematics.yaml │ │ │ │ ├── mmmlu_ja_jp_college_medicine.yaml │ │ │ │ ├── mmmlu_ja_jp_college_physics.yaml │ │ │ │ ├── mmmlu_ja_jp_computer_security.yaml │ │ │ │ ├── mmmlu_ja_jp_conceptual_physics.yaml │ │ │ │ ├── mmmlu_ja_jp_econometrics.yaml │ │ │ │ ├── mmmlu_ja_jp_electrical_engineering.yaml │ │ │ │ ├── mmmlu_ja_jp_elementary_mathematics.yaml │ │ │ │ ├── mmmlu_ja_jp_formal_logic.yaml │ │ │ │ ├── mmmlu_ja_jp_global_facts.yaml │ │ │ │ ├── mmmlu_ja_jp_high_school_biology.yaml │ │ │ │ ├── mmmlu_ja_jp_high_school_chemistry.yaml │ │ │ │ ├── mmmlu_ja_jp_high_school_computer_science.yaml │ │ │ │ ├── mmmlu_ja_jp_high_school_european_history.yaml │ │ │ │ ├── mmmlu_ja_jp_high_school_geography.yaml │ │ │ │ ├── mmmlu_ja_jp_high_school_government_and_politics.yaml │ │ │ │ ├── mmmlu_ja_jp_high_school_macroeconomics.yaml │ │ │ │ ├── mmmlu_ja_jp_high_school_mathematics.yaml │ │ │ │ ├── mmmlu_ja_jp_high_school_microeconomics.yaml │ │ │ │ ├── mmmlu_ja_jp_high_school_physics.yaml │ │ │ │ ├── mmmlu_ja_jp_high_school_psychology.yaml │ │ │ │ ├── mmmlu_ja_jp_high_school_statistics.yaml │ │ │ │ ├── mmmlu_ja_jp_high_school_us_history.yaml │ │ │ │ ├── mmmlu_ja_jp_high_school_world_history.yaml │ │ │ │ ├── mmmlu_ja_jp_human_aging.yaml │ │ │ │ ├── mmmlu_ja_jp_human_sexuality.yaml │ │ │ │ ├── mmmlu_ja_jp_international_law.yaml │ │ │ │ ├── mmmlu_ja_jp_jurisprudence.yaml │ │ │ │ ├── mmmlu_ja_jp_logical_fallacies.yaml │ │ │ │ ├── mmmlu_ja_jp_machine_learning.yaml │ │ │ │ ├── mmmlu_ja_jp_management.yaml │ │ │ │ ├── mmmlu_ja_jp_marketing.yaml │ │ │ │ ├── mmmlu_ja_jp_medical_genetics.yaml │ │ │ │ ├── mmmlu_ja_jp_miscellaneous.yaml │ │ │ │ ├── mmmlu_ja_jp_moral_disputes.yaml │ │ │ │ ├── mmmlu_ja_jp_moral_scenarios.yaml │ │ │ │ ├── mmmlu_ja_jp_nutrition.yaml │ │ │ │ ├── mmmlu_ja_jp_philosophy.yaml │ │ │ │ ├── mmmlu_ja_jp_prehistory.yaml │ │ │ │ ├── mmmlu_ja_jp_professional_accounting.yaml │ │ │ │ ├── mmmlu_ja_jp_professional_law.yaml │ │ │ │ ├── mmmlu_ja_jp_professional_medicine.yaml │ │ │ │ ├── mmmlu_ja_jp_professional_psychology.yaml │ │ │ │ ├── mmmlu_ja_jp_public_relations.yaml │ │ │ │ ├── mmmlu_ja_jp_security_studies.yaml │ │ │ │ ├── mmmlu_ja_jp_sociology.yaml │ │ │ │ ├── mmmlu_ja_jp_us_foreign_policy.yaml │ │ │ │ ├── mmmlu_ja_jp_virology.yaml │ │ │ │ ├── mmmlu_ja_jp_world_religions.yaml │ │ │ │ ├── mmmlu_ko_kr_abstract_algebra.yaml │ │ │ │ ├── mmmlu_ko_kr_anatomy.yaml │ │ │ │ ├── mmmlu_ko_kr_astronomy.yaml │ │ │ │ ├── mmmlu_ko_kr_business_ethics.yaml │ │ │ │ ├── mmmlu_ko_kr_clinical_knowledge.yaml │ │ │ │ ├── mmmlu_ko_kr_college_biology.yaml │ │ │ │ ├── mmmlu_ko_kr_college_chemistry.yaml │ │ │ │ ├── mmmlu_ko_kr_college_computer_science.yaml │ │ │ │ ├── mmmlu_ko_kr_college_mathematics.yaml │ │ │ │ ├── mmmlu_ko_kr_college_medicine.yaml │ │ │ │ ├── mmmlu_ko_kr_college_physics.yaml │ │ │ │ ├── mmmlu_ko_kr_computer_security.yaml │ │ │ │ ├── mmmlu_ko_kr_conceptual_physics.yaml │ │ │ │ ├── mmmlu_ko_kr_econometrics.yaml │ │ │ │ ├── mmmlu_ko_kr_electrical_engineering.yaml │ │ │ │ ├── mmmlu_ko_kr_elementary_mathematics.yaml │ │ │ │ ├── mmmlu_ko_kr_formal_logic.yaml │ │ │ │ ├── mmmlu_ko_kr_global_facts.yaml │ │ │ │ ├── mmmlu_ko_kr_high_school_biology.yaml │ │ │ │ ├── mmmlu_ko_kr_high_school_chemistry.yaml │ │ │ │ ├── mmmlu_ko_kr_high_school_computer_science.yaml │ │ │ │ ├── mmmlu_ko_kr_high_school_european_history.yaml │ │ │ │ ├── mmmlu_ko_kr_high_school_geography.yaml │ │ │ │ ├── mmmlu_ko_kr_high_school_government_and_politics.yaml │ │ │ │ ├── mmmlu_ko_kr_high_school_macroeconomics.yaml │ │ │ │ ├── mmmlu_ko_kr_high_school_mathematics.yaml │ │ │ │ ├── mmmlu_ko_kr_high_school_microeconomics.yaml │ │ │ │ ├── mmmlu_ko_kr_high_school_physics.yaml │ │ │ │ ├── mmmlu_ko_kr_high_school_psychology.yaml │ │ │ │ ├── mmmlu_ko_kr_high_school_statistics.yaml │ │ │ │ ├── mmmlu_ko_kr_high_school_us_history.yaml │ │ │ │ ├── mmmlu_ko_kr_high_school_world_history.yaml │ │ │ │ ├── mmmlu_ko_kr_human_aging.yaml │ │ │ │ ├── mmmlu_ko_kr_human_sexuality.yaml │ │ │ │ ├── mmmlu_ko_kr_international_law.yaml │ │ │ │ ├── mmmlu_ko_kr_jurisprudence.yaml │ │ │ │ ├── mmmlu_ko_kr_logical_fallacies.yaml │ │ │ │ ├── mmmlu_ko_kr_machine_learning.yaml │ │ │ │ ├── mmmlu_ko_kr_management.yaml │ │ │ │ ├── mmmlu_ko_kr_marketing.yaml │ │ │ │ ├── mmmlu_ko_kr_medical_genetics.yaml │ │ │ │ ├── mmmlu_ko_kr_miscellaneous.yaml │ │ │ │ ├── mmmlu_ko_kr_moral_disputes.yaml │ │ │ │ ├── mmmlu_ko_kr_moral_scenarios.yaml │ │ │ │ ├── mmmlu_ko_kr_nutrition.yaml │ │ │ │ ├── mmmlu_ko_kr_philosophy.yaml │ │ │ │ ├── mmmlu_ko_kr_prehistory.yaml │ │ │ │ ├── mmmlu_ko_kr_professional_accounting.yaml │ │ │ │ ├── mmmlu_ko_kr_professional_law.yaml │ │ │ │ ├── mmmlu_ko_kr_professional_medicine.yaml │ │ │ │ ├── mmmlu_ko_kr_professional_psychology.yaml │ │ │ │ ├── mmmlu_ko_kr_public_relations.yaml │ │ │ │ ├── mmmlu_ko_kr_security_studies.yaml │ │ │ │ ├── mmmlu_ko_kr_sociology.yaml │ │ │ │ ├── mmmlu_ko_kr_us_foreign_policy.yaml │ │ │ │ ├── mmmlu_ko_kr_virology.yaml │ │ │ │ ├── mmmlu_ko_kr_world_religions.yaml │ │ │ │ ├── mmmlu_pt_br_abstract_algebra.yaml │ │ │ │ ├── mmmlu_pt_br_anatomy.yaml │ │ │ │ ├── mmmlu_pt_br_astronomy.yaml │ │ │ │ ├── mmmlu_pt_br_business_ethics.yaml │ │ │ │ ├── mmmlu_pt_br_clinical_knowledge.yaml │ │ │ │ ├── mmmlu_pt_br_college_biology.yaml │ │ │ │ ├── mmmlu_pt_br_college_chemistry.yaml │ │ │ │ ├── mmmlu_pt_br_college_computer_science.yaml │ │ │ │ ├── mmmlu_pt_br_college_mathematics.yaml │ │ │ │ ├── mmmlu_pt_br_college_medicine.yaml │ │ │ │ ├── mmmlu_pt_br_college_physics.yaml │ │ │ │ ├── mmmlu_pt_br_computer_security.yaml │ │ │ │ ├── mmmlu_pt_br_conceptual_physics.yaml │ │ │ │ ├── mmmlu_pt_br_econometrics.yaml │ │ │ │ ├── mmmlu_pt_br_electrical_engineering.yaml │ │ │ │ ├── mmmlu_pt_br_elementary_mathematics.yaml │ │ │ │ ├── mmmlu_pt_br_formal_logic.yaml │ │ │ │ ├── mmmlu_pt_br_global_facts.yaml │ │ │ │ ├── mmmlu_pt_br_high_school_biology.yaml │ │ │ │ ├── mmmlu_pt_br_high_school_chemistry.yaml │ │ │ │ ├── mmmlu_pt_br_high_school_computer_science.yaml │ │ │ │ ├── mmmlu_pt_br_high_school_european_history.yaml │ │ │ │ ├── mmmlu_pt_br_high_school_geography.yaml │ │ │ │ ├── mmmlu_pt_br_high_school_government_and_politics.yaml │ │ │ │ ├── mmmlu_pt_br_high_school_macroeconomics.yaml │ │ │ │ ├── mmmlu_pt_br_high_school_mathematics.yaml │ │ │ │ ├── mmmlu_pt_br_high_school_microeconomics.yaml │ │ │ │ ├── mmmlu_pt_br_high_school_physics.yaml │ │ │ │ ├── mmmlu_pt_br_high_school_psychology.yaml │ │ │ │ ├── mmmlu_pt_br_high_school_statistics.yaml │ │ │ │ ├── mmmlu_pt_br_high_school_us_history.yaml │ │ │ │ ├── mmmlu_pt_br_high_school_world_history.yaml │ │ │ │ ├── mmmlu_pt_br_human_aging.yaml │ │ │ │ ├── mmmlu_pt_br_human_sexuality.yaml │ │ │ │ ├── mmmlu_pt_br_international_law.yaml │ │ │ │ ├── mmmlu_pt_br_jurisprudence.yaml │ │ │ │ ├── mmmlu_pt_br_logical_fallacies.yaml │ │ │ │ ├── mmmlu_pt_br_machine_learning.yaml │ │ │ │ ├── mmmlu_pt_br_management.yaml │ │ │ │ ├── mmmlu_pt_br_marketing.yaml │ │ │ │ ├── mmmlu_pt_br_medical_genetics.yaml │ │ │ │ ├── mmmlu_pt_br_miscellaneous.yaml │ │ │ │ ├── mmmlu_pt_br_moral_disputes.yaml │ │ │ │ ├── mmmlu_pt_br_moral_scenarios.yaml │ │ │ │ ├── mmmlu_pt_br_nutrition.yaml │ │ │ │ ├── mmmlu_pt_br_philosophy.yaml │ │ │ │ ├── mmmlu_pt_br_prehistory.yaml │ │ │ │ ├── mmmlu_pt_br_professional_accounting.yaml │ │ │ │ ├── mmmlu_pt_br_professional_law.yaml │ │ │ │ ├── mmmlu_pt_br_professional_medicine.yaml │ │ │ │ ├── mmmlu_pt_br_professional_psychology.yaml │ │ │ │ ├── mmmlu_pt_br_public_relations.yaml │ │ │ │ ├── mmmlu_pt_br_security_studies.yaml │ │ │ │ ├── mmmlu_pt_br_sociology.yaml │ │ │ │ ├── mmmlu_pt_br_us_foreign_policy.yaml │ │ │ │ ├── mmmlu_pt_br_virology.yaml │ │ │ │ ├── mmmlu_pt_br_world_religions.yaml │ │ │ │ ├── mmmlu_sw_ke_abstract_algebra.yaml │ │ │ │ ├── mmmlu_sw_ke_anatomy.yaml │ │ │ │ ├── mmmlu_sw_ke_astronomy.yaml │ │ │ │ ├── mmmlu_sw_ke_business_ethics.yaml │ │ │ │ ├── mmmlu_sw_ke_clinical_knowledge.yaml │ │ │ │ ├── mmmlu_sw_ke_college_biology.yaml │ │ │ │ ├── mmmlu_sw_ke_college_chemistry.yaml │ │ │ │ ├── mmmlu_sw_ke_college_computer_science.yaml │ │ │ │ ├── mmmlu_sw_ke_college_mathematics.yaml │ │ │ │ ├── mmmlu_sw_ke_college_medicine.yaml │ │ │ │ ├── mmmlu_sw_ke_college_physics.yaml │ │ │ │ ├── mmmlu_sw_ke_computer_security.yaml │ │ │ │ ├── mmmlu_sw_ke_conceptual_physics.yaml │ │ │ │ ├── mmmlu_sw_ke_econometrics.yaml │ │ │ │ ├── mmmlu_sw_ke_electrical_engineering.yaml │ │ │ │ ├── mmmlu_sw_ke_elementary_mathematics.yaml │ │ │ │ ├── mmmlu_sw_ke_formal_logic.yaml │ │ │ │ ├── mmmlu_sw_ke_global_facts.yaml │ │ │ │ ├── mmmlu_sw_ke_high_school_biology.yaml │ │ │ │ ├── mmmlu_sw_ke_high_school_chemistry.yaml │ │ │ │ ├── mmmlu_sw_ke_high_school_computer_science.yaml │ │ │ │ ├── mmmlu_sw_ke_high_school_european_history.yaml │ │ │ │ ├── mmmlu_sw_ke_high_school_geography.yaml │ │ │ │ ├── mmmlu_sw_ke_high_school_government_and_politics.yaml │ │ │ │ ├── mmmlu_sw_ke_high_school_macroeconomics.yaml │ │ │ │ ├── mmmlu_sw_ke_high_school_mathematics.yaml │ │ │ │ ├── mmmlu_sw_ke_high_school_microeconomics.yaml │ │ │ │ ├── mmmlu_sw_ke_high_school_physics.yaml │ │ │ │ ├── mmmlu_sw_ke_high_school_psychology.yaml │ │ │ │ ├── mmmlu_sw_ke_high_school_statistics.yaml │ │ │ │ ├── mmmlu_sw_ke_high_school_us_history.yaml │ │ │ │ ├── mmmlu_sw_ke_high_school_world_history.yaml │ │ │ │ ├── mmmlu_sw_ke_human_aging.yaml │ │ │ │ ├── mmmlu_sw_ke_human_sexuality.yaml │ │ │ │ ├── mmmlu_sw_ke_international_law.yaml │ │ │ │ ├── mmmlu_sw_ke_jurisprudence.yaml │ │ │ │ ├── mmmlu_sw_ke_logical_fallacies.yaml │ │ │ │ ├── mmmlu_sw_ke_machine_learning.yaml │ │ │ │ ├── mmmlu_sw_ke_management.yaml │ │ │ │ ├── mmmlu_sw_ke_marketing.yaml │ │ │ │ ├── mmmlu_sw_ke_medical_genetics.yaml │ │ │ │ ├── mmmlu_sw_ke_miscellaneous.yaml │ │ │ │ ├── mmmlu_sw_ke_moral_disputes.yaml │ │ │ │ ├── mmmlu_sw_ke_moral_scenarios.yaml │ │ │ │ ├── mmmlu_sw_ke_nutrition.yaml │ │ │ │ ├── mmmlu_sw_ke_philosophy.yaml │ │ │ │ ├── mmmlu_sw_ke_prehistory.yaml │ │ │ │ ├── mmmlu_sw_ke_professional_accounting.yaml │ │ │ │ ├── mmmlu_sw_ke_professional_law.yaml │ │ │ │ ├── mmmlu_sw_ke_professional_medicine.yaml │ │ │ │ ├── mmmlu_sw_ke_professional_psychology.yaml │ │ │ │ ├── mmmlu_sw_ke_public_relations.yaml │ │ │ │ ├── mmmlu_sw_ke_security_studies.yaml │ │ │ │ ├── mmmlu_sw_ke_sociology.yaml │ │ │ │ ├── mmmlu_sw_ke_us_foreign_policy.yaml │ │ │ │ ├── mmmlu_sw_ke_virology.yaml │ │ │ │ ├── mmmlu_sw_ke_world_religions.yaml │ │ │ │ ├── mmmlu_yo_ng_abstract_algebra.yaml │ │ │ │ ├── mmmlu_yo_ng_anatomy.yaml │ │ │ │ ├── mmmlu_yo_ng_astronomy.yaml │ │ │ │ ├── mmmlu_yo_ng_business_ethics.yaml │ │ │ │ ├── mmmlu_yo_ng_clinical_knowledge.yaml │ │ │ │ ├── mmmlu_yo_ng_college_biology.yaml │ │ │ │ ├── mmmlu_yo_ng_college_chemistry.yaml │ │ │ │ ├── mmmlu_yo_ng_college_computer_science.yaml │ │ │ │ ├── mmmlu_yo_ng_college_mathematics.yaml │ │ │ │ ├── mmmlu_yo_ng_college_medicine.yaml │ │ │ │ ├── mmmlu_yo_ng_college_physics.yaml │ │ │ │ ├── mmmlu_yo_ng_computer_security.yaml │ │ │ │ ├── mmmlu_yo_ng_conceptual_physics.yaml │ │ │ │ ├── mmmlu_yo_ng_econometrics.yaml │ │ │ │ ├── mmmlu_yo_ng_electrical_engineering.yaml │ │ │ │ ├── mmmlu_yo_ng_elementary_mathematics.yaml │ │ │ │ ├── mmmlu_yo_ng_formal_logic.yaml │ │ │ │ ├── mmmlu_yo_ng_global_facts.yaml │ │ │ │ ├── mmmlu_yo_ng_high_school_biology.yaml │ │ │ │ ├── mmmlu_yo_ng_high_school_chemistry.yaml │ │ │ │ ├── mmmlu_yo_ng_high_school_computer_science.yaml │ │ │ │ ├── mmmlu_yo_ng_high_school_european_history.yaml │ │ │ │ ├── mmmlu_yo_ng_high_school_geography.yaml │ │ │ │ ├── mmmlu_yo_ng_high_school_government_and_politics.yaml │ │ │ │ ├── mmmlu_yo_ng_high_school_macroeconomics.yaml │ │ │ │ ├── mmmlu_yo_ng_high_school_mathematics.yaml │ │ │ │ ├── mmmlu_yo_ng_high_school_microeconomics.yaml │ │ │ │ ├── mmmlu_yo_ng_high_school_physics.yaml │ │ │ │ ├── mmmlu_yo_ng_high_school_psychology.yaml │ │ │ │ ├── mmmlu_yo_ng_high_school_statistics.yaml │ │ │ │ ├── mmmlu_yo_ng_high_school_us_history.yaml │ │ │ │ ├── mmmlu_yo_ng_high_school_world_history.yaml │ │ │ │ ├── mmmlu_yo_ng_human_aging.yaml │ │ │ │ ├── mmmlu_yo_ng_human_sexuality.yaml │ │ │ │ ├── mmmlu_yo_ng_international_law.yaml │ │ │ │ ├── mmmlu_yo_ng_jurisprudence.yaml │ │ │ │ ├── mmmlu_yo_ng_logical_fallacies.yaml │ │ │ │ ├── mmmlu_yo_ng_machine_learning.yaml │ │ │ │ ├── mmmlu_yo_ng_management.yaml │ │ │ │ ├── mmmlu_yo_ng_marketing.yaml │ │ │ │ ├── mmmlu_yo_ng_medical_genetics.yaml │ │ │ │ ├── mmmlu_yo_ng_miscellaneous.yaml │ │ │ │ ├── mmmlu_yo_ng_moral_disputes.yaml │ │ │ │ ├── mmmlu_yo_ng_moral_scenarios.yaml │ │ │ │ ├── mmmlu_yo_ng_nutrition.yaml │ │ │ │ ├── mmmlu_yo_ng_philosophy.yaml │ │ │ │ ├── mmmlu_yo_ng_prehistory.yaml │ │ │ │ ├── mmmlu_yo_ng_professional_accounting.yaml │ │ │ │ ├── mmmlu_yo_ng_professional_law.yaml │ │ │ │ ├── mmmlu_yo_ng_professional_medicine.yaml │ │ │ │ ├── mmmlu_yo_ng_professional_psychology.yaml │ │ │ │ ├── mmmlu_yo_ng_public_relations.yaml │ │ │ │ ├── mmmlu_yo_ng_security_studies.yaml │ │ │ │ ├── mmmlu_yo_ng_sociology.yaml │ │ │ │ ├── mmmlu_yo_ng_us_foreign_policy.yaml │ │ │ │ ├── mmmlu_yo_ng_virology.yaml │ │ │ │ ├── mmmlu_yo_ng_world_religions.yaml │ │ │ │ ├── mmmlu_zh_cn_abstract_algebra.yaml │ │ │ │ ├── mmmlu_zh_cn_anatomy.yaml │ │ │ │ ├── mmmlu_zh_cn_astronomy.yaml │ │ │ │ ├── mmmlu_zh_cn_business_ethics.yaml │ │ │ │ ├── mmmlu_zh_cn_clinical_knowledge.yaml │ │ │ │ ├── mmmlu_zh_cn_college_biology.yaml │ │ │ │ ├── mmmlu_zh_cn_college_chemistry.yaml │ │ │ │ ├── mmmlu_zh_cn_college_computer_science.yaml │ │ │ │ ├── mmmlu_zh_cn_college_mathematics.yaml │ │ │ │ ├── mmmlu_zh_cn_college_medicine.yaml │ │ │ │ ├── mmmlu_zh_cn_college_physics.yaml │ │ │ │ ├── mmmlu_zh_cn_computer_security.yaml │ │ │ │ ├── mmmlu_zh_cn_conceptual_physics.yaml │ │ │ │ ├── mmmlu_zh_cn_econometrics.yaml │ │ │ │ ├── mmmlu_zh_cn_electrical_engineering.yaml │ │ │ │ ├── mmmlu_zh_cn_elementary_mathematics.yaml │ │ │ │ ├── mmmlu_zh_cn_formal_logic.yaml │ │ │ │ ├── mmmlu_zh_cn_global_facts.yaml │ │ │ │ ├── mmmlu_zh_cn_high_school_biology.yaml │ │ │ │ ├── mmmlu_zh_cn_high_school_chemistry.yaml │ │ │ │ ├── mmmlu_zh_cn_high_school_computer_science.yaml │ │ │ │ ├── mmmlu_zh_cn_high_school_european_history.yaml │ │ │ │ ├── mmmlu_zh_cn_high_school_geography.yaml │ │ │ │ ├── mmmlu_zh_cn_high_school_government_and_politics.yaml │ │ │ │ ├── mmmlu_zh_cn_high_school_macroeconomics.yaml │ │ │ │ ├── mmmlu_zh_cn_high_school_mathematics.yaml │ │ │ │ ├── mmmlu_zh_cn_high_school_microeconomics.yaml │ │ │ │ ├── mmmlu_zh_cn_high_school_physics.yaml │ │ │ │ ├── mmmlu_zh_cn_high_school_psychology.yaml │ │ │ │ ├── mmmlu_zh_cn_high_school_statistics.yaml │ │ │ │ ├── mmmlu_zh_cn_high_school_us_history.yaml │ │ │ │ ├── mmmlu_zh_cn_high_school_world_history.yaml │ │ │ │ ├── mmmlu_zh_cn_human_aging.yaml │ │ │ │ ├── mmmlu_zh_cn_human_sexuality.yaml │ │ │ │ ├── mmmlu_zh_cn_international_law.yaml │ │ │ │ ├── mmmlu_zh_cn_jurisprudence.yaml │ │ │ │ ├── mmmlu_zh_cn_logical_fallacies.yaml │ │ │ │ ├── mmmlu_zh_cn_machine_learning.yaml │ │ │ │ ├── mmmlu_zh_cn_management.yaml │ │ │ │ ├── mmmlu_zh_cn_marketing.yaml │ │ │ │ ├── mmmlu_zh_cn_medical_genetics.yaml │ │ │ │ ├── mmmlu_zh_cn_miscellaneous.yaml │ │ │ │ ├── mmmlu_zh_cn_moral_disputes.yaml │ │ │ │ ├── mmmlu_zh_cn_moral_scenarios.yaml │ │ │ │ ├── mmmlu_zh_cn_nutrition.yaml │ │ │ │ ├── mmmlu_zh_cn_philosophy.yaml │ │ │ │ ├── mmmlu_zh_cn_prehistory.yaml │ │ │ │ ├── mmmlu_zh_cn_professional_accounting.yaml │ │ │ │ ├── mmmlu_zh_cn_professional_law.yaml │ │ │ │ ├── mmmlu_zh_cn_professional_medicine.yaml │ │ │ │ ├── mmmlu_zh_cn_professional_psychology.yaml │ │ │ │ ├── mmmlu_zh_cn_public_relations.yaml │ │ │ │ ├── mmmlu_zh_cn_security_studies.yaml │ │ │ │ ├── mmmlu_zh_cn_sociology.yaml │ │ │ │ ├── mmmlu_zh_cn_us_foreign_policy.yaml │ │ │ │ ├── mmmlu_zh_cn_virology.yaml │ │ │ │ ├── mmmlu_zh_cn_world_religions.yaml │ │ │ │ └── utils.py │ │ │ ├── languages.json │ │ │ └── subjects.json │ │ ├── openbookqa/ │ │ │ ├── README.md │ │ │ └── openbookqa.yaml │ │ ├── paloma/ │ │ │ ├── README.md │ │ │ ├── _paloma_template │ │ │ ├── paloma_4chan_meta_sep.yaml │ │ │ ├── paloma_c4_100_domains.yaml │ │ │ ├── paloma_c4_en.yaml │ │ │ ├── paloma_dolma-v1_5.yaml │ │ │ ├── paloma_dolma_100_programing_languages.yaml │ │ │ ├── paloma_dolma_100_subreddits.yaml │ │ │ ├── paloma_falcon-refinedweb.yaml │ │ │ ├── paloma_gab.yaml │ │ │ ├── paloma_m2d2_s2orc_unsplit.yaml │ │ │ ├── paloma_m2d2_wikipedia_unsplit.yaml │ │ │ ├── paloma_manosphere_meta_sep.yaml │ │ │ ├── paloma_mc4.yaml │ │ │ ├── paloma_ptb.yaml │ │ │ ├── paloma_redpajama.yaml │ │ │ ├── paloma_twitterAAE_HELM_fixed.yaml │ │ │ ├── paloma_utils.py │ │ │ └── paloma_wikitext_103.yaml │ │ ├── paws-x/ │ │ │ ├── README.md │ │ │ ├── _generate_config.py │ │ │ ├── _pawsx.yaml │ │ │ ├── paws_de.yaml │ │ │ ├── paws_en.yaml │ │ │ ├── paws_es.yaml │ │ │ ├── paws_fr.yaml │ │ │ ├── paws_ja.yaml │ │ │ ├── paws_ko.yaml │ │ │ ├── paws_zh.yaml │ │ │ ├── pawsx_template_yaml │ │ │ └── utils.py │ │ ├── pile/ │ │ │ ├── README.md │ │ │ ├── pile_arxiv.yaml │ │ │ ├── pile_bookcorpus2.yaml │ │ │ ├── pile_books3.yaml │ │ │ ├── pile_dm-mathematics.yaml │ │ │ ├── pile_enron.yaml │ │ │ ├── pile_europarl.yaml │ │ │ ├── pile_freelaw.yaml │ │ │ ├── pile_github.yaml │ │ │ ├── pile_gutenberg.yaml │ │ │ ├── pile_hackernews.yaml │ │ │ ├── pile_nih-exporter.yaml │ │ │ ├── pile_opensubtitles.yaml │ │ │ ├── pile_openwebtext2.yaml │ │ │ ├── pile_philpapers.yaml │ │ │ ├── pile_pile-cc.yaml │ │ │ ├── pile_pubmed-abstracts.yaml │ │ │ ├── pile_pubmed-central.yaml │ │ │ ├── pile_stackexchange.yaml │ │ │ ├── pile_ubuntu-irc.yaml │ │ │ ├── pile_uspto.yaml │ │ │ ├── pile_wikipedia.yaml │ │ │ └── pile_youtubesubtitles.yaml │ │ ├── pile_10k/ │ │ │ ├── README.md │ │ │ └── pile_10k.yaml │ │ ├── piqa/ │ │ │ ├── README.md │ │ │ └── piqa.yaml │ │ ├── pisa/ │ │ │ ├── README.md │ │ │ ├── _pisa.yaml │ │ │ ├── _pisa_llm_judged.yaml │ │ │ ├── _template_yaml │ │ │ ├── pisa_ch.yaml │ │ │ ├── pisa_ch_llm_judged.yaml │ │ │ ├── pisa_de.yaml │ │ │ ├── pisa_de_llm_judged.yaml │ │ │ ├── pisa_en.yaml │ │ │ ├── pisa_en_llm_judged.yaml │ │ │ ├── pisa_es.yaml │ │ │ ├── pisa_es_llm_judged.yaml │ │ │ ├── pisa_fr.yaml │ │ │ ├── pisa_fr_llm_judged.yaml │ │ │ ├── pisa_it.yaml │ │ │ ├── pisa_it_llm_judged.yaml │ │ │ └── utils.py │ │ ├── polemo2/ │ │ │ ├── README.md │ │ │ ├── polemo2_in.yaml │ │ │ └── polemo2_out.yaml │ │ ├── portuguese_bench/ │ │ │ ├── README.md │ │ │ ├── assin_entailment.yaml │ │ │ ├── assin_paraphrase.yaml │ │ │ ├── flores_pt/ │ │ │ │ ├── _flores_common_yaml │ │ │ │ ├── create_yamls_flores_pt.py │ │ │ │ ├── flores_ca-pt.yaml │ │ │ │ ├── flores_de-pt.yaml │ │ │ │ ├── flores_en-pt.yaml │ │ │ │ ├── flores_es-pt.yaml │ │ │ │ ├── flores_eu-pt.yaml │ │ │ │ ├── flores_fr-pt.yaml │ │ │ │ ├── flores_gl-pt.yaml │ │ │ │ ├── flores_it-pt.yaml │ │ │ │ ├── flores_pt-ca.yaml │ │ │ │ ├── flores_pt-de.yaml │ │ │ │ ├── flores_pt-en.yaml │ │ │ │ ├── flores_pt-es.yaml │ │ │ │ ├── flores_pt-eu.yaml │ │ │ │ ├── flores_pt-fr.yaml │ │ │ │ ├── flores_pt-gl.yaml │ │ │ │ ├── flores_pt-it.yaml │ │ │ │ └── flores_pt.yaml │ │ │ └── portuguese_bench.yaml │ │ ├── prost/ │ │ │ ├── README.md │ │ │ └── corypaik_prost.yaml │ │ ├── pubmedqa/ │ │ │ ├── README.md │ │ │ ├── preprocess_pubmedqa.py │ │ │ └── pubmedqa.yaml │ │ ├── qa4mre/ │ │ │ ├── README.md │ │ │ ├── preprocess_qa4mre.py │ │ │ ├── qa4mre_2011.yaml │ │ │ ├── qa4mre_2012.yaml │ │ │ └── qa4mre_2013.yaml │ │ ├── qasper/ │ │ │ ├── README.md │ │ │ ├── bool.yaml │ │ │ ├── freeform.yaml │ │ │ ├── metrics.py │ │ │ └── utils.py │ │ ├── race/ │ │ │ ├── README.md │ │ │ ├── preprocess_race.py │ │ │ └── race.yaml │ │ ├── realtoxicityprompts/ │ │ │ ├── metric.py │ │ │ └── realtoxicityprompts.yaml │ │ ├── ruler/ │ │ │ ├── README.md │ │ │ ├── common_utils.py │ │ │ ├── cwe.yaml │ │ │ ├── cwe_utils.py │ │ │ ├── essays.py │ │ │ ├── fwe.yaml │ │ │ ├── fwe_utils.py │ │ │ ├── niah_multikey_1.yaml │ │ │ ├── niah_multikey_2.yaml │ │ │ ├── niah_multikey_3.yaml │ │ │ ├── niah_multiquery.yaml │ │ │ ├── niah_multivalue.yaml │ │ │ ├── niah_single_1.yaml │ │ │ ├── niah_single_2.yaml │ │ │ ├── niah_single_3.yaml │ │ │ ├── niah_utils.py │ │ │ ├── prepare_niah.py │ │ │ ├── qa_hotpot.yaml │ │ │ ├── qa_squad.yaml │ │ │ ├── qa_utils.py │ │ │ ├── ruler.yaml │ │ │ ├── vt.yaml │ │ │ └── vt_utils.py │ │ ├── sciq/ │ │ │ ├── README.md │ │ │ └── sciq.yaml │ │ ├── score/ │ │ │ ├── NON_GREEDY.md │ │ │ ├── README.md │ │ │ ├── agi_eval/ │ │ │ │ ├── non_greedy_robustness_agieval_aqua_rat.yaml │ │ │ │ ├── non_greedy_robustness_agieval_logiqa_en.yaml │ │ │ │ ├── non_greedy_robustness_agieval_lsat_rc.yaml │ │ │ │ ├── non_greedy_robustness_agieval_lstat_ar.yaml │ │ │ │ ├── non_greedy_robustness_agieval_lstat_lr.yaml │ │ │ │ ├── non_greedy_robustness_agieval_sat_en.yaml │ │ │ │ ├── non_greedy_robustness_agieval_sat_math.yaml │ │ │ │ ├── option_order_robustness_agieval_aqua_rat.yaml │ │ │ │ ├── option_order_robustness_agieval_logiqa_en.yaml │ │ │ │ ├── option_order_robustness_agieval_lsat_ar.yaml │ │ │ │ ├── option_order_robustness_agieval_lsat_lr.yaml │ │ │ │ ├── option_order_robustness_agieval_lsat_rc.yaml │ │ │ │ ├── option_order_robustness_agieval_sat_en.yaml │ │ │ │ ├── option_order_robustness_agieval_sat_math.yaml │ │ │ │ ├── prompt_robustness_agieval_aqua_rat.yaml │ │ │ │ ├── prompt_robustness_agieval_logiqa_en.yaml │ │ │ │ ├── prompt_robustness_agieval_lsat_rc.yaml │ │ │ │ ├── prompt_robustness_agieval_lstat_ar.yaml │ │ │ │ ├── prompt_robustness_agieval_lstat_lr.yaml │ │ │ │ ├── prompt_robustness_agieval_sat_en.yaml │ │ │ │ ├── prompt_robustness_agieval_sat_math.yaml │ │ │ │ ├── prompt_templates.json │ │ │ │ ├── score_non_greedy_robustness_agieval.yaml │ │ │ │ ├── score_option_order_robustness_agieval.yaml │ │ │ │ ├── score_prompt_robustness_agieval.yaml │ │ │ │ ├── score_robustness_agieval.yaml │ │ │ │ └── utils_agieval.py │ │ │ ├── math/ │ │ │ │ ├── math_grader.py │ │ │ │ ├── non_greedy_robustness_math_algebra.yaml │ │ │ │ ├── non_greedy_robustness_math_counting_and_prob.yaml │ │ │ │ ├── non_greedy_robustness_math_geometry.yaml │ │ │ │ ├── non_greedy_robustness_math_intermediate_algebra.yaml │ │ │ │ ├── non_greedy_robustness_math_num_theory.yaml │ │ │ │ ├── non_greedy_robustness_math_prealgebra.yaml │ │ │ │ ├── non_greedy_robustness_math_precalc.yaml │ │ │ │ ├── prompt_robustness_math_algebra.yaml │ │ │ │ ├── prompt_robustness_math_counting_and_prob.yaml │ │ │ │ ├── prompt_robustness_math_geometry.yaml │ │ │ │ ├── prompt_robustness_math_intermediate_algebra.yaml │ │ │ │ ├── prompt_robustness_math_num_theory.yaml │ │ │ │ ├── prompt_robustness_math_prealgebra.yaml │ │ │ │ ├── prompt_robustness_math_precalc.yaml │ │ │ │ ├── prompt_templates.json │ │ │ │ ├── score_non_greedy_robustness_math.yaml │ │ │ │ ├── score_prompt_robustness_math.yaml │ │ │ │ ├── score_robustness_math.yaml │ │ │ │ ├── to_be_fixed_questions.json │ │ │ │ └── utils_math.py │ │ │ ├── mmlu_pro/ │ │ │ │ ├── prompt_templates.json │ │ │ │ ├── score_non_greedy_robustness_mmlu_pro.yaml │ │ │ │ ├── score_option_order_robustness_mmlu_pro.yaml │ │ │ │ ├── score_prompt_robustness_mmlu_pro.yaml │ │ │ │ └── utils_mmlu_pro.py │ │ │ ├── non_greedy.sh │ │ │ ├── non_greedy_summarizer.py │ │ │ ├── score_robustness.yaml │ │ │ └── utils.py │ │ ├── scrolls/ │ │ │ ├── README.md │ │ │ ├── scrolls_contractnli.yaml │ │ │ ├── scrolls_govreport.yaml │ │ │ ├── scrolls_narrativeqa.yaml │ │ │ ├── scrolls_qasper.yaml │ │ │ ├── scrolls_qmsum.yaml │ │ │ ├── scrolls_quality.yaml │ │ │ ├── scrolls_summscreenfd.yaml │ │ │ └── task.py │ │ ├── simple_cooccurrence_bias/ │ │ │ ├── README.md │ │ │ ├── simple_cooccurrence_bias.yaml │ │ │ ├── simple_cooccurrence_bias_gen.yaml │ │ │ └── utils.py │ │ ├── siqa/ │ │ │ ├── README.md │ │ │ └── siqa.yaml │ │ ├── slr_bench/ │ │ │ ├── README.md │ │ │ ├── lm_eval_slr_bench.py │ │ │ ├── slr_bench_all.yaml │ │ │ ├── slr_bench_basic.yaml │ │ │ ├── slr_bench_common_yaml │ │ │ ├── slr_bench_easy.yaml │ │ │ ├── slr_bench_group.yaml │ │ │ ├── slr_bench_hard.yaml │ │ │ └── slr_bench_medium.yaml │ │ ├── spanish_bench/ │ │ │ ├── README.md │ │ │ ├── cocoteros_es.yaml │ │ │ ├── copa_es.yaml │ │ │ ├── escola.yaml │ │ │ ├── flores_es/ │ │ │ │ ├── _flores_common_yaml │ │ │ │ ├── create_yamls_flores_es.py │ │ │ │ ├── flores_ca-es.yaml │ │ │ │ ├── flores_de-es.yaml │ │ │ │ ├── flores_en-es.yaml │ │ │ │ ├── flores_es-ca.yaml │ │ │ │ ├── flores_es-de.yaml │ │ │ │ ├── flores_es-en.yaml │ │ │ │ ├── flores_es-eu.yaml │ │ │ │ ├── flores_es-fr.yaml │ │ │ │ ├── flores_es-gl.yaml │ │ │ │ ├── flores_es-it.yaml │ │ │ │ ├── flores_es-pt.yaml │ │ │ │ ├── flores_es.yaml │ │ │ │ ├── flores_eu-es.yaml │ │ │ │ ├── flores_fr-es.yaml │ │ │ │ ├── flores_gl-es.yaml │ │ │ │ ├── flores_it-es.yaml │ │ │ │ └── flores_pt-es.yaml │ │ │ ├── mgsm_direct_es_spanish_bench.yaml │ │ │ ├── openbookqa_es.yaml │ │ │ ├── paws_es_spanish_bench.yaml │ │ │ ├── phrases_es/ │ │ │ │ ├── _phrases_es_common │ │ │ │ ├── phrases_es-va.yaml │ │ │ │ └── phrases_va-es.yaml │ │ │ ├── spanish_bench.yaml │ │ │ ├── utils.py │ │ │ ├── wnli_es.yaml │ │ │ ├── xlsum_es.yaml │ │ │ └── xnli_es_spanish_bench.yaml │ │ ├── squad_completion/ │ │ │ ├── README.md │ │ │ ├── squad_completion.yaml │ │ │ └── task.py │ │ ├── squadv2/ │ │ │ ├── README.md │ │ │ ├── squadv2.yaml │ │ │ └── task.py │ │ ├── storycloze/ │ │ │ ├── README.md │ │ │ ├── storycloze_2016.yaml │ │ │ └── storycloze_2018.yaml │ │ ├── super_glue/ │ │ │ ├── README.md │ │ │ ├── boolq/ │ │ │ │ ├── default.yaml │ │ │ │ ├── seq2seq.yaml │ │ │ │ └── t5-prompt.yaml │ │ │ ├── cb/ │ │ │ │ ├── aggregate.py │ │ │ │ ├── default.yaml │ │ │ │ ├── t5-prompt.yaml │ │ │ │ └── t5_utils.py │ │ │ ├── copa/ │ │ │ │ ├── default.yaml │ │ │ │ ├── t5-prompt.yaml │ │ │ │ └── utils.py │ │ │ ├── multirc/ │ │ │ │ ├── default.yaml │ │ │ │ ├── t5-prompt.yaml │ │ │ │ └── t5_utils.py │ │ │ ├── record/ │ │ │ │ ├── default.yaml │ │ │ │ ├── t5-prompt.yaml │ │ │ │ ├── t5_utils.py │ │ │ │ └── util.py │ │ │ ├── rte/ │ │ │ │ ├── default.yaml │ │ │ │ └── t5-prompt.yaml │ │ │ ├── wic/ │ │ │ │ ├── default.yaml │ │ │ │ └── t5-prompt.yaml │ │ │ └── wsc/ │ │ │ ├── default.yaml │ │ │ ├── preprocess_wsc.py │ │ │ ├── t5-prompt.yaml │ │ │ └── t5_utils.py │ │ ├── swag/ │ │ │ ├── README.md │ │ │ └── swag.yaml │ │ ├── swde/ │ │ │ ├── README.md │ │ │ ├── swde.yaml │ │ │ └── task.py │ │ ├── tinyBenchmarks/ │ │ │ ├── README.md │ │ │ ├── agg_functions.py │ │ │ ├── tinyArc.yaml │ │ │ ├── tinyBenchmarks.yaml │ │ │ ├── tinyGSM8k.yaml │ │ │ ├── tinyHellaswag.yaml │ │ │ ├── tinyMMLU.yaml │ │ │ ├── tinyTruthfulQA_mc1.yaml │ │ │ ├── tinyTruthfulQA_mc2.yaml │ │ │ ├── tinyWinogrande.yaml │ │ │ ├── utils_hellaswag.py │ │ │ ├── utils_truthfulqa.py │ │ │ └── utils_winogrande.py │ │ ├── tmlu/ │ │ │ ├── README.md │ │ │ ├── default/ │ │ │ │ ├── _default_template_yaml │ │ │ │ ├── _generate_configs.py │ │ │ │ ├── _tmlu.yaml │ │ │ │ ├── tmlu_AST_biology.yaml │ │ │ │ ├── tmlu_AST_chemistry.yaml │ │ │ │ ├── tmlu_AST_chinese.yaml │ │ │ │ ├── tmlu_AST_civics.yaml │ │ │ │ ├── tmlu_AST_geography.yaml │ │ │ │ ├── tmlu_AST_history.yaml │ │ │ │ ├── tmlu_CAP_biology.yaml │ │ │ │ ├── tmlu_CAP_chemistry.yaml │ │ │ │ ├── tmlu_CAP_chinese.yaml │ │ │ │ ├── tmlu_CAP_civics.yaml │ │ │ │ ├── tmlu_CAP_earth_science.yaml │ │ │ │ ├── tmlu_CAP_geography.yaml │ │ │ │ ├── tmlu_CAP_history.yaml │ │ │ │ ├── tmlu_GSAT_biology.yaml │ │ │ │ ├── tmlu_GSAT_chemistry.yaml │ │ │ │ ├── tmlu_GSAT_chinese.yaml │ │ │ │ ├── tmlu_GSAT_civics.yaml │ │ │ │ ├── tmlu_GSAT_earth_science.yaml │ │ │ │ ├── tmlu_GSAT_geography.yaml │ │ │ │ ├── tmlu_GSAT_history.yaml │ │ │ │ ├── tmlu_accountant.yaml │ │ │ │ ├── tmlu_basic_traditional_chinese_medicine.yaml │ │ │ │ ├── tmlu_clinical_psychologist.yaml │ │ │ │ ├── tmlu_clinical_traditional_chinese_medicine.yaml │ │ │ │ ├── tmlu_driving_rule.yaml │ │ │ │ ├── tmlu_lawyer_qualification.yaml │ │ │ │ ├── tmlu_nutritionist.yaml │ │ │ │ ├── tmlu_taiwan_tourist_resources.yaml │ │ │ │ ├── tmlu_teacher_qualification.yaml │ │ │ │ ├── tmlu_tour_guide.yaml │ │ │ │ ├── tmlu_tour_leader.yaml │ │ │ │ └── utils.py │ │ │ └── subject.tsv │ │ ├── tmmluplus/ │ │ │ ├── README.md │ │ │ ├── default/ │ │ │ │ ├── _generate_configs.py │ │ │ │ ├── _tmmluplus.yaml │ │ │ │ ├── _tmmluplus_STEM.yaml │ │ │ │ ├── _tmmluplus_humanities.yaml │ │ │ │ ├── _tmmluplus_other.yaml │ │ │ │ ├── _tmmluplus_social_sciences.yaml │ │ │ │ ├── _tmmluplus_template_yaml │ │ │ │ ├── tmmluplus_accounting.yaml │ │ │ │ ├── tmmluplus_administrative_law.yaml │ │ │ │ ├── tmmluplus_advance_chemistry.yaml │ │ │ │ ├── tmmluplus_agriculture.yaml │ │ │ │ ├── tmmluplus_anti_money_laundering.yaml │ │ │ │ ├── tmmluplus_auditing.yaml │ │ │ │ ├── tmmluplus_basic_medical_science.yaml │ │ │ │ ├── tmmluplus_business_management.yaml │ │ │ │ ├── tmmluplus_chinese_language_and_literature.yaml │ │ │ │ ├── tmmluplus_clinical_psychology.yaml │ │ │ │ ├── tmmluplus_computer_science.yaml │ │ │ │ ├── tmmluplus_culinary_skills.yaml │ │ │ │ ├── tmmluplus_dentistry.yaml │ │ │ │ ├── tmmluplus_economics.yaml │ │ │ │ ├── tmmluplus_education.yaml │ │ │ │ ├── tmmluplus_education_(profession_level).yaml │ │ │ │ ├── tmmluplus_educational_psychology.yaml │ │ │ │ ├── tmmluplus_engineering_math.yaml │ │ │ │ ├── tmmluplus_finance_banking.yaml │ │ │ │ ├── tmmluplus_financial_analysis.yaml │ │ │ │ ├── tmmluplus_fire_science.yaml │ │ │ │ ├── tmmluplus_general_principles_of_law.yaml │ │ │ │ ├── tmmluplus_geography_of_taiwan.yaml │ │ │ │ ├── tmmluplus_human_behavior.yaml │ │ │ │ ├── tmmluplus_insurance_studies.yaml │ │ │ │ ├── tmmluplus_introduction_to_law.yaml │ │ │ │ ├── tmmluplus_jce_humanities.yaml │ │ │ │ ├── tmmluplus_junior_chemistry.yaml │ │ │ │ ├── tmmluplus_junior_chinese_exam.yaml │ │ │ │ ├── tmmluplus_junior_math_exam.yaml │ │ │ │ ├── tmmluplus_junior_science_exam.yaml │ │ │ │ ├── tmmluplus_junior_social_studies.yaml │ │ │ │ ├── tmmluplus_linear_algebra.yaml │ │ │ │ ├── tmmluplus_logic_reasoning.yaml │ │ │ │ ├── tmmluplus_macroeconomics.yaml │ │ │ │ ├── tmmluplus_management_accounting.yaml │ │ │ │ ├── tmmluplus_marketing_management.yaml │ │ │ │ ├── tmmluplus_mechanical.yaml │ │ │ │ ├── tmmluplus_music.yaml │ │ │ │ ├── tmmluplus_national_protection.yaml │ │ │ │ ├── tmmluplus_nautical_science.yaml │ │ │ │ ├── tmmluplus_occupational_therapy_for_psychological_disorders.yaml │ │ │ │ ├── tmmluplus_official_document_management.yaml │ │ │ │ ├── tmmluplus_optometry.yaml │ │ │ │ ├── tmmluplus_organic_chemistry.yaml │ │ │ │ ├── tmmluplus_pharmacology.yaml │ │ │ │ ├── tmmluplus_pharmacy.yaml │ │ │ │ ├── tmmluplus_physical_education.yaml │ │ │ │ ├── tmmluplus_physics.yaml │ │ │ │ ├── tmmluplus_politic_science.yaml │ │ │ │ ├── tmmluplus_real_estate.yaml │ │ │ │ ├── tmmluplus_secondary_physics.yaml │ │ │ │ ├── tmmluplus_statistics_and_machine_learning.yaml │ │ │ │ ├── tmmluplus_taiwanese_hokkien.yaml │ │ │ │ ├── tmmluplus_taxation.yaml │ │ │ │ ├── tmmluplus_technical.yaml │ │ │ │ ├── tmmluplus_three_principles_of_people.yaml │ │ │ │ ├── tmmluplus_trade.yaml │ │ │ │ ├── tmmluplus_traditional_chinese_medicine_clinical_medicine.yaml │ │ │ │ ├── tmmluplus_trust_practice.yaml │ │ │ │ ├── tmmluplus_ttqav2.yaml │ │ │ │ ├── tmmluplus_tve_chinese_language.yaml │ │ │ │ ├── tmmluplus_tve_design.yaml │ │ │ │ ├── tmmluplus_tve_mathematics.yaml │ │ │ │ ├── tmmluplus_tve_natural_sciences.yaml │ │ │ │ ├── tmmluplus_veterinary_pathology.yaml │ │ │ │ ├── tmmluplus_veterinary_pharmacology.yaml │ │ │ │ └── utils.py │ │ │ └── subject.tsv │ │ ├── toxigen/ │ │ │ ├── README.md │ │ │ ├── toxigen.yaml │ │ │ └── utils.py │ │ ├── translation/ │ │ │ ├── README.md │ │ │ ├── iwslt2017_ar-en.yaml │ │ │ ├── iwslt2017_en-ar.yaml │ │ │ ├── utils.py │ │ │ ├── wmt14_en-fr.yaml │ │ │ ├── wmt14_fr-en.yaml │ │ │ ├── wmt16_de-en.yaml │ │ │ ├── wmt16_en-de.yaml │ │ │ ├── wmt16_en-ro.yaml │ │ │ ├── wmt16_ro-en.yaml │ │ │ └── wmt_common_yaml │ │ ├── triviaqa/ │ │ │ ├── README.md │ │ │ └── default.yaml │ │ ├── truthfulqa/ │ │ │ ├── README.md │ │ │ ├── truthfulqa_gen.yaml │ │ │ ├── truthfulqa_mc1.yaml │ │ │ ├── truthfulqa_mc2.yaml │ │ │ └── utils.py │ │ ├── truthfulqa-multi/ │ │ │ ├── README.md │ │ │ ├── truthfulqa-multi_gen_ca.yaml │ │ │ ├── truthfulqa-multi_gen_common │ │ │ ├── truthfulqa-multi_gen_en.yaml │ │ │ ├── truthfulqa-multi_gen_es.yaml │ │ │ ├── truthfulqa-multi_gen_eu.yaml │ │ │ ├── truthfulqa-multi_gen_gl.yaml │ │ │ ├── truthfulqa-multi_mc1_ca.yaml │ │ │ ├── truthfulqa-multi_mc1_en.yaml │ │ │ ├── truthfulqa-multi_mc1_es.yaml │ │ │ ├── truthfulqa-multi_mc1_eu.yaml │ │ │ ├── truthfulqa-multi_mc1_gl.yaml │ │ │ ├── truthfulqa-multi_mc2_ca.yaml │ │ │ ├── truthfulqa-multi_mc2_en.yaml │ │ │ ├── truthfulqa-multi_mc2_es.yaml │ │ │ ├── truthfulqa-multi_mc2_eu.yaml │ │ │ ├── truthfulqa-multi_mc2_gl.yaml │ │ │ ├── truthfulqa-multi_mc_common │ │ │ └── utils.py │ │ ├── turblimp/ │ │ │ ├── README.md │ │ │ ├── _template_yaml │ │ │ ├── anaphor_agreement.yaml │ │ │ ├── argument_structure_ditransitive.yaml │ │ │ ├── argument_structure_transitive.yaml │ │ │ ├── binding.yaml │ │ │ ├── determiners.yaml │ │ │ ├── ellipsis.yaml │ │ │ ├── irregular_forms.yaml │ │ │ ├── island_effects.yaml │ │ │ ├── nominalization.yaml │ │ │ ├── npi_licensing.yaml │ │ │ ├── passives.yaml │ │ │ ├── quantifiers.yaml │ │ │ ├── relative_clauses.yaml │ │ │ ├── scrambling.yaml │ │ │ ├── subject_agreement.yaml │ │ │ ├── suspended_affixation.yaml │ │ │ └── turblimp_group.yaml │ │ ├── turkishmmlu/ │ │ │ ├── README.md │ │ │ ├── config/ │ │ │ │ ├── Biology.yaml │ │ │ │ ├── Chemistry.yaml │ │ │ │ ├── Geography.yaml │ │ │ │ ├── History.yaml │ │ │ │ ├── Mathematics.yaml │ │ │ │ ├── Philosophy.yaml │ │ │ │ ├── Physics.yaml │ │ │ │ ├── Religion_and_Ethics.yaml │ │ │ │ ├── Turkish_Language_and_Literature.yaml │ │ │ │ └── _turkishmmlu_default_yaml │ │ │ └── config_cot/ │ │ │ ├── Biology.yaml │ │ │ ├── Chemistry.yaml │ │ │ ├── Geography.yaml │ │ │ ├── History.yaml │ │ │ ├── Mathematics.yaml │ │ │ ├── Philosophy.yaml │ │ │ ├── Physics.yaml │ │ │ ├── Religion_and_Ethics.yaml │ │ │ ├── Turkish_Language_and_Literature.yaml │ │ │ └── _turkishmmlu_cot_default_yaml │ │ ├── ulqa/ │ │ │ ├── README.md │ │ │ ├── celep1.yaml │ │ │ ├── celep2.yaml │ │ │ ├── lambada_uyghur.yaml │ │ │ ├── uleval.yaml │ │ │ ├── ulqa.yaml │ │ │ ├── ulqa_.yaml │ │ │ └── ulut/ │ │ │ ├── nug.yaml │ │ │ ├── ulut.yaml │ │ │ ├── wag.yaml │ │ │ ├── wsm.yaml │ │ │ ├── wub.yaml │ │ │ └── wum.yaml │ │ ├── unitxt/ │ │ │ ├── 20_newsgroups.yaml │ │ │ ├── README.md │ │ │ ├── ag_news.yaml │ │ │ ├── argument_topic.yaml │ │ │ ├── atis.yaml │ │ │ ├── banking77.yaml │ │ │ ├── claim_stance_topic.yaml │ │ │ ├── cnn_dailymail.yaml │ │ │ ├── coedit_gec.yaml │ │ │ ├── dbpedia_14.yaml │ │ │ ├── doc_vqa.yaml │ │ │ ├── ethos_binary.yaml │ │ │ ├── financial_tweets.yaml │ │ │ ├── law_stack_exchange.yaml │ │ │ ├── ledgar.yaml │ │ │ ├── medical_abstracts.yaml │ │ │ ├── stsb.yaml │ │ │ ├── task.py │ │ │ ├── unfair_tos.yaml │ │ │ ├── unitxt │ │ │ ├── unitxt_multimodal │ │ │ ├── xsum.yaml │ │ │ └── yahoo_answers_topics.yaml │ │ ├── unscramble/ │ │ │ ├── README.md │ │ │ ├── anagrams1.yaml │ │ │ ├── anagrams2.yaml │ │ │ ├── cycle_letters.yaml │ │ │ ├── random_insertion.yaml │ │ │ └── reversed_words.yaml │ │ ├── webqs/ │ │ │ ├── README.md │ │ │ ├── utils.py │ │ │ └── webqs.yaml │ │ ├── wikitext/ │ │ │ ├── README.md │ │ │ ├── preprocess_wikitext.py │ │ │ └── wikitext.yaml │ │ ├── winogender/ │ │ │ ├── README.md │ │ │ ├── utils.py │ │ │ ├── winogender.yaml │ │ │ ├── winogender_female.yaml │ │ │ ├── winogender_gotcha.yaml │ │ │ ├── winogender_gotcha_female.yaml │ │ │ ├── winogender_gotcha_male.yaml │ │ │ ├── winogender_male.yaml │ │ │ └── winogender_neutral.yaml │ │ ├── winogrande/ │ │ │ ├── README.md │ │ │ ├── default.yaml │ │ │ └── preprocess_winogrande.py │ │ ├── wmdp/ │ │ │ ├── README.md │ │ │ ├── _default_template_yaml │ │ │ ├── _wmdp.yaml │ │ │ ├── wmdp_bio.yaml │ │ │ ├── wmdp_chem.yaml │ │ │ └── wmdp_cyber.yaml │ │ ├── wmt2016/ │ │ │ ├── README.md │ │ │ ├── metrics.py │ │ │ └── ro_en-t5_prompt.yaml │ │ ├── wsc273/ │ │ │ ├── README.md │ │ │ ├── default.yaml │ │ │ └── utils.py │ │ ├── xcopa/ │ │ │ ├── README.md │ │ │ ├── _xcopa.yaml │ │ │ ├── default_et.yaml │ │ │ ├── default_ht.yaml │ │ │ ├── default_id.yaml │ │ │ ├── default_it.yaml │ │ │ ├── default_qu.yaml │ │ │ ├── default_sw.yaml │ │ │ ├── default_ta.yaml │ │ │ ├── default_th.yaml │ │ │ ├── default_tr.yaml │ │ │ ├── default_vi.yaml │ │ │ ├── default_zh.yaml │ │ │ └── utils.py │ │ ├── xnli/ │ │ │ ├── README.md │ │ │ ├── _xnli.yaml │ │ │ ├── utils.py │ │ │ ├── xnli_ar.yaml │ │ │ ├── xnli_bg.yaml │ │ │ ├── xnli_common_yaml │ │ │ ├── xnli_de.yaml │ │ │ ├── xnli_el.yaml │ │ │ ├── xnli_en.yaml │ │ │ ├── xnli_es.yaml │ │ │ ├── xnli_fr.yaml │ │ │ ├── xnli_hi.yaml │ │ │ ├── xnli_ru.yaml │ │ │ ├── xnli_sw.yaml │ │ │ ├── xnli_th.yaml │ │ │ ├── xnli_tr.yaml │ │ │ ├── xnli_ur.yaml │ │ │ ├── xnli_vi.yaml │ │ │ └── xnli_zh.yaml │ │ ├── xnli_eu/ │ │ │ ├── README.md │ │ │ ├── xnli_common_yaml │ │ │ ├── xnli_eu.yaml │ │ │ ├── xnli_eu_mt.yaml │ │ │ └── xnli_eu_native.yaml │ │ ├── xquad/ │ │ │ ├── README.md │ │ │ ├── utils.py │ │ │ ├── xquad_ar.yaml │ │ │ ├── xquad_common_yaml │ │ │ ├── xquad_de.yaml │ │ │ ├── xquad_el.yaml │ │ │ ├── xquad_en.yaml │ │ │ ├── xquad_es.yaml │ │ │ ├── xquad_hi.yaml │ │ │ ├── xquad_ro.yaml │ │ │ ├── xquad_ru.yaml │ │ │ ├── xquad_th.yaml │ │ │ ├── xquad_tr.yaml │ │ │ ├── xquad_vi.yaml │ │ │ └── xquad_zh.yaml │ │ ├── xstorycloze/ │ │ │ ├── README.md │ │ │ ├── _xstorycloze.yaml │ │ │ ├── default_ar.yaml │ │ │ ├── default_en.yaml │ │ │ ├── default_es.yaml │ │ │ ├── default_eu.yaml │ │ │ ├── default_hi.yaml │ │ │ ├── default_id.yaml │ │ │ ├── default_my.yaml │ │ │ ├── default_ru.yaml │ │ │ ├── default_sw.yaml │ │ │ ├── default_te.yaml │ │ │ └── default_zh.yaml │ │ ├── xwinograd/ │ │ │ ├── README.md │ │ │ ├── _xwinograd.yaml │ │ │ ├── utils.py │ │ │ ├── xwinograd_common_yaml │ │ │ ├── xwinograd_en.yaml │ │ │ ├── xwinograd_fr.yaml │ │ │ ├── xwinograd_jp.yaml │ │ │ ├── xwinograd_pt.yaml │ │ │ ├── xwinograd_ru.yaml │ │ │ └── xwinograd_zh.yaml │ │ └── zhoblimp/ │ │ ├── BA_BEI_subj_drop.yaml │ │ ├── BA_deletion.yaml │ │ ├── BA_duplicate_argument.yaml │ │ ├── BA_inversion.yaml │ │ ├── BA_meiba.yaml │ │ ├── BA_negation.yaml │ │ ├── BA_no_progressive.yaml │ │ ├── BA_no_stative_verb.yaml │ │ ├── BA_suo_adverbial_a.yaml │ │ ├── BA_suo_adverbial_b.yaml │ │ ├── BA_verb_le_a.yaml │ │ ├── BA_verb_le_b.yaml │ │ ├── BEI_construction_a.yaml │ │ ├── BEI_construction_b.yaml │ │ ├── BEI_deletion.yaml │ │ ├── BEI_preposition.yaml │ │ ├── PN_numP_a.yaml │ │ ├── PN_numP_b.yaml │ │ ├── README.md │ │ ├── _template_yaml │ │ ├── adjective_transitive_dui.yaml │ │ ├── agent_animacy_adv.yaml │ │ ├── agent_animacy_passive.yaml │ │ ├── agent_animacy_subj.yaml │ │ ├── agent_causative.yaml │ │ ├── agent_deletion.yaml │ │ ├── anaphor_gender_agreement.yaml │ │ ├── anaphor_number_agreement.yaml │ │ ├── causative_shi_ba.yaml │ │ ├── classifier_noun_agreement.yaml │ │ ├── classifier_noun_agreement_no_gap.yaml │ │ ├── classifier_noun_subj.yaml │ │ ├── control_modal_vs_raising_modal.yaml │ │ ├── ellipsis_adj.yaml │ │ ├── ellipsis_double_object.yaml │ │ ├── ellipsis_n_bar_class.yaml │ │ ├── existential_there_subject_raising.yaml │ │ ├── fci_renhe_dou.yaml │ │ ├── fci_renhe_prepP.yaml │ │ ├── fci_renhe_ruguo.yaml │ │ ├── fci_renhe_subj.yaml │ │ ├── fci_renhe_suoyou.yaml │ │ ├── intransitive_double_obj.yaml │ │ ├── intransitive_no_obj.yaml │ │ ├── left_adverbial_b.yaml │ │ ├── left_adverbial_d.yaml │ │ ├── left_adverbial_e.yaml │ │ ├── left_adverbial_negation.yaml │ │ ├── left_dou.yaml │ │ ├── modal_raising_hui.yaml │ │ ├── modal_raising_topicalization.yaml │ │ ├── nominal_definite_men.yaml │ │ ├── nominal_modal_insertion.yaml │ │ ├── noun_adjective_shi.yaml │ │ ├── noun_phrase_conjunction_jian.yaml │ │ ├── npi_renhe_A_not_A_question.yaml │ │ ├── npi_renhe_conditional.yaml │ │ ├── npi_renhe_neg_scope_locP.yaml │ │ ├── npi_renhe_neg_scope_subj.yaml │ │ ├── npi_renhe_wh_question_obj.yaml │ │ ├── npi_renhe_wh_question_subj.yaml │ │ ├── passive_agent_deletion_long_left.yaml │ │ ├── passive_agent_deletion_long_right_a.yaml │ │ ├── passive_agent_deletion_long_right_b.yaml │ │ ├── passive_agent_deletion_short.yaml │ │ ├── passive_body_part.yaml │ │ ├── passive_intransitive.yaml │ │ ├── passive_no_adj.yaml │ │ ├── passive_suo.yaml │ │ ├── plural_cardinal_men_a.yaml │ │ ├── plural_cardinal_men_b.yaml │ │ ├── preposition_deletion.yaml │ │ ├── preposition_insertion.yaml │ │ ├── principle_A_c_command.yaml │ │ ├── principle_A_c_command_number.yaml │ │ ├── principle_A_domain.yaml │ │ ├── principle_A_domain_number.yaml │ │ ├── question_A_not_A.yaml │ │ ├── question_A_not_A_daodi_a.yaml │ │ ├── question_A_not_A_daodi_b.yaml │ │ ├── question_A_not_A_indirect.yaml │ │ ├── question_V_not_VP_1.yaml │ │ ├── question_V_not_VP_2.yaml │ │ ├── question_daodi_nandao_1.yaml │ │ ├── question_daodi_nandao_2.yaml │ │ ├── question_daodi_nandao_A_not_A_intran.yaml │ │ ├── question_daodi_nandao_A_not_A_tran.yaml │ │ ├── question_daodi_negation.yaml │ │ ├── question_nandao_negation.yaml │ │ ├── question_nandao_raising_1_a.yaml │ │ ├── question_nandao_raising_1_b.yaml │ │ ├── question_nandao_raising_2.yaml │ │ ├── question_nandao_raising_3.yaml │ │ ├── question_nandao_scope_1.yaml │ │ ├── question_nandao_scope_2.yaml │ │ ├── question_particle_daodi_choice_intran.yaml │ │ ├── question_particle_daodi_choice_tran.yaml │ │ ├── question_particle_nandao.yaml │ │ ├── relative_operator_intepretation.yaml │ │ ├── relative_operator_who.yaml │ │ ├── relativization_movement_no_gap.yaml │ │ ├── relativization_movement_when_where.yaml │ │ ├── renhe_no_episodic_sentences.yaml │ │ ├── renhe_no_superordinate_negation.yaml │ │ ├── renhe_non_factive_verb.yaml │ │ ├── right_yijing_a.yaml │ │ ├── right_yijing_b.yaml │ │ ├── singular_PN_but_plural_pron.yaml │ │ ├── superlative_quantifiers_1.yaml │ │ ├── superlative_quantifiers_2.yaml │ │ ├── topicalization_OSV.yaml │ │ ├── topicalization_OSV_mei.yaml │ │ ├── topicalization_SOV.yaml │ │ ├── topicalization_SOV_mei.yaml │ │ ├── verb_negation_particle.yaml │ │ ├── verb_phrase_left_adverbial.yaml │ │ ├── verb_phrase_left_negation.yaml │ │ ├── ya_insertion.yaml │ │ ├── you_quantifier_adj.yaml │ │ ├── you_yige.yaml │ │ └── zhoblimp_group.yaml │ └── utils.py ├── pile_statistics.json ├── pyproject.toml ├── scripts/ │ ├── __init__.py │ ├── build_benchmark.py │ ├── clean_training_data/ │ │ ├── README.md │ │ ├── __init__.py │ │ ├── compress_and_package.py │ │ ├── generate_13_grams.py │ │ ├── investigate_pile.py │ │ ├── janitor_util.cpp │ │ ├── process_sorted_buckets.py │ │ └── sort_13_gram_buckets.py │ ├── get_prompts.py │ ├── make_gpt2_test_cases.py │ ├── make_table_results.py │ ├── make_table_tasks.py │ ├── model_comparator.py │ ├── regression.py │ ├── requests_caching.py │ ├── write_out.py │ └── zeno_visualize.py ├── templates/ │ └── new_yaml_task/ │ ├── README.md │ └── blank_yaml.yaml └── tests/ ├── __init__.py ├── conftest.py ├── models/ │ ├── test_api.py │ ├── test_bos_handling.py │ ├── test_gguf.py │ ├── test_gptqmodel.py │ ├── test_hf_steered.py │ ├── test_huggingface.py │ ├── test_model_utils.py │ ├── test_openvino.py │ ├── test_sglang.py │ ├── test_vllm.py │ └── test_vllm_context_length.py ├── scripts/ │ └── test_zeno_visualize.py ├── test_aggregation_pipeline.py ├── test_cli_subcommands.py ├── test_configs/ │ ├── empty_group.yaml │ ├── group.yaml │ ├── group_ref_parent.yaml │ ├── include_base.yaml │ ├── include_group.yaml │ ├── include_task_fs0.yaml │ ├── include_task_fs1.yaml │ ├── include_task_fs5.yaml │ ├── inline_subgroup.yaml │ ├── mixed_members_group.yaml │ ├── propagation_group.yaml │ ├── simple_task.yaml │ ├── simple_task_b.yaml │ ├── tag_parent_group.yaml │ ├── tag_subgroup.yaml │ ├── tag_task_1.yaml │ ├── tag_task_2.yaml │ ├── tag_task_3.yaml │ └── test_data.json ├── test_evaluator.py ├── test_evaluator_utils.py ├── test_fewshot_context.py ├── test_group.py ├── test_janitor.py ├── test_metrics.py ├── test_misc.py ├── test_prompt.py ├── test_registry.py ├── test_requests_caching.py ├── test_samplers.py ├── test_task_manager.py ├── test_tasks.py ├── test_unitxt_tasks.py ├── test_utils.py ├── testconfigs/ │ ├── arc_easy_unitxt.yaml │ ├── arc_test.yaml │ ├── sae_lens_intervention.csv │ └── sparsify_intervention.csv ├── testdata/ │ ├── ai2_arc_10_hf_pretrained-EleutherAI-pythia-14m-deduped-dtype-float32-device-cpu.txt │ ├── anagrams1-v0-greedy_until │ ├── anagrams1-v0-res.json │ ├── anagrams2-v0-greedy_until │ ├── anagrams2-v0-res.json │ ├── anli_r1-v0-loglikelihood │ ├── anli_r1-v0-res.json │ ├── anli_r2-v0-loglikelihood │ ├── anli_r2-v0-res.json │ ├── anli_r3-v0-loglikelihood │ ├── anli_r3-v0-res.json │ ├── arc_challenge-v0-loglikelihood │ ├── arc_challenge-v0-res.json │ ├── arc_challenge-v2.0-loglikelihood │ ├── arc_challenge-v2.0-res.json │ ├── arc_easy-v0-loglikelihood │ ├── arc_easy-v0-res.json │ ├── arithmetic_1dc-v0-loglikelihood │ ├── arithmetic_1dc-v0-res.json │ ├── arithmetic_2da-v0-loglikelihood │ ├── arithmetic_2da-v0-res.json │ ├── arithmetic_2dm-v0-loglikelihood │ ├── arithmetic_2dm-v0-res.json │ ├── arithmetic_2ds-v0-loglikelihood │ ├── arithmetic_2ds-v0-res.json │ ├── arithmetic_3da-v0-loglikelihood │ ├── arithmetic_3da-v0-res.json │ ├── arithmetic_3ds-v0-loglikelihood │ ├── arithmetic_3ds-v0-res.json │ ├── arithmetic_4da-v0-loglikelihood │ ├── arithmetic_4da-v0-res.json │ ├── arithmetic_4ds-v0-loglikelihood │ ├── arithmetic_4ds-v0-res.json │ ├── arithmetic_5da-v0-loglikelihood │ ├── arithmetic_5da-v0-res.json │ ├── arithmetic_5ds-v0-loglikelihood │ ├── arithmetic_5ds-v0-res.json │ ├── blimp_adjunct_island-v0-loglikelihood │ ├── blimp_adjunct_island-v0-res.json │ ├── blimp_anaphor_gender_agreement-v0-loglikelihood │ ├── blimp_anaphor_gender_agreement-v0-res.json │ ├── blimp_anaphor_number_agreement-v0-loglikelihood │ ├── blimp_anaphor_number_agreement-v0-res.json │ ├── blimp_animate_subject_passive-v0-loglikelihood │ ├── blimp_animate_subject_passive-v0-res.json │ ├── blimp_animate_subject_trans-v0-loglikelihood │ ├── blimp_animate_subject_trans-v0-res.json │ ├── blimp_causative-v0-loglikelihood │ ├── blimp_causative-v0-res.json │ ├── blimp_complex_NP_island-v0-loglikelihood │ ├── blimp_complex_NP_island-v0-res.json │ ├── blimp_coordinate_structure_constraint_complex_left_branch-v0-loglikelihood │ ├── blimp_coordinate_structure_constraint_complex_left_branch-v0-res.json │ ├── blimp_coordinate_structure_constraint_object_extraction-v0-loglikelihood │ ├── blimp_coordinate_structure_constraint_object_extraction-v0-res.json │ ├── blimp_determiner_noun_agreement_1-v0-loglikelihood │ ├── blimp_determiner_noun_agreement_1-v0-res.json │ ├── blimp_determiner_noun_agreement_2-v0-loglikelihood │ ├── blimp_determiner_noun_agreement_2-v0-res.json │ ├── blimp_determiner_noun_agreement_irregular_1-v0-loglikelihood │ ├── blimp_determiner_noun_agreement_irregular_1-v0-res.json │ ├── blimp_determiner_noun_agreement_irregular_2-v0-loglikelihood │ ├── blimp_determiner_noun_agreement_irregular_2-v0-res.json │ ├── blimp_determiner_noun_agreement_with_adj_2-v0-loglikelihood │ ├── blimp_determiner_noun_agreement_with_adj_2-v0-res.json │ ├── blimp_determiner_noun_agreement_with_adj_irregular_1-v0-loglikelihood │ ├── blimp_determiner_noun_agreement_with_adj_irregular_1-v0-res.json │ ├── blimp_determiner_noun_agreement_with_adj_irregular_2-v0-loglikelihood │ ├── blimp_determiner_noun_agreement_with_adj_irregular_2-v0-res.json │ ├── blimp_determiner_noun_agreement_with_adjective_1-v0-loglikelihood │ ├── blimp_determiner_noun_agreement_with_adjective_1-v0-res.json │ ├── blimp_distractor_agreement_relational_noun-v0-loglikelihood │ ├── blimp_distractor_agreement_relational_noun-v0-res.json │ ├── blimp_distractor_agreement_relative_clause-v0-loglikelihood │ ├── blimp_distractor_agreement_relative_clause-v0-res.json │ ├── blimp_drop_argument-v0-loglikelihood │ ├── blimp_drop_argument-v0-res.json │ ├── blimp_ellipsis_n_bar_1-v0-loglikelihood │ ├── blimp_ellipsis_n_bar_1-v0-res.json │ ├── blimp_ellipsis_n_bar_2-v0-loglikelihood │ ├── blimp_ellipsis_n_bar_2-v0-res.json │ ├── blimp_existential_there_object_raising-v0-loglikelihood │ ├── blimp_existential_there_object_raising-v0-res.json │ ├── blimp_existential_there_quantifiers_1-v0-loglikelihood │ ├── blimp_existential_there_quantifiers_1-v0-res.json │ ├── blimp_existential_there_quantifiers_2-v0-loglikelihood │ ├── blimp_existential_there_quantifiers_2-v0-res.json │ ├── blimp_existential_there_subject_raising-v0-loglikelihood │ ├── blimp_existential_there_subject_raising-v0-res.json │ ├── blimp_expletive_it_object_raising-v0-loglikelihood │ ├── blimp_expletive_it_object_raising-v0-res.json │ ├── blimp_inchoative-v0-loglikelihood │ ├── blimp_inchoative-v0-res.json │ ├── blimp_intransitive-v0-loglikelihood │ ├── blimp_intransitive-v0-res.json │ ├── blimp_irregular_past_participle_adjectives-v0-loglikelihood │ ├── blimp_irregular_past_participle_adjectives-v0-res.json │ ├── blimp_irregular_past_participle_verbs-v0-loglikelihood │ ├── blimp_irregular_past_participle_verbs-v0-res.json │ ├── blimp_irregular_plural_subject_verb_agreement_1-v0-loglikelihood │ ├── blimp_irregular_plural_subject_verb_agreement_1-v0-res.json │ ├── blimp_irregular_plural_subject_verb_agreement_2-v0-loglikelihood │ ├── blimp_irregular_plural_subject_verb_agreement_2-v0-res.json │ ├── blimp_left_branch_island_echo_question-v0-loglikelihood │ ├── blimp_left_branch_island_echo_question-v0-res.json │ ├── blimp_left_branch_island_simple_question-v0-loglikelihood │ ├── blimp_left_branch_island_simple_question-v0-res.json │ ├── blimp_matrix_question_npi_licensor_present-v0-loglikelihood │ ├── blimp_matrix_question_npi_licensor_present-v0-res.json │ ├── blimp_npi_present_1-v0-loglikelihood │ ├── blimp_npi_present_1-v0-res.json │ ├── blimp_npi_present_2-v0-loglikelihood │ ├── blimp_npi_present_2-v0-res.json │ ├── blimp_only_npi_licensor_present-v0-loglikelihood │ ├── blimp_only_npi_licensor_present-v0-res.json │ ├── blimp_only_npi_scope-v0-loglikelihood │ ├── blimp_only_npi_scope-v0-res.json │ ├── blimp_passive_1-v0-loglikelihood │ ├── blimp_passive_1-v0-res.json │ ├── blimp_passive_2-v0-loglikelihood │ ├── blimp_passive_2-v0-res.json │ ├── blimp_principle_A_c_command-v0-loglikelihood │ ├── blimp_principle_A_c_command-v0-res.json │ ├── blimp_principle_A_case_1-v0-loglikelihood │ ├── blimp_principle_A_case_1-v0-res.json │ ├── blimp_principle_A_case_2-v0-loglikelihood │ ├── blimp_principle_A_case_2-v0-res.json │ ├── blimp_principle_A_domain_1-v0-loglikelihood │ ├── blimp_principle_A_domain_1-v0-res.json │ ├── blimp_principle_A_domain_2-v0-loglikelihood │ ├── blimp_principle_A_domain_2-v0-res.json │ ├── blimp_principle_A_domain_3-v0-loglikelihood │ ├── blimp_principle_A_domain_3-v0-res.json │ ├── blimp_principle_A_reconstruction-v0-loglikelihood │ ├── blimp_principle_A_reconstruction-v0-res.json │ ├── blimp_regular_plural_subject_verb_agreement_1-v0-loglikelihood │ ├── blimp_regular_plural_subject_verb_agreement_1-v0-res.json │ ├── blimp_regular_plural_subject_verb_agreement_2-v0-loglikelihood │ ├── blimp_regular_plural_subject_verb_agreement_2-v0-res.json │ ├── blimp_sentential_negation_npi_licensor_present-v0-loglikelihood │ ├── blimp_sentential_negation_npi_licensor_present-v0-res.json │ ├── blimp_sentential_negation_npi_scope-v0-loglikelihood │ ├── blimp_sentential_negation_npi_scope-v0-res.json │ ├── blimp_sentential_subject_island-v0-loglikelihood │ ├── blimp_sentential_subject_island-v0-res.json │ ├── blimp_superlative_quantifiers_1-v0-loglikelihood │ ├── blimp_superlative_quantifiers_1-v0-res.json │ ├── blimp_superlative_quantifiers_2-v0-loglikelihood │ ├── blimp_superlative_quantifiers_2-v0-res.json │ ├── blimp_tough_vs_raising_1-v0-loglikelihood │ ├── blimp_tough_vs_raising_1-v0-res.json │ ├── blimp_tough_vs_raising_2-v0-loglikelihood │ ├── blimp_tough_vs_raising_2-v0-res.json │ ├── blimp_transitive-v0-loglikelihood │ ├── blimp_transitive-v0-res.json │ ├── blimp_wh_island-v0-loglikelihood │ ├── blimp_wh_island-v0-res.json │ ├── blimp_wh_questions_object_gap-v0-loglikelihood │ ├── blimp_wh_questions_object_gap-v0-res.json │ ├── blimp_wh_questions_subject_gap-v0-loglikelihood │ ├── blimp_wh_questions_subject_gap-v0-res.json │ ├── blimp_wh_questions_subject_gap_long_distance-v0-loglikelihood │ ├── blimp_wh_questions_subject_gap_long_distance-v0-res.json │ ├── blimp_wh_vs_that_no_gap-v0-loglikelihood │ ├── blimp_wh_vs_that_no_gap-v0-res.json │ ├── blimp_wh_vs_that_no_gap_long_distance-v0-loglikelihood │ ├── blimp_wh_vs_that_no_gap_long_distance-v0-res.json │ ├── blimp_wh_vs_that_with_gap-v0-loglikelihood │ ├── blimp_wh_vs_that_with_gap-v0-res.json │ ├── blimp_wh_vs_that_with_gap_long_distance-v0-loglikelihood │ ├── blimp_wh_vs_that_with_gap_long_distance-v0-res.json │ ├── boolq-v0-loglikelihood │ ├── boolq-v0-res.json │ ├── boolq-v1-loglikelihood │ ├── boolq-v1-res.json │ ├── cb-v0-loglikelihood │ ├── cb-v0-res.json │ ├── cb-v1-loglikelihood │ ├── cb-v1-res.json │ ├── cola-v0-loglikelihood │ ├── cola-v0-res.json │ ├── copa-v0-loglikelihood │ ├── copa-v0-res.json │ ├── coqa-v0-greedy_until │ ├── coqa-v0-res.json │ ├── coqa-v1-greedy_until │ ├── coqa-v1-res.json │ ├── crows_pairs_english-v0-loglikelihood │ ├── crows_pairs_english-v0-res.json │ ├── crows_pairs_english_age-v0-loglikelihood │ ├── crows_pairs_english_age-v0-res.json │ ├── crows_pairs_english_autre-v0-loglikelihood │ ├── crows_pairs_english_autre-v0-res.json │ ├── crows_pairs_english_disability-v0-loglikelihood │ ├── crows_pairs_english_disability-v0-res.json │ ├── crows_pairs_english_gender-v0-loglikelihood │ ├── crows_pairs_english_gender-v0-res.json │ ├── crows_pairs_english_nationality-v0-loglikelihood │ ├── crows_pairs_english_nationality-v0-res.json │ ├── crows_pairs_english_physical_appearance-v0-loglikelihood │ ├── crows_pairs_english_physical_appearance-v0-res.json │ ├── crows_pairs_english_race_color-v0-loglikelihood │ ├── crows_pairs_english_race_color-v0-res.json │ ├── crows_pairs_english_religion-v0-loglikelihood │ ├── crows_pairs_english_religion-v0-res.json │ ├── crows_pairs_english_sexual_orientation-v0-loglikelihood │ ├── crows_pairs_english_sexual_orientation-v0-res.json │ ├── crows_pairs_english_socioeconomic-v0-loglikelihood │ ├── crows_pairs_english_socioeconomic-v0-res.json │ ├── crows_pairs_french-v0-loglikelihood │ ├── crows_pairs_french-v0-res.json │ ├── crows_pairs_french_age-v0-loglikelihood │ ├── crows_pairs_french_age-v0-res.json │ ├── crows_pairs_french_autre-v0-loglikelihood │ ├── crows_pairs_french_autre-v0-res.json │ ├── crows_pairs_french_disability-v0-loglikelihood │ ├── crows_pairs_french_disability-v0-res.json │ ├── crows_pairs_french_gender-v0-loglikelihood │ ├── crows_pairs_french_gender-v0-res.json │ ├── crows_pairs_french_nationality-v0-loglikelihood │ ├── crows_pairs_french_nationality-v0-res.json │ ├── crows_pairs_french_physical_appearance-v0-loglikelihood │ ├── crows_pairs_french_physical_appearance-v0-res.json │ ├── crows_pairs_french_race_color-v0-loglikelihood │ ├── crows_pairs_french_race_color-v0-res.json │ ├── crows_pairs_french_religion-v0-loglikelihood │ ├── crows_pairs_french_religion-v0-res.json │ ├── crows_pairs_french_sexual_orientation-v0-loglikelihood │ ├── crows_pairs_french_sexual_orientation-v0-res.json │ ├── crows_pairs_french_socioeconomic-v0-loglikelihood │ ├── crows_pairs_french_socioeconomic-v0-res.json │ ├── cycle_letters-v0-greedy_until │ ├── cycle_letters-v0-res.json │ ├── drop-v0-greedy_until │ ├── drop-v0-res.json │ ├── drop-v1-greedy_until │ ├── drop-v1-res.json │ ├── ethics_cm-v0-loglikelihood │ ├── ethics_cm-v0-res.json │ ├── ethics_deontology-v0-loglikelihood │ ├── ethics_deontology-v0-res.json │ ├── ethics_justice-v0-loglikelihood │ ├── ethics_justice-v0-res.json │ ├── ethics_utilitarianism-v0-loglikelihood │ ├── ethics_utilitarianism-v0-res.json │ ├── ethics_utilitarianism_original-v0-loglikelihood │ ├── ethics_utilitarianism_original-v0-res.json │ ├── ethics_virtue-v0-loglikelihood │ ├── ethics_virtue-v0-res.json │ ├── gguf_test_44e268d15decc4d2d0f99e57e1476269826cd3b54262f7a0981f75ddd45b25d0.pkl │ ├── gguf_test_52ea409606de8755e03cf7c79f824101a4ce64bb6e6d3df556b8a4e7a5d92418.pkl │ ├── gguf_test_8fcf3f2f52afeb2acd7c8e02c2cc3ce31a691b665d295f6c4e4bbd71c7caa1a2.pkl │ ├── gpt3_test_0deb8e9bde8e8327bbc48157f638ff3ba06b0cd816dad2beb8ad90f7fbe795c7.pkl │ ├── gpt3_test_8025023377febbd8c5f2b9f26705c394ff375d0cad7c89c10fd9b8e1eb66ff1c.pkl │ ├── gpt3_test_bb2cc49115e88788ed870ad0716eb00b280a885f91c7ed6e1e864435e5e2b6ac.pkl │ ├── gpt3_test_cfd11f555a5a63b6dfa114a55a932e51b724cdd44d4842586b9ce37260bf7aaa.pkl │ ├── gpt3_test_f307d52964c295e2005c5e782b688c24388e0cecadf29f1e6fc7f394236ea9c0.pkl │ ├── gsm8k-v0-greedy_until │ ├── gsm8k-v0-res.json │ ├── headqa-v0-loglikelihood │ ├── headqa-v0-res.json │ ├── headqa_en-v0-loglikelihood │ ├── headqa_en-v0-res.json │ ├── headqa_es-v0-loglikelihood │ ├── headqa_es-v0-res.json │ ├── hellaswag-v0-loglikelihood │ ├── hellaswag-v0-res.json │ ├── hendrycksTest-abstract_algebra-v0-loglikelihood │ ├── hendrycksTest-abstract_algebra-v0-res.json │ ├── hendrycksTest-anatomy-v0-loglikelihood │ ├── hendrycksTest-anatomy-v0-res.json │ ├── hendrycksTest-astronomy-v0-loglikelihood │ ├── hendrycksTest-astronomy-v0-res.json │ ├── hendrycksTest-business_ethics-v0-loglikelihood │ ├── hendrycksTest-business_ethics-v0-res.json │ ├── hendrycksTest-clinical_knowledge-v0-loglikelihood │ ├── hendrycksTest-clinical_knowledge-v0-res.json │ ├── hendrycksTest-college_biology-v0-loglikelihood │ ├── hendrycksTest-college_biology-v0-res.json │ ├── hendrycksTest-college_chemistry-v0-loglikelihood │ ├── hendrycksTest-college_chemistry-v0-res.json │ ├── hendrycksTest-college_computer_science-v0-loglikelihood │ ├── hendrycksTest-college_computer_science-v0-res.json │ ├── hendrycksTest-college_mathematics-v0-loglikelihood │ ├── hendrycksTest-college_mathematics-v0-res.json │ ├── hendrycksTest-college_medicine-v0-loglikelihood │ ├── hendrycksTest-college_medicine-v0-res.json │ ├── hendrycksTest-college_physics-v0-loglikelihood │ ├── hendrycksTest-college_physics-v0-res.json │ ├── hendrycksTest-computer_security-v0-loglikelihood │ ├── hendrycksTest-computer_security-v0-res.json │ ├── hendrycksTest-conceptual_physics-v0-loglikelihood │ ├── hendrycksTest-conceptual_physics-v0-res.json │ ├── hendrycksTest-econometrics-v0-loglikelihood │ ├── hendrycksTest-econometrics-v0-res.json │ ├── hendrycksTest-electrical_engineering-v0-loglikelihood │ ├── hendrycksTest-electrical_engineering-v0-res.json │ ├── hendrycksTest-elementary_mathematics-v0-loglikelihood │ ├── hendrycksTest-elementary_mathematics-v0-res.json │ ├── hendrycksTest-formal_logic-v0-loglikelihood │ ├── hendrycksTest-formal_logic-v0-res.json │ ├── hendrycksTest-global_facts-v0-loglikelihood │ ├── hendrycksTest-global_facts-v0-res.json │ ├── hendrycksTest-high_school_biology-v0-loglikelihood │ ├── hendrycksTest-high_school_biology-v0-res.json │ ├── hendrycksTest-high_school_chemistry-v0-loglikelihood │ ├── hendrycksTest-high_school_chemistry-v0-res.json │ ├── hendrycksTest-high_school_computer_science-v0-loglikelihood │ ├── hendrycksTest-high_school_computer_science-v0-res.json │ ├── hendrycksTest-high_school_european_history-v0-loglikelihood │ ├── hendrycksTest-high_school_european_history-v0-res.json │ ├── hendrycksTest-high_school_geography-v0-loglikelihood │ ├── hendrycksTest-high_school_geography-v0-res.json │ ├── hendrycksTest-high_school_government_and_politics-v0-loglikelihood │ ├── hendrycksTest-high_school_government_and_politics-v0-res.json │ ├── hendrycksTest-high_school_macroeconomics-v0-loglikelihood │ ├── hendrycksTest-high_school_macroeconomics-v0-res.json │ ├── hendrycksTest-high_school_mathematics-v0-loglikelihood │ ├── hendrycksTest-high_school_mathematics-v0-res.json │ ├── hendrycksTest-high_school_microeconomics-v0-loglikelihood │ ├── hendrycksTest-high_school_microeconomics-v0-res.json │ ├── hendrycksTest-high_school_physics-v0-loglikelihood │ ├── hendrycksTest-high_school_physics-v0-res.json │ ├── hendrycksTest-high_school_psychology-v0-loglikelihood │ ├── hendrycksTest-high_school_psychology-v0-res.json │ ├── hendrycksTest-high_school_statistics-v0-loglikelihood │ ├── hendrycksTest-high_school_statistics-v0-res.json │ ├── hendrycksTest-high_school_us_history-v0-loglikelihood │ ├── hendrycksTest-high_school_us_history-v0-res.json │ ├── hendrycksTest-high_school_world_history-v0-loglikelihood │ ├── hendrycksTest-high_school_world_history-v0-res.json │ ├── hendrycksTest-human_aging-v0-loglikelihood │ ├── hendrycksTest-human_aging-v0-res.json │ ├── hendrycksTest-human_sexuality-v0-loglikelihood │ ├── hendrycksTest-human_sexuality-v0-res.json │ ├── hendrycksTest-international_law-v0-loglikelihood │ ├── hendrycksTest-international_law-v0-res.json │ ├── hendrycksTest-jurisprudence-v0-loglikelihood │ ├── hendrycksTest-jurisprudence-v0-res.json │ ├── hendrycksTest-logical_fallacies-v0-loglikelihood │ ├── hendrycksTest-logical_fallacies-v0-res.json │ ├── hendrycksTest-machine_learning-v0-loglikelihood │ ├── hendrycksTest-machine_learning-v0-res.json │ ├── hendrycksTest-management-v0-loglikelihood │ ├── hendrycksTest-management-v0-res.json │ ├── hendrycksTest-marketing-v0-loglikelihood │ ├── hendrycksTest-marketing-v0-res.json │ ├── hendrycksTest-medical_genetics-v0-loglikelihood │ ├── hendrycksTest-medical_genetics-v0-res.json │ ├── hendrycksTest-miscellaneous-v0-loglikelihood │ ├── hendrycksTest-miscellaneous-v0-res.json │ ├── hendrycksTest-moral_disputes-v0-loglikelihood │ ├── hendrycksTest-moral_disputes-v0-res.json │ ├── hendrycksTest-moral_scenarios-v0-loglikelihood │ ├── hendrycksTest-moral_scenarios-v0-res.json │ ├── hendrycksTest-nutrition-v0-loglikelihood │ ├── hendrycksTest-nutrition-v0-res.json │ ├── hendrycksTest-philosophy-v0-loglikelihood │ ├── hendrycksTest-philosophy-v0-res.json │ ├── hendrycksTest-prehistory-v0-loglikelihood │ ├── hendrycksTest-prehistory-v0-res.json │ ├── hendrycksTest-professional_accounting-v0-loglikelihood │ ├── hendrycksTest-professional_accounting-v0-res.json │ ├── hendrycksTest-professional_law-v0-loglikelihood │ ├── hendrycksTest-professional_law-v0-res.json │ ├── hendrycksTest-professional_medicine-v0-loglikelihood │ ├── hendrycksTest-professional_medicine-v0-res.json │ ├── hendrycksTest-professional_psychology-v0-loglikelihood │ ├── hendrycksTest-professional_psychology-v0-res.json │ ├── hendrycksTest-public_relations-v0-loglikelihood │ ├── hendrycksTest-public_relations-v0-res.json │ ├── hendrycksTest-security_studies-v0-loglikelihood │ ├── hendrycksTest-security_studies-v0-res.json │ ├── hendrycksTest-sociology-v0-loglikelihood │ ├── hendrycksTest-sociology-v0-res.json │ ├── hendrycksTest-us_foreign_policy-v0-loglikelihood │ ├── hendrycksTest-us_foreign_policy-v0-res.json │ ├── hendrycksTest-virology-v0-loglikelihood │ ├── hendrycksTest-virology-v0-res.json │ ├── hendrycksTest-world_religions-v0-loglikelihood │ ├── hendrycksTest-world_religions-v0-res.json │ ├── iwslt17-ar-en-v0-greedy_until │ ├── iwslt17-ar-en-v0-res.json │ ├── iwslt17-en-ar-v0-greedy_until │ ├── iwslt17-en-ar-v0-res.json │ ├── lambada-v0-loglikelihood │ ├── lambada-v0-res.json │ ├── lambada_cloze-v0-loglikelihood │ ├── lambada_cloze-v0-res.json │ ├── lambada_mt_de-v0-loglikelihood │ ├── lambada_mt_de-v0-res.json │ ├── lambada_mt_en-v0-loglikelihood │ ├── lambada_mt_en-v0-res.json │ ├── lambada_mt_es-v0-loglikelihood │ ├── lambada_mt_es-v0-res.json │ ├── lambada_mt_fr-v0-loglikelihood │ ├── lambada_mt_fr-v0-res.json │ ├── lambada_mt_it-v0-loglikelihood │ ├── lambada_mt_it-v0-res.json │ ├── lambada_openai-v0-loglikelihood │ ├── lambada_openai-v0-res.json │ ├── lambada_openai-v2.0-loglikelihood │ ├── lambada_openai-v2.0-res.json │ ├── lambada_openai_10_hf_pretrained-EleutherAI-pythia-14m-deduped-dtype-float32-device-cpu.txt │ ├── lambada_openai_cloze-v0-loglikelihood │ ├── lambada_openai_cloze-v0-res.json │ ├── lambada_openai_mt_de-v0-loglikelihood │ ├── lambada_openai_mt_de-v0-res.json │ ├── lambada_openai_mt_en-v0-loglikelihood │ ├── lambada_openai_mt_en-v0-res.json │ ├── lambada_openai_mt_es-v0-loglikelihood │ ├── lambada_openai_mt_es-v0-res.json │ ├── lambada_openai_mt_fr-v0-loglikelihood │ ├── lambada_openai_mt_fr-v0-res.json │ ├── lambada_openai_mt_it-v0-loglikelihood │ ├── lambada_openai_mt_it-v0-res.json │ ├── lambada_standard-v0-loglikelihood │ ├── lambada_standard-v0-res.json │ ├── lambada_standard_cloze-v0-loglikelihood │ ├── lambada_standard_cloze-v0-res.json │ ├── logiqa-v0-loglikelihood │ ├── logiqa-v0-res.json │ ├── math_algebra-v0-greedy_until │ ├── math_algebra-v0-res.json │ ├── math_algebra-v1-greedy_until │ ├── math_algebra-v1-res.json │ ├── math_counting_and_prob-v0-greedy_until │ ├── math_counting_and_prob-v0-res.json │ ├── math_counting_and_prob-v1-greedy_until │ ├── math_counting_and_prob-v1-res.json │ ├── math_geometry-v0-greedy_until │ ├── math_geometry-v0-res.json │ ├── math_geometry-v1-greedy_until │ ├── math_geometry-v1-res.json │ ├── math_intermediate_algebra-v0-greedy_until │ ├── math_intermediate_algebra-v0-res.json │ ├── math_intermediate_algebra-v1-greedy_until │ ├── math_intermediate_algebra-v1-res.json │ ├── math_num_theory-v0-greedy_until │ ├── math_num_theory-v0-res.json │ ├── math_num_theory-v1-greedy_until │ ├── math_num_theory-v1-res.json │ ├── math_prealgebra-v0-greedy_until │ ├── math_prealgebra-v0-res.json │ ├── math_prealgebra-v1-greedy_until │ ├── math_prealgebra-v1-res.json │ ├── math_precalc-v0-greedy_until │ ├── math_precalc-v0-res.json │ ├── math_precalc-v1-greedy_until │ ├── math_precalc-v1-res.json │ ├── mathqa-v0-loglikelihood │ ├── mathqa-v0-res.json │ ├── mc_taco-v0-loglikelihood │ ├── mc_taco-v0-res.json │ ├── mmlu_stem_10_hf_pretrained-EleutherAI-pythia-14m-deduped-dtype-float32-device-cpu.txt │ ├── mnli-v0-loglikelihood │ ├── mnli-v0-res.json │ ├── mnli_mismatched-v0-loglikelihood │ ├── mnli_mismatched-v0-res.json │ ├── mrpc-v0-loglikelihood │ ├── mrpc-v0-res.json │ ├── multirc-v0-loglikelihood │ ├── multirc-v0-res.json │ ├── multirc-v1-loglikelihood │ ├── multirc-v1-res.json │ ├── mutual-v0-loglikelihood │ ├── mutual-v0-res.json │ ├── mutual-v1-loglikelihood │ ├── mutual-v1-res.json │ ├── mutual_plus-v0-loglikelihood │ ├── mutual_plus-v0-res.json │ ├── mutual_plus-v1-loglikelihood │ ├── mutual_plus-v1-res.json │ ├── openbookqa-v0-loglikelihood │ ├── openbookqa-v0-res.json │ ├── pile_arxiv-v0-loglikelihood_rolling │ ├── pile_arxiv-v0-res.json │ ├── pile_arxiv-v1-loglikelihood_rolling │ ├── pile_arxiv-v1-res.json │ ├── pile_bookcorpus2-v0-loglikelihood_rolling │ ├── pile_bookcorpus2-v0-res.json │ ├── pile_bookcorpus2-v1-loglikelihood_rolling │ ├── pile_bookcorpus2-v1-res.json │ ├── pile_books3-v0-loglikelihood_rolling │ ├── pile_books3-v0-res.json │ ├── pile_books3-v1-loglikelihood_rolling │ ├── pile_books3-v1-res.json │ ├── pile_dm-mathematics-v0-loglikelihood_rolling │ ├── pile_dm-mathematics-v0-res.json │ ├── pile_dm-mathematics-v1-loglikelihood_rolling │ ├── pile_dm-mathematics-v1-res.json │ ├── pile_enron-v0-loglikelihood_rolling │ ├── pile_enron-v0-res.json │ ├── pile_enron-v1-loglikelihood_rolling │ ├── pile_enron-v1-res.json │ ├── pile_europarl-v0-loglikelihood_rolling │ ├── pile_europarl-v0-res.json │ ├── pile_europarl-v1-loglikelihood_rolling │ ├── pile_europarl-v1-res.json │ ├── pile_freelaw-v0-loglikelihood_rolling │ ├── pile_freelaw-v0-res.json │ ├── pile_freelaw-v1-loglikelihood_rolling │ ├── pile_freelaw-v1-res.json │ ├── pile_github-v0-loglikelihood_rolling │ ├── pile_github-v0-res.json │ ├── pile_github-v1-loglikelihood_rolling │ ├── pile_github-v1-res.json │ ├── pile_gutenberg-v0-loglikelihood_rolling │ ├── pile_gutenberg-v0-res.json │ ├── pile_gutenberg-v1-loglikelihood_rolling │ ├── pile_gutenberg-v1-res.json │ ├── pile_hackernews-v0-loglikelihood_rolling │ ├── pile_hackernews-v0-res.json │ ├── pile_hackernews-v1-loglikelihood_rolling │ ├── pile_hackernews-v1-res.json │ ├── pile_nih-exporter-v0-loglikelihood_rolling │ ├── pile_nih-exporter-v0-res.json │ ├── pile_nih-exporter-v1-loglikelihood_rolling │ ├── pile_nih-exporter-v1-res.json │ ├── pile_opensubtitles-v0-loglikelihood_rolling │ ├── pile_opensubtitles-v0-res.json │ ├── pile_opensubtitles-v1-loglikelihood_rolling │ ├── pile_opensubtitles-v1-res.json │ ├── pile_openwebtext2-v0-loglikelihood_rolling │ ├── pile_openwebtext2-v0-res.json │ ├── pile_openwebtext2-v1-loglikelihood_rolling │ ├── pile_openwebtext2-v1-res.json │ ├── pile_philpapers-v0-loglikelihood_rolling │ ├── pile_philpapers-v0-res.json │ ├── pile_philpapers-v1-loglikelihood_rolling │ ├── pile_philpapers-v1-res.json │ ├── pile_pile-cc-v0-loglikelihood_rolling │ ├── pile_pile-cc-v0-res.json │ ├── pile_pile-cc-v1-loglikelihood_rolling │ ├── pile_pile-cc-v1-res.json │ ├── pile_pubmed-abstracts-v0-loglikelihood_rolling │ ├── pile_pubmed-abstracts-v0-res.json │ ├── pile_pubmed-abstracts-v1-loglikelihood_rolling │ ├── pile_pubmed-abstracts-v1-res.json │ ├── pile_pubmed-central-v0-loglikelihood_rolling │ ├── pile_pubmed-central-v0-res.json │ ├── pile_pubmed-central-v1-loglikelihood_rolling │ ├── pile_pubmed-central-v1-res.json │ ├── pile_stackexchange-v0-loglikelihood_rolling │ ├── pile_stackexchange-v0-res.json │ ├── pile_stackexchange-v1-loglikelihood_rolling │ ├── pile_stackexchange-v1-res.json │ ├── pile_ubuntu-irc-v0-loglikelihood_rolling │ ├── pile_ubuntu-irc-v0-res.json │ ├── pile_ubuntu-irc-v1-loglikelihood_rolling │ ├── pile_ubuntu-irc-v1-res.json │ ├── pile_uspto-v0-loglikelihood_rolling │ ├── pile_uspto-v0-res.json │ ├── pile_uspto-v1-loglikelihood_rolling │ ├── pile_uspto-v1-res.json │ ├── pile_wikipedia-v0-loglikelihood_rolling │ ├── pile_wikipedia-v0-res.json │ ├── pile_wikipedia-v1-loglikelihood_rolling │ ├── pile_wikipedia-v1-res.json │ ├── pile_youtubesubtitles-v0-loglikelihood_rolling │ ├── pile_youtubesubtitles-v0-res.json │ ├── pile_youtubesubtitles-v1-loglikelihood_rolling │ ├── pile_youtubesubtitles-v1-res.json │ ├── piqa-v0-loglikelihood │ ├── piqa-v0-res.json │ ├── prost-v0-loglikelihood │ ├── prost-v0-res.json │ ├── pubmedqa-v0-loglikelihood │ ├── pubmedqa-v0-res.json │ ├── qa4mre_2011-v0-loglikelihood │ ├── qa4mre_2011-v0-res.json │ ├── qa4mre_2012-v0-loglikelihood │ ├── qa4mre_2012-v0-res.json │ ├── qa4mre_2013-v0-loglikelihood │ ├── qa4mre_2013-v0-res.json │ ├── qnli-v0-loglikelihood │ ├── qnli-v0-res.json │ ├── qqp-v0-loglikelihood │ ├── qqp-v0-res.json │ ├── race-v0-loglikelihood │ ├── race-v0-res.json │ ├── random_insertion-v0-greedy_until │ ├── random_insertion-v0-res.json │ ├── record-v0-loglikelihood │ ├── record-v0-res.json │ ├── reversed_words-v0-greedy_until │ ├── reversed_words-v0-res.json │ ├── rte-v0-loglikelihood │ ├── rte-v0-res.json │ ├── sciq-v0-loglikelihood │ ├── sciq-v0-res.json │ ├── squad2-v0-greedy_until │ ├── squad2-v0-loglikelihood │ ├── squad2-v0-res.json │ ├── squad2-v1-greedy_until │ ├── squad2-v1-loglikelihood │ ├── squad2-v1-res.json │ ├── sst-v0-loglikelihood │ ├── sst-v0-res.json │ ├── swag-v0-loglikelihood │ ├── swag-v0-res.json │ ├── textsynth_test_0a89c2739f9598b4be2674b0a8e43931d7f3f0b696970bcba31f9b52bdf12297.pkl │ ├── textsynth_test_0c1c14571add7903b89e588c8212572b95bb57b334fc0752c89a7e045a5f63ae.pkl │ ├── textsynth_test_3092d07756f3e1d010c07524cc8a2ecba7f0c19f9e39f2aaf2bf440bfe328004.pkl │ ├── textsynth_test_434076260b6af3a46b7a5eaceec3306a5872c400a3872f744280b237455a0f8e.pkl │ ├── textsynth_test_49c47ae40e11f349f2f6b492128188b1b2bc103a421c676ee4b2142a68b43516.pkl │ ├── textsynth_test_4fd8d66a6dad7f602b40e5d7dc298d6fe329299d086a4659743a41f4a4012659.pkl │ ├── textsynth_test_51b5302f157cf224f694ccad973f255ae19e9e061d533256bdf75b04e0a917ab.pkl │ ├── textsynth_test_6d6c62dd70caaa208712bf766deaf419cfac89538d4ab7745621e339394c0c23.pkl │ ├── textsynth_test_7209c4617547bfe17cb9e7f5f735fe35822d650aefdc5fbeeaf0c1724effbe09.pkl │ ├── textsynth_test_7afdc285388e51094e12645f305328c759574fa3ec9751631025f8ad5ebf9f3e.pkl │ ├── textsynth_test_9d5f33dbfe1e254928c89f5ed85e4c010d888065f55a8f1b863bc1eb0340a5f2.pkl │ ├── textsynth_test_abcbcba648d89e5d81a50511a6d24ddeb538de2ffe108c1370dd74ce6ac8038d.pkl │ ├── textsynth_test_b1cbb29666cce5e31a1e97695858137398a0885ca5d5d98f515404fb6aeb99e7.pkl │ ├── textsynth_test_e7ad1e9f52a39e1ddd1e50f3c57ffa4546728dd150a67c0a0ddc8675c04e15d1.pkl │ ├── textsynth_test_f4bfe4beb605bd52a8ab6be3c9293639e7e2261d98de58159d15ccb83131bf4e.pkl │ ├── toxigen-v0-loglikelihood │ ├── toxigen-v0-res.json │ ├── triviaqa-v0-loglikelihood │ ├── triviaqa-v0-res.json │ ├── triviaqa-v1-loglikelihood │ ├── triviaqa-v1-res.json │ ├── truthfulqa_gen-v0-greedy_until │ ├── truthfulqa_gen-v0-res.json │ ├── truthfulqa_gen-v1-greedy_until │ ├── truthfulqa_gen-v1-res.json │ ├── truthfulqa_mc-v0-loglikelihood │ ├── truthfulqa_mc-v0-res.json │ ├── truthfulqa_mc-v1-loglikelihood │ ├── truthfulqa_mc-v1-res.json │ ├── webqs-v0-loglikelihood │ ├── webqs-v0-res.json │ ├── wic-v0-loglikelihood │ ├── wic-v0-res.json │ ├── wikitext-v0-loglikelihood_rolling │ ├── wikitext-v0-res.json │ ├── wikitext-v1-loglikelihood_rolling │ ├── wikitext-v1-res.json │ ├── wikitext_10_hf_pretrained-EleutherAI-pythia-14m-deduped-dtype-float32-device-cpu.txt │ ├── winogrande-v0-loglikelihood │ ├── winogrande-v0-res.json │ ├── wmt14-en-fr-v0-greedy_until │ ├── wmt14-en-fr-v0-res.json │ ├── wmt14-fr-en-v0-greedy_until │ ├── wmt14-fr-en-v0-res.json │ ├── wmt16-de-en-v0-greedy_until │ ├── wmt16-de-en-v0-res.json │ ├── wmt16-en-de-v0-greedy_until │ ├── wmt16-en-de-v0-res.json │ ├── wmt16-en-ro-v0-greedy_until │ ├── wmt16-en-ro-v0-res.json │ ├── wmt16-ro-en-v0-greedy_until │ ├── wmt16-ro-en-v0-res.json │ ├── wmt20-cs-en-v0-greedy_until │ ├── wmt20-cs-en-v0-res.json │ ├── wmt20-de-en-v0-greedy_until │ ├── wmt20-de-en-v0-res.json │ ├── wmt20-de-fr-v0-greedy_until │ ├── wmt20-de-fr-v0-res.json │ ├── wmt20-en-cs-v0-greedy_until │ ├── wmt20-en-cs-v0-res.json │ ├── wmt20-en-de-v0-greedy_until │ ├── wmt20-en-de-v0-res.json │ ├── wmt20-en-iu-v0-greedy_until │ ├── wmt20-en-iu-v0-res.json │ ├── wmt20-en-ja-v0-greedy_until │ ├── wmt20-en-ja-v0-res.json │ ├── wmt20-en-ja-v1-greedy_until │ ├── wmt20-en-ja-v1-res.json │ ├── wmt20-en-km-v0-greedy_until │ ├── wmt20-en-km-v0-res.json │ ├── wmt20-en-pl-v0-greedy_until │ ├── wmt20-en-pl-v0-res.json │ ├── wmt20-en-ps-v0-greedy_until │ ├── wmt20-en-ps-v0-res.json │ ├── wmt20-en-ru-v0-greedy_until │ ├── wmt20-en-ru-v0-res.json │ ├── wmt20-en-ta-v0-greedy_until │ ├── wmt20-en-ta-v0-res.json │ ├── wmt20-en-zh-v0-greedy_until │ ├── wmt20-en-zh-v0-res.json │ ├── wmt20-en-zh-v1-greedy_until │ ├── wmt20-en-zh-v1-res.json │ ├── wmt20-fr-de-v0-greedy_until │ ├── wmt20-fr-de-v0-res.json │ ├── wmt20-iu-en-v0-greedy_until │ ├── wmt20-iu-en-v0-res.json │ ├── wmt20-ja-en-v0-greedy_until │ ├── wmt20-ja-en-v0-res.json │ ├── wmt20-km-en-v0-greedy_until │ ├── wmt20-km-en-v0-res.json │ ├── wmt20-pl-en-v0-greedy_until │ ├── wmt20-pl-en-v0-res.json │ ├── wmt20-ps-en-v0-greedy_until │ ├── wmt20-ps-en-v0-res.json │ ├── wmt20-ru-en-v0-greedy_until │ ├── wmt20-ru-en-v0-res.json │ ├── wmt20-ta-en-v0-greedy_until │ ├── wmt20-ta-en-v0-res.json │ ├── wmt20-zh-en-v0-greedy_until │ ├── wmt20-zh-en-v0-res.json │ ├── wnli-v0-loglikelihood │ ├── wnli-v0-res.json │ ├── wnli-v1-loglikelihood │ ├── wnli-v1-res.json │ ├── wsc-v0-loglikelihood │ ├── wsc-v0-res.json │ ├── wsc273-v0-loglikelihood │ └── wsc273-v0-res.json ├── testyamls/ │ └── test-01.yaml └── utils.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .github/workflows/new_tasks.yml ================================================ name: Tasks Modified on: push: branches: - 'main' pull_request: branches: - 'main' workflow_dispatch: env: TQDM_DISABLE: "1" HF_HUB_DISABLE_PROGRESS_BARS: "1" # comment/edit out the above to stop/change the triggers jobs: changed_files: runs-on: ubuntu-latest # windows-latest || macos-latest timeout-minutes: 120 name: Scan for changed tasks steps: - name: checkout uses: actions/checkout@v6 with: fetch-depth: 2 # OR "2" -> To retrieve the preceding commit. # Uses the tj-actions/changed-files action to check for changes. # The `files_yaml` input optionally takes a yaml string to specify filters, # and prepends the filter name to the standard output names. - name: Check task folders id: changed-tasks uses: tj-actions/changed-files@v47 with: # tasks checks the tasks folder and api checks the api folder for changes files_yaml: | tasks: - lm_eval/tasks/** api: - lm_eval/api/** write_output_files: true # The next step is optional; the files are written to the workspace by default (above). # so it's just for debugging - name: Run Tests if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true' run: | echo .github/outputs/tasks_all_changed_and_modified_files.txt >> 'GITHUB_ENV' echo "One or more test file(s) has changed." echo "List of all the files that have changed: ${{ steps.changed-tasks.outputs.tasks_all_modified_files }}" - name: Install uv if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true' uses: astral-sh/setup-uv@v7 with: enable-cache: true python-version: "3.10" activate-environment: true - name: Install dependencies if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true' run: | uv pip install -e '.[dev,ifeval,unitxt,math,longbench,hf]' --extra-index-url https://download.pytorch.org/whl/cpu - name: Test with pytest # if new tasks are added, run tests on them if: steps.changed-tasks.outputs.tasks_any_modified == 'true' run: pytest -x -s -vv tests/test_tasks.py # if api is modified, run tests on it - name: Test more tasks with pytest env: API: true if: steps.changed-tasks.outputs.api_any_modified == 'true' run: pytest -x -s -vv -n=auto tests/test_tasks.py ================================================ FILE: .github/workflows/publish.yml ================================================ name: Publish Python distribution to PyPI on: push: tags: - '*' jobs: build: name: Build distribution runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - name: Set up Python uses: actions/setup-python@v6 with: python-version: "3.x" - name: Print version run: | # Extract version from pyproject.toml PYPROJECT_VERSION=$(grep 'version = ' pyproject.toml | head -1 | cut -d'"' -f2) echo "Version in pyproject.toml: $PYPROJECT_VERSION" - name: Install pypa/build run: >- python3 -m pip install build --user - name: Build a binary wheel and a source tarball run: python3 -m build - name: Store the distribution packages uses: actions/upload-artifact@v7 with: name: python-package-distributions path: dist/ publish-to-pypi: name: >- Publish Python distribution to PyPI if: startsWith(github.ref, 'refs/tags/') # only publish to PyPI on tag pushes needs: - build runs-on: ubuntu-latest environment: name: pypi url: https://pypi.org/p/lm_eval permissions: id-token: write # IMPORTANT: mandatory for trusted publishing steps: - name: Download all the dists uses: actions/download-artifact@v8 with: name: python-package-distributions path: dist/ - name: Publish distribution to PyPI uses: pypa/gh-action-pypi-publish@release/v1 publish-to-testpypi: name: Publish Python distribution to TestPyPI needs: - build runs-on: ubuntu-latest environment: name: testpypi url: https://test.pypi.org/p/lm_eval permissions: id-token: write # IMPORTANT: mandatory for trusted publishing steps: - name: Download all the dists uses: actions/download-artifact@v8 with: name: python-package-distributions path: dist/ - name: Publish distribution to TestPyPI uses: pypa/gh-action-pypi-publish@release/v1 with: repository-url: https://test.pypi.org/legacy/ ================================================ FILE: .github/workflows/unit_tests.yml ================================================ # This workflow will install Python dependencies, run tests and lint with a variety of Python versions # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python # just comment out unwanted steps to turn off the test. name: Unit Tests on: push: branches: - 'main' pull_request: branches: - 'main' workflow_dispatch: env: TQDM_DISABLE: "1" HF_HUB_DISABLE_PROGRESS_BARS: "1" # Jobs run concurrently and steps run sequentially within a job. # jobs: linter and cpu_tests. Add more jobs/steps as required. jobs: linter: name: Linters runs-on: ubuntu-latest timeout-minutes: 5 steps: - name: Checkout Code uses: actions/checkout@v6 with: fetch-depth: 0 - name: Install uv uses: astral-sh/setup-uv@v7 with: enable-cache: true python-version: "3.10" activate-environment: true - name: Install pip run: uv pip install pip - name: Pre-Commit env: SKIP: "no-commit-to-branch,mypy" uses: pre-commit/action@v3.0.1 with: extra_args: --from-ref ${{ github.event.pull_request.base.sha || 'HEAD~1' }} --to-ref HEAD # Job 2 testcpu: name: CPU Tests runs-on: ubuntu-latest strategy: fail-fast: true matrix: python-version: ["3.10", "3.11", "3.12"] timeout-minutes: 30 steps: - name: Checkout Code uses: actions/checkout@v6 - name: Install uv uses: astral-sh/setup-uv@v7 with: enable-cache: true python-version: ${{ matrix.python-version }} activate-environment: true # Cache HuggingFace cache directory for CPU tests - name: Cache HuggingFace cache (CPU tests) uses: actions/cache@v5 id: cache-hf-cpu with: path: ~/.cache/huggingface key: ${{ runner.os }}-hf-cache-cpu restore-keys: | ${{ runner.os }}-hf-cache-cpu - name: Install dependencies run: | uv pip install -e '.[dev,unitxt,hf]' --extra-index-url https://download.pytorch.org/whl/cpu uv pip install hf_xet - name: Test with pytest run: pytest -x --showlocals -s -vv -n=auto --ignore=tests/models/test_openvino.py --ignore=tests/models/test_hf_steered.py --ignore=tests/scripts/test_zeno_visualize.py # Save test artifacts - name: Archive test artifacts if: always() # Upload artifacts even if tests fail uses: actions/upload-artifact@v7 with: name: output_testcpu${{ matrix.python-version }} path: | test_logs/* # testmodels: # name: External LM Tests # runs-on: ubuntu-latest # timeout-minutes: 30 # steps: # - name: Checkout Code # uses: actions/checkout@v4 # - name: Set up Python 3.9 # uses: actions/setup-python@v5 # with: # python-version: 3.9 # cache: pip # cache-dependency-path: pyproject.toml # # # Cache HuggingFace cache directory for External LM tests # - name: Cache HuggingFace cache (External LM tests) # uses: actions/cache@v3 # id: cache-hf-lm # with: # path: ~/.cache/huggingface # key: ${{ runner.os }}-hf-cache-external-lm # restore-keys: | # ${{ runner.os }}-hf-cache-external-lm # # - name: Install dependencies # run: | # python -m pip install --upgrade pip # pip install -e '.[dev,optimum,api]' --extra-index-url https://download.pytorch.org/whl/cpu # pip install -U transformers peft accelerate # # - name: Test with pytest # run: python -m pytest tests/models --showlocals -s -vv # continue-on-error: true # Continue workflow even if tests fail ================================================ FILE: .gitignore ================================================ # macOS system files .DS_Store # Virtual environments .venv/ venv/ ENV/ env/ *.env # Python bytecode and build artifacts __pycache__/ *.py[cod] *.so *.egg-info/ build/ dist/ # IDE & editor settings .vscode/ .idea/ # Jupyter .ipynb_checkpoints/ profile_default/ ipython_config.py # Output and data output/ data/ temp/ test_logs/ # Caching lm_eval/caching/.cache lm_cache/ # Logging *.log logs/ # wandb experiment tracking wandb/ examples/wandb/ # PyInstaller *.spec #uv uv.lock ================================================ FILE: .pre-commit-config.yaml ================================================ # Ignore test linting to avoid conflicting changes to version stability. exclude: ^tests/testdata/ repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v6.0.0 hooks: - id: check-added-large-files - id: check-ast - id: fix-byte-order-marker - id: check-case-conflict - id: check-json - id: check-merge-conflict args: [ --assume-in-merge ] - id: check-symlinks - id: check-yaml args: [ "--unsafe" ] - id: destroyed-symlinks - id: detect-private-key - id: end-of-file-fixer - id: no-commit-to-branch always_run: false - id: requirements-txt-fixer - id: trailing-whitespace args: [ --markdown-linebreak-ext=md ] - id: fix-byte-order-marker exclude: docs/CNAME - id: mixed-line-ending args: [ --fix=lf ] - repo: https://github.com/astral-sh/ruff-pre-commit rev: v0.15.6 hooks: # Run the linter. - id: ruff-check args: [ --fix ] # Run the formatter. - id: ruff-format - repo: https://github.com/codespell-project/codespell rev: v2.4.2 hooks: - id: codespell exclude: > (?x)^( .*\.json|ignore.txt|lm_eval/tasks/.*|.*yaml|.*\.ipynb )$ args: [ --check-filenames, --check-hidden, --ignore-words=ignore.txt ] - repo: https://github.com/jackdewinter/pymarkdown rev: v0.9.36 hooks: - id: pymarkdown exclude: ^(lm_eval/tasks/.*|docs/footguns\.md)$ args: [ fix, -r ] ================================================ FILE: CITATION.bib ================================================ @misc{eval-harness, author = {Gao, Leo and Tow, Jonathan and Abbasi, Baber and Biderman, Stella and Black, Sid and DiPofi, Anthony and Foster, Charles and Golding, Laurence and Hsu, Jeffrey and Le Noac'h, Alain and Li, Haonan and McDonell, Kyle and Muennighoff, Niklas and Ociepa, Chris and Phang, Jason and Reynolds, Laria and Schoelkopf, Hailey and Skowron, Aviya and Sutawika, Lintang and Tang, Eric and Thite, Anish and Wang, Ben and Wang, Kevin and Zou, Andy}, title = {A framework for few-shot language model evaluation}, month = 12, year = 2023, publisher = {Zenodo}, version = {v0.4.0}, doi = {10.5281/zenodo.10256836}, url = {https://zenodo.org/records/10256836} } ================================================ FILE: CODEOWNERS ================================================ * @baberabb * @0xSMT ================================================ FILE: LICENSE.md ================================================ MIT License Copyright (c) 2020 EleutherAI Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: MANIFEST.in ================================================ recursive-include tests ================================================ FILE: README.md ================================================ # Language Model Evaluation Harness [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.10256836.svg)](https://doi.org/10.5281/zenodo.10256836) --- ## Latest News 📣 - [2025/12] **CLI refactored** with subcommands (`run`, `ls`, `validate`) and YAML config file support via `--config`. See the [CLI Reference](./docs/interface.md) and [Configuration Guide](./docs/config_files.md). - [2025/12] **Lighter install**: Base package no longer includes `transformers`/`torch`. Install model backends separately: `pip install lm_eval[hf]`, `lm_eval[vllm]`, etc. - [2025/07] Added `think_end_token` arg to `hf` (token/str), `vllm` and `sglang` (str) for stripping CoT reasoning traces from models that support it. - [2025/03] Added support for steering HF models! - [2025/02] Added [SGLang](https://docs.sglang.ai/) support! - [2024/09] We are prototyping allowing users of LM Evaluation Harness to create and evaluate on text+image multimodal input, text output tasks, and have just added the `hf-multimodal` and `vllm-vlm` model types and `mmmu` task as a prototype feature. We welcome users to try out this in-progress feature and stress-test it for themselves, and suggest they check out [`lmms-eval`](https://github.com/EvolvingLMMs-Lab/lmms-eval), a wonderful project originally forking off of the lm-evaluation-harness, for a broader range of multimodal tasks, models, and features. - [2024/07] [API model](docs/API_guide.md) support has been updated and refactored, introducing support for batched and async requests, and making it significantly easier to customize and use for your own purposes. **To run Llama 405B, we recommend using VLLM's OpenAI-compliant API to host the model, and use the `local-completions` model type to evaluate the model.** - [2024/07] New Open LLM Leaderboard tasks have been added ! You can find them under the [leaderboard](lm_eval/tasks/leaderboard/README.md) task group. --- ## Announcement **A new v0.4.0 release of lm-evaluation-harness is available** ! New updates and features include: - **New Open LLM Leaderboard tasks have been added ! You can find them under the [leaderboard](lm_eval/tasks/leaderboard/README.md) task group.** - Internal refactoring - Config-based task creation and configuration - Easier import and sharing of externally-defined task config YAMLs - Support for Jinja2 prompt design, easy modification of prompts + prompt imports from Promptsource - More advanced configuration options, including output post-processing, answer extraction, and multiple LM generations per document, configurable fewshot settings, and more - Speedups and new modeling libraries supported, including: faster data-parallel HF model usage, vLLM support, MPS support with HuggingFace, and more - Logging and usability changes - New tasks including CoT BIG-Bench-Hard, Belebele, user-defined task groupings, and more Please see our updated documentation pages in `docs/` for more details. Development will be continuing on the `main` branch, and we encourage you to give us feedback on what features are desired and how to improve the library further, or ask questions, either in issues or PRs on GitHub, or in the [EleutherAI discord](https://discord.gg/eleutherai)! --- ## Overview This project provides a unified framework to test generative language models on a large number of different evaluation tasks. **Features:** - Over 60 standard academic benchmarks for LLMs, with hundreds of subtasks and variants implemented. - Support for models loaded via [transformers](https://github.com/huggingface/transformers/) (including quantization via [GPTQModel](https://github.com/ModelCloud/GPTQModel) and [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ)), [GPT-NeoX](https://github.com/EleutherAI/gpt-neox), and [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed/), with a flexible tokenization-agnostic interface. - Support for fast and memory-efficient inference with [vLLM](https://github.com/vllm-project/vllm). - Support for commercial APIs including [OpenAI](https://openai.com), and [TextSynth](https://textsynth.com/). - Support for evaluation on adapters (e.g. LoRA) supported in [HuggingFace's PEFT library](https://github.com/huggingface/peft). - Support for local models and benchmarks. - Evaluation with publicly available prompts ensures reproducibility and comparability between papers. - Easy support for custom prompts and evaluation metrics. The Language Model Evaluation Harness is the backend for 🤗 Hugging Face's popular [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard), has been used in [hundreds of papers](https://scholar.google.com/scholar?oi=bibs&hl=en&authuser=2&cites=15052937328817631261,4097184744846514103,1520777361382155671,17476825572045927382,18443729326628441434,14801318227356878622,7890865700763267262,12854182577605049984,15641002901115500560,5104500764547628290), and is used internally by dozens of organizations including NVIDIA, Cohere, BigScience, BigCode, Nous Research, and Mosaic ML. ## Install To install the `lm-eval` package from the github repository, run: ```bash git clone --depth 1 https://github.com/EleutherAI/lm-evaluation-harness cd lm-evaluation-harness pip install -e . ``` ### Installing Model Backends The base installation provides the core evaluation framework. **Model backends must be installed separately** using optional extras: For HuggingFace transformers models: ```bash pip install "lm_eval[hf]" ``` For vLLM inference: ```bash pip install "lm_eval[vllm]" ``` For API-based models (OpenAI, Anthropic, etc.): ```bash pip install "lm_eval[api]" ``` Multiple backends can be installed together: ```bash pip install "lm_eval[hf,vllm,api]" ``` A detailed table of all optional extras is available at the end of this document. ## Basic Usage ### Documentation | Guide | Description | |-------|-------------| | [CLI Reference](./docs/interface.md) | Command-line arguments and subcommands | | [Configuration Guide](./docs/config_files.md) | YAML config file format and examples | | [Python API](./docs/python-api.md) | Programmatic usage with `simple_evaluate()` | | [Task Guide](./lm_eval/tasks/README.md) | Available tasks and task configuration | Use `lm-eval -h` to see available options, or `lm-eval run -h` for evaluation options. List available tasks with: ```bash lm-eval ls tasks ``` ### Hugging Face `transformers` > [!Important] > To use the HuggingFace backend, first install: `pip install "lm_eval[hf]"` To evaluate a model hosted on the [HuggingFace Hub](https://huggingface.co/models) (e.g. GPT-J-6B) on `hellaswag` you can use the following command (this assumes you are using a CUDA-compatible GPU): ```bash lm_eval --model hf \ --model_args pretrained=EleutherAI/gpt-j-6B \ --tasks hellaswag \ --device cuda:0 \ --batch_size 8 ``` Additional arguments can be provided to the model constructor using the `--model_args` flag. Most notably, this supports the common practice of using the `revisions` feature on the Hub to store partially trained checkpoints, or to specify the datatype for running a model: ```bash lm_eval --model hf \ --model_args pretrained=EleutherAI/pythia-160m,revision=step100000,dtype="float" \ --tasks lambada_openai,hellaswag \ --device cuda:0 \ --batch_size 8 ``` Models that are loaded via both `transformers.AutoModelForCausalLM` (autoregressive, decoder-only GPT style models) and `transformers.AutoModelForSeq2SeqLM` (such as encoder-decoder models like T5) in Huggingface are supported. Batch size selection can be automated by setting the ```--batch_size``` flag to ```auto```. This will perform automatic detection of the largest batch size that will fit on your device. On tasks where there is a large difference between the longest and shortest example, it can be helpful to periodically recompute the largest batch size, to gain a further speedup. To do this, append ```:N``` to above flag to automatically recompute the largest batch size ```N``` times. For example, to recompute the batch size 4 times, the command would be: ```bash lm_eval --model hf \ --model_args pretrained=EleutherAI/pythia-160m,revision=step100000,dtype="float" \ --tasks lambada_openai,hellaswag \ --device cuda:0 \ --batch_size auto:4 ``` > [!Note] > Just like you can provide a local path to `transformers.AutoModel`, you can also provide a local path to `lm_eval` via `--model_args pretrained=/path/to/model` #### Evaluating GGUF Models `lm-eval` supports evaluating models in GGUF format using the Hugging Face (`hf`) backend. This allows you to use quantized models compatible with `transformers`, `AutoModel`, and llama.cpp conversions. To evaluate a GGUF model, pass the path to the directory containing the model weights, the `gguf_file`, and optionally a separate `tokenizer` path using the `--model_args` flag. **🚨 Important Note:** If no separate tokenizer is provided, Hugging Face will attempt to reconstruct the tokenizer from the GGUF file — this can take **hours** or even hang indefinitely. Passing a separate tokenizer avoids this issue and can reduce tokenizer loading time from hours to seconds. **✅ Recommended usage:** ```bash lm_eval --model hf \ --model_args pretrained=/path/to/gguf_folder,gguf_file=model-name.gguf,tokenizer=/path/to/tokenizer \ --tasks hellaswag \ --device cuda:0 \ --batch_size 8 ``` > [!Tip] > Ensure the tokenizer path points to a valid Hugging Face tokenizer directory (e.g., containing tokenizer_config.json, vocab.json, etc.). #### Multi-GPU Evaluation with Hugging Face `accelerate` We support three main ways of using Hugging Face's [accelerate 🚀](https://github.com/huggingface/accelerate) library for multi-GPU evaluation. To perform *data-parallel evaluation* (where each GPU loads a **separate full copy** of the model), we leverage the `accelerate` launcher as follows: ```bash accelerate launch -m lm_eval --model hf \ --tasks lambada_openai,arc_easy \ --batch_size 16 ``` (or via `accelerate launch --no-python lm_eval`). For cases where your model can fit on a single GPU, this allows you to evaluate on K GPUs K times faster than on one. **WARNING**: This setup does not work with FSDP model sharding, so in `accelerate config` FSDP must be disabled, or the NO_SHARD FSDP option must be used. The second way of using `accelerate` for multi-GPU evaluation is when your model is *too large to fit on a single GPU.* In this setting, run the library *outside the `accelerate` launcher*, but passing `parallelize=True` to `--model_args` as follows: ```bash lm_eval --model hf \ --tasks lambada_openai,arc_easy \ --model_args parallelize=True \ --batch_size 16 ``` This means that your model's weights will be split across all available GPUs. For more advanced users or even larger models, we allow for the following arguments when `parallelize=True` as well: - `device_map_option`: How to split model weights across available GPUs. defaults to "auto". - `max_memory_per_gpu`: the max GPU memory to use per GPU in loading the model. - `max_cpu_memory`: the max amount of CPU memory to use when offloading the model weights to RAM. - `offload_folder`: a folder where model weights will be offloaded to disk if needed. The third option is to use both at the same time. This will allow you to take advantage of both data parallelism and model sharding, and is especially useful for models that are too large to fit on a single GPU. ```bash accelerate launch --multi_gpu --num_processes {nb_of_copies_of_your_model} \ -m lm_eval --model hf \ --tasks lambada_openai,arc_easy \ --model_args parallelize=True \ --batch_size 16 ``` To learn more about model parallelism and how to use it with the `accelerate` library, see the [accelerate documentation](https://huggingface.co/docs/transformers/v4.15.0/en/parallelism) **Warning: We do not natively support multi-node evaluation using the `hf` model type! Please reference [our GPT-NeoX library integration](https://github.com/EleutherAI/gpt-neox/blob/main/eval.py) for an example of code in which a custom multi-machine evaluation script is written.** **Note: we do not currently support multi-node evaluations natively, and advise using either an externally hosted server to run inference requests against, or creating a custom integration with your distributed framework [as is done for the GPT-NeoX library](https://github.com/EleutherAI/gpt-neox/blob/main/eval_tasks/eval_adapter.py).** ### Steered Hugging Face `transformers` models To evaluate a Hugging Face `transformers` model with steering vectors applied, specify the model type as `steered` and provide the path to either a PyTorch file containing pre-defined steering vectors, or a CSV file that specifies how to derive steering vectors from pretrained `sparsify` or `sae_lens` models (you will need to install the corresponding optional dependency for this method). Specify pre-defined steering vectors: ```python import torch steer_config = { "layers.3": { "steering_vector": torch.randn(1, 768), "bias": torch.randn(1, 768), "steering_coefficient": 1, "action": "add" }, } torch.save(steer_config, "steer_config.pt") ``` Specify derived steering vectors: ```python import pandas as pd pd.DataFrame({ "loader": ["sparsify"], "action": ["add"], "sparse_model": ["EleutherAI/sae-pythia-70m-32k"], "hookpoint": ["layers.3"], "feature_index": [30], "steering_coefficient": [10.0], }).to_csv("steer_config.csv", index=False) ``` Run the evaluation harness with steering vectors applied: ```bash lm_eval --model steered \ --model_args pretrained=EleutherAI/pythia-160m,steer_path=steer_config.pt \ --tasks lambada_openai,hellaswag \ --device cuda:0 \ --batch_size 8 ``` ### NVIDIA `nemo` models [NVIDIA NeMo Framework](https://github.com/NVIDIA/NeMo) is a generative AI framework built for researchers and pytorch developers working on language models. To evaluate a `nemo` model, start by installing NeMo following [the documentation](https://github.com/NVIDIA/NeMo?tab=readme-ov-file#installation). We highly recommended to use the NVIDIA PyTorch or NeMo container, especially if having issues installing Apex or any other dependencies (see [latest released containers](https://github.com/NVIDIA/NeMo/releases)). Please also install the lm evaluation harness library following the instructions in [the Install section](https://github.com/EleutherAI/lm-evaluation-harness/tree/main?tab=readme-ov-file#install). NeMo models can be obtained through [NVIDIA NGC Catalog](https://catalog.ngc.nvidia.com/models) or in [NVIDIA's Hugging Face page](https://huggingface.co/nvidia). In [NVIDIA NeMo Framework](https://github.com/NVIDIA/NeMo/tree/main/scripts/nlp_language_modeling) there are conversion scripts to convert the `hf` checkpoints of popular models like llama, falcon, mixtral or mpt to `nemo`. Run a `nemo` model on one GPU: ```bash lm_eval --model nemo_lm \ --model_args path= \ --tasks hellaswag \ --batch_size 32 ``` It is recommended to unpack the `nemo` model to avoid the unpacking inside the docker container - it may overflow disk space. For that you can run: ```bash mkdir MY_MODEL tar -xvf MY_MODEL.nemo -c MY_MODEL ``` #### Multi-GPU evaluation with NVIDIA `nemo` models By default, only one GPU is used. But we do support either data replication or tensor/pipeline parallelism during evaluation, on one node. 1) To enable data replication, set the `model_args` of `devices` to the number of data replicas to run. For example, the command to run 8 data replicas over 8 GPUs is: ```bash torchrun --nproc-per-node=8 --no-python lm_eval \ --model nemo_lm \ --model_args path=,devices=8 \ --tasks hellaswag \ --batch_size 32 ``` 1) To enable tensor and/or pipeline parallelism, set the `model_args` of `tensor_model_parallel_size` and/or `pipeline_model_parallel_size`. In addition, you also have to set up `devices` to be equal to the product of `tensor_model_parallel_size` and/or `pipeline_model_parallel_size`. For example, the command to use one node of 4 GPUs with tensor parallelism of 2 and pipeline parallelism of 2 is: ```bash torchrun --nproc-per-node=4 --no-python lm_eval \ --model nemo_lm \ --model_args path=,devices=4,tensor_model_parallel_size=2,pipeline_model_parallel_size=2 \ --tasks hellaswag \ --batch_size 32 ``` Note that it is recommended to substitute the `python` command by `torchrun --nproc-per-node= --no-python` to facilitate loading the model into the GPUs. This is especially important for large checkpoints loaded into multiple GPUs. Not supported yet: multi-node evaluation and combinations of data replication with tensor or pipeline parallelism. ### Megatron-LM models [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) is NVIDIA's large-scale transformer training framework. This backend allows direct evaluation of Megatron-LM checkpoints without conversion. **Requirements:** - Megatron-LM must be installed or accessible via `MEGATRON_PATH` environment variable - PyTorch with CUDA support **Setup:** ```bash # Set environment variable pointing to Megatron-LM installation export MEGATRON_PATH=/path/to/Megatron-LM ``` **Basic usage (single GPU):** ```bash lm_eval --model megatron_lm \ --model_args load=/path/to/checkpoint,tokenizer_type=HuggingFaceTokenizer,tokenizer_model=/path/to/tokenizer \ --tasks hellaswag \ --batch_size 1 ``` **Supported checkpoint formats:** - Standard Megatron checkpoints (`model_optim_rng.pt`) - Distributed checkpoints (`.distcp` format, auto-detected) #### Parallelism Modes The Megatron-LM backend supports the following parallelism modes: | Mode | Configuration | Description | |------|---------------|-------------| | Single GPU | `devices=1` (default) | Standard single GPU evaluation | | Data Parallelism | `devices>1, TP=1` | Each GPU has a full model replica, data is distributed | | Tensor Parallelism | `TP == devices` | Model layers are split across GPUs | | Expert Parallelism | `EP == devices, TP=1` | For MoE models, experts are distributed across GPUs | > [!Note] > - Pipeline Parallelism (PP > 1) is not currently supported. > - Expert Parallelism (EP) cannot be combined with Tensor Parallelism (TP). **Data Parallelism (4 GPUs, each with full model replica):** ```bash torchrun --nproc-per-node=4 -m lm_eval --model megatron_lm \ --model_args load=/path/to/checkpoint,tokenizer_model=/path/to/tokenizer,devices=4 \ --tasks hellaswag ``` **Tensor Parallelism (TP=2):** ```bash torchrun --nproc-per-node=2 -m lm_eval --model megatron_lm \ --model_args load=/path/to/checkpoint,tokenizer_model=/path/to/tokenizer,devices=2,tensor_model_parallel_size=2 \ --tasks hellaswag ``` **Expert Parallelism for MoE models (EP=4):** ```bash torchrun --nproc-per-node=4 -m lm_eval --model megatron_lm \ --model_args load=/path/to/moe_checkpoint,tokenizer_model=/path/to/tokenizer,devices=4,expert_model_parallel_size=4 \ --tasks hellaswag ``` **Using extra_args for additional Megatron options:** ```bash lm_eval --model megatron_lm \ --model_args load=/path/to/checkpoint,tokenizer_model=/path/to/tokenizer,extra_args="--no-rope-fusion --trust-remote-code" \ --tasks hellaswag ``` > [!Note] > The `--use-checkpoint-args` flag is enabled by default, which loads model architecture parameters from the checkpoint. For checkpoints converted via Megatron-Bridge, this typically includes all necessary model configuration. #### Multi-GPU evaluation with OpenVINO models Pipeline parallelism during evaluation is supported with OpenVINO models To enable pipeline parallelism, set the `model_args` of `pipeline_parallel`. In addition, you also have to set up `device` to value `HETERO:,` for example `HETERO:GPU.1,GPU.0` For example, the command to use pipeline parallelism of 2 is: ```bash lm_eval --model openvino \ --tasks wikitext \ --model_args pretrained=,pipeline_parallel=True \ --device HETERO:GPU.1,GPU.0 ``` ### Tensor + Data Parallel and Optimized Inference with `vLLM` We also support vLLM for faster inference on [supported model types](https://docs.vllm.ai/en/latest/models/supported_models.html), especially faster when splitting a model across multiple GPUs. For single-GPU or multi-GPU — tensor parallel, data parallel, or a combination of both — inference, for example: ```bash lm_eval --model vllm \ --model_args pretrained={model_name},tensor_parallel_size={GPUs_per_model},dtype=auto,gpu_memory_utilization=0.8,data_parallel_size={model_replicas} \ --tasks lambada_openai \ --batch_size auto ``` To use vllm, do `pip install "lm_eval[vllm]"`. For a full list of supported vLLM configurations, please reference our [vLLM integration](https://github.com/EleutherAI/lm-evaluation-harness/blob/e74ec966556253fbe3d8ecba9de675c77c075bce/lm_eval/models/vllm_causallms.py) and the vLLM documentation. vLLM occasionally differs in output from Huggingface. We treat Huggingface as the reference implementation and provide a [script](./scripts/model_comparator.py) for checking the validity of vllm results against HF. > [!Tip] > For fastest performance, we recommend using `--batch_size auto` for vLLM whenever possible, to leverage its continuous batching functionality! > [!Tip] > Passing `max_model_len=4096` or some other reasonable default to vLLM through model args may cause speedups or prevent out-of-memory errors when trying to use auto batch size, such as for Mistral-7B-v0.1 which defaults to a maximum length of 32k. ### Tensor + Data Parallel and Fast Offline Batching Inference with `SGLang` We support SGLang for efficient offline batch inference. Its **[Fast Backend Runtime](https://docs.sglang.ai/index.html)** delivers high performance through optimized memory management and parallel processing techniques. Key features include tensor parallelism, continuous batching, and support for various quantization methods (FP8/INT4/AWQ/GPTQ). To use SGLang as the evaluation backend, please **install it in advance** via SGLang documents [here](https://docs.sglang.io/get_started/install.html#install-sglang). > [!Tip] > Due to the installing method of [`Flashinfer`](https://docs.flashinfer.ai/)-- a fast attention kernel library, we don't include the dependencies of `SGLang` within [pyproject.toml](pyproject.toml). Note that the `Flashinfer` also has some requirements on `torch` version. SGLang's server arguments are slightly different from other backends, see [here](https://docs.sglang.io/advanced_features/server_arguments.html) for more information. We provide an example of the usage here: ```bash lm_eval --model sglang \ --model_args pretrained={model_name},dp_size={data_parallel_size},tp_size={tensor_parallel_size},dtype=auto \ --tasks gsm8k_cot \ --batch_size auto ``` > [!Tip] > When encountering out-of-memory (OOM) errors (especially for multiple-choice tasks), try these solutions: > > 1. Use a manual `batch_size`, rather than `auto`. > 2. Lower KV cache pool memory usage by adjusting `mem_fraction_static` - Add to your model arguments for example `--model_args pretrained=...,mem_fraction_static=0.7`. > 3. Increase tensor parallel size `tp_size` (if using multiple GPUs). ### Windows ML We support **Windows ML** for hardware-accelerated inference on Windows platforms. This enables evaluation on CPU, GPU, and **NPU (Neural Processing Unit)** devices. Windows ML? https://learn.microsoft.com/en-us/windows/ai/new-windows-ml/overview To use Windows ML, install the required dependencies: ```bash pip install wasdk-Microsoft.Windows.AI.MachineLearning[all] wasdk-Microsoft.Windows.ApplicationModel.DynamicDependency.Bootstrap onnxruntime-windowsml onnxruntime-genai-winml ``` Evaluate an ONNX Runtime GenAI LLM on NPU/GPU/CPU on Windows: ```bash lm_eval --model winml \ --model_args pretrained=/path/to/onnx/model \ --tasks mmlu \ --batch_size 1 ``` > [!Note] > The Windows ML backend is ONLY for ONNX Runtime GenAI model format. Models targeting `transformers.js` won't work. You can verify this by finding the `genai_config.json` file in the model folder. > [!Note] > To run an ONNX Runtime GenAI model on the target device, you MUST convert the original model to that vendor and device type. Converted models won't work / work well on other vendor or device types. To learn more on model conversion, please visit [Microsoft AI Tool Kit](https://code.visualstudio.com/docs/intelligentapps/modelconversion) ### Model APIs and Inference Servers > [!Important] > To use API-based models, first install: `pip install "lm_eval[api]"` Our library also supports the evaluation of models served via several commercial APIs, and we hope to implement support for the most commonly used performant local/self-hosted inference servers. To call a hosted model, use: ```bash export OPENAI_API_KEY=YOUR_KEY_HERE lm_eval --model openai-completions \ --model_args model=davinci-002 \ --tasks lambada_openai,hellaswag ``` We also support using your own local inference server with servers that mirror the OpenAI Completions and ChatCompletions APIs. ```bash lm_eval --model local-completions --tasks gsm8k --model_args model=facebook/opt-125m,base_url=http://{yourip}:8000/v1/completions,num_concurrent=1,max_retries=3,tokenized_requests=False,batch_size=16 ``` Note that for externally hosted models, configs such as `--device` which relate to where to place a local model should not be used and do not function. Just like you can use `--model_args` to pass arbitrary arguments to the model constructor for local models, you can use it to pass arbitrary arguments to the model API for hosted models. See the documentation of the hosting service for information on what arguments they support. | API or Inference Server | Implemented? | `--model ` name | Models supported: | Request Types: | |---------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------|-----------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------| | OpenAI Completions | :heavy_check_mark: | `openai-completions`, `local-completions` | All OpenAI Completions API models | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | | OpenAI ChatCompletions | :heavy_check_mark: | `openai-chat-completions`, `local-chat-completions` | [All ChatCompletions API models](https://platform.openai.com/docs/guides/gpt) | `generate_until` (no logprobs) | | Anthropic | :heavy_check_mark: | `anthropic` | [Supported Anthropic Engines](https://docs.anthropic.com/claude/reference/selecting-a-model) | `generate_until` (no logprobs) | | Anthropic Chat | :heavy_check_mark: | `anthropic-chat`, `anthropic-chat-completions` | [Supported Anthropic Engines](https://docs.anthropic.com/claude/docs/models-overview) | `generate_until` (no logprobs) | | Textsynth | :heavy_check_mark: | `textsynth` | [All supported engines](https://textsynth.com/documentation.html#engines) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | | Cohere | [:hourglass: - blocked on Cohere API bug](https://github.com/EleutherAI/lm-evaluation-harness/pull/395) | N/A | [All `cohere.generate()` engines](https://docs.cohere.com/docs/models) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | | [Llama.cpp](https://github.com/ggerganov/llama.cpp) (via [llama-cpp-python](https://github.com/abetlen/llama-cpp-python)) | :heavy_check_mark: | `gguf`, `ggml` | [All models supported by llama.cpp](https://github.com/ggerganov/llama.cpp) | `generate_until`, `loglikelihood`, (perplexity evaluation not yet implemented) | | vLLM | :heavy_check_mark: | `vllm` | [Most HF Causal Language Models](https://docs.vllm.ai/en/latest/models/supported_models.html) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | | Mamba | :heavy_check_mark: | `mamba_ssm` | [Mamba architecture Language Models via the `mamba_ssm` package](https://huggingface.co/state-spaces) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | | Huggingface Optimum (Causal LMs) | :heavy_check_mark: | `openvino` | Any decoder-only AutoModelForCausalLM converted with Huggingface Optimum into OpenVINO™ Intermediate Representation (IR) format | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | | Huggingface Optimum-intel IPEX (Causal LMs) | :heavy_check_mark: | `ipex` | Any decoder-only AutoModelForCausalLM | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | | Huggingface Optimum-habana (Causal LMs) | :heavy_check_mark: | `habana` | Any decoder-only AutoModelForCausalLM | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | | Neuron via AWS Inf2 (Causal LMs) | :heavy_check_mark: | `neuronx` | Any decoder-only AutoModelForCausalLM supported to run on [huggingface-ami image for inferentia2](https://aws.amazon.com/marketplace/pp/prodview-gr3e6yiscria2) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | | NVIDIA NeMo | :heavy_check_mark: | `nemo_lm` | [All supported models](https://docs.nvidia.com/nemo-framework/user-guide/24.09/nemotoolkit/core/core.html#nemo-models) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | | NVIDIA Megatron-LM | :heavy_check_mark: | `megatron_lm` | [Megatron-LM GPT models](https://github.com/NVIDIA/Megatron-LM) (standard and distributed checkpoints) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | | Watsonx.ai | :heavy_check_mark: | `watsonx_llm` | [Supported Watsonx.ai Engines](https://dataplatform.cloud.ibm.com/docs/content/wsj/analyze-data/fm-models.html?context=wx) | `generate_until` `loglikelihood` | | Windows ML | :heavy_check_mark: | `winml` | [ONNX models in GenAI format](https://code.visualstudio.com/docs/intelligentapps/modelconversion) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | | [Your local inference server!](docs/API_guide.md) | :heavy_check_mark: | `local-completions` or `local-chat-completions` | Support for OpenAI API-compatible servers, with easy customization for other APIs. | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | Models which do not supply logits or logprobs can be used with tasks of type `generate_until` only, while local models, or APIs that supply logprobs/logits of their prompts, can be run on all task types: `generate_until`, `loglikelihood`, `loglikelihood_rolling`, and `multiple_choice`. For more information on the different task `output_types` and model request types, see [our documentation](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/model_guide.md#interface). > [!Note] > For best performance with closed chat model APIs such as Anthropic Claude 3 and GPT-4, we recommend carefully looking at a few sample outputs using `--limit 10` first to confirm answer extraction and scoring on generative tasks is performing as expected. providing `system=""` within `--model_args` for anthropic-chat-completions, to instruct the model what format to respond in, may be useful. ### Other Frameworks A number of other libraries contain scripts for calling the eval harness through their library. These include [GPT-NeoX](https://github.com/EleutherAI/gpt-neox/blob/main/eval_tasks/eval_adapter.py), [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed/blob/main/examples/MoE/readme_evalharness.md), and [mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/blob/master/eval_harness.py). To create your own custom integration you can follow instructions from [this tutorial](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md#external-library-usage). ### Additional Features > [!Note] > For tasks unsuitable for direct evaluation — either due risks associated with executing untrusted code or complexities in the evaluation process — the `--predict_only` flag is available to obtain decoded generations for post-hoc evaluation. If you have a Metal compatible Mac, you can run the eval harness using the MPS back-end by replacing `--device cuda:0` with `--device mps` (requires PyTorch version 2.1 or higher). **Note that the PyTorch MPS backend is still in early stages of development, so correctness issues or unsupported operations may exist. If you observe oddities in model performance on the MPS back-end, we recommend first checking that a forward pass of your model on `--device cpu` and `--device mps` match.** > [!Note] > You can inspect what the LM inputs look like by running the following command: > > ```bash > python write_out.py \ > --tasks \ > --num_fewshot 5 \ > --num_examples 10 \ > --output_base_path /path/to/output/folder > ``` > > This will write out one text file for each task. To verify the data integrity of the tasks you're performing in addition to running the tasks themselves, you can use the `--check_integrity` flag: ```bash lm_eval --model openai \ --model_args engine=davinci-002 \ --tasks lambada_openai,hellaswag \ --check_integrity ``` ## Advanced Usage Tips For models loaded with the HuggingFace `transformers` library, any arguments provided via `--model_args` get passed to the relevant constructor directly. This means that anything you can do with `AutoModel` can be done with our library. For example, you can pass a local path via `pretrained=` or use models finetuned with [PEFT](https://github.com/huggingface/peft) by taking the call you would run to evaluate the base model and add `,peft=PATH` to the `model_args` argument: ```bash lm_eval --model hf \ --model_args pretrained=EleutherAI/gpt-j-6b,parallelize=True,load_in_4bit=True,peft=nomic-ai/gpt4all-j-lora \ --tasks openbookqa,arc_easy,winogrande,hellaswag,arc_challenge,piqa,boolq \ --device cuda:0 ``` Models provided as delta weights can be easily loaded using the Hugging Face transformers library. Within --model_args, set the delta argument to specify the delta weights, and use the pretrained argument to designate the relative base model to which they will be applied: ```bash lm_eval --model hf \ --model_args pretrained=Ejafa/llama_7B,delta=lmsys/vicuna-7b-delta-v1.1 \ --tasks hellaswag ``` GPTQ quantized models can be loaded using [GPTQModel](https://github.com/ModelCloud/GPTQModel) (faster) or [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) GPTQModel: add `,gptqmodel=True` to `model_args` ```bash lm_eval --model hf \ --model_args pretrained=model-name-or-path,gptqmodel=True \ --tasks hellaswag ``` AutoGPTQ: add `,autogptq=True` to `model_args`: ```bash lm_eval --model hf \ --model_args pretrained=model-name-or-path,autogptq=model.safetensors,gptq_use_triton=True \ --tasks hellaswag ``` We support wildcards in task names, for example you can run all of the machine-translated lambada tasks via `--task lambada_openai_mt_*`. ## Saving & Caching Results To save evaluation results provide an `--output_path`. We also support logging model responses with the `--log_samples` flag for post-hoc analysis. > [!TIP] > Use `--use_cache ` to cache evaluation results and skip previously evaluated samples when resuming runs of the same (model, task) pairs. Note that caching is rank-dependent, so restart with the same GPU count if interrupted. You can also use --cache_requests to save dataset preprocessing steps for faster evaluation resumption. To push results and samples to the Hugging Face Hub, first ensure an access token with write access is set in the `HF_TOKEN` environment variable. Then, use the `--hf_hub_log_args` flag to specify the organization, repository name, repository visibility, and whether to push results and samples to the Hub - [example dataset on the HF Hub](https://huggingface.co/datasets/KonradSzafer/lm-eval-results-demo). For instance: ```bash lm_eval --model hf \ --model_args pretrained=model-name-or-path,autogptq=model.safetensors,gptq_use_triton=True \ --tasks hellaswag \ --log_samples \ --output_path results \ --hf_hub_log_args hub_results_org=EleutherAI,hub_repo_name=lm-eval-results,push_results_to_hub=True,push_samples_to_hub=True,public_repo=False \ ``` This allows you to easily download the results and samples from the Hub, using: ```python from datasets import load_dataset load_dataset("EleutherAI/lm-eval-results-private", "hellaswag", "latest") ``` For a full list of supported arguments, check out the [interface](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md) guide in our documentation! ## Visualizing Results You can seamlessly visualize and analyze the results of your evaluation harness runs using both Weights & Biases (W&B) and Zeno. ### Zeno You can use [Zeno](https://zenoml.com) to visualize the results of your eval harness runs. First, head to [hub.zenoml.com](https://hub.zenoml.com) to create an account and get an API key [on your account page](https://hub.zenoml.com/account). Add this key as an environment variable: ```bash export ZENO_API_KEY=[your api key] ``` You'll also need to install the `lm_eval[zeno]` package extra. To visualize the results, run the eval harness with the `log_samples` and `output_path` flags. We expect `output_path` to contain multiple folders that represent individual model names. You can thus run your evaluation on any number of tasks and models and upload all of the results as projects on Zeno. ```bash lm_eval \ --model hf \ --model_args pretrained=EleutherAI/gpt-j-6B \ --tasks hellaswag \ --device cuda:0 \ --batch_size 8 \ --log_samples \ --output_path output/gpt-j-6B ``` Then, you can upload the resulting data using the `zeno_visualize` script: ```bash python scripts/zeno_visualize.py \ --data_path output \ --project_name "Eleuther Project" ``` This will use all subfolders in `data_path` as different models and upload all tasks within these model folders to Zeno. If you run the eval harness on multiple tasks, the `project_name` will be used as a prefix and one project will be created per task. You can find an example of this workflow in [examples/visualize-zeno.ipynb](examples/visualize-zeno.ipynb). ### Weights and Biases With the [Weights and Biases](https://wandb.ai/site) integration, you can now spend more time extracting deeper insights into your evaluation results. The integration is designed to streamline the process of logging and visualizing experiment results using the Weights & Biases (W&B) platform. The integration provide functionalities - to automatically log the evaluation results, - log the samples as W&B Tables for easy visualization, - log the `results.json` file as an artifact for version control, - log the `_eval_samples.json` file if the samples are logged, - generate a comprehensive report for analysis and visualization with all the important metric, - log task and cli specific configs, - and more out of the box like the command used to run the evaluation, GPU/CPU counts, timestamp, etc. First you'll need to install the lm_eval[wandb] package extra. Do `pip install lm_eval[wandb]`. Authenticate your machine with an your unique W&B token. Visit https://wandb.ai/authorize to get one. Do `wandb login` in your command line terminal. Run eval harness as usual with a `wandb_args` flag. Use this flag to provide arguments for initializing a wandb run ([wandb.init](https://docs.wandb.ai/ref/python/init)) as comma separated string arguments. ```bash lm_eval \ --model hf \ --model_args pretrained=microsoft/phi-2,trust_remote_code=True \ --tasks hellaswag,mmlu_abstract_algebra \ --device cuda:0 \ --batch_size 8 \ --output_path output/phi-2 \ --limit 10 \ --wandb_args project=lm-eval-harness-integration \ --log_samples ``` In the stdout, you will find the link to the W&B run page as well as link to the generated report. You can find an example of this workflow in [examples/visualize-wandb.ipynb](examples/visualize-wandb.ipynb), and an example of how to integrate it beyond the CLI. ## Contributing Check out our [open issues](https://github.com/EleutherAI/lm-evaluation-harness/issues) and feel free to submit pull requests! For more information on the library and how everything fits together, see our [documentation pages](https://github.com/EleutherAI/lm-evaluation-harness/tree/main/docs). To get started with development, first clone the repository and install the dev dependencies: ```bash git clone https://github.com/EleutherAI/lm-evaluation-harness cd lm-evaluation-harness pip install -e ".[dev,hf]" ```` ### Implementing new tasks To implement a new task in the eval harness, see [this guide](./docs/new_task_guide.md). In general, we follow this priority list for addressing concerns about prompting and other eval details: 1. If there is widespread agreement among people who train LLMs, use the agreed upon procedure. 2. If there is a clear and unambiguous official implementation, use that procedure. 3. If there is widespread agreement among people who evaluate LLMs, use the agreed upon procedure. 4. If there are multiple common implementations but not universal or widespread agreement, use our preferred option among the common implementations. As before, prioritize choosing from among the implementations found in LLM training papers. These are guidelines and not rules, and can be overruled in special circumstances. We try to prioritize agreement with the procedures used by other groups to decrease the harm when people inevitably compare runs across different papers despite our discouragement of the practice. Historically, we also prioritized the implementation from [Language Models are Few Shot Learners](https://arxiv.org/abs/2005.14165) as our original goal was specifically to compare results with that paper. ### Support The best way to get support is to open an issue on this repo or join the [EleutherAI Discord server](https://discord.gg/eleutherai). The `#lm-thunderdome` channel is dedicated to developing this project and the `#release-discussion` channel is for receiving support for our releases. If you've used the library and have had a positive (or negative) experience, we'd love to hear from you! ## Optional Extras Extras dependencies can be installed via `pip install -e ".[NAME]"` ### Model Backends These extras install dependencies required to run specific model backends: | NAME | Description | |----------------|--------------------------------------------------| | hf | HuggingFace Transformers (torch, transformers, accelerate, peft) | | vllm | vLLM fast inference | | api | API models (OpenAI, Anthropic, local servers) | | gptq | AutoGPTQ quantized models | | gptqmodel | GPTQModel quantized models | | ibm_watsonx_ai | IBM watsonx.ai models | | ipex | Intel IPEX backend | | habana | Intel Gaudi backend | | optimum | Intel OpenVINO models | | neuronx | AWS Inferentia2 instances | | winml | Windows ML (ONNX Runtime GenAI) - CPU/GPU/NPU | | sparsify | Sparsify model steering | | sae_lens | SAELens model steering | ### Task Dependencies These extras install dependencies required for specific evaluation tasks: | NAME | Description | |----------------------|--------------------------------| | tasks | All task-specific dependencies | | acpbench | ACP Bench tasks | | audiolm_qwen | Qwen2 audio models | | ifeval | IFEval task | | japanese_leaderboard | Japanese LLM tasks | | longbench | LongBench tasks | | math | Math answer checking | | multilingual | Multilingual tokenizers | | ruler | RULER tasks | ### Development & Utilities | NAME | Description | |---------------|--------------------------------| | dev | Linting & contributions | | hf_transfer | Speed up HF downloads | | sentencepiece | Sentencepiece tokenizer | | unitxt | Unitxt tasks | | wandb | Weights & Biases logging | | zeno | Zeno result visualization | ## Cite as ```text @misc{eval-harness, author = {Gao, Leo and Tow, Jonathan and Abbasi, Baber and Biderman, Stella and Black, Sid and DiPofi, Anthony and Foster, Charles and Golding, Laurence and Hsu, Jeffrey and Le Noac'h, Alain and Li, Haonan and McDonell, Kyle and Muennighoff, Niklas and Ociepa, Chris and Phang, Jason and Reynolds, Laria and Schoelkopf, Hailey and Skowron, Aviya and Sutawika, Lintang and Tang, Eric and Thite, Anish and Wang, Ben and Wang, Kevin and Zou, Andy}, title = {The Language Model Evaluation Harness}, month = 07, year = 2024, publisher = {Zenodo}, version = {v0.4.3}, doi = {10.5281/zenodo.12608602}, url = {https://zenodo.org/records/12608602} } ``` ================================================ FILE: docs/API_guide.md ================================================ # TemplateAPI Usage Guide The `TemplateAPI` class is a versatile superclass designed to facilitate the integration of various API-based language models into the lm-evaluation-harness framework. This guide will explain how to use and extend the `TemplateAPI` class to implement your own API models. If your API implements the OpenAI API you can use the `local-completions` or the `local-chat-completions` (defined [here](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/models/openai_completions.py)) model types, which can also serve as examples of how to effectively subclass this template. ## Overview The `TemplateAPI` class provides a template for creating API-based model implementations. It handles common functionalities such as: - Tokenization (optional) - Batch processing - Caching - Retrying failed requests - Parsing API responses To use this class, you typically need to subclass it and implement specific methods for your API. ## Key Methods to Implement When subclassing `TemplateAPI`, you need to implement the following methods: 1. `_create_payload`: Creates the JSON payload for API requests. 2. `parse_logprobs`: Parses log probabilities from API responses. 3. `parse_generations`: Parses generated text from API responses. Optional Properties: 4. `header`: Returns the headers for the API request. 5. `api_key`: Returns the API key for authentication (if required). You may also need to override other methods or properties depending on your API's specific requirements. > [!NOTE] > Currently loglikelihood and MCQ based tasks (such as MMLU) are only supported for completion endpoints. Not for chat-completion — those that expect a list of dicts — endpoints! Completion APIs which support instruct tuned models can be evaluated with the `--apply_chat_template` option in order to simultaneously evaluate models using a chat template format while still being able to access the model logits needed for loglikelihood-based tasks. ## TemplateAPI Arguments When initializing a `TemplateAPI` instance or a subclass, you can provide several arguments to customize its behavior. Here's a detailed explanation of some important arguments: - `model` or `pretrained` (str): - The name or identifier of the model to use. - `model` takes precedence over `pretrained` when both are provided. - `base_url` (str): - The base URL for the API endpoint. - `tokenizer` (str, optional): - The name or path of the tokenizer to use. - If not provided, it defaults to using the same tokenizer name as the model. - `num_concurrent` (int): - Number of concurrent requests to make to the API. - Useful for APIs that support parallel processing. - Default is 1 (sequential processing). - `timeout` (int, optional): - Timeout for API requests in seconds. - Default is 30. - `tokenized_requests` (bool): - Determines whether the input is pre-tokenized. Defaults to `True`. - Requests can be sent in either tokenized form (`list[list[int]]`) or as text (`list[str]`, or `str` for batch_size=1). - For loglikelihood-based tasks, prompts require tokenization to calculate the context length. If `False` prompts are decoded back to text before being sent to the API. - Not as important for `generate_until` tasks. - Ignored for chat formatted inputs (list[dict...]) or if tokenizer_backend is None. - `tokenizer_backend` (str, optional): - Required for loglikelihood-based or MCQ tasks. - Specifies the tokenizer library to use. Options are "tiktoken", "huggingface", or None. - Default is "huggingface". - `max_length` (int, optional): - Maximum length of input + output. - Default is 2048. - `max_retries` (int, optional): - Maximum number of retries for failed API requests. - Default is 3. - `max_gen_toks` (int, optional): - Maximum number of tokens to generate in completion tasks. - Default is 256 or set in task yaml. - `batch_size` (int or str, optional): - Number of requests to batch together (if the API supports batching). - Can be an integer or "auto" (which defaults to 1 for API models). - Default is 1. - `seed` (int, optional): - Random seed for reproducibility. - Default is 1234. - `add_bos_token` (bool, optional): - Whether to add the beginning-of-sequence token to inputs (when tokenizing). - Default is False. - `custom_prefix_token_id` (int, optional): - Custom token ID to use as a prefix for inputs. - If not provided, uses the model's default BOS or EOS token (if `add_bos_token` is True). - `verify_certificate` (bool, optional): - Whether to validate the certificate of the API endpoint (if HTTPS). - Default is True. - `header` (dict, optional): - Custom headers for API requests. - If not provided, uses `{"Authorization": f"Bearer {self.api_key}"}` by default. Example usage: ```python class MyAPIModel(TemplateAPI): def __init__(self, **kwargs): super().__init__( model="my-model", base_url="https://api.mymodel.com/v1/completions", tokenizer_backend="huggingface", num_concurrent=5, max_retries=5, batch_size=10, **kwargs ) # Implement other required methods... ``` When subclassing `TemplateAPI`, you can override these arguments in your `__init__` method to set default values specific to your API. You can also add additional (potentially user-specified) arguments as needed for your specific implementation. ## Example Implementation: OpenAI API The `OpenAICompletionsAPI` and `OpenAIChatCompletion` ([here](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/models/openai_completions.py) classes demonstrate how to implement API models using the `TemplateAPI` class. Here's a breakdown of the key components: ### 1. Subclassing and Initialization ```python @register_model("openai-completions") class OpenAICompletionsAPI(LocalCompletionsAPI): def __init__( self, base_url="https://api.openai.com/v1/completions", tokenizer_backend="tiktoken", **kwargs, ): super().__init__( base_url=base_url, tokenizer_backend=tokenizer_backend, **kwargs ) ``` ### 2. Implementing API Key Retrieval ```python @cached_property def api_key(self): key = os.environ.get("OPENAI_API_KEY", None) if key is None: raise ValueError( "API key not found. Please set the OPENAI_API_KEY environment variable." ) return key ``` ### 3. Creating the Payload ```python def _create_payload( self, messages: Union[List[List[int]], List[dict], List[str], str], generate=False, gen_kwargs: Optional[dict] = None, **kwargs, ) -> dict: if generate: # ... (implementation for generation) else: # ... (implementation for log likelihood) ``` ### 4. Parsing API Responses ```python @staticmethod def parse_logprobs( outputs: Union[Dict, List[Dict]], tokens: List[List[int]] = None, ctxlens: List[int] = None, **kwargs, ) -> List[Tuple[float, bool]]: # ... (implementation) @staticmethod def parse_generations(outputs: Union[Dict, List[Dict]], **kwargs) -> List[str]: # ... (implementation) ``` The requests are initiated in the `model_call` or the `amodel_call` methods. ## Implementing Your Own API Model To implement your own API model: 1. Subclass `TemplateAPI` or one of its subclasses (e.g., `LocalCompletionsAPI`). 2. Override the `__init__` method if you need to set specific parameters. 3. Implement the `_create_payload` and `header` methods to create the appropriate payload for your API. 4. Implement the `parse_logprobs` and `parse_generations` methods to parse your API's responses. 5. Override the `api_key` property if your API requires authentication. 6. Override any other methods as necessary to match your API's behavior. ## Best Practices 1. Use the `@register_model` decorator to register your model with the framework (and import it in `lm_eval/models/__init__.py`!). 2. Use environment variables for sensitive information like API keys. 3. Properly handle batching and concurrent requests if supported by your API. ================================================ FILE: docs/CONTRIBUTING.md ================================================ # Contributing to LM Evaluation Harness Welcome and thank you for your interest in the LM Evaluation Harness! We welcome contributions and feedback and appreciate your time spent with our library, and hope you find it useful! ## Important Resources There are several places information about LM Evaluation Harness is located: - Our [documentation pages](https://github.com/EleutherAI/lm-evaluation-harness/tree/main/docs) - We occasionally use [GitHub Milestones](https://github.com/EleutherAI/lm-evaluation-harness/milestones) to track progress toward specific near-term version releases. - We maintain a [Project Board](https://github.com/orgs/EleutherAI/projects/25) for tracking current work items and PRs, and for future roadmap items or feature requests. - Further discussion and support conversations are located in the #lm-thunderdome channel of the [EleutherAI discord](https://discord.gg/eleutherai). ## Code Style LM Evaluation Harness uses [ruff](https://github.com/astral-sh/ruff) for linting via [pre-commit](https://pre-commit.com/). You can install linters and dev tools via ```pip install lm_eval[dev]``` or ```pip install -e ".[dev]"``` Then, run ```pre-commit install``` in order to ensure linters and other checks will be run upon committing. ## Testing We use [pytest](https://docs.pytest.org/en/latest/) for running unit tests. All library unit tests can be run via: ```bash python -m pytest --showlocals -s -vv -n=auto --ignore=tests/models/test_openvino.py ``` ## Verbose logging You can enable verbose logging with the environment variable `LMEVAL_LOG_LEVEL="debug"`. ## Contributor License Agreement We ask that new contributors agree to a Contributor License Agreement affirming that EleutherAI has the rights to use your contribution to our library. First-time pull requests will have a reply added by @CLAassistant containing instructions for how to confirm this, and we require it before merging your PR. ## Contribution Best Practices We recommend a few best practices to make your contributions or reported errors easier to assist with. **For Pull Requests:** - PRs should be titled descriptively, and be opened with a brief description of the scope and intent of the new contribution. - New features should have appropriate documentation added alongside them. - Aim for code maintainability, and minimize code copying. - If opening a task, try to share test results on the task using a publicly-available model, and if any public results are available on the task, compare to them. **For Feature Requests:** - Provide a short paragraph's worth of description. What is the feature you are requesting? What is its motivation, and an example use case of it? How does this differ from what is currently supported? **For Bug Reports**: - Provide a short description of the bug. - Provide a *reproducible example*--what is the command you run with our library that results in this error? Have you tried any other steps to resolve it? - Provide a *full error traceback* of the error that occurs, if applicable. A one-line error message or small screenshot snippet is unhelpful without the surrounding context. - Note what version of the codebase you are using, and any specifics of your environment and setup that may be relevant. **For Requesting New Tasks**: - Provide a 1-2 sentence description of what the task is and what it evaluates. - Provide a link to the paper introducing the task. - Provide a link to where the dataset can be found. - Provide a link to a paper containing results on an open-source model on the task, for use in comparisons and implementation validation. - If applicable, link to any codebase that has implemented the task (especially the original publication's codebase, if existent). ## How Can I Get Involved? To quickly get started, we maintain a list of good first issues, which can be found [on our project board](https://github.com/orgs/EleutherAI/projects/25/views/8) or by [filtering GH Issues](https://github.com/EleutherAI/lm-evaluation-harness/issues?q=is%3Aopen+label%3A%22good+first+issue%22+label%3A%22help+wanted%22). These are typically smaller code changes or self-contained features which can be added without extensive familiarity with library internals, and we recommend new contributors consider taking a stab at one of these first if they are feeling uncertain where to begin. There are a number of distinct ways to contribute to LM Evaluation Harness, and all are extremely helpful! A sampling of ways to contribute include: - **Implementing and verifying new evaluation tasks**: Is there a task you'd like to see LM Evaluation Harness support? Consider opening an issue requesting it, or helping add it! Verifying and cross-checking task implementations with their original versions is also a very valuable form of assistance in ensuring standardized evaluation. - **Improving documentation** - Improvements to the documentation, or noting pain points / gaps in documentation, are helpful in order for us to improve the user experience of the library and clarity + coverage of documentation. - **Testing and devops** - We are very grateful for any assistance in adding tests for the library that can be run for new PRs, and other devops workflows. - **Adding new modeling / inference library integrations** - We hope to support a broad range of commonly-used inference libraries popular among the community, and welcome PRs for new integrations, so long as they are documented properly and maintainable. - **Proposing or Contributing New Features** - We want LM Evaluation Harness to support a broad range of evaluation usecases. If you have a feature that is not currently supported but desired, feel free to open an issue describing the feature and, if applicable, how you intend to implement it. We would be happy to give feedback on the cleanest way to implement new functionalities and are happy to coordinate with interested contributors via GH discussions or via discord. We hope that this has been helpful, and appreciate your interest in contributing! Further questions can be directed to [our Discord](discord.gg/eleutherai). ================================================ FILE: docs/README.md ================================================ # Eval Harness Documentation Welcome to the docs for the LM Evaluation Harness! ## Table of Contents * To learn about the public interface of the library, as well as how to evaluate via the command line or as integrated into an external library, see the [Interface](./interface.md). * To learn how to add a new library, API, or model type to the library, as well as a quick explainer on the types of ways to evaluate an LM, see the [Model Guide](./model_guide.md). * For an extended description of how to extend the library to new model classes served over an API, see the [API Guide](./API_guide.md). * For a crash course on adding new tasks to the library, see our [New Task Guide](./new_task_guide.md). * To learn more about pushing the limits of task configuration that the Eval Harness supports, see the [Task Configuration Guide](./task_guide.md). ================================================ FILE: docs/chat-template-readme.md ================================================ # Chat Template Delimiter Handling Update ## Overview This change modifies how delimiters are handled when applying chat templates in the request construction process for likelihood and multiple-choice based tasks. When `apply_chat_template` is set to `True`, the target delimiter is now set to an empty string instead of using the configured delimiter. ## Background By default, the system uses a target delimiter (typically a whitespace " ") between the context and target text when constructing prompts. The full string is constructed as: ```text doc_to_text(doc) + target_delimiter + doc_to_target(doc) ``` While this worked well for base models where we wanted the model to predict a single whitespace followed by the answer, chat models have their own formatting conventions that handle spacing differently. ## The Change - When `apply_chat_template=True`, the target delimiter is now empty ("") instead of the default whitespace - This prevents interference between chat template formatting and the default delimiter system - Particularly important for multiple choice tasks where the template itself handles spacing ## Example ```text # Before (with default delimiter " ") Question: What color is the sky?\nAnswer: blue # After Question: What color is the sky?\nAnswer:blue ``` ================================================ FILE: docs/config_files.md ================================================ # Configuration Guide This guide explains how to use YAML configuration files with `lm-eval` to define reusable evaluation settings. ## Overview Instead of passing many CLI arguments, you can define evaluation parameters in a YAML configuration file: ```bash # Instead of: lm-eval run --model hf --model_args pretrained=gpt2,dtype=float32 --tasks hellaswag arc_easy --num_fewshot 5 --batch_size 8 --device cuda:0 # Use: lm-eval run --config eval_config.yaml ``` CLI arguments override config file values, so you can set defaults in a config file and override specific settings: ```bash lm-eval run --config eval_config.yaml --tasks mmlu --limit 100 ``` ## Quick Reference All configuration keys correspond directly to CLI arguments. See the [CLI Reference](interface.md#lm-eval-run) for detailed descriptions of each option. ## Config Schema | Field | Type | Default | Description | |-------|------|---------|-------------| | `model` | string | `"hf"` | Model type/provider | | `model_args` | dict | `{}` | Model constructor arguments | | `tasks` | list/string | required | Tasks to evaluate | | `num_fewshot` | int/null | `null` | Few-shot example count | | `batch_size` | int/string | `1` | Batch size or "auto" | | `max_batch_size` | int/null | `null` | Max batch size for auto | | `device` | string/null | `"cuda:0"` | Device to use | | `limit` | float/null | `null` | Example limit per task | | `samples` | dict/null | `null` | Specific sample indices | | `use_cache` | string/null | `null` | Response cache path | | `cache_requests` | string/dict | `{}` | Request cache settings | | `output_path` | string/null | `null` | Results output path | | `log_samples` | bool | `false` | Save model I/O | | `predict_only` | bool | `false` | Skip metrics | | `apply_chat_template` | bool/string | `false` | Chat template | | `system_instruction` | string/null | `null` | System prompt | | `fewshot_as_multiturn` | bool/null | `null` | Multi-turn few-shot | | `include_path` | string/null | `null` | External tasks path | | `gen_kwargs` | dict | `{}` | Generation arguments | | `wandb_args` | dict | `{}` | W&B init arguments | | `hf_hub_log_args` | dict | `{}` | HF Hub logging | | `seed` | list/int | `[0,1234,1234,1234]` | Random seeds | | `trust_remote_code` | bool | `false` | Trust remote code | | `metadata` | dict | `{}` | Task metadata | --- ## Example ```yaml # basic_eval.yaml model: hf model_args: pretrained: gpt2 dtype: float32 tasks: - hellaswag - arc_easy num_fewshot: 0 batch_size: auto device: cuda:0 output_path: ./results/gpt2/ log_samples: true wandb_args: project: llm-evals name: mistral-7b-instruct tags: - mistral - instruct - production hf_hub_log_args: hub_results_org: my-org results_repo_name: llm-eval-results push_results_to_hub: true public_repo: false ``` --- ## Programmatic Usage For loading config files in Python, see the [Python API Guide](python-api.md#using-evaluatorconfig). --- ## Validation Validate your configuration before running: ```bash # Check that tasks exist lm-eval validate --tasks hellaswag,arc_easy # With external tasks lm-eval validate --tasks my_task --include_path /path/to/tasks ``` --- ## Tips 1. **Start simple**: Begin with minimal config and add options as needed 2. **Use CLI overrides**: Set defaults in config, override with CLI for experiments 3. **Separate concerns**: Create different configs for different model families or task sets 4. **Version control**: Commit config files alongside results for reproducibility 5. **Use comments**: YAML supports `#` comments to document your choices ================================================ FILE: docs/decontamination.md ================================================ # Decontamination ## Usage The provided directory should contain the ngram files and info.json produced in "Pile Ngram Generation" further down. ```bash python -m lm_eval \ --model gpt2 \ --device 0 \ --tasks sciq ``` ## Background Downstream evaluations test model generalization, and are less useful when test set data also exists in the training set, referred to as leakage or contamination. Filtering your training set against the test set is a good first step, however this isn't always possible, as in the case of a new benchmark or one that wasn't considered prior to model training. When training set filtering isn't possible, it is useful to measure the impact of test set leakage by detecting the contaminated test examples and producing a clean version of the benchmark. The basis for our decontamination procedure can be found in Appendix C of "Language Models are Few-Shot Learners". OpenAI defined a test document as contaminated if any N-gram overlap existed with any training document. They used a range of N values between 8 and 13 depending on dataset, while we just used 13 for simplicity. ## Implementation Contamination detection can be found in `lm_eval/decontaminate.py` with supporting code in `lm_eval/decontamination/`. decontaminate.py does the following: 1. Build dictionaries of all ngrams and their corresponding evaluation/document ids. 2. Scan through sorted files containing training set n-grams. 3. If a match is found, the corresponding evaluation/document combinations are marked as contaminated. `lm_eval/evaluator.py` can then produce a clean version of the benchmark by excluding the results of contaminated documents. For each metric, a clean version will be shown in the results with a "decontaminate" suffix. This is disabled by default for new tasks, to support decontamination on a task override the "should_decontaminate" and "doc_to_decontamination_query" methods. For more details see the [task guide](task_guide.md). ## Pile Ngram Generation The relevant scripts can be found in `scripts/clean_training_data`, which also import from `lm_eval/decontamination/` 1. git clone https://github.com/EleutherAI/lm-evaluation-harness.git 2. pip install -r requirements.txt 3. Download The Pile from [The Eye](https://the-eye.eu/public/AI/pile/train/) 4. Place pile files in "pile" directory under "lm-evaluation-harness" (or create a symlink) 5. Run generate_13_grams. ```bash export PYTHONHASHSEED=0 python -m scripts/clean_training_data/generate_13_grams \ -dir path/to/working/directory \ -n 13 \ -buckets 500 ``` Took approximately 4 days for us. We had the time to wait, but this could be scaled out by doing partial pile scans on multiple instances of this script and merging the relevant buckets. We fixed PYTHONHASHSEED to ensure reproducibility of bucket hashing in case you need to stop and start. 6. Sort the generated 13-grams. ```bash python -m scripts/clean_training_data/sort_13_gram_buckets \ -dir path/to/working/directory/output ``` Took approximately 5 days for us. You could speed this up by spreading the files around to different machines and running the sort script before gathering them together. 7. Compress the sorted 13 grams files and place them together with info.json. This step only takes a few hours. ```bash python -m scripts/clean_training_data/compress_and_package \ -dir path/to/working/directory \ -output path/to/final/directory \ -procs 8 ``` ================================================ FILE: docs/footguns.md ================================================ # Common Pitfalls and Troubleshooting Guide This document highlights common pitfalls and troubleshooting tips when using this library. We'll continue to add more tips as we discover them. ## YAML Configuration Issues ### Newline Characters in YAML (`\n`) **Problem:** When specifying newline characters in YAML, they may be interpreted incorrectly depending on how you format them. ```yaml # ❌ WRONG: Single quotes don't process escape sequences generation_kwargs: until: ['\n'] # Gets parsed as the literal characters '\' and 'n' i.e "\\n" ``` ```yaml # ✅ RIGHT: Use double quotes for escape sequences generation_kwargs: until: ["\n"] # Gets parsed as an actual newline character ``` **Solutions:** - Use double quotes for strings containing escape sequences - For multiline content, use YAML's block scalars (`|` or `>`) - When generating YAML programmatically, be careful with how template engines handle escape sequences ### Quoting in YAML **When to use different types of quotes:** - **No quotes**: Simple values (numbers, booleans, alphanumeric strings without special characters) ```yaml simple_value: plain text number: 42 ``` - **Single quotes (')**: - Preserves literal values - Use when you need special characters to be treated literally - Escape single quotes by doubling them: `'It''s working'` ```yaml literal_string: 'The newline character \n is not processed here' path: 'C:\Users\name' # Backslashes preserved ``` - **Double quotes (")**: - Processes escape sequences like `\n`, `\t`, etc. - Use for strings that need special characters interpreted - Escape double quotes with backslash: `"He said \"Hello\""` ```yaml processed_string: "First line\nSecond line" # Creates actual newline unicode: "Copyright symbol: \u00A9" # Unicode character ``` ================================================ FILE: docs/interface.md ================================================ # User Guide This document details the interface exposed by `lm-eval` and provides details on what flags are available to users. ## Command-line Interface The `lm-eval` CLI is organized into subcommands: | Command | Description | |---------|-------------| | `lm-eval run` | Run evaluations on language models | | `lm-eval ls` | List available tasks, groups, subtasks, or tags | | `lm-eval validate` | Validate task configurations | Run the library via the `lm-eval` entrypoint or `python -m lm_eval`. Use `-h` or `--help` to see available options: ```bash lm-eval -h # Show all subcommands lm-eval run -h # Show options for run command lm-eval ls -h # Show options for list command ``` > **Legacy Compatibility**: The original single-command interface still works. Running `lm-eval --model hf --tasks hellaswag` automatically inserts the `run` subcommand. --- ## Quick Start ```bash # List available tasks lm-eval ls tasks # Basic evaluation lm-eval run --model hf --model_args pretrained=gpt2 --tasks hellaswag # With few-shot examples lm-eval run --model hf --model_args pretrained=gpt2 --tasks arc_easy --num_fewshot 5 # Save results and model outputs lm-eval run --model hf --model_args pretrained=gpt2 --tasks hellaswag --output_path ./results/ --log_samples # Use a config file lm-eval run --config eval_config.yaml ``` --- ## `lm-eval run` Run evaluations on language models. ```bash lm-eval run --model --tasks [options] ``` ### Quick Examples ```bash # Basic evaluation with HuggingFace model lm-eval run --model hf --model_args pretrained=gpt2 dtype=float32 --tasks hellaswag # Multiple tasks with few-shot examples lm-eval run --model vllm --model_args pretrained=EleutherAI/gpt-j-6B --tasks arc_easy arc_challenge --num_fewshot 5 # Custom generation parameters lm-eval run --model hf --model_args pretrained=gpt2 --tasks lambada --gen_kwargs temperature=0.8 top_p=0.95 # Use a YAML configuration file lm-eval run --config my_config.yaml --tasks mmlu ``` ### Model and Tasks | Argument | Short | Description | |----------|-------|-------------| | `--model` | `-M` | Model type/provider name (default: `hf`). See [supported models](https://github.com/EleutherAI/lm-evaluation-harness#model-apis-and-inference-servers). | | `--model_args` | `-a` | Model constructor arguments as `key=val key2=val2` or `key=val,key2=val2`. For HuggingFace models, see [`HFLM`](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/models/huggingface.py) for available arguments. | | `--tasks` | `-t` | Space or comma-separated list of task names or groups. Use `lm-eval ls tasks` to see available tasks. | | `--apply_chat_template` | | Apply chat template to prompts. Use without argument for default template, or specify template name. | | `--limit` | `-L` | Limit examples per task. Integer for count, float (0.0-1.0) for percentage. **For testing only.** | | `--use_cache` | `-c` | Path prefix for SQLite cache of model responses (e.g., `/path/to/cache_`). | ### Evaluation Settings | Argument | Short | Description | |----------|-------|-------------| | `--num_fewshot` | `-f` | Number of few-shot examples in context. | | `--batch_size` | `-b` | Batch size: integer, `auto`, or `auto:N` to auto-tune N times (default: 1). | | `--max_batch_size` | | Maximum batch size when using `--batch_size auto`. | | `--device` | | Device to use: `cuda`, `cuda:0`, `cpu`, `mps` (default: `cuda`). | | `--gen_kwargs` | | Generation arguments as `key=val key2=val2`. Values parsed with `ast.literal_eval`. Example: `temperature=0.8 'stop=["\n\n"]'` | ### Data and Output | Argument | Short | Description | |----------|-------|-------------| | `--output_path` | `-o` | Output directory or JSON file for results. Required with `--log_samples`. | | `--log_samples` | `-s` | Save all model inputs/outputs for post-hoc analysis. | | `--samples` | `-E` | JSON mapping task names to sample indices, e.g., `'{"task1": [0,1,2]}'`. Incompatible with `--limit`. | ### Caching and Performance | Argument | Description | |----------|-------------| | `--cache_requests` | Cache preprocessed prompts: `true`, `refresh`, or `delete`. Cached files stored in `lm_eval/cache/.cache` or path set by `LM_HARNESS_CACHE_PATH` env var. | | `--check_integrity` | Run task test suite validation before evaluation. | ### Prompt Formatting | Argument | Description | |----------|-------------| | `--system_instruction` | Custom system instruction prepended to prompts. | | `--fewshot_as_multiturn` | Format few-shot examples as multi-turn conversation. Auto-enabled with `--apply_chat_template`. Set to `false` to disable. | ### Task Management | Argument | Description | |----------|-------------| | `--include_path` | Additional directory containing external task YAML files. | ### Logging and Tracking | Argument | Short | Description | |----------|-------|-------------| | `--verbosity` | `-v` | **(Deprecated)** Use `LMEVAL_LOG_LEVEL` env var instead. | | `--write_out` | `-w` | Print prompts for first few documents (for debugging). | | `--show_config` | | Display full task configuration after evaluation. | | `--wandb_args` | | Weights & Biases arguments as `key=val`. E.g., `project=my-project name=run-1`. | | `--wandb_config_args` | | Additional W&B config arguments. | | `--hf_hub_log_args` | | HuggingFace Hub logging arguments. See [HF Hub Logging](#huggingface-hub-logging). | ### Advanced Options | Argument | Short | Description | |----------|-------|-------------| | `--predict_only` | `-x` | Save predictions only, skip metric computation. Implies `--log_samples`. | | `--seed` | | Random seeds as single integer or comma-separated list for `python,numpy,torch,fewshot`. Default: `0,1234,1234,1234`. Use `None` to skip. Example: `--seed 42` or `--seed 0,None,8,52`. | | `--trust_remote_code` | | Allow executing remote code from HuggingFace Hub. | | `--confirm_run_unsafe_code` | | Confirm understanding of risks for tasks executing arbitrary Python. | | `--metadata` | | JSON string passed to TaskConfig. Required for some tasks like RULER. Example: `--metadata '{"max_seq_length": 4096}'`. | ### Configuration File | Argument | Short | Description | |----------|-------|-------------| | `--config` | `-C` | Path to YAML configuration file. CLI arguments override config file values. See [Configuration Files](config_files.md). | ### HuggingFace Hub Logging The `--hf_hub_log_args` argument accepts these keys: | Key | Description | |-----|-------------| | `hub_results_org` | Organization name on HF Hub. Defaults to token owner. | | `details_repo_name` | Repository name for detailed results. | | `results_repo_name` | Repository name for aggregated results. | | `push_results_to_hub` | `True`/`False` - push results to Hub. | | `push_samples_to_hub` | `True`/`False` - push samples to Hub. Requires `--log_samples`. | | `public_repo` | `True`/`False` - make repository public. | | `leaderboard_url` | URL to associated leaderboard. | | `point_of_contact` | Contact email for results dataset. | | `gated` | `True`/`False` - gate the details dataset. | --- ## `lm-eval ls` List available tasks, groups, subtasks, or tags. ```bash lm-eval ls [tasks|groups|subtasks|tags] [--include_path DIR] ``` ### Arguments | Argument | Description | |----------|-------------| | `tasks` | List all available tasks (groups, subtasks, and tags). | | `groups` | List only task groups (e.g., `mmlu`, `glue`, `superglue`). | | `subtasks` | List only individual subtasks (e.g., `mmlu_anatomy`, `hellaswag`). | | `tags` | List task tags (e.g., `reasoning`, `knowledge`). | | `--include_path` | Additional directory for external task definitions. | ### Task Organization - **Groups**: Collections of related tasks with aggregated metrics across subtasks (e.g., `mmlu` contains 57 subtasks) - **Subtasks**: Individual evaluation tasks (e.g., `mmlu_anatomy`, `hellaswag`) - **Tags**: Categories for filtering tasks without aggregated metrics (e.g., `reasoning`, `language`) ### Examples ```bash # List all tasks lm-eval ls tasks # List only task groups lm-eval ls groups # Include external tasks lm-eval ls tasks --include_path /path/to/external/tasks ``` --- ## `lm-eval validate` Validate task configurations before running evaluations. ```bash lm-eval validate --tasks [--include_path DIR] ``` ### Arguments | Argument | Short | Description | |----------|-------|-------------| | `--tasks` | `-t` | **(Required)** Comma-separated list of task names to validate. | | `--include_path` | | Additional directory for external task definitions. | ### Validation Checks The validate command performs: - **Task existence**: Verifies all specified tasks are available - **Configuration syntax**: Checks YAML/JSON configuration files - **Dataset access**: Validates dataset paths and configurations - **Required fields**: Ensures all mandatory task parameters are present - **Metric definitions**: Verifies metric functions and aggregation methods - **Filter pipelines**: Validates filter chains and their parameters - **Template rendering**: Tests prompt templates with sample data ### Examples ```bash # Validate a single task lm-eval validate --tasks hellaswag # Validate multiple tasks lm-eval validate --tasks arc_easy,arc_challenge,hellaswag # Validate a task group lm-eval validate --tasks mmlu # Validate external tasks lm-eval validate --tasks my_custom_task --include_path ./custom_tasks ``` --- ## Python API For programmatic usage, see the [Python API Guide](python-api.md). --- ## Environment Variables | Variable | Description | |----------|-------------| | `LMEVAL_LOG_LEVEL` | Logging level (`DEBUG`, `INFO`, `WARNING`, `ERROR`). | | `LM_HARNESS_CACHE_PATH` | Path for cached requests (default: `lm_eval/cache/.cache`). | | `HF_TOKEN` | HuggingFace Hub token for private datasets/models. | | `TOKENIZERS_PARALLELISM` | Set to `false` to avoid tokenizer warnings (auto-set by CLI). | ================================================ FILE: docs/model_guide.md ================================================ # New Model Guide This guide may be of special interest to users who are using the library outside of the repository, via installing the library via pypi and calling `lm_eval.evaluator.evaluate()` to evaluate an existing model. In order to properly evaluate a given LM, we require implementation of a wrapper class subclassing the `lm_eval.api.model.LM` class, that defines how the Evaluation Harness should interface with your model. This guide walks through how to write this `LM` subclass via adding it to the library! ## Setup To get started contributing, go ahead and fork the main repo, clone it, create a branch with the name of your model, and install the project requirements in your environment: ```sh # After forking... git clone https://github.com//lm-evaluation-harness.git cd lm-evaluation-harness git checkout -b pip install -e ".[dev]" ``` Now, we'll create a new file where we'll be adding our model: ```sh touch lm_eval/models/.py ``` **Tip: this filename should not shadow package names! For example, naming your file `anthropic.py` is disallowed since the API's name on pypi is `anthropic`, but naming it `anthropic_llms.py` works with no problems.** ## Interface All models must subclass the `lm_eval.api.model.LM` class. The LM class enforces a common interface via which we can extract responses from a model: ```python class MyCustomLM(LM): #... def loglikelihood(self, requests: list[Instance]) -> list[tuple[float, bool]]: #... def loglikelihood_rolling(self, requests: list[Instance]) -> list[tuple[float, bool]]: #... def generate_until(self, requests: list[Instance]) -> list[str]: #... #... ``` Where `Instance` is a dataclass defined in [`lm_eval.api.instance`](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/api/instance.py) with property `args` of request-dependent type signature described below. We support three types of requests, consisting of different interactions / measurements with an autoregressive LM. All three request types take as input `requests` of type `list[Instance]` that have a matching `Instance.request_type` to the method name. - `generate_until` - Each request contains `Instance.args : Tuple[str, dict]` containing 1. an input string to the LM and 2. a dictionary of keyword arguments used to control generation parameters. - Using this input and these generation parameters, text will be sampled from the language model (typically until a maximum output length or specific stopping string sequences--for example, `{"until": ["\n\n", "."], "max_gen_toks": 128}`). - The generated output text from the model will then be returned. - `loglikelihood` - Each request contains `Instance.args : Tuple[str, str]` containing 1. an input string to the LM and 2. a target string on which the loglikelihood of the LM producing this target, conditioned on the input, will be returned. - Each request will have, as result, `(ll, is_greedy): Tuple[float, int]` returned, where `ll` is a floating point number representing the log probability of generating the target string conditioned on the input, and `is_greedy` being either the value `0` or `1`, with it being `1` if and only if the target string *would be generated by greedy sampling from the LM* (that is, if the target string is the *most likely* N-token string to be output by the LM given the input. ) - `loglikelihood_rolling` - Each request contains `Instance.args : Tuple[str]`, which is an input string to the model whose *entire* loglikelihood, conditioned on purely the EOT token, will be calculated. - This is used to evaluate *perplexity* on a data distribution. - It should return `(ll,) : Tuple[float]` , a.k.a. solely the *loglikelihood* of producing each piece of text given no starting input. To allow a model to be evaluated on all types of tasks, you will need to implement these three types of measurements (note that `loglikelihood_rolling` is a special case of `loglikelihood`). For a reference implementation, check out `lm_eval/models/huggingface.py` ! Additionally, check out `lm_eval.api.model.TemplateLM` for a class that abstracts away some commonly used functions across LM subclasses, or see if your model would lend itself well to subclassing the `lm_eval.models.huggingface.HFLM` class and overriding just the initialization or a couple methods! **Tip: be careful of indexing in loglikelihood!** LMs take in tokens in position `[0 1 2 ... N]` and output a probability distribution for token position `N+1`. We provide a simplified graphic here, excerpted from `huggingface.py`: ```text # how this all works (illustrated on a causal decoder-only setup): # CTX CONT # inp 0 1 2 3|4 5 6 7 8 9 <- last token is deleted by inp[:, :-1] # model \ \ # logits 1 2 3|4 5 6 7 8 9 <- the ctx half gets tossed out by the # cont_toks 4 5 6 7 8 9 [:, -len(continuation_enc):, :self.vocab_size] slice ``` The final token of the target is not passed into the LM, because we want the LM's predictions *up to but not past* that final target token. For more information, check out https://github.com/EleutherAI/lm-evaluation-harness/issues/942 . ## Registration Congrats on implementing your model! Now it's time to test it out. To make your model usable via the command line interface to `lm-eval` using `python -m lm_eval`, you'll need to tell `lm-eval` what your model's name is. This is done via a *decorator*, `lm_eval.api.registry.register_model`. Using `register_model()`, one can both tell the package what the model's name(s) to be used are when invoking it with `python -m lm_eval --model ` and alert `lm-eval` to the model's existence. ```python from lm_eval.api.registry import register_model @register_model("", "") class MyCustomLM(LM): ``` Using this decorator results in the class being added to an accounting of the usable LM types maintained internally to the library at `lm_eval.api.registry.MODEL_REGISTRY`. See `lm_eval.api.registry` for more detail on what sorts of registries and decorators exist in the library! **Tip: be sure to import your model in `lm_eval/models/__init__.py!`** ## Testing We also recommend that new model contributions be accompanied by short tests of their 3 core functionalities, at minimum. To see an example of such tests, look at https://github.com/EleutherAI/lm-evaluation-harness/blob/35bdecd379c0cefad6897e67db892f4a6026a128/tests/test_ggml.py . ## Chat Templating Many models are fine-tuned with a [Chat Template](https://huggingface.co/docs/transformers/main/en/chat_templating) in order to enable back-and-forth interaction between a "User"'s queries and the model (often called "Assistant")'s responses. It can be desirable to evaluate fine-tuned models on evaluation tasks while wrapped in the conversational format they expect. In order to make your model optionally compatible with a chat format, three additional methods must be implemented: ```python class MyCustomLM(LM): #... @property def tokenizer_name(self) -> str: """ Return the name of the model's tokenizer and/or the accompanying chat template. The returned string is used to cache requests. Returns: str: The name of the model's tokenizer and/or chat template. """ def chat_template(self, chat_template: Union[bool, str] = False) -> str: """ Get the appropriate chat template for the model based on the `chat_template` argument. This method returns the chat template string to build the prompt from a chat history. The chat template is saved in the evaluation results for reproducibility. Boolean arguments should be used with models that have only one chat template, while string arguments are used with models that have multiple chat templates. For the reference implementation, see HFLM class in `lm_eval.models.huggingface`. Args: chat_template (Union[bool, str]): Specifies whether to apply a chat template: - If False: Do not apply any chat template. - If True: Apply the default chat template. - If str: Apply the specified chat template by name. Returns: str: The selected chat template in Jinja format. """ def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> str: """ Process a chat history to create a string that can be tokenized and input into the model. Args: chat_history (List[Dict[str, str]]): A list of dictionaries representing the chat history, where each dictionary has "role" and "content" keys. Returns: str: A string representing the chat history that can be tokenized and fed into the model. """ ``` - `apply_chat_template` - This method performs the bulk of the work required for chat-formatting. - As input, a `chat_history: List[Dict[str, str]]` is passed in. This is a transcript of a conversation of a form similar to ```text [ {"system": }, {"user": } {"assistant": }, # ... more few-shot examples, potentially {"user": }, ] ``` which can then be converted into a string input. - The output is a string representing this conversation that can be fed into the model. - For example, this consists of simply calling `tokenizer.apply_chat_template` for HFLM--see the implementation there for reference. - `tokenizer_name` - LM Eval Harness supports [caching requests](https://github.com/EleutherAI/lm-evaluation-harness/blob/4902aaaf1f374682f95ac25fe2e13b23faddc91a/lm_eval/__main__.py#L140) that are sent to a model, for faster setup when repeating an already-performed evaluation. - However, we don't want to use the cache of chat transcripts rendered using one chat template or system prompt to send to a model with a different template! So, we use this `lm.tokenizer_name` string to distinguish caches for a given model (and chat template) from one another. - `chat_template` - Chat templates are typically provided as a Jinja template string or a string formatted with str.format to include user and assistant messages in a single prompt. This template string is saved in the evaluation results to ensure reproducibility. If not implemented for a given model type, the flags `--apply_chat_template` , `--fewshot_as_multiturn`, and `--system_instruction` cannot be used. ## Other **Pro tip**: In order to make the Evaluation Harness overestimate total runtimes rather than underestimate it, HuggingFace models come in-built with the ability to provide responses on data points in *descending order by total input length* via `lm_eval.utils.Reorderer`. Take a look at `lm_eval.models.hf_causal.HFLM` to see how this is done, and see if you can implement it in your own model! ## Conclusion After reading this guide, you should be able to add new model APIs or implementations to the Eval Harness library! ================================================ FILE: docs/new_task_guide.md ================================================ # New Task Guide `lm-evaluation-harness` is a framework that strives to support a wide range of zero- and few-shot evaluation tasks on autoregressive language models (LMs). This documentation page provides a walkthrough to get started creating your own task, in `lm-eval` versions v0.4.0 and later. A more interactive tutorial is available as a Jupyter notebook [here](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/examples/lm-eval-overview.ipynb). ## Setup If you haven't already, go ahead and fork the main repo, clone it, create a branch with the name of your task, and install the project requirements in your environment: ```sh # After forking... git clone https://github.com//lm-evaluation-harness.git cd lm-evaluation-harness git checkout -b pip install -e ".[dev]" ``` In this document, we'll walk through the basics of implementing a static benchmark evaluation in two formats: a *generative* task which requires sampling text from a model, such as [`gsm8k`](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/gsm8k/gsm8k.yaml), and a *discriminative*, or *multiple choice*, task where the model picks the most likely of several fixed answer choices, such as [`sciq`](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/sciq/sciq.yaml). ## Creating a YAML file To implement a new standard task, we'll need to write a YAML file which configures our task logic. We start by making a new empty YAML file. This file can have any name, but we recommend placing it in a subfolder of `lm_eval/tasks` titled by the dataset or task's shorthand name: for example, ```sh touch lm_eval/tasks//.yaml ``` Or, copy the template subfolder we provide from `templates/new_yaml_task`: ```sh cp -r templates/new_yaml_task lm_eval/tasks/ ``` and rename the folders and YAML file(s) as desired. ### Selecting and configuring a dataset All data downloading and management is handled through the HuggingFace (**HF**) [`datasets`](https://github.com/huggingface/datasets) API. So, the first thing you should do is check to see if your task's dataset is already provided in their catalog [here](https://huggingface.co/datasets). If it's not in there, please consider adding it to their Hub to make it accessible to a wider user base by following their [new dataset guide](https://github.com/huggingface/datasets/blob/main/ADD_NEW_DATASET.md) . > [!TIP] > To test your task, we recommend using verbose logging using `export LMEVAL_LOG_LEVEL="DEBUG"` in your shell before running the evaluation script. This will help you debug any issues that may arise. Once you have a HuggingFace dataset prepared for your task, we want to assign our new YAML to use this dataset: ```yaml dataset_path: ... # the name of the dataset on the HF Hub. dataset_name: ... # the dataset configuration to use. Leave `null` if your dataset does not require a config to be passed. See https://huggingface.co/docs/datasets/load_hub#configurations for more info. dataset_kwargs: null # any extra keyword arguments that should be passed to the dataset constructor, e.g. `data_dir`. ``` Next, we'd like to tell our task what the dataset's train, validation, and test splits are named, if they exist: ```yaml training_split: validation_split: test_split: ``` Tests will run on the `test_split` if it is available, and otherwise evaluate on the `validation_split`. We can also specify from which split the task should retrieve few-shot examples via: ```yaml fewshot_split: ``` or by hardcoding them, either using the following in the yaml file: ```yaml fewshot_config: sampler: first_n samples: [ {}, {}, ] ``` The full `fewshot_config` supports the following fields: ```yaml fewshot_config: sampler: default # Sampling strategy: "default" (random) or "first_n" split: train # Dataset split to draw fewshot examples from (overrides fewshot_split) samples: [...] # Hardcoded list of fewshot examples, or a callable returning them doc_to_text: "..." # Override doc_to_text for fewshot examples only doc_to_target: "..." # Override doc_to_target for fewshot examples only doc_to_choice: "..." # Override doc_to_choice for fewshot examples only gen_prefix: "Answer:" # Prefix for assistant response in fewshot examples fewshot_delimiter: "\n\n" # Delimiter between fewshot examples target_delimiter: " " # Delimiter between question and answer ``` All fields are optional. If not specified, they inherit from the parent `TaskConfig`. This allows you to format fewshot examples differently from the evaluation examples — useful when your fewshot source has different field names or requires different formatting. You can also hardcode fewshot examples by adding the function `list_fewshot_samples` in the associated utils.py file: ```python def list_fewshot_samples() -> list[dict]: return [{}, {}] ``` See `lm_eval/tasks/minerva_math/minerva_math_algebra.yaml` for an example of the latter, and `lm_eval/tasks/gsm8k/gsm8k-cot.yaml` for an example of the former. In this case, each sample must contain the same fields as the samples in the above sets--for example, if `doc_to_text` expects an `input` field when rendering input prompts, these provided samples must include an `input` key. If neither above options are not set, we will default to train/validation/test sets, in that order. Finally, our dataset may not be already in the exact format we want. Maybe we have to strip whitespace and special characters via a regex from our dataset's "question" field! Or maybe we just want to rename its columns to match a convention we'll be using for our prompts. Let's create a python file in the directory where we're writing our YAML file: ```bash touch lm_eval/tasks//utils.py ``` Now, in `utils.py` we'll write a function to process each split of our dataset (the following example is drawn from [the `hellaswag` task](../lm_eval/tasks/hellaswag/utils.py)): ```python def process_docs(dataset: datasets.Dataset) -> datasets.Dataset: def _process_doc(doc): ctx = doc["ctx_a"] + " " + doc["ctx_b"].capitalize() out_doc = { "query": preprocess(doc["activity_label"] + ": " + ctx), "choices": [preprocess(ending) for ending in doc["endings"]], "gold": int(doc["label"]), } return out_doc return dataset.map(_process_doc) ``` Now, in our YAML config file we'll use the `!function` constructor, and tell the config where our imported Python function will come from. At runtime, before doing anything else we will preprocess our dataset according to this function! ```yaml process_docs: !function utils.process_docs ``` ### Using Local Datasets To load a local dataset for evaluation, you can specify data files in the `dataset_kwargs` field, such as the following for JSON files: ```yaml dataset_path: json dataset_name: null dataset_kwargs: data_files: /path/to/my/json ``` Or with files already split into separate directories: ```yaml dataset_path: arrow dataset_kwargs: data_files: train: /path/to/arrow/train/data-00000-of-00001.arrow validation: /path/to/arrow/validation/data-00000-of-00001.arrow ``` Alternatively, if you have previously downloaded a dataset from huggingface hub (using `save_to_disk()`) and wish to use the local files, you will need to use `data_dir` under `dataset_kwargs` to point to where the directory is. ```yaml dataset_path: hellaswag dataset_kwargs: data_dir: hellaswag_local/ ``` You can also set `dataset_path` as a directory path in your local system. This will assume that there is a loading script with the same name as the directory. [See datasets docs](https://huggingface.co/docs/datasets/loading#local-loading-script). ## Writing a Prompt Template The next thing we need to do is decide what format to use when presenting the data to the LM. This is our **prompt**, where we'll define both an input and output format. To write a prompt, users will use `doc_to_text`, `doc_to_target`, and `doc_to_choice` (Optional when certain conditions are met). `doc_to_text` defines the input string a model will be given while `doc_to_target` and `doc_to_choice` will be used to generate the target text. `doc_to_target` can be either a text string that refers to the target string or an integer that refers to the index of the correct label. When it is set as an index, `doc_to_choice` must also be set with the appropriate list of possible choice strings. ### Basic prompts If a dataset is straightforward enough, users can enter the feature name directly. This assumes that no preprocessing is required. For example in [Swag](https://github.com/EleutherAI/lm-evaluation-harness/blob/1710b42d52d0f327cb0eb3cb1bfbbeca992836ca/lm_eval/tasks/swag/swag.yaml#L10-L11), `doc_to_text` and `doc_to_target` given the name of one of the feature each. ```yaml doc_to_text: startphrase doc_to_target: label ``` Hard-coding is also possible as is the case in [SciQ](https://github.com/EleutherAI/lm-evaluation-harness/blob/1710b42d52d0f327cb0eb3cb1bfbbeca992836ca/lm_eval/tasks/sciq/sciq.yaml#L11). ```yaml doc_to_target: 3 ``` `doc_to_choice` can be directly given a list of text as option (See [Toxigen](https://github.com/EleutherAI/lm-evaluation-harness/blob/1710b42d52d0f327cb0eb3cb1bfbbeca992836ca/lm_eval/tasks/toxigen/toxigen.yaml#L11)) ```yaml doc_to_choice: ['No', 'Yes'] ``` if a dataset feature is already a list, you can set the name of the feature as `doc_to_choice` (See [Hellaswag](https://github.com/EleutherAI/lm-evaluation-harness/blob/e0eda4d3ffa10e5f65e0976161cd134bec61983a/lm_eval/tasks/hellaswag/hellaswag.yaml#L13)) ```yaml doc_to_choice: choices ``` ### Writing a prompt with Jinja 2 We support the [Jinja 2](https://jinja.palletsprojects.com/en/3.1.x/) templating language for writing prompts. In practice, this means you can take your dataset's columns and do many basic string manipulations to place each document into prompted format. Take for example the dataset `super_glue/boolq`. As input, we'd like to use the features `passage` and `question` and string them together so that for a sample line `doc`, the model sees something in the format of: ```text doc["passage"] Question: doc["question"]? Answer: ``` We do this by [writing](https://github.com/EleutherAI/lm-evaluation-harness/blob/1710b42d52d0f327cb0eb3cb1bfbbeca992836ca/lm_eval/tasks/super_glue/boolq/default.yaml#L9C1-L9C61) ```yaml doc_to_text: "{{passage}}\nQuestion: {{question}}?\nAnswer:" ``` Such that `{{passage}}` will be replaced by `doc["passage"]` and `{{question}}` with `doc["question"]` when rendering the prompt template. Our intended output is for the model to predict a single whitespace, and then the answer to the question. We do this via: ```yaml doc_to_target: "{{answer}}" ``` #### Multiple choice format For tasks which are multiple choice (a fixed, finite set of label words per each document) and evaluated via comparing loglikelihoods of all label words (the `multiple_choice` task output type) we enforce a particular convention on prompt format. > [!WARNING] > We add `target_delimiter` between input and target which defaults to " ", such that the full input-output string is `doc_to_text(doc) + target_delimiter + doc_to_target(doc)`. `doc_to_text` and `doc_to_target` should not contain trailing right or left whitespace, respectively. For multiple choice the target will be each choice index concatenated with the delimiter. An annotated example in the case of SciQ is as follows: ```yaml doc_to_text: "{{support.lstrip()}}\nQuestion: {{question}}\nAnswer:" # This is the input portion of the prompt for this doc. It will have " {{choice}}" appended to it as target for each choice in answer_choices. doc_to_target: 3 # this contains the index into the answer choice list of the correct answer. doc_to_choice: "{{[distractor1, distractor2, distractor3, correct_answer]}}" ``` Task implementers are thus able to decide what the answer choices should be for a document, and what prompt format to use. The label index can also be sourced from a feature directly. For example in `superglue/boolq`, the label index if defined in the feature `label`. We can set `doc_to_target` as simply `label`. The options or verbalizers can be written in the form of a list `["no", "yes"]` that will correspond to the label index. ```yaml doc_to_text: "{{passage}}\nQuestion: {{question}}?\nAnswer:" doc_to_target: label doc_to_choice: ["no", "yes"] ``` ### Using Python Functions for Prompts There may be cases where the prompt we want to implement is easier expressed in Python instead of Jinja 2. For this, we can use Python helper functions that are defined in the YAML config. It should be noted that the function script must be in the same directory as the yaml. A good example is WikiText that requires a lot of regex rules to clean the samples. ```python def wikitext_detokenizer(doc): string = doc["page"] # contractions string = string.replace("s '", "s'") string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string) ... string = string.replace(" 's", "'s") return string ``` We can load this function in `doc_to_target` by using a `!function` operator after `doc_to_target` and followed by `.`. In the file [wikitext.yaml](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/wikitext/wikitext.yaml) we write: ```yaml doc_to_target: !function preprocess_wikitext.wikitext_detokenizer ``` ### Importing a Prompt from Promptsource [Promptsource](https://github.com/bigscience-workshop/promptsource/tree/main/promptsource) is a great repository for crowdsourced prompts for many datasets. We can load these prompts easily by using the `use_prompt` argument and filling it with the format `"promptsource:"`. To use this, `doc_to_text` and `doc_to_target` should be left undefined. This will fetch the template of the dataset defined in the YAML file. For example, For Super Glue BoolQ, if we want to use the prompt template `GPT-3 Style` we can add this to the YAML file. ```yaml use_prompt: "promptsource:GPT-3 Style" ``` If you would like to run evaluation on all prompt templates, you can simply call it this way. ```yaml use_prompt: "promptsource:*" ``` ### Setting metrics You're almost done! Now we need to choose how to score our task. - *If this is a multiple choice task:* do you just want to check your model's accuracy in choosing the correct answer choice? - *If this is a generation task:* do you just want to check how often your model outputs *exactly the ground-truth output string provided*? If the answer to the above is no: you'll need to record what scoring metrics to use! Metrics can be listed in the following format: ```yaml metric_list: - metric: aggregation: higher_is_better: - metric: !function script.function aggregation: ... higher_is_better: ... ``` `aggregation` and `higher_is_better` can optionally be left out to default to the manually-set defaults if using a natively supported metric, otherwise it must be defined explicitly (for example, when using a custom metric implemented as a function). For a full list of natively supported metrics and aggregation functions see [`docs/task_guide.md`](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/task_guide.md). All metrics supported in [HuggingFace Evaluate](https://github.com/huggingface/evaluate/tree/main/metrics) can also be used, and will be loaded if a given metric name is not one natively supported in `lm-eval` or `hf_evaluate` is set to `true`. ### Optional, More Advanced Setup Some tasks may require more advanced processing logic than is described in this guide. As a heuristic check: - Does your task require generating multiple free-form outputs per input document? - Does your task require complex, multi-step post-processing of generated model outputs? - Does your task require subsetting documents on the fly based on their content? - Do you expect to compute metrics after applying multiple such processing steps on your model outputs? - Does your task rely on metrics that need a custom implementation? For more detail on the task system and advanced features, see [`docs/task_guide.md`](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/task_guide.md). If none of the above sounds like they apply to your task, it's time to continue onto checking your task performance! ### Task name + tags (registering a task) To test a task conveniently, it helps to *register* the task--that is, to give it a name and make the `lm-eval` library aware it exists! If you're writing your YAML file inside the `lm_eval/tasks` folder, you just need to give your task a name! You can do this inside your YAML file: ```yaml task: ``` Including a task name is mandatory. It is often also convenient to label your task with several `tag` values, though this field is optional: ```yaml tag: - tag1 - tag2 ``` This will add your task to the `tag1` and `tag2` tags, enabling people to know how to categorize your task, and if desired run all tasks in one of these groups at once, your task along with them. If your task is not in the `lm_eval/tasks` folder, you'll need to tell the Eval Harness where to look for YAML files. You can do this via the `--include_path` argument in `__main__.py`. This command will be used to initialize the `TaskManager` object which you can also use for your custom scripts. ```python task_manager = TaskManager(args.verbosity, include_path=args.include_path) ``` Passing `--tasks /path/to/yaml/file` is also accepted. ### Advanced Group Configs While `tag` values are helpful when you want to be able to quickly and conveniently run a set of related tasks via `--tasks my_tag_name`, often, we wish to implement more complex logic. For example, the MMLU benchmark contains 57 *subtasks* that must all be *averaged* together in order to report a final 'MMLU score'. Groupings of tasks might also use particular variants of a task--for example, we might want to default to evaluating a task as 5-shot when called as part of a given grouping, but not have a preference for number of shots when evaluating it as a standalone. We implement this via **groups**, which are distinct from tags. Groups can be implemented via *group config* YAML files, which are laid out similarly but slightly differently to tasks' YAML configs. The most basic form of group can be defined via a YAML config similar to the following: ```yaml group: nli_tasks task: - cb - anli_r1 - rte metadata: version: 1.0 ``` This will behave almost identically to a `tag` that includes these 3 tasks, but with one key distinction: we'll print the `nli_tasks` group as a row (with no associated metrics) in our table of outputs, and visually show that these 3 tasks appear under its subheader. Now, let's assume we actually want to report an aggregate score for `nli_tasks`. We would instead use a YAML config like the following: ```yaml group: nli_tasks task: - cb - anli_r1 - rte aggregate_metric_list: - metric: acc aggregation: mean weight_by_size: true # defaults to `true`. Set this to `false` to do a "macro" average (taking each subtask's average accuracy, and summing those accuracies and dividing by 3)--by default we do a "micro" average (retain all subtasks' per-document accuracies, and take the mean over all documents' accuracies to get our aggregate mean). metadata: version: 1.0 ``` Similar to our `metric_list` for listing out the metrics we want to calculate for a given task, we use an `aggregate_metric_list` field to specify which metric name to aggregate across subtasks, what aggregation function to use, and whether we should micro- or macro- average these metrics. See [./task_guide.md](./task_guide.md) for a full list of related sub-keys. **[!Tip]: currently, we predominantly only support the aggregation of group metrics that use `mean` (either micro- or macro- averaged) over their subtasks. If you require even more complex aggregation rules, you may want to perform aggregation offline.** Group configs can be fairly complex! We can do various operations, such as defining new subtask(s) inline in our group YAML, overriding an existing task's specific config value, or nesting existing groups within our For example, let's build a config for evaluating MMLU and a few natural language inference tasks. For MMLU, we can write the name for the benchmark as a subtask written under `task`. You can configure the parameters such as `num_fewshot`. If the task being configured is a group such as `mmlu` or `super_glue`, the parameter set will be applied to all of the subtasks. ```yaml group: nli_and_mmlu task: - group: nli_tasks task: - cb - anli_r1 - rte aggregate_metric_list: - metric: acc aggregation: mean higher_is_better: true - task: mmlu num_fewshot: 2 ``` ### Configuring python classes There can be occasions when yaml-based tasks cannot accommodate how a task is handled. LM-Eval supports the manually implementing tasks as was previously done before `0.4.x`. To register the task, you can simply make a yaml with the name of the task in `task` and the class object in `class` using the `!function` prefix. ```yaml task: squadv2 class: !function task.SQuAD2 ``` This also applies to building group configurations with subtasks that are python classes. ```yaml group: scrolls task: - task: scrolls_qasper class: !function task.Qasper - task: scrolls_quality class: !function task.QuALITY - task: scrolls_narrativeqa class: !function task.NarrativeQA ... ``` You can also pass a custom argument to your class by accepting `config` in the custom class constructor. Here's how to do it: ```yaml task: 20_newsgroups class: !function task.Unitxt recipe: card=cards.20_newsgroups,template=templates.classification.multi_class.title ``` In this example, `recipe` is the custom argument for the `Unitxt` class. ## Beautifying Table Display To avoid conflict, each task needs to be registered with a unique name. Because of this, slight variations of task are still counted as unique tasks and need to be named uniquely. This could be done by appending an additional naming that may refer to the variation such as in MMLU where the template used to evaluated for flan are differentiated from the default by the prefix `mmlu_flan_*`. Printing the full task names can easily clutter the results table at the end of the evaluation especially when you have a long list of tasks or are using a benchmark that comprises of many tasks. To make it more legible, you can use `task_alias` and `group_alias` to provide an alternative task name and group name that will be printed. For example in `mmlu_abstract_algebra.yaml` we set `task_alias` to `abstract_algebra`. In group configs, a `group_alias` for a group can also be set. ```yaml "dataset_name": "abstract_algebra" "description": "The following are multiple choice questions (with answers) about abstract\ \ algebra.\n\n" "include": "_default_template_yaml" "task": "mmlu_abstract_algebra" "task_alias": "abstract_algebra" ``` ## Checking validity After registering your task, you can now check on your data downloading and verify that the few-shot samples look as intended. Run the following command with your desired args: ```bash python -m scripts.write_out \ --output_base_path \ --tasks \ --sets \ --num_fewshot K \ --num_examples N \ ``` Open the file specified at the `--output_base_path ` and ensure it passes a simple eye test. ## Versioning One key feature in LM Evaluation Harness is the ability to version tasks and groups--that is, mark them with a specific version number that can be bumped whenever a breaking change is made. This version info can be provided by adding the following to your new task or group config file: ```yaml metadata: version: 0 ``` Now, whenever a change needs to be made to your task in the future, please increase the version number by 1 so that users can differentiate the different task iterations and versions. If you are incrementing a task's version, please also consider adding a changelog to the task's README.md noting the date, PR number, what version you have updated to, and a one-liner describing the change. for example, - \[Dec 25, 2023\] (PR #999) Version 0.0 -> 1.0: Fixed a bug with answer extraction that led to underestimated performance. ## Checking performance + equivalence It's now time to check models' performance on your task! In the evaluation harness, we intend to support a wide range of evaluation tasks and setups, but prioritize the inclusion of already-proven benchmarks following the precise evaluation setups in the literature where possible. To enable this, we provide a checklist that should be completed when contributing a new task, to enable accurate book-keeping and to ensure that tasks added to the library are well-tested and, where applicable, precedented. ### Task Validity Checklist The checklist is the following: For adding novel benchmarks/datasets to the library: - [ ] Is the task an existing benchmark in the literature? - [ ] Have you referenced the original paper that introduced the task? - [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test? If other tasks on this dataset are already supported: - [ ] Is the "Main" variant of this task clearly denoted? - [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates? - [ ] Have you noted which, if any, published evaluation setups are matched by this variant? It is recommended to include a filled-out copy of this checklist in the README.md for the subfolder you are creating, if you have created a new subfolder in `lm_eval/tasks`. **Finally, please add a short description of your task(s), along with a link to its subfolder in lm_eval/tasks, to [`lm_eval/tasks/README.md`](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/README.md) so that users can discover your task in the library, and follow the link to your README for more information about the variants supported, their task names, and the original source of the dataset and/or evaluation setup.** ## Submitting your task You're all set! Now push your work and make a pull request to the `main` branch! Thanks for the contribution :). If there are any questions, please leave a message in the `#lm-thunderdome` channel on the EAI discord! ================================================ FILE: docs/python-api.md ================================================ # Python API This guide covers programmatic usage of the evaluation harness in Python scripts and applications. ## Overview The library provides three main ways to run evaluations programmatically: | Function | Use Case | |----------|----------| | `simple_evaluate()` | Most common - accepts model name strings or LM objects | | `EvaluatorConfig` | Config-based - load settings from YAML or dataclass | | `evaluate()` | Low-level - full control over task dictionaries | --- ## Quick Start The simplest way to run an evaluation: ```python import lm_eval results = lm_eval.simple_evaluate( model="hf", model_args="pretrained=gpt2", tasks=["hellaswag"], ) print(results["results"]) ``` --- ## Using `simple_evaluate()` The `simple_evaluate()` function is the recommended entry point for most use cases. ### Basic Usage ```python import lm_eval results = lm_eval.simple_evaluate( model="hf", model_args="pretrained=gpt2,dtype=float32", tasks=["hellaswag", "arc_easy"], num_fewshot=5, batch_size=8, device="cuda:0", ) ``` ### With a Pre-initialized Model ```python import lm_eval from lm_eval.models.huggingface import HFLM # Initialize model separately lm = HFLM(pretrained="gpt2", batch_size=16) results = lm_eval.simple_evaluate( model=lm, tasks=["hellaswag"], num_fewshot=0, ) ``` ### With External Tasks ```python import lm_eval from lm_eval.tasks import TaskManager # Include custom task definitions task_manager = TaskManager(include_path="/path/to/custom/tasks") results = lm_eval.simple_evaluate( model="hf", model_args="pretrained=gpt2", tasks=["my_custom_task"], task_manager=task_manager, ) ``` ### Common Parameters | Parameter | Type | Description | |-----------|------|-------------| | `model` | str or LM | Model name (e.g., "hf", "vllm") or LM instance | | `model_args` | str or dict | Model constructor arguments | | `tasks` | list[str] | Task names to evaluate | | `num_fewshot` | int | Number of few-shot examples | | `batch_size` | int or str | Batch size or "auto" | | `device` | str | Device (cuda, cpu, mps) | | `limit` | int or float | Limit examples per task | | `log_samples` | bool | Save model inputs/outputs | | `task_manager` | TaskManager | For external tasks | | `gen_kwargs` | dict | Generation arguments | | `apply_chat_template` | bool or str | Use chat template | | `system_instruction` | str | System prompt | | `fewshot_as_multiturn` | bool | Multi-turn few-shot | See [`lm_eval/evaluator.py`](../lm_eval/evaluator.py) for the complete parameter list. ### Return Value `simple_evaluate()` returns a dictionary with: ```python { "results": { "task_name": { "metric_name": value, "metric_name,stderr": stderr_value, } }, "configs": {...}, # Task configurations "versions": {...}, # Task versions "n-shot": {...}, # Few-shot counts "higher_is_better": {...}, "n-samples": {...}, "samples": {...}, # If log_samples=True } ``` --- ## Using `EvaluatorConfig` The `EvaluatorConfig` class provides a structured way to manage evaluation settings. ### From YAML File ```python from lm_eval.config.evaluate_config import EvaluatorConfig import lm_eval # Load configuration from YAML config = EvaluatorConfig.from_config("eval_config.yaml") # Process tasks task_manager = config.process_tasks() # Run evaluation results = lm_eval.simple_evaluate( model=config.model, model_args=config.model_args, tasks=config.tasks, num_fewshot=config.num_fewshot, batch_size=config.batch_size, device=config.device, task_manager=task_manager, log_samples=config.log_samples, gen_kwargs=config.gen_kwargs, apply_chat_template=config.apply_chat_template, system_instruction=config.system_instruction, ) ``` ### Direct Instantiation ```python from lm_eval.config.evaluate_config import EvaluatorConfig config = EvaluatorConfig( model="hf", model_args={"pretrained": "gpt2", "dtype": "float32"}, tasks=["hellaswag", "arc_easy"], num_fewshot=5, batch_size=8, device="cuda:0", output_path="./results/", log_samples=True, ) # Validate and process task_manager = config.process_tasks() ``` ### Config Fields See the [Configuration Guide](config_files.md#config-schema) for all available fields. --- ## Using `evaluate()` The `evaluate()` function provides lower-level control, accepting pre-built task dictionaries. ### With Custom Task Objects ```python import lm_eval from lm_eval.tasks import TaskManager, get_task_dict from lm_eval.models.huggingface import HFLM # Initialize model lm = HFLM(pretrained="gpt2", batch_size=16) # Build task dictionary task_manager = TaskManager(include_path="/path/to/custom/tasks") task_dict = get_task_dict( ["hellaswag", "my_custom_task"], task_manager ) # Run evaluation results = lm_eval.evaluate( lm=lm, task_dict=task_dict, num_fewshot=5, limit=100, ) ``` ### Mixed Task Sources ```python from lm_eval.tasks import get_task_dict # Combine different task sources task_dict = get_task_dict( [ "mmlu", # Stock task name "my_custom_task", # From include_path {"task": "inline_task", ...}, # Inline config dict ], task_manager ) ``` --- ## Custom Models To evaluate a custom model, create a subclass of `lm_eval.api.model.LM`: ```python from lm_eval.api.model import LM class MyCustomLM(LM): def __init__(self, model, batch_size=1): super().__init__() self.model = model self._batch_size = batch_size def loglikelihood(self, requests): # Return list of (logprob, is_greedy) tuples ... def generate_until(self, requests): # Return list of generated strings ... def loglikelihood_rolling(self, requests): # Return list of (logprob, is_greedy) tuples ... @property def batch_size(self): return self._batch_size ``` Then use it with `simple_evaluate()`: ```python my_model = load_my_model() lm = MyCustomLM(model=my_model, batch_size=16) results = lm_eval.simple_evaluate( model=lm, tasks=["hellaswag"], ) ``` For detailed guidance on implementing custom models, see the [Model Guide](model_guide.md). --- ## Logging Configure logging for debugging: ```python from lm_eval.utils import setup_logging # Set log level setup_logging("DEBUG") # DEBUG, INFO, WARNING, ERROR # Or use environment variable import os os.environ["LMEVAL_LOG_LEVEL"] = "DEBUG" ``` --- ## Examples ### Batch Evaluation of Multiple Models ```python import lm_eval models = [ "gpt2", "gpt2-medium", "gpt2-large", ] all_results = {} for model_name in models: results = lm_eval.simple_evaluate( model="hf", model_args=f"pretrained={model_name}", tasks=["hellaswag"], batch_size="auto", ) all_results[model_name] = results["results"] ``` ### Save and Load Results ```python import json import lm_eval from lm_eval.utils import handle_non_serializable results = lm_eval.simple_evaluate( model="hf", model_args="pretrained=gpt2", tasks=["hellaswag"], ) # Save results with open("results.json", "w") as f: json.dump(results, f, default=handle_non_serializable, indent=2) ``` ================================================ FILE: docs/task_guide.md ================================================ # Task Configuration The `lm-evaluation-harness` is meant to be an extensible and flexible framework within which many different evaluation tasks can be defined. All tasks in the new version of the harness are built around a YAML configuration file format. These YAML configuration files, along with the current codebase commit hash, are intended to be shareable such that providing the YAML config enables another researcher to precisely replicate the evaluation setup used by another, in the case that the prompt or setup differs from standard `lm-eval` task implementations. While adding a standard evaluation task on a new dataset can be occasionally as simple as swapping out a Hugging Face dataset path in an existing file, more specialized evaluation setups also exist. Here we'll provide a crash course on the more advanced logic implementable in YAML form available to users. If your intended task relies on features beyond what is described in this guide, we'd love to hear about it! Feel free to open an issue describing the scenario on Github, create a PR to the project with a proposed implementation, or ask in the `#lm-thunderdome` channel on the EleutherAI discord. ## Configurations Tasks are configured via the `TaskConfig` object. Below, we describe all fields usable within the object, and their role in defining a task. ### Parameters Task naming + registration: - **task** (`str`, defaults to None) — name of the task. - **task_alias** (`str`, defaults to None) - Alias of the task name that will be printed in the final table results. - **tag** (`str`, *optional*) — name of the task tags(s) a task belongs to. Enables one to run all tasks with a specified tag name at once. Dataset configuration options: - **dataset_path** (`str`) — The name of the dataset as listed by HF in the datasets Hub. - **dataset_name** (`str`, *optional*, defaults to None) — The name of what HF calls a “data instance” or sub-task of the benchmark. If your task does not contain any data instances, just leave this to default to None. (If you're familiar with the HF `datasets.load_dataset` function, these are just the first 2 arguments to it.) - **dataset_kwargs** (`dict`, *optional*) — Auxiliary arguments that `datasets.load_dataset` accepts. This can be used to specify arguments such as `data_files` or `data_dir` if you want to use local datafiles such as json or csv. - **custom_dataset** (`Callable`, *optional) - A function that returns a `dict[str, datasets.Dataset]` (, dataset) object. This can be used to load a dataset from a custom source or to preprocess the dataset in a way that is not supported by the `datasets` library. Will have access to `metadata` field if defined (from config and passed to TaskManager), and `model_args` from runtime (if using `evaluate`). - **training_split** (`str`, *optional*) — Split in the dataset to use as the training split. - **validation_split** (`str`, *optional*) — Split in the dataset to use as the validation split. - **test_split** (`str`, *optional*) — Split in the dataset to use as the test split. - **fewshot_split** (`str`, *optional*) — Split in the dataset to draw few-shot exemplars from. assert that this not None if num_fewshot > 0. - **process_docs** (`Callable`, *optional*) — Optionally define a function to apply to each HF dataset split, to preprocess all documents before being fed into prompt template rendering or other evaluation steps. Can be used to rename dataset columns, or to process documents into a format closer to the expected format expected by a prompt template. Prompting / in-context formatting options: - **use_prompt** (`str`, *optional*) — Name of prompt in promptsource to use. if defined, will overwrite doc_to_text, doc_to_target, and doc_to_choice. - **description** (`str`, *optional*) — An optional prepended Jinja2 template or string which will be prepended to the few-shot examples passed into the model, often describing the task or providing instructions to a model, such as `"The following are questions (with answers) about {{subject}}.\n\n"`. No delimiters or spacing are inserted between the description and the first few-shot example. - **doc_to_text** (`Union[Callable, str]`, *optional*) — Jinja2 template, string, or function to process a sample into the appropriate input for the model. - **doc_to_target** (`Union[Callable, str]`, *optional*) — Jinja2 template, string, or function to process a sample into the appropriate target output for the model. For multiple choice tasks, this should return an index into the answer choice list of the correct answer. - **doc_to_choice** (`Union[Callable, str]`, *optional*) — Jinja2 template, string, or function to process a sample into a list of possible string choices for `multiple_choice` tasks. Left undefined for `generate_until` tasks. - **fewshot_delimiter** (`str`, *optional*, defaults to "\n\n") — String to insert between few-shot examples. - **target_delimiter** (`str`, *optional*, defaults to `" "`) — String to insert between input and target output for the datapoint being tested. - **gen_prefix** (`str`, *optional*) — String to append after the <|assistant|> token. For example, if the task is to generate a question, the gen_prefix could be "The answer is: " to prompt the model to generate an answer to the question. If not using a chat template then this string will be appended to the end of the prompt. Runtime configuration options: - **num_fewshot** (`int`, *optional*, defaults to 0) — Number of few-shot examples before the input. - **batch_size** (`int`, *optional*, defaults to 1) — Batch size. Scoring details: - **metric_list** (`str`, *optional*, defaults to None) — A list of metrics to use for evaluation. See docs for expected format. - **output_type** (`str`, *optional*, defaults to "generate_until") — Selects the type of model output for the given task. Options are `generate_until`, `loglikelihood`, `loglikelihood_rolling`, and `multiple_choice`. - **generation_kwargs** (`dict`, *optional*) — Auxiliary arguments for the `generate` function from HF transformers library. Advanced keyword arguments may not be supported for non-HF LM classes. - **repeats** (`int`, *optional*, defaults to 1) — Number of repeated runs through model for each sample. Can be used for cases such as self-consistency. - **filter_list** (`Union[str, list]`, *optional*) — List of filters to postprocess model outputs. See below for further detail on the filter API. - **should_decontaminate** (`bool`, *optional*, defaults to False) - Whether to decontaminate or not. - **doc_to_decontamination_query** (`str`, *optional*) — Query for decontamination if `should_decontaminate` is True. If `should_decontaminate` is True but `doc_to_decontamination_query` is `None`, `doc_to_decontamination_query` will follow `doc_to_text`. Other: - **metadata** (`dict`, *optional*) — An optional field where arbitrary metadata can be passed. Most tasks should include a `version` key in this field that is used to denote the version of the yaml config. Other special metadata keys are: `num_fewshot`, to override the printed `n-shot` table column for a task. Will also be passed to the `custom_dataset` function if defined. ## Filters A key component of the `lm-evaluation-harness` library is the `Filter` object. In a typical evaluation run of the harness, we take the formatted inputs and run them through our LM, with the appropriate output type (greedy or free-form generation, or loglikelihood-based comparative scoring). After getting scores or output text from our LM on each `Instance` or document in the dataset, we then need to feed these responses into a metric or scoring function to return scores to a user. However, certain tasks may require more complex behavior than directly turning over model outputs to a metric function. For example, we may want to post-process our output text by truncating it or extracting a model's answer, we may want to ensemble over multiple "takes" on a different document, et cetera. **Detailed Aside**: We do such post-processing by operating on *responses*, which are stored after running an LM on an `Instance` from the task in `Instance.resps`. `resps` is a `List[str]` for each instance, and we pass a `List[List[]]` to our filters that is a list of `[instance.resps for instance in instances]`. Our filters, after completing a pipeline, must return a `List[]` which we then unpack and store each element of in `Instance.filtered_resps` for the corresponding instance. Thus, we take as input a list of returns from our model for each doc, and must return a return from our model *without it being wrapped in a list* for each doc. **End Aside** A full list of supported filter operations can be found in `lm_eval/filters/__init__.py`. Contributions of new filter types are welcome! ### Multiple Filter Pipelines Tasks need not be limited to a single filter pipeline. We enable users to run multiple, distinct, filter pipelines on *the same model outputs* generated in one run on a task. As a case study, let's look at an implementation of solving the Gsm8k math word problem benchmark in `lm_eval/tasks/gsm8k/gsm8k-cot-self-consistency.yaml`. Here, we are emulating the setup used by [Self-Consistency Improves Chain of Thought Prompting](https://arxiv.org/abs/2203.11171), in which evaluation is performed by generating N chain-of-thought outputs from a model via temperature-based sampling, then selecting the answers output by the model at the end of the chains of thought, then majority voting across all those numeric answers. Within our YAML file: ```yaml ... repeats: 64 filter_list: - name: "score-first" filter: - function: "regex" regex_pattern: "The answer is (\\-?[0-9\\.\\,]*[0-9]+)" - function: "take_first" - name: "maj@64" filter: - function: "regex" regex_pattern: "The answer is (\\-?[0-9\\.\\,]*[0-9]+)" - function: "majority_vote" - function: "take_first" - name: "maj@8" filter: - function: "take_first_k" k: 8 - function: "regex" regex_pattern: "The answer is (\\-?[0-9\\.\\,]*[0-9]+)" - function: "majority_vote" - function: "take_first" ``` We are able to provide multiple different filter pipelines, each with their own name and list of filters to apply in sequence. Our first filter pipeline implements - applying a regex to the model generations (extracting the number within the phrase "The answer is (number)") - selecting only the first out of the 64 model answers Then scoring this single answer. ```yaml - name: "score-first" filter: - function: "regex" regex_pattern: "The answer is (\\-?[0-9\\.\\,]*[0-9]+)" - function: "take_first" ``` Our second filter pipeline, "maj@64", does majority voting across all 64 answers via: - applying the same regex to all responses, to get the numerical answer from the model for each of the 64 responses per problem - applying majority voting to all responses, which then returns a length-1 `[]` list for each - taking the first element of this length-1 list, to then score the sole response `` for each document. ```yaml - name: "maj@64" filter: - function: "regex" regex_pattern: "The answer is (\\-?[0-9\\.\\,]*[0-9]+)" - function: "majority_vote" - function: "take_first" ``` Our final filter pipeline, "maj@8", does majority voting across the first 8 of the model's responses per document via: - subsetting the len-64 list of responses `[answer1, answer2, ..., answer64]` to `[answer1, answer2, ..., answer8]` for each document - performing the same sequence of filters on these new sets of 8 responses, for each document. ```yaml - name: "maj@8" filter: - function: "take_first_k" k: 8 - function: "regex" regex_pattern: "The answer is (\\-?[0-9\\.\\,]*[0-9]+)" - function: "majority_vote" - function: "take_first" ``` Thus, given the 64 responses from our LM on each document, we can report metrics on these responses in these 3 different ways, as defined by our filter pipelines. ### Adding a custom filter Just like adding a custom model with `register_model` decorator one is able to do the same with filters, for example ```python from lm_eval.api.filter import Filter from lm_eval.api.registry import register_filter @register_filter("new_filter") class NewFilter(Filter) ... ``` ## Embedded Python Code Use can use python functions for certain arguments by using the `!function` operator after the argument name followed by `.`. This feature can be used for the following arguments: 1. `doc_to_text` 2. `doc_to_target` 3. `doc_to_choice` 4. `aggregation` for a `metric` in `metric_list` ## (No Longer Recommended) Direct `Task` Subclassing The prior implementation method of new tasks was to subclass `Task`. While we intend to migrate all tasks to the new YAML implementation option going forward, it remains possible to subclass the Task class and implement custom logic. For more information, see `docs/task_guide.md` in v0.3.0 of the `lm-evaluation-harness`. ## Including a Base YAML You can base a YAML on another YAML file as a template. This can be handy when you need to just change the prompt for `doc_to_text` but keep the rest the same or change `filters` to compare which is better. Simply use `include` in the YAML file and write the name of the template you want to base from. This assumes that the base template is in the same directory. Otherwise, You will need to define the full path. ```yaml include: ... ``` You can find an example of how to use this feature at [gsm8k-cot-self-consistency.yaml](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/gsm8k/gsm8k-cot-self-consistency.yaml) where it is based off [gsm8k-cot.yaml](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/gsm8k/gsm8k-cot.yaml) ## Passing Arguments to Metrics Metrics can be defined in the `metric_list` argument when building the YAML config. Multiple metrics can be listed along with any auxiliary arguments. For example, setting the [`exact_match` metric](https://github.com/huggingface/evaluate/tree/main/metrics/exact_match), auxiliary arguments such as `ignore_case`, `ignore_punctuation`, `regexes_to_ignore` can be listed as well. They will be added to the metric function as `kwargs`. Some metrics have predefined values for `aggregation` and `higher_is_better` so listing the metric name only can be sufficient. ```yaml metric_list: - metric: acc - metric: exact_match aggregation: mean higher_is_better: true ignore_case: true ignore_punctuation: false regexes_to_ignore: - "," - "\\$" ``` ### Natively Supported Metrics Here we list all metrics currently supported natively in `lm-eval`: Metrics: - `acc` (accuracy) - `acc_norm` (length-normalized accuracy) - `acc_mutual_info` (baseline loglikelihood - normalized accuracy) - `perplexity` - `word_perplexity` (perplexity per word) - `byte_perplexity` (perplexity per byte) - `bits_per_byte` - `matthews_corrcoef` (Matthews correlation coefficient) - `f1` (F1 score) - `bleu` - `chrf` - `ter` Aggregation functions: - `mean` - `median` - `perplexity` - `weighted_perplexity` - `bits_per_byte` ### Adding a Multiple Choice Metric Adding a multiple choice metric has a few steps. To get it working you need to: 1. register a metric function 2. register an aggregation function 3. update the `Task` definition to make sure the correct arguments are passed The default metric and aggregation functions are in `lm_eval/api/metrics.py`, and you can add a function there if it's for general use. The metrics are towards the bottom of the file and look like this: ```python @register_metric( metric="mcc", higher_is_better=True, output_type="multiple_choice", aggregation="matthews_corrcoef", ) def mcc_fn(items): # This is a passthrough function return items ``` Note that many of these are passthrough functions, and for multiple choice (at least) this function is never actually called. Aggregation functions are defined towards the top of the file, here's an example: ```python @register_aggregation("matthews_corrcoef") def matthews_corrcoef(items): unzipped_list = list(zip(*items)) golds = unzipped_list[0] preds = unzipped_list[1] return sklearn.metrics.matthews_corrcoef(golds, preds) ``` This function returns a single numeric value. The input is defined in `Task.process_results` in `lm_eval/api/task.py`. There's a section that looks like this: ```python result_dict = { **({"acc": acc} if "acc" in use_metric else {}), **({"f1": (gold, pred)} if "f1" in use_metric else {}), **({"mcc": (gold, pred)} if "mcc" in use_metric else {}), **({"acc_norm": acc_norm} if "acc_norm" in use_metric else {}), **({"exact_match": exact_match} if "exact_match" in use_metric else {}), } ``` The value here determines the input to the aggregation function, though the name used matches the metric function. These metrics all have simple needs and just need the accuracy or gold and predicted values, but immediately below this there are examples of metrics with more complicated needs you can use as reference. ## Good Reference Tasks Contributing a new task can be daunting! Luckily, much of the work has often been done for you in a different, similarly evaluated task. Good examples of task implementations to study include: Multiple choice tasks: - SciQ (`lm_eval/tasks/sciq/sciq.yaml`) Corpus perplexity evaluations: - Wikitext (`lm_eval/tasks/wikitext/wikitext.yaml`) Generative tasks: - GSM8k (`lm_eval/tasks/gsm8k/gsm8k.yaml`) Tasks using complex filtering: - GSM8k with CoT (+ with Self-Consistency): (`lm_eval/tasks/gsm8k/gsm8k-cot.yaml` ; `lm_eval/tasks/gsm8k/gsm8k-cot-self-consistency.yaml`) # Group Configuration When evaluating a language model, it is not unusual to test across a number of tasks that may not be related to one another in order to assess a variety of capabilities. To this end, it may be cumbersome to have to list the set of tasks or add a new group name to each yaml of each individual task. To solve this, we can create a **group** yaml config. This is a config that contains the names of the tasks that should be included in a particular group. The config consists of two main keys: a `group` key which denotes the name of the group (as it would be called from the command line, e.g. `mmlu`) and a `task` key which is where we can list the tasks. The tasks listed in `task` are the task names that have been registered. A good example of a group yaml config can be found at [../lm_eval/tasks/mmlu/default/_mmlu.yaml]. See also the [New Task Guide](./new_task_guide.md) for a more in-depth and tutorial-esque explanation of how to write complex GroupConfigs. ## Configurations Groups are configured via the `GroupConfig` object. Below, we describe all fields usable within the object, and their role in defining a task. ### Parameters - **group** (`str`, defaults to `None`) — name of the group. Used to invoke it from the command line. - **group_alias** (`str`, defaults to `None`) - Alternative name for the group that will be printed in the table output. - **task** (`Union[str, list]`, defaults to `None`) - List of tasks that constitute the group. - **aggregate_metric_list** (`list`, defaults to `None`) - similar to `metric_list` in TaskConfigs, provide a list of configurations for metrics that should be aggregated across subtasks. Leaving empty will result in no aggregation being performed for this group. Keys for each list entry are: - `metric: str` - the name of the metric to aggregate over (all subtasks must report a metric holding this name.) - `aggregation: str` - what aggregation function to apply to aggregate these per-subtask metrics. **currently, only `mean` is supported.** - `weight_by_size: bool = True` whether to perform micro- averaging (`True`) or macro- (`False`) averaging of subtasks' accuracy scores when reporting the group's metric. MMLU, for example, averages over per-document accuracies (the *micro average*), resulting in the same accuracy as if one simply concatenated all 57 subjects into a single dataset and evaluated accuracy on that dataset. - `filter_list: Union[str, List[str]] = "none"` - what filter keys one should match on to aggregate results. For example, if trying to aggregate over the `exact_match` metric using `strict-match` filter for `bbh_cot_zeroshot`, then set this to be `filter_list: "strict-match"`. - **metadata** (`dict`, *optional*) - As with TaskConfigs, a field where extra config metadata can be passed. set the `num_fewshot` key within this to override the printed n_shot value in a results table for your group, for example. ================================================ FILE: examples/lm-eval-overview.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": { "id": "Qw83KAePAhaS" }, "source": [ "# Releasing LM-Evaluation-Harness v0.4.0" ] }, { "cell_type": "markdown", "metadata": { "id": "Z7k2vq1iAdqr" }, "source": [ "With the vast amount of work done in the field today, it helps to have a tool that people can use easily to share their results and use to check others to ensure reported numbers are valid. The LM Evaluation Harness is one such tool the community has used extensively. We want to continue to support the community and with that in mind, we’re excited to announce a major update on the LM Evaluation Harness to further our goal for open and accessible AI research." ] }, { "cell_type": "markdown", "metadata": { "id": "0gDoM0AJAvEc" }, "source": [ "Our refactor stems from our desires to make the following believed best practices easier to carry out. \n", "\n", "1. Never copy results from other papers\n", "2. Always share your exact prompts\n", "3. Always provide model outputs\n", "4. Qualitatively review a small batch of outputs before running evaluation jobs at scale\n", "\n", "We also wanted to make the library a better experience to use and to contribute or design evaluations within. New features in the new release that serve this purpose include:\n", "\n", "1. Faster Evaluation Runtimes (accelerated data-parallel inference with HF Transformers + Accelerate, and commonly used or faster inference libraries such as vLLM and Llama-CPP)\n", "2. Easier addition and sharing of new tasks (YAML-based task config formats, allowing single-file sharing of custom tasks)\n", "3. More configurability, for more advanced workflows and easier operation with modifying prompts\n", "4. Better logging of data at runtime and post-hoc" ] }, { "cell_type": "markdown", "metadata": { "id": "nnwsOpjda_YW" }, "source": [ "In this notebook we will be going through a short tutorial on how things work." ] }, { "cell_type": "markdown", "metadata": { "id": "zAov81vTbL2K" }, "source": [ "## Install LM-Eval" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "8hiosGzq_qZg", "outputId": "6ab73e5e-1f54-417e-a388-07e0d870b132" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Collecting git+https://github.com/EleutherAI/lm-evaluation-harness.git@big-refactor\n", " Cloning https://github.com/EleutherAI/lm-evaluation-harness.git (to revision big-refactor) to /tmp/pip-req-build-tnssql5s\n", " Running command git clone --filter=blob:none --quiet https://github.com/EleutherAI/lm-evaluation-harness.git /tmp/pip-req-build-tnssql5s\n", " Running command git checkout -b big-refactor --track origin/big-refactor\n", " Switched to a new branch 'big-refactor'\n", " Branch 'big-refactor' set up to track remote branch 'big-refactor' from 'origin'.\n", " Resolved https://github.com/EleutherAI/lm-evaluation-harness.git to commit 42f486ee49b65926a444cb0620870a39a5b4b0a8\n", " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", "Collecting accelerate>=0.21.0 (from lm-eval==1.0.0)\n", " Downloading accelerate-0.24.1-py3-none-any.whl (261 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m261.4/261.4 kB\u001b[0m \u001b[31m4.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting evaluate (from lm-eval==1.0.0)\n", " Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m84.1/84.1 kB\u001b[0m \u001b[31m5.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting datasets>=2.0.0 (from lm-eval==1.0.0)\n", " Downloading datasets-2.15.0-py3-none-any.whl (521 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m521.2/521.2 kB\u001b[0m \u001b[31m9.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting jsonlines (from lm-eval==1.0.0)\n", " Downloading jsonlines-4.0.0-py3-none-any.whl (8.7 kB)\n", "Requirement already satisfied: numexpr in /usr/local/lib/python3.10/dist-packages (from lm-eval==1.0.0) (2.8.7)\n", "Collecting peft>=0.2.0 (from lm-eval==1.0.0)\n", " Downloading peft-0.6.2-py3-none-any.whl (174 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m174.7/174.7 kB\u001b[0m \u001b[31m7.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting pybind11>=2.6.2 (from lm-eval==1.0.0)\n", " Downloading pybind11-2.11.1-py3-none-any.whl (227 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m227.7/227.7 kB\u001b[0m \u001b[31m12.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting pytablewriter (from lm-eval==1.0.0)\n", " Downloading pytablewriter-1.2.0-py3-none-any.whl (111 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m111.1/111.1 kB\u001b[0m \u001b[31m8.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting rouge-score>=0.0.4 (from lm-eval==1.0.0)\n", " Downloading rouge_score-0.1.2.tar.gz (17 kB)\n", " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", "Collecting sacrebleu>=1.5.0 (from lm-eval==1.0.0)\n", " Downloading sacrebleu-2.3.2-py3-none-any.whl (119 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m119.7/119.7 kB\u001b[0m \u001b[31m8.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: scikit-learn>=0.24.1 in /usr/local/lib/python3.10/dist-packages (from lm-eval==1.0.0) (1.2.2)\n", "Collecting sqlitedict (from lm-eval==1.0.0)\n", " Downloading sqlitedict-2.1.0.tar.gz (21 kB)\n", " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", "Requirement already satisfied: torch>=1.8 in /usr/local/lib/python3.10/dist-packages (from lm-eval==1.0.0) (2.1.0+cu118)\n", "Collecting tqdm-multiprocess (from lm-eval==1.0.0)\n", " Downloading tqdm_multiprocess-0.0.11-py3-none-any.whl (9.8 kB)\n", "Requirement already satisfied: transformers>=4.1 in /usr/local/lib/python3.10/dist-packages (from lm-eval==1.0.0) (4.35.2)\n", "Collecting zstandard (from lm-eval==1.0.0)\n", " Downloading zstandard-0.22.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.4 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.4/5.4 MB\u001b[0m \u001b[31m29.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.21.0->lm-eval==1.0.0) (1.23.5)\n", "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.21.0->lm-eval==1.0.0) (23.2)\n", "Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.21.0->lm-eval==1.0.0) (5.9.5)\n", "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.21.0->lm-eval==1.0.0) (6.0.1)\n", "Requirement already satisfied: huggingface-hub in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.21.0->lm-eval==1.0.0) (0.19.4)\n", "Requirement already satisfied: pyarrow>=8.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (9.0.0)\n", "Collecting pyarrow-hotfix (from datasets>=2.0.0->lm-eval==1.0.0)\n", " Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)\n", "Collecting dill<0.3.8,>=0.3.0 (from datasets>=2.0.0->lm-eval==1.0.0)\n", " Downloading dill-0.3.7-py3-none-any.whl (115 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m115.3/115.3 kB\u001b[0m \u001b[31m14.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (1.5.3)\n", "Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (2.31.0)\n", "Requirement already satisfied: tqdm>=4.62.1 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (4.66.1)\n", "Requirement already satisfied: xxhash in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (3.4.1)\n", "Collecting multiprocess (from datasets>=2.0.0->lm-eval==1.0.0)\n", " Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m19.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: fsspec[http]<=2023.10.0,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (2023.6.0)\n", "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (3.8.6)\n", "Collecting responses<0.19 (from evaluate->lm-eval==1.0.0)\n", " Downloading responses-0.18.0-py3-none-any.whl (38 kB)\n", "Requirement already satisfied: safetensors in /usr/local/lib/python3.10/dist-packages (from peft>=0.2.0->lm-eval==1.0.0) (0.4.0)\n", "Requirement already satisfied: absl-py in /usr/local/lib/python3.10/dist-packages (from rouge-score>=0.0.4->lm-eval==1.0.0) (1.4.0)\n", "Requirement already satisfied: nltk in /usr/local/lib/python3.10/dist-packages (from rouge-score>=0.0.4->lm-eval==1.0.0) (3.8.1)\n", "Requirement already satisfied: six>=1.14.0 in /usr/local/lib/python3.10/dist-packages (from rouge-score>=0.0.4->lm-eval==1.0.0) (1.16.0)\n", "Collecting portalocker (from sacrebleu>=1.5.0->lm-eval==1.0.0)\n", " Downloading portalocker-2.8.2-py3-none-any.whl (17 kB)\n", "Requirement already satisfied: regex in /usr/local/lib/python3.10/dist-packages (from sacrebleu>=1.5.0->lm-eval==1.0.0) (2023.6.3)\n", "Requirement already satisfied: tabulate>=0.8.9 in /usr/local/lib/python3.10/dist-packages (from sacrebleu>=1.5.0->lm-eval==1.0.0) (0.9.0)\n", "Collecting colorama (from sacrebleu>=1.5.0->lm-eval==1.0.0)\n", " Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)\n", "Requirement already satisfied: lxml in /usr/local/lib/python3.10/dist-packages (from sacrebleu>=1.5.0->lm-eval==1.0.0) (4.9.3)\n", "Requirement already satisfied: scipy>=1.3.2 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.24.1->lm-eval==1.0.0) (1.11.3)\n", "Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.24.1->lm-eval==1.0.0) (1.3.2)\n", "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.24.1->lm-eval==1.0.0) (3.2.0)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->lm-eval==1.0.0) (3.13.1)\n", "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->lm-eval==1.0.0) (4.5.0)\n", "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->lm-eval==1.0.0) (1.12)\n", "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->lm-eval==1.0.0) (3.2.1)\n", "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->lm-eval==1.0.0) (3.1.2)\n", "Requirement already satisfied: triton==2.1.0 in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->lm-eval==1.0.0) (2.1.0)\n", "Requirement already satisfied: tokenizers<0.19,>=0.14 in /usr/local/lib/python3.10/dist-packages (from transformers>=4.1->lm-eval==1.0.0) (0.15.0)\n", "Requirement already satisfied: attrs>=19.2.0 in /usr/local/lib/python3.10/dist-packages (from jsonlines->lm-eval==1.0.0) (23.1.0)\n", "Requirement already satisfied: setuptools>=38.3.0 in /usr/local/lib/python3.10/dist-packages (from pytablewriter->lm-eval==1.0.0) (67.7.2)\n", "Collecting DataProperty<2,>=1.0.1 (from pytablewriter->lm-eval==1.0.0)\n", " Downloading DataProperty-1.0.1-py3-none-any.whl (27 kB)\n", "Collecting mbstrdecoder<2,>=1.0.0 (from pytablewriter->lm-eval==1.0.0)\n", " Downloading mbstrdecoder-1.1.3-py3-none-any.whl (7.8 kB)\n", "Collecting pathvalidate<4,>=2.3.0 (from pytablewriter->lm-eval==1.0.0)\n", " Downloading pathvalidate-3.2.0-py3-none-any.whl (23 kB)\n", "Collecting tabledata<2,>=1.3.1 (from pytablewriter->lm-eval==1.0.0)\n", " Downloading tabledata-1.3.3-py3-none-any.whl (11 kB)\n", "Collecting tcolorpy<1,>=0.0.5 (from pytablewriter->lm-eval==1.0.0)\n", " Downloading tcolorpy-0.1.4-py3-none-any.whl (7.9 kB)\n", "Collecting typepy[datetime]<2,>=1.3.2 (from pytablewriter->lm-eval==1.0.0)\n", " Downloading typepy-1.3.2-py3-none-any.whl (31 kB)\n", "Requirement already satisfied: charset-normalizer<4.0,>=2.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.0.0->lm-eval==1.0.0) (3.3.2)\n", "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.0.0->lm-eval==1.0.0) (6.0.4)\n", "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.0.0->lm-eval==1.0.0) (4.0.3)\n", "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.0.0->lm-eval==1.0.0) (1.9.2)\n", "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.0.0->lm-eval==1.0.0) (1.4.0)\n", "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.0.0->lm-eval==1.0.0) (1.3.1)\n", "Requirement already satisfied: chardet<6,>=3.0.4 in /usr/local/lib/python3.10/dist-packages (from mbstrdecoder<2,>=1.0.0->pytablewriter->lm-eval==1.0.0) (5.2.0)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets>=2.0.0->lm-eval==1.0.0) (3.4)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets>=2.0.0->lm-eval==1.0.0) (2.0.7)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets>=2.0.0->lm-eval==1.0.0) (2023.7.22)\n", "Requirement already satisfied: python-dateutil<3.0.0,>=2.8.0 in /usr/local/lib/python3.10/dist-packages (from typepy[datetime]<2,>=1.3.2->pytablewriter->lm-eval==1.0.0) (2.8.2)\n", "Requirement already satisfied: pytz>=2018.9 in /usr/local/lib/python3.10/dist-packages (from typepy[datetime]<2,>=1.3.2->pytablewriter->lm-eval==1.0.0) (2023.3.post1)\n", "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch>=1.8->lm-eval==1.0.0) (2.1.3)\n", "Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from nltk->rouge-score>=0.0.4->lm-eval==1.0.0) (8.1.7)\n", "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch>=1.8->lm-eval==1.0.0) (1.3.0)\n", "Building wheels for collected packages: lm-eval, rouge-score, sqlitedict\n", " Building wheel for lm-eval (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", " Created wheel for lm-eval: filename=lm_eval-1.0.0-py3-none-any.whl size=994254 sha256=88356155b19f2891981ecef948326ad6ce8ca40a6009378410ec20d0e225995a\n", " Stored in directory: /tmp/pip-ephem-wheel-cache-9v6ye7h3/wheels/17/01/26/599c0779e9858a70a73fa8a306699b5b9a868f820c225457b0\n", " Building wheel for rouge-score (setup.py) ... \u001b[?25l\u001b[?25hdone\n", " Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=6bb0d44e4881972c43ce194e7cb65233d309758cb15f0dec54590d3d2efcfc36\n", " Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4\n", " Building wheel for sqlitedict (setup.py) ... \u001b[?25l\u001b[?25hdone\n", " Created wheel for sqlitedict: filename=sqlitedict-2.1.0-py3-none-any.whl size=16863 sha256=5747f7dd73ddf3d8fbcebf51b5e4f718fabe1e94bccdf16d2f22a2e65ee7fdf4\n", " Stored in directory: /root/.cache/pip/wheels/79/d6/e7/304e0e6cb2221022c26d8161f7c23cd4f259a9e41e8bbcfabd\n", "Successfully built lm-eval rouge-score sqlitedict\n", "Installing collected packages: sqlitedict, zstandard, tcolorpy, pybind11, pyarrow-hotfix, portalocker, pathvalidate, mbstrdecoder, jsonlines, dill, colorama, typepy, tqdm-multiprocess, sacrebleu, rouge-score, responses, multiprocess, accelerate, datasets, DataProperty, tabledata, peft, evaluate, pytablewriter, lm-eval\n", "Successfully installed DataProperty-1.0.1 accelerate-0.24.1 colorama-0.4.6 datasets-2.15.0 dill-0.3.7 evaluate-0.4.1 jsonlines-4.0.0 lm-eval-1.0.0 mbstrdecoder-1.1.3 multiprocess-0.70.15 pathvalidate-3.2.0 peft-0.6.2 portalocker-2.8.2 pyarrow-hotfix-0.6 pybind11-2.11.1 pytablewriter-1.2.0 responses-0.18.0 rouge-score-0.1.2 sacrebleu-2.3.2 sqlitedict-2.1.0 tabledata-1.3.3 tcolorpy-0.1.4 tqdm-multiprocess-0.0.11 typepy-1.3.2 zstandard-0.22.0\n" ] } ], "source": [ "# Install LM-Eval\n", "!pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 0, "referenced_widgets": [ "a1d3a8aa016544a78e8821c8f6199e06", "f61ed33fad754146bdd2ac9db1ba1c48", "bfa0af6aeff344c6845e1080a878e92e", "fd1ad9e0367d4004aae853b91c3a7617", "6b2d90209ec14230b3d58a74ac9b83bf", "a73f357065d34d7baf0453ae4a8d75e2", "46f521b73fd943c081c648fd873ebc0a", "7c5689bc13684db8a22681f41863dddd", "48763b6233374554ae76035c0483066f", "4986a21eb560448fa79f4b25cde48951", "aed3acd2f2d74003b44079c333a0698e" ] }, "id": "uyO5MaKkZyah", "outputId": "d46e8096-5086-4e49-967e-ea33d4a2a335" }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "a1d3a8aa016544a78e8821c8f6199e06", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading builder script: 0%| | 0.00/5.67k [00:00\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "id": "fthNg3ywO-kA" }, "outputs": [], "source": [ "YAML_cola_string = \"\"\"\n", "tag: yes_or_no_tasks\n", "task: demo_cola\n", "dataset_path: glue\n", "dataset_name: cola\n", "output_type: multiple_choice\n", "training_split: train\n", "validation_split: validation\n", "doc_to_text: \"{{sentence}}\\nQuestion: Does this sentence make sense?\\nAnswer:\"\n", "doc_to_target: label\n", "doc_to_choice: [\"no\", \"yes\"]\n", "should_decontaminate: true\n", "doc_to_decontamination_query: sentence\n", "metric_list:\n", " - metric: acc\n", "\"\"\"\n", "with open(\"cola.yaml\", \"w\") as f:\n", " f.write(YAML_cola_string)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "XceRKCuuDtbn" }, "outputs": [], "source": "# !accelerate launch --no_python\n%env LMEVAL_LOG_LEVEL=DEBUG\n!lm_eval \\\n --model hf \\\n --model_args pretrained=EleutherAI/pythia-2.8b \\\n --include_path ./ \\\n --tasks yes_or_no_tasks \\\n --limit 10 \\\n --output output/yes_or_no_tasks/ \\\n --log_samples" }, { "cell_type": "markdown", "metadata": { "id": "XceRKCuuDtbn" }, "source": [ "## Edit Prompt Templates Quickly\n", "\n", "The following is a yaml made to evaluate the specific subtask of `high_school_geography` from MMLU. It uses the standard prompt where the we choose the letters from the options with most likelihood as the model's prediction." ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "id": "GTFvdt9kSlBG" }, "outputs": [], "source": [ "YAML_mmlu_geo_string = \"\"\"\n", "task: demo_mmlu_high_school_geography\n", "dataset_path: cais/mmlu\n", "dataset_name: high_school_geography\n", "description: \"The following are multiple choice questions (with answers) about high school geography.\\n\\n\"\n", "test_split: test\n", "fewshot_split: dev\n", "fewshot_config:\n", " sampler: first_n\n", "output_type: multiple_choice\n", "doc_to_text: \"{{question.strip()}}\\nA. {{choices[0]}}\\nB. {{choices[1]}}\\nC. {{choices[2]}}\\nD. {{choices[3]}}\\nAnswer:\"\n", "doc_to_choice: [\"A\", \"B\", \"C\", \"D\"]\n", "doc_to_target: answer\n", "metric_list:\n", " - metric: acc\n", " aggregation: mean\n", " higher_is_better: true\n", " - metric: acc_norm\n", " aggregation: mean\n", " higher_is_better: true\n", "\"\"\"\n", "with open(\"mmlu_high_school_geography.yaml\", \"w\") as f:\n", " f.write(YAML_mmlu_geo_string)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "jyKOfCsKb-xy" }, "outputs": [], "source": "# !accelerate launch --no_python\n%env LMEVAL_LOG_LEVEL=DEBUG\n!lm_eval \\\n --model hf \\\n --model_args pretrained=EleutherAI/pythia-2.8b \\\n --include_path ./ \\\n --tasks demo_mmlu_high_school_geography \\\n --limit 10 \\\n --output output/mmlu_high_school_geography/ \\\n --log_samples" }, { "cell_type": "markdown", "metadata": { "id": "jyKOfCsKb-xy" }, "source": [ "We could also evaluate this task in a different way. For example, instead of observing the loglikelihood of the letters, we can instead evaluate on the choices themselves as the continuation. This is done by simply changing `doc_to_choice` from a list of letters to the corresponding `choices` field from the HF dataset. We write `\"{{choices}}\"` so that the string field is interpreted as jinja string that acquires the list from the HF dataset directly.\n", "\n", "Another convenient feature here is since we're only modifying the `doc_to_choice` and the rest of config is the same as the task above, we can use the above configuration as a template by using `include: mmlu_high_school_geography.yaml` to load the config from that file. We'll need to add a unique task name as to not colide with the existing yaml config we're including. For this case we'll simply name this one `mmlu_high_school_geography_continuation`. `doc_to_text` is added here just for sake of clarity." ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "id": "lqElwU54TaK-" }, "outputs": [], "source": [ "YAML_mmlu_geo_string = \"\"\"\n", "include: mmlu_high_school_geography.yaml\n", "task: demo_mmlu_high_school_geography_continuation\n", "doc_to_text: \"{{question.strip()}}\\nA. {{choices[0]}}\\nB. {{choices[1]}}\\nC. {{choices[2]}}\\nD. {{choices[3]}}\\nAnswer:\"\n", "doc_to_choice: \"{{choices}}\"\n", "\"\"\"\n", "with open(\"mmlu_high_school_geography_continuation.yaml\", \"w\") as f:\n", " f.write(YAML_mmlu_geo_string)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "-_CVnDirdy7j" }, "outputs": [], "source": "# !accelerate launch --no_python\n%env LMEVAL_LOG_LEVEL=DEBUG\n!lm_eval \\\n --model hf \\\n --model_args pretrained=EleutherAI/pythia-2.8b \\\n --include_path ./ \\\n --tasks demo_mmlu_high_school_geography_continuation \\\n --limit 10 \\\n --output output/mmlu_high_school_geography_continuation/ \\\n --log_samples" }, { "cell_type": "markdown", "metadata": { "id": "-_CVnDirdy7j" }, "source": [ "If we take a look at the samples, we can see that it is in fact evaluating the continuation based on the choices rather than the letters." ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "id": "duBDqC6PAdjL" }, "outputs": [ { "data": { "application/javascript": "\n ((filepath) => {{\n if (!google.colab.kernel.accessAllowed) {{\n return;\n }}\n google.colab.files.view(filepath);\n }})(\"/content/output/mmlu_high_school_geography_continuation/pretrained__EleutherAI__pythia-2.8b_demo_mmlu_high_school_geography_continuation.jsonl\")", "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from google.colab import files\n", "\n", "\n", "files.view(\n", " \"output/mmlu_high_school_geography_continuation/pretrained__EleutherAI__pythia-2.8b_demo_mmlu_high_school_geography_continuation.jsonl\"\n", ")" ] }, { "cell_type": "markdown", "metadata": { "id": "6p0-KPwAgK5j" }, "source": [ "## Closer Look at YAML Fields\n", "\n", "To prepare a task we can simply fill in a YAML config with the relevant information.\n", "\n", "`output_type`\n", "The current provided evaluation types comprise of the following:\n", "1. `loglikelihood`: Evaluates the loglikelihood of a continuation, conditioned on some input string.\n", "2. `loglikelihood_rolling`: evaluate the loglikelihood of producing a string, conditioned on the empty string. (Used for perplexity evaluations)\n", "3. `multiple_choice`: Evaluates loglikelihood among the a number of choices predicted by the model.\n", "4. `greedy_until`: Model outputs greedy generation (can be configured to to use beam search and other generation-related parameters)\n", "\n", "The core prompt revolves around 3 fields.\n", "1. `doc_to_text`: Denotes the prompt template that will be used as input to the model.\n", "2. `doc_to_choice`: Available choices that will be used as continuation for the model. This is used when the `output_type` is `multiple_choice`, and otherwise can be left as `None`.\n", "3. `doc_to_target`: When `output_type` is `multiple_choice`, this can be an index that corresponds to the correct answer, or the answer string itself (must be a subset of `doc_to_choice`). For other tasks, this is expected to be a string. You can fill this field with a feature name from the HF dataset so long as the resulting feature follows the conditioned described.\n", "\n", "These three fields can be expressed as strings, column names from the source dataset, or as Jinja2 templates that can use fields from the source dataset as variables.\n" ] }, { "cell_type": "markdown", "metadata": { "id": "6p0-KPwAgK5j" }, "source": [ "## What if Jinja is not Sufficient?\n", "\n", "There can be times where the Jinja2 templating language is not enough to make the prompt we had in mind. There are a few ways to circumvent this limitation:\n", "\n", "1. Use `!function` operator for the prompt-related fields to pass a python function that takes as input the dataset row, and will output the prompt template component.\n", "2. Perform a transformation on the dataset beforehand." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Below, we show an example of using `!function` to create `doc_to_text` from a python function:" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "DYZ5c0JhR1lJ", "outputId": "ca945235-fb9e-4f17-8bfa-78e7d6ec1490" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2023-11-29:11:59:08,312 INFO [utils.py:160] NumExpr defaulting to 2 threads.\n", "2023-11-29 11:59:09.348327: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n", "2023-11-29 11:59:09.348387: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n", "2023-11-29 11:59:09.348421: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n", "2023-11-29 11:59:10.573752: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n", "2023-11-29:11:59:14,044 INFO [__main__.py:132] Verbosity set to INFO\n", "2023-11-29:11:59:23,654 WARNING [__main__.py:138] --limit SHOULD ONLY BE USED FOR TESTING.REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.\n", "2023-11-29:11:59:23,654 INFO [__main__.py:143] Including path: ./\n", "2023-11-29:11:59:23,678 INFO [__main__.py:205] Selected Tasks: ['demo_mmlu_high_school_geography_function_prompt']\n", "2023-11-29:11:59:23,679 WARNING [evaluator.py:93] generation_kwargs specified through cli, these settings will be used over set parameters in yaml tasks.\n", "2023-11-29:11:59:23,708 INFO [huggingface.py:120] Using device 'cuda'\n", "2023-11-29:11:59:44,516 INFO [task.py:355] Building contexts for task on rank 0...\n", "2023-11-29:11:59:44,524 INFO [evaluator.py:319] Running loglikelihood requests\n", "100% 40/40 [00:02<00:00, 15.41it/s]\n", "fatal: not a git repository (or any of the parent directories): .git\n", "hf (pretrained=EleutherAI/pythia-2.8b), gen_kwargs: (), limit: 10.0, num_fewshot: None, batch_size: 1\n", "| Tasks |Version|Filter|n-shot| Metric |Value| |Stderr|\n", "|-----------------------------------------------|-------|------|-----:|--------|----:|---|-----:|\n", "|demo_mmlu_high_school_geography_function_prompt|Yaml |none | 0|acc | 0.1|± |0.1000|\n", "| | |none | 0|acc_norm| 0.2|± |0.1333|\n", "\n" ] } ], "source": [ "YAML_mmlu_geo_string = \"\"\"\n", "include: mmlu_high_school_geography.yaml\n", "task: demo_mmlu_high_school_geography_function_prompt\n", "doc_to_text: !function utils.doc_to_text\n", "doc_to_choice: \"{{choices}}\"\n", "\"\"\"\n", "with open(\"demo_mmlu_high_school_geography_function_prompt.yaml\", \"w\") as f:\n", " f.write(YAML_mmlu_geo_string)\n", "\n", "DOC_TO_TEXT = \"\"\"\n", "def doc_to_text(x):\n", " question = x[\"question\"].strip()\n", " choices = x[\"choices\"]\n", " option_a = choices[0]\n", " option_b = choices[1]\n", " option_c = choices[2]\n", " option_d = choices[3]\n", " return f\"{question}\\\\nA. {option_a}\\\\nB. {option_b}\\\\nC. {option_c}\\\\nD. {option_d}\\\\nAnswer:\"\n", "\"\"\"\n", "with open(\"utils.py\", \"w\") as f:\n", " f.write(DOC_TO_TEXT)\n", "\n", "!lm_eval \\\n", " --model hf \\\n", " --model_args pretrained=EleutherAI/pythia-2.8b \\\n", " --include_path ./ \\\n", " --tasks demo_mmlu_high_school_geography_function_prompt \\\n", " --limit 10 \\\n", " --output output/demo_mmlu_high_school_geography_function_prompt/ \\\n", " --log_samples" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Next, we'll also show how to do this via preprocessing the dataset as necessary using the `process_docs` config field:\n", "\n", "We will write a function that will modify each document in our evaluation dataset's split to add a field that is suitable for us to use in `doc_to_text`." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "YAML_mmlu_geo_string = \"\"\"\n", "include: mmlu_high_school_geography.yaml\n", "task: demo_mmlu_high_school_geography_function_prompt_2\n", "process_docs: !function utils_process_docs.process_docs\n", "doc_to_text: \"{{input}}\"\n", "doc_to_choice: \"{{choices}}\"\n", "\"\"\"\n", "with open(\"demo_mmlu_high_school_geography_process_docs.yaml\", \"w\") as f:\n", " f.write(YAML_mmlu_geo_string)\n", "\n", "DOC_TO_TEXT = \"\"\"\n", "def process_docs(dataset):\n", " def _process_doc(x):\n", " question = x[\"question\"].strip()\n", " choices = x[\"choices\"]\n", " option_a = choices[0]\n", " option_b = choices[1]\n", " option_c = choices[2]\n", " option_d = choices[3]\n", " doc[\"input\"] = f\"{question}\\\\nA. {option_a}\\\\nB. {option_b}\\\\nC. {option_c}\\\\nD. {option_d}\\\\nAnswer:\"\n", " return out_doc\n", "\n", " return dataset.map(_process_doc)\n", "\"\"\"\n", "\n", "with open(\"utils_process_docs.py\", \"w\") as f:\n", " f.write(DOC_TO_TEXT)\n", "\n", "!lm_eval \\\n", " --model hf \\\n", " --model_args pretrained=EleutherAI/pythia-2.8b \\\n", " --include_path ./ \\\n", " --tasks demo_mmlu_high_school_geography_function_prompt_2 \\\n", " --limit 10 \\\n", " --output output/demo_mmlu_high_school_geography_function_prompt_2/ \\\n", " --log_samples" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We hope that this explainer gives you a sense of what can be done with and how to work with LM-Evaluation-Harnes v0.4.0 ! \n", "\n", "For more information, check out our documentation pages in the `docs/` folder, and if you have questions, please raise them in GitHub issues, or in #lm-thunderdome or #release-discussion on the EleutherAI discord server." ] } ], "metadata": { "accelerator": "GPU", "colab": { "collapsed_sections": [ "zAov81vTbL2K" ], "gpuType": "T4", "provenance": [] }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "name": "python" }, "widgets": { "application/vnd.jupyter.widget-state+json": { "state": { "46f521b73fd943c081c648fd873ebc0a": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "48763b6233374554ae76035c0483066f": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "ProgressStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "4986a21eb560448fa79f4b25cde48951": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "6b2d90209ec14230b3d58a74ac9b83bf": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "7c5689bc13684db8a22681f41863dddd": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "a1d3a8aa016544a78e8821c8f6199e06": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HBoxModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_f61ed33fad754146bdd2ac9db1ba1c48", "IPY_MODEL_bfa0af6aeff344c6845e1080a878e92e", "IPY_MODEL_fd1ad9e0367d4004aae853b91c3a7617" ], "layout": "IPY_MODEL_6b2d90209ec14230b3d58a74ac9b83bf" } }, "a73f357065d34d7baf0453ae4a8d75e2": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "aed3acd2f2d74003b44079c333a0698e": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "bfa0af6aeff344c6845e1080a878e92e": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "FloatProgressModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_7c5689bc13684db8a22681f41863dddd", "max": 5669, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_48763b6233374554ae76035c0483066f", "value": 5669 } }, "f61ed33fad754146bdd2ac9db1ba1c48": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_a73f357065d34d7baf0453ae4a8d75e2", "placeholder": "​", "style": "IPY_MODEL_46f521b73fd943c081c648fd873ebc0a", "value": "Downloading builder script: 100%" } }, "fd1ad9e0367d4004aae853b91c3a7617": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_4986a21eb560448fa79f4b25cde48951", "placeholder": "​", "style": "IPY_MODEL_aed3acd2f2d74003b44079c333a0698e", "value": " 5.67k/5.67k [00:00<00:00, 205kB/s]" } } }, "version_major": 2, "version_minor": 0 } } }, "nbformat": 4, "nbformat_minor": 0 } ================================================ FILE: examples/transformer-lens.py ================================================ import warnings import torch import torch.nn as nn from transformer_lens import HookedTransformer from transformers import AutoConfig from lm_eval import evaluator from lm_eval.models.huggingface import HFLM def evaluate_lm_eval(lens_model: HookedTransformer, tasks: list[str], **kwargs): class HFLikeModelAdapter(nn.Module): """Adapts HookedTransformer to match the HuggingFace interface expected by lm-eval""" def __init__(self, model: HookedTransformer): super().__init__() self.model = model self.tokenizer = model.tokenizer self.config = AutoConfig.from_pretrained(model.cfg.tokenizer_name) self.device = model.cfg.device self.tie_weights = lambda: self def forward(self, input_ids=None, attention_mask=None, **kwargs): output = self.model(input_ids, attention_mask=attention_mask, **kwargs) # Make sure output has the expected .logits attribute if not hasattr(output, "logits"): if isinstance(output, torch.Tensor): output.logits = output return output # Only delegate specific attributes we know we need def to(self, *args, **kwargs): return self.model.to(*args, **kwargs) def eval(self): self.model.eval() return self def train(self, mode=True): self.model.train(mode) return self model = HFLikeModelAdapter(lens_model) warnings.filterwarnings("ignore", message="Failed to get model SHA for") results = evaluator.simple_evaluate( model=HFLM(pretrained=model, tokenizer=model.tokenizer), tasks=tasks, verbosity="WARNING", **kwargs, ) return results if __name__ == "__main__": # Load base model model = HookedTransformer.from_pretrained("pythia-70m") res = evaluate_lm_eval(model, tasks=["arc_easy"]) print(res["results"]) ================================================ FILE: examples/visualize-wandb.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "id": "fc477b96-adee-4829-a9d7-a5eb990df358", "metadata": {}, "source": [ "# Visualizing Results in Weights and Biases\n", "\n", "With the Weights and Biases integration, you can now spend more time extracting deeper insights into your evaluation results. The integration is designed to streamline the process of logging and visualizing experiment results using the Weights & Biases (W&B) platform.\n", "\n", "The integration provide functionalities\n", "\n", "- to automatically log the evaluation results,\n", "- log the samples as W&B Tables for easy visualization,\n", "- log the `results.json` file as an artifact for version control,\n", "- log the `_eval_samples.json` file if the samples are logged,\n", "- generate a comprehensive report for analysis and visualization with all the important metric,\n", "- log task and cli configs,\n", "- and more out of the box like the command used to run the evaluation, GPU/CPU counts, timestamp, etc.\n", "\n", "The integration is super easy to use with the eval harness. Let's see how!" ] }, { "cell_type": "code", "execution_count": null, "id": "3851439a-bff4-41f2-bf21-1b3d8704913b", "metadata": { "scrolled": true }, "outputs": [], "source": [ "# Install this project if you did not already have it.\n", "# This is all that is needed to be installed to start using Weights and Biases\n", "\n", "!pip -qq install -e ..[wandb]" ] }, { "cell_type": "markdown", "id": "8507fd7e-3b99-4a92-89fa-9eaada74ba91", "metadata": {}, "source": [ "# Run the Eval Harness\n", "\n", "Run the eval harness as usual with a `wandb_args` flag. This flag is used to provide arguments for initializing a wandb run ([wandb.init](https://docs.wandb.ai/ref/python/init)) as comma separated string arguments.\n", "\n", "If `wandb_args` flag is used, the metrics and all other goodness will be automatically logged to Weights and Biases. In the stdout, you will find the link to the W&B run page as well as link to the generated report." ] }, { "cell_type": "markdown", "id": "eec5866e-f01e-42f8-8803-9d77472ef991", "metadata": {}, "source": [ "## Set your API Key\n", "\n", "Before you can use W&B, you need to authenticate your machine with an authentication key. Visit https://wandb.ai/authorize to get one." ] }, { "cell_type": "code", "execution_count": null, "id": "d824d163-71a9-4313-935d-f1d56397841c", "metadata": {}, "outputs": [], "source": [ "import wandb\n", "\n", "\n", "wandb.login()" ] }, { "cell_type": "markdown", "id": "124e4a34-1547-4bed-bc09-db012bacbda6", "metadata": {}, "source": [ "> Note that if you are using command line you can simply authenticate your machine by doing `wandb login` in your terminal. For more info check out the [documentation](https://docs.wandb.ai/quickstart#2-log-in-to-wb)." ] }, { "cell_type": "markdown", "id": "abc6f6b6-179a-4aff-ada9-f380fb74df6e", "metadata": {}, "source": [ "## Run and log to W&B" ] }, { "cell_type": "code", "execution_count": null, "id": "bd0a8130-a97b-451a-acd2-3f9885b88643", "metadata": {}, "outputs": [], "source": [ "!lm_eval \\\n", " --model hf \\\n", " --model_args pretrained=microsoft/phi-2,trust_remote_code=True \\\n", " --tasks hellaswag,mmlu_abstract_algebra \\\n", " --device cuda:0 \\\n", " --batch_size 8 \\\n", " --output_path output/phi-2 \\\n", " --limit 10 \\\n", " --wandb_args project=lm-eval-harness-integration \\\n", " --log_samples" ] }, { "cell_type": "markdown", "id": "e974cabdbe70b667", "metadata": {}, "source": [] }, { "cell_type": "markdown", "id": "5178ca9445b844e4", "metadata": {}, "source": [ "W&B can also be initialized programmatically for use outside the CLI to parse and log the results." ] }, { "cell_type": "code", "execution_count": null, "id": "c6a421b2cf3ddac5", "metadata": {}, "outputs": [], "source": [ "import lm_eval\n", "from lm_eval.loggers import WandbLogger\n", "\n", "\n", "results = lm_eval.simple_evaluate(\n", " model=\"hf\",\n", " model_args=\"pretrained=microsoft/phi-2,trust_remote_code=True\",\n", " tasks=\"hellaswag,mmlu_abstract_algebra\",\n", " log_samples=True,\n", ")\n", "\n", "wandb_logger = WandbLogger(\n", " project=\"lm-eval-harness-integration\", job_type=\"eval\"\n", ") # or empty if wandb.init(...) already called before\n", "wandb_logger.post_init(results)\n", "wandb_logger.log_eval_result()\n", "wandb_logger.log_eval_samples(results[\"samples\"]) # if log_samples" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" } }, "nbformat": 4, "nbformat_minor": 5 } ================================================ FILE: examples/visualize-zeno.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Visualizing Results in Zeno\n", "\n", "Benchmarking your models is the first step towards making sure your model performs well.\n", "However, looking at the data behind the benchmark, slicing the data into subsets, and comparing models on individual instances can help you even more in evaluating and quantifying the behavior of your AI system.\n", "\n", "All of this can be done in [Zeno](https://zenoml.com)!\n", "Zeno is super easy to use with the eval harness, let's explore how you can easily upload and visualize your eval results.\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Install this project if you did not already do that. This is all that needs to be installed for you to be able to visualize your data in Zeno!\n", "!pip install -e ..\n", "!pip install -e ..[zeno]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Run the Eval Harness\n", "\n", "To visualize the results, run the eval harness with the `log_samples` and `output_path` flags. We expect `output_path` to contain multiple folders that represent individual model names. You can thus run your evaluation on any number of tasks and models and upload all of the results as projects on Zeno.\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!lm_eval \\\n", " --model hf \\\n", " --model_args pretrained=EleutherAI/gpt-neo-2.7B \\\n", " --tasks hellaswag,wikitext \\\n", " --batch_size 8 \\\n", " --device mps \\\n", " --log_samples \\\n", " --output_path output/gpt-neo-2.7B \\\n", " --limit 10" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Set your API Key\n", "\n", "This is so you can be authenticated with Zeno.\n", "If you don't already have a Zeno account, first create an account on [Zeno Hub](https://hub.zenoml.com).\n", "After logging in to Zeno Hub, generate your API key by clicking on your profile at the bottom left to navigate to your account page.\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%env ZENO_API_KEY=YOUR_API_KEY" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Visualize Eval Results\n", "\n", "You can now use the `zeno_visualize` script to upload the results to Zeno.\n", "\n", "This will use all subfolders in `data_path` as different models and upload all tasks within these model folders to Zeno. If you run the eval harness on multiple tasks, the `project_name` will be used as a prefix and one project will be created per task.\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!python ../scripts/zeno_visualize.py --data_path output --project_name \"Zeno Upload Test\"" ] } ], "metadata": { "kernelspec": { "display_name": "zeno_projects", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.11" } }, "nbformat": 4, "nbformat_minor": 2 } ================================================ FILE: ignore.txt ================================================ ROUGE rouge nin maka mor te ond extraversion ================================================ FILE: lm_eval/__init__.py ================================================ import importlib.metadata import logging import os from importlib.util import find_spec __version__ = importlib.metadata.version("lm_eval") # Enable high-performance transfers os.environ.setdefault("HF_XET_HIGH_PERFORMANCE", "1") # huggingface_hub >= 0.32.0 if find_spec("hf_transfer") is not None: os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1") # legacy hf_transfer # Lazy-load .evaluator module to improve CLI startup def __getattr__(name): if name == "evaluate": from .evaluator import evaluate return evaluate elif name == "simple_evaluate": from .evaluator import simple_evaluate return simple_evaluate raise AttributeError(f"module {__name__!r} has no attribute {name!r}") __all__ = ["evaluate", "simple_evaluate", "__version__"] ================================================ FILE: lm_eval/__main__.py ================================================ from lm_eval._cli import HarnessCLI from lm_eval.utils import setup_logging def cli_evaluate() -> None: """Main CLI entry point.""" setup_logging() parser = HarnessCLI() args = parser.parse_args() parser.execute(args) if __name__ == "__main__": cli_evaluate() ================================================ FILE: lm_eval/_cli/__init__.py ================================================ """ CLI subcommands to run from the terminal. """ from .harness import HarnessCLI __all__ = ["HarnessCLI"] ================================================ FILE: lm_eval/_cli/harness.py ================================================ import argparse import sys import textwrap from lm_eval._cli.ls import List from lm_eval._cli.run import Run from lm_eval._cli.validate import Validate class HarnessCLI: """Main CLI parser that manages all subcommands.""" def __init__(self): self._parser = argparse.ArgumentParser( prog="lm-eval", description="Language Model Evaluation Harness", epilog=textwrap.dedent(""" quick start: # Basic evaluation lm-eval run --model hf --model_args pretrained=gpt2 --tasks hellaswag # List available tasks lm-eval ls tasks # Validate task configurations lm-eval validate --tasks hellaswag,arc_easy legacy compatibility: The harness maintains backward compatibility with the original interface. If no command is specified, 'run' is automatically inserted: lm-eval --model hf --tasks hellaswag # Equivalent to 'lm-eval run --model hf --tasks hellaswag' For documentation, visit: https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md """), formatter_class=argparse.RawDescriptionHelpFormatter, ) self._parser.set_defaults(func=lambda args: self._parser.print_help()) self._subparsers = self._parser.add_subparsers( dest="command", help="Available commands", metavar="COMMAND" ) Run.create(self._subparsers) List.create(self._subparsers) Validate.create(self._subparsers) def parse_args(self) -> argparse.Namespace: """Parse arguments using the main parser.""" if len(sys.argv) > 2 and sys.argv[1] not in self._subparsers.choices: # Backward compatibility: arguments provided but no valid subcommand - insert 'run' # TODO: add warning sys.argv.insert(1, "run") elif len(sys.argv) == 2 and "run" in sys.argv: # if only 'run' is specified, ensure it is treated as a subcommand self._subparsers.choices["run"].print_help() sys.exit(0) return self._parser.parse_args() def execute(self, args: argparse.Namespace) -> None: """Main execution method that handles subcommands and legacy support.""" args.func(args) ================================================ FILE: lm_eval/_cli/ls.py ================================================ import argparse import textwrap from lm_eval._cli.subcommand import SubCommand class List(SubCommand): """Command for listing available tasks.""" def __init__(self, subparsers: argparse._SubParsersAction, *args, **kwargs): # Create and configure the parser super().__init__(*args, **kwargs) self._parser = subparsers.add_parser( "ls", help="List available tasks, groups, subtasks, or tags", description="List available tasks, groups, subtasks, or tags from the evaluation harness.", usage="lm-eval list [tasks|groups|subtasks|tags] [--include_path DIR]", epilog=textwrap.dedent(""" examples: # List all available tasks (includes groups, subtasks, and tags) $ lm-eval ls tasks # List only task groups (like 'mmlu', 'glue', 'superglue') $ lm-eval ls groups # List only individual subtasks (like 'mmlu_abstract_algebra') $ lm-eval ls subtasks # Include external task definitions $ lm-eval ls tasks --include_path /path/to/external/tasks # List tasks from multiple external paths $ lm-eval ls tasks --include_path "/path/to/tasks1:/path/to/tasks2" organization: • Groups: Collections of tasks with aggregated metric across subtasks (e.g., 'mmlu') • Subtasks: Individual evaluation tasks (e.g., 'mmlu_anatomy', 'hellaswag') • Tags: Similar to groups but no aggregate metric (e.g., 'reasoning', 'knowledge', 'language') • External Tasks: Custom tasks defined in external directories evaluation usage: After listing tasks, use them with the run command! For more information tasks configs are defined in https://github.com/EleutherAI/lm-evaluation-harness/tree/main/lm_eval/tasks """), formatter_class=argparse.RawDescriptionHelpFormatter, ) self._add_args() self._parser.set_defaults(func=self._execute) def _add_args(self) -> None: self._parser.add_argument( "what", choices=["tasks", "groups", "subtasks", "tags"], nargs="?", help="What to list: tasks (all), groups, subtasks, or tags", ) self._parser.add_argument( "--include_path", type=str, default=None, metavar="DIR", help="Additional path to include if there are external tasks.", ) def _execute(self, args: argparse.Namespace) -> None: """Execute the list command.""" from lm_eval.tasks import TaskManager task_manager = TaskManager(include_path=args.include_path) if args.what == "tasks": print(task_manager.list_all_tasks()) elif args.what == "groups": print(task_manager.list_all_tasks(list_subtasks=False, list_tags=False)) elif args.what == "subtasks": print(task_manager.list_all_tasks(list_groups=False, list_tags=False)) elif args.what == "tags": print(task_manager.list_all_tasks(list_groups=False, list_subtasks=False)) elif args.what is None: self._parser.print_help() ================================================ FILE: lm_eval/_cli/run.py ================================================ import argparse import json import logging import os import textwrap from functools import partial from lm_eval._cli.subcommand import SubCommand from lm_eval._cli.utils import ( MergeDictAction, SplitArgs, _int_or_none_list_arg_type, request_caching_arg_to_dict, try_parse_json, ) class Run(SubCommand): """Command for running language model evaluation.""" def __init__(self, subparsers: argparse._SubParsersAction, *args, **kwargs): super().__init__(*args, **kwargs) self._parser = subparsers.add_parser( "run", help="Run the evaluation harness on specified tasks", description="Evaluate language models on various benchmarks and tasks.", usage="lm-eval run --model --tasks --model_args [options]", epilog=textwrap.dedent(""" examples: # Basic evaluation with HuggingFace model $ lm-eval run --model hf --model_args pretrained=gpt2 dtype=float32 --tasks hellaswag # Evaluate on multiple tasks with few-shot examples $ lm-eval run --model vllm --model_args pretrained=EleutherAI/gpt-j-6B --tasks arc_easy arc_challenge --num_fewshot 5 # Evaluation with custom generation parameters $ lm-eval run --model hf --model_args pretrained=gpt2 --tasks lambada --gen_kwargs temperature=0.8 top_p=0.95 'stop=["\\n\\n"]' # Use configuration file $ lm-eval run --config my_config.yaml --tasks mmlu For more information, see: https://github.com/EleutherAI/lm-evaluation-harness """), formatter_class=argparse.RawDescriptionHelpFormatter, ) self._add_args() self._parser.set_defaults(func=self._execute) def _add_args(self) -> None: self._parser = self._parser # Defaults are set in config/evaluate_config.py config_group = self._parser.add_argument_group("configuration") config_group.add_argument( "--config", "-C", default=None, type=str, metavar="", help="Set initial arguments from YAML config", ) # Model and Tasks model_group = self._parser.add_argument_group("model and tasks") model_group.add_argument( "--tasks", "-t", default=None, nargs="+", metavar="", action=SplitArgs, help=textwrap.dedent(""" Space (or comma-separated) list of task names or groupings. Use 'lm-eval list tasks' to see all available tasks. """).strip(), ) model_group.add_argument( "--model", "-M", type=str, default=None, metavar="", help="Model name (default: hf)", ) model_group.add_argument( "--model_args", "-a", default=None, nargs="+", action=MergeDictAction, metavar="", help="Model arguments as 'key=val,key2=val2' or `key=val` `key2=val2`", ) model_group.add_argument( "--apply_chat_template", type=str, nargs="?", const=True, default=argparse.SUPPRESS, metavar="