Showing preview only (2,435K chars total). Download the full file or copy to clipboard to get everything.
Repository: JonathanChavezTamales/LLMStats
Branch: main
Commit: 872b75f63b8d
Files: 778
Total size: 2.2 MB
Directory structure:
gitextract_261_qksq/
├── .github/
│ ├── pull_request_template.md
│ └── workflows/
│ └── schema-validation.yml
├── .gitignore
├── .vscode/
│ └── settings.json
├── CONTRIBUTING.md
├── LICENSE.md
├── README.md
├── data/
│ ├── .github/
│ │ └── CODEOWNERS
│ ├── benchmarks/
│ │ ├── aa-index.json
│ │ ├── acebench.json
│ │ ├── activitynet.json
│ │ ├── agieval.json
│ │ ├── ai2-reasoning-challenge-(arc).json
│ │ ├── ai2d.json
│ │ ├── aider-polyglot-edit.json
│ │ ├── aider-polyglot.json
│ │ ├── aider.json
│ │ ├── aime-2024.json
│ │ ├── aime-2025.json
│ │ ├── aime.json
│ │ ├── aitz-em.json
│ │ ├── alignbench.json
│ │ ├── alpacaeval-2.0.json
│ │ ├── amc-2022-23.json
│ │ ├── android-control-high-em.json
│ │ ├── android-control-low-em.json
│ │ ├── androidworld-sr.json
│ │ ├── api-bank.json
│ │ ├── arc-agi-v2.json
│ │ ├── arc-agi.json
│ │ ├── arc-c.json
│ │ ├── arc-e.json
│ │ ├── arc.json
│ │ ├── arena-hard-v2.json
│ │ ├── arena-hard.json
│ │ ├── attaq.json
│ │ ├── autologi.json
│ │ ├── bbh.json
│ │ ├── bfcl-v2.json
│ │ ├── bfcl-v3-multiturn.json
│ │ ├── bfcl-v3.json
│ │ ├── bfcl.json
│ │ ├── big-bench-extra-hard.json
│ │ ├── big-bench-hard.json
│ │ ├── big-bench.json
│ │ ├── bigcodebench-full.json
│ │ ├── bigcodebench-hard.json
│ │ ├── bigcodebench.json
│ │ ├── bird-sql-(dev).json
│ │ ├── blink.json
│ │ ├── boolq.json
│ │ ├── browsecomp-long-128k.json
│ │ ├── browsecomp-long-256k.json
│ │ ├── browsecomp-zh.json
│ │ ├── browsecomp.json
│ │ ├── c-eval.json
│ │ ├── cbnsl.json
│ │ ├── cc-ocr.json
│ │ ├── cfeval.json
│ │ ├── charadessta.json
│ │ ├── chartqa.json
│ │ ├── charxiv-d.json
│ │ ├── charxiv-r.json
│ │ ├── chexpert-cxr.json
│ │ ├── cluewsc.json
│ │ ├── cmmlu.json
│ │ ├── cnmo-2024.json
│ │ ├── codeforces.json
│ │ ├── codegolf-v2.2.json
│ │ ├── collie.json
│ │ ├── common-voice-15.json
│ │ ├── commonsenseqa.json
│ │ ├── complexfuncbench.json
│ │ ├── covost2-en-zh.json
│ │ ├── covost2.json
│ │ ├── crag.json
│ │ ├── creative-writing-v3.json
│ │ ├── crperelation.json
│ │ ├── crux-o.json
│ │ ├── cruxeval-input-cot.json
│ │ ├── cruxeval-o.json
│ │ ├── cruxeval-output-cot.json
│ │ ├── csimpleqa.json
│ │ ├── cybersecurity-ctfs.json
│ │ ├── dermmcqa.json
│ │ ├── docvqa.json
│ │ ├── docvqatest.json
│ │ ├── drop.json
│ │ ├── ds-arena-code.json
│ │ ├── ds-fim-eval.json
│ │ ├── eclektic.json
│ │ ├── egoschema.json
│ │ ├── erqa.json
│ │ ├── evalplus.json
│ │ ├── facts-grounding.json
│ │ ├── factscore.json
│ │ ├── finqa.json
│ │ ├── flenqa.json
│ │ ├── fleurs.json
│ │ ├── frames.json
│ │ ├── french-mmlu.json
│ │ ├── frontiermath.json
│ │ ├── functionalmath.json
│ │ ├── giantsteps-tempo.json
│ │ ├── global-mmlu-lite.json
│ │ ├── global-mmlu.json
│ │ ├── gorilla-benchmark-api-bench.json
│ │ ├── govreport.json
│ │ ├── gpqa-biology.json
│ │ ├── gpqa-chemistry.json
│ │ ├── gpqa-physics.json
│ │ ├── gpqa.json
│ │ ├── graphwalks-bfs-%3C128k.json
│ │ ├── graphwalks-bfs-%3E128k.json
│ │ ├── graphwalks-parents-%3C128k.json
│ │ ├── graphwalks-parents-%3E128k.json
│ │ ├── groundui-1k.json
│ │ ├── gsm-8k-(cot).json
│ │ ├── gsm8k-chat.json
│ │ ├── gsm8k.json
│ │ ├── hallusion-bench.json
│ │ ├── healthbench-hard.json
│ │ ├── healthbench.json
│ │ ├── hellaswag.json
│ │ ├── hiddenmath.json
│ │ ├── hle.json
│ │ ├── hmmt-2025.json
│ │ ├── hmmt25.json
│ │ ├── humaneval+.json
│ │ ├── humaneval-average.json
│ │ ├── humaneval-er.json
│ │ ├── humaneval-mul.json
│ │ ├── humaneval-plus.json
│ │ ├── humaneval.json
│ │ ├── humanevalfim-average.json
│ │ ├── humanity's-last-exam.json
│ │ ├── if.json
│ │ ├── ifeval.json
│ │ ├── include.json
│ │ ├── infinitebench-en.mc.json
│ │ ├── infinitebench-en.qa.json
│ │ ├── infographicsqa.json
│ │ ├── infovqa.json
│ │ ├── infovqatest.json
│ │ ├── instruct-humaneval.json
│ │ ├── intergps.json
│ │ ├── internal-api-instruction-following-(hard).json
│ │ ├── lbpp-(v2).json
│ │ ├── livebench-20241125.json
│ │ ├── livebench.json
│ │ ├── livecodebench(01-09).json
│ │ ├── livecodebench-v5-24.12-25.2.json
│ │ ├── livecodebench-v5.json
│ │ ├── livecodebench-v6.json
│ │ ├── livecodebench.json
│ │ ├── longbench-v2.json
│ │ ├── longfact-concepts.json
│ │ ├── longfact-objects.json
│ │ ├── longvideobench.json
│ │ ├── lsat.json
│ │ ├── lvbench.json
│ │ ├── math-(cot).json
│ │ ├── math-500.json
│ │ ├── math.json
│ │ ├── mathvision.json
│ │ ├── mathvista-mini.json
│ │ ├── mathvista.json
│ │ ├── mbpp+.json
│ │ ├── mbpp-++-base-version.json
│ │ ├── mbpp-evalplus-(base).json
│ │ ├── mbpp-evalplus.json
│ │ ├── mbpp-pass@1.json
│ │ ├── mbpp-plus.json
│ │ ├── mbpp.json
│ │ ├── medxpertqa.json
│ │ ├── mega-mlqa.json
│ │ ├── mega-tydi-qa.json
│ │ ├── mega-udpos.json
│ │ ├── mega-xcopa.json
│ │ ├── mega-xstorycloze.json
│ │ ├── meld.json
│ │ ├── mgsm.json
│ │ ├── mimic-cxr.json
│ │ ├── mlvu-m.json
│ │ ├── mlvu.json
│ │ ├── mm-if-eval.json
│ │ ├── mm-mind2web.json
│ │ ├── mm-mt-bench.json
│ │ ├── mmau-music.json
│ │ ├── mmau-sound.json
│ │ ├── mmau-speech.json
│ │ ├── mmau.json
│ │ ├── mmbench-test.json
│ │ ├── mmbench-v1.1.json
│ │ ├── mmbench-video.json
│ │ ├── mmbench.json
│ │ ├── mme-realworld.json
│ │ ├── mme.json
│ │ ├── mmlu-(cot).json
│ │ ├── mmlu-base.json
│ │ ├── mmlu-chat.json
│ │ ├── mmlu-french.json
│ │ ├── mmlu-pro.json
│ │ ├── mmlu-prox.json
│ │ ├── mmlu-redux-2.0.json
│ │ ├── mmlu-redux.json
│ │ ├── mmlu-stem.json
│ │ ├── mmlu.json
│ │ ├── mmmlu.json
│ │ ├── mmmu-(val).json
│ │ ├── mmmu-(validation).json
│ │ ├── mmmu-pro.json
│ │ ├── mmmu.json
│ │ ├── mmmuval.json
│ │ ├── mmstar.json
│ │ ├── mmt-bench.json
│ │ ├── mmvet.json
│ │ ├── mmvetgpt4turbo.json
│ │ ├── mobileminiwob++-sr.json
│ │ ├── mrcr-1m-(pointwise).json
│ │ ├── mrcr-1m.json
│ │ ├── mrcr-v2-(8-needle).json
│ │ ├── mrcr-v2.json
│ │ ├── mrcr.json
│ │ ├── mt-bench.json
│ │ ├── mtvqa.json
│ │ ├── muirbench.json
│ │ ├── multi-if.json
│ │ ├── multi-swe-bench.json
│ │ ├── multichallenge-(o3-mini-grader).json
│ │ ├── multichallenge.json
│ │ ├── multilf.json
│ │ ├── multilingual-mgsm-(cot).json
│ │ ├── multilingual-mmlu.json
│ │ ├── multipl-e-humaneval.json
│ │ ├── multipl-e-mbpp.json
│ │ ├── multipl-e.json
│ │ ├── musiccaps.json
│ │ ├── musr.json
│ │ ├── mvbench.json
│ │ ├── natural-questions.json
│ │ ├── natural2code.json
│ │ ├── nexus.json
│ │ ├── nih-multi-needle.json
│ │ ├── nmos.json
│ │ ├── nq.json
│ │ ├── ocrbench-v2-(en).json
│ │ ├── ocrbench-v2-(zh).json
│ │ ├── ocrbench-v2.json
│ │ ├── ocrbench.json
│ │ ├── odinw.json
│ │ ├── ojbench.json
│ │ ├── olympiadbench.json
│ │ ├── omnibench-music.json
│ │ ├── omnibench.json
│ │ ├── omnimath.json
│ │ ├── open-rewrite.json
│ │ ├── openai-mmlu.json
│ │ ├── openai-mrcr%3A-2-needle-128k.json
│ │ ├── openai-mrcr%3A-2-needle-1m.json
│ │ ├── openai-mrcr%3A-2-needle-256k.json
│ │ ├── openbookqa.json
│ │ ├── osworld-extended.json
│ │ ├── osworld-screenshot-only.json
│ │ ├── osworld.json
│ │ ├── pathmcqa.json
│ │ ├── perceptiontest.json
│ │ ├── phibench.json
│ │ ├── physicsfinals.json
│ │ ├── piqa.json
│ │ ├── pointgrounding.json
│ │ ├── polymath-en.json
│ │ ├── polymath.json
│ │ ├── pope.json
│ │ ├── popqa.json
│ │ ├── qasper.json
│ │ ├── qmsum.json
│ │ ├── realworldqa.json
│ │ ├── repobench.json
│ │ ├── repoqa.json
│ │ ├── ruler.json
│ │ ├── sat-math.json
│ │ ├── scale-multichallenge.json
│ │ ├── scicode.json
│ │ ├── scienceqa-visual.json
│ │ ├── scienceqa.json
│ │ ├── screenspot-pro.json
│ │ ├── screenspot.json
│ │ ├── simpleqa.json
│ │ ├── slakevqa.json
│ │ ├── social-iqa.json
│ │ ├── spider.json
│ │ ├── squality.json
│ │ ├── stem.json
│ │ ├── summscreenfd.json
│ │ ├── superglue.json
│ │ ├── supergpqa.json
│ │ ├── swe-bench-multilingual.json
│ │ ├── swe-bench-verified-(agentic-coding).json
│ │ ├── swe-bench-verified-(agentless).json
│ │ ├── swe-bench-verified-(multiple-attempts).json
│ │ ├── swe-bench-verified.json
│ │ ├── swe-dev.json
│ │ ├── swe-lancer-(ic-diamond-subset).json
│ │ ├── swe-lancer.json
│ │ ├── tau-bench-airline.json
│ │ ├── tau-bench-retail.json
│ │ ├── tau-bench.json
│ │ ├── tau2-airline.json
│ │ ├── tau2-retail.json
│ │ ├── tau2-telecom.json
│ │ ├── tempcompass.json
│ │ ├── terminal-bench.json
│ │ ├── terminus.json
│ │ ├── textvqa.json
│ │ ├── theoremqa.json
│ │ ├── tldr9+-(test).json
│ │ ├── translation-en-to-set1-comet22.json
│ │ ├── translation-en-to-set1-spbleu.json
│ │ ├── translation-set1-to-en-comet22.json
│ │ ├── translation-set1-to-en-spbleu.json
│ │ ├── triviaqa.json
│ │ ├── truthfulqa.json
│ │ ├── tydiqa.json
│ │ ├── uniform-bar-exam.json
│ │ ├── usamo25.json
│ │ ├── vatex.json
│ │ ├── vcr-en-easy.json
│ │ ├── vibe-eval.json
│ │ ├── video-mme-(long,-no-subtitles).json
│ │ ├── video-mme.json
│ │ ├── video-mmew-sub.json
│ │ ├── videomme-w-o-sub..json
│ │ ├── videomme-w-sub..json
│ │ ├── videommmu.json
│ │ ├── visualwebbench.json
│ │ ├── vocalsound.json
│ │ ├── voicebench-avg.json
│ │ ├── vqa-rad.json
│ │ ├── vqav2-(test).json
│ │ ├── vqav2-(val).json
│ │ ├── vqav2.json
│ │ ├── wild-bench.json
│ │ ├── winogrande.json
│ │ ├── wmt23.json
│ │ ├── wmt24++.json
│ │ ├── writingbench.json
│ │ ├── xlsum-english.json
│ │ ├── xstest.json
│ │ └── zebralogic.json
│ ├── licenses/
│ │ ├── apache_2_0.json
│ │ ├── cc_by_nc.json
│ │ ├── creative_commons_attribution_4_0_license.json
│ │ ├── deepseek.json
│ │ ├── gemma.json
│ │ ├── health_ai_developer_foundations_terms_of_use.json
│ │ ├── jamba_open_model_license.json
│ │ ├── llama3_2.json
│ │ ├── llama_3_1_community_license.json
│ │ ├── llama_3_2_community_license.json
│ │ ├── llama_3_3_community_license_agreement.json
│ │ ├── llama_4_community_license_agreement.json
│ │ ├── mistral_research_license.json
│ │ ├── mistral_research_license_(mrl)_for_research;_mistral_commercial_license_for_commercial_use.json
│ │ ├── mit.json
│ │ ├── mit_+_model_license_(commercial_use_allowed).json
│ │ ├── mit_license.json
│ │ ├── mnpl_0_1.json
│ │ ├── modified_mit_license.json
│ │ ├── nvidia_open_model_license_agreement.json
│ │ ├── proprietary.json
│ │ ├── qwen.json
│ │ ├── tongyi_qianwen.json
│ │ └── unknown.json
│ ├── organizations/
│ │ ├── ai21/
│ │ │ ├── models/
│ │ │ │ ├── jamba-1.5-large/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ └── jamba-1.5-mini/
│ │ │ │ ├── benchmarks.json
│ │ │ │ └── model.json
│ │ │ └── organization.json
│ │ ├── amazon/
│ │ │ ├── models/
│ │ │ │ ├── nova-lite/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── nova-micro/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ └── nova-pro/
│ │ │ │ ├── benchmarks.json
│ │ │ │ └── model.json
│ │ │ └── organization.json
│ │ ├── anthropic/
│ │ │ ├── models/
│ │ │ │ ├── claude-3-5-haiku-20241022/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── claude-3-5-sonnet-20240620/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── claude-3-5-sonnet-20241022/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── claude-3-7-sonnet-20250219/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── claude-3-haiku-20240307/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── claude-3-opus-20240229/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── claude-3-sonnet-20240229/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── claude-haiku-4-5-20251015/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── claude-opus-4-1-20250805/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── claude-opus-4-20250514/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── claude-sonnet-4-20250514/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ └── claude-sonnet-4-5-20250929/
│ │ │ │ ├── benchmarks.json
│ │ │ │ └── model.json
│ │ │ └── organization.json
│ │ ├── cohere/
│ │ │ ├── models/
│ │ │ │ └── command-r-plus-04-2024/
│ │ │ │ ├── benchmarks.json
│ │ │ │ └── model.json
│ │ │ └── organization.json
│ │ ├── deepseek/
│ │ │ ├── models/
│ │ │ │ ├── deepseek-r1/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── deepseek-r1-0528/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── deepseek-r1-distill-llama-70b/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── deepseek-r1-distill-llama-8b/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── deepseek-r1-distill-qwen-1.5b/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── deepseek-r1-distill-qwen-14b/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── deepseek-r1-distill-qwen-32b/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── deepseek-r1-distill-qwen-7b/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── deepseek-r1-zero/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── deepseek-v2.5/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── deepseek-v3/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── deepseek-v3-0324/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── deepseek-v3.1/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── deepseek-v3.2-exp/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── deepseek-vl2/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── deepseek-vl2-small/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ └── deepseek-vl2-tiny/
│ │ │ │ ├── benchmarks.json
│ │ │ │ └── model.json
│ │ │ └── organization.json
│ │ ├── google/
│ │ │ ├── models/
│ │ │ │ ├── gemini-1.0-pro/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gemini-1.5-flash/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gemini-1.5-flash-8b/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gemini-1.5-pro/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gemini-2.0-flash/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gemini-2.0-flash-lite/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gemini-2.0-flash-thinking/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gemini-2.5-flash/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gemini-2.5-flash-lite/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gemini-2.5-pro/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gemini-2.5-pro-preview-06-05/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gemini-diffusion/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gemma-2-27b-it/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gemma-2-9b-it/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gemma-3-12b-it/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gemma-3-1b-it/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gemma-3-27b-it/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gemma-3-4b-it/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gemma-3n-e2b/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gemma-3n-e2b-it/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gemma-3n-e2b-it-litert-preview/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gemma-3n-e4b/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gemma-3n-e4b-it/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gemma-3n-e4b-it-litert-preview/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ └── medgemma-4b-it/
│ │ │ │ ├── benchmarks.json
│ │ │ │ └── model.json
│ │ │ └── organization.json
│ │ ├── ibm/
│ │ │ ├── models/
│ │ │ │ ├── granite-3.3-8b-base/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── granite-3.3-8b-instruct/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ └── granite-4.0-tiny-preview/
│ │ │ │ ├── benchmarks.json
│ │ │ │ └── model.json
│ │ │ └── organization.json
│ │ ├── meta/
│ │ │ ├── models/
│ │ │ │ ├── llama-3.1-405b-instruct/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── llama-3.1-70b-instruct/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── llama-3.1-8b-instruct/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── llama-3.2-11b-instruct/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── llama-3.2-3b-instruct/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── llama-3.2-90b-instruct/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── llama-3.3-70b-instruct/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── llama-4-maverick/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ └── llama-4-scout/
│ │ │ │ ├── benchmarks.json
│ │ │ │ └── model.json
│ │ │ └── organization.json
│ │ ├── microsoft/
│ │ │ ├── models/
│ │ │ │ ├── phi-3.5-mini-instruct/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── phi-3.5-moe-instruct/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── phi-3.5-vision-instruct/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── phi-4/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── phi-4-mini/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── phi-4-mini-reasoning/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── phi-4-multimodal-instruct/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── phi-4-reasoning/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ └── phi-4-reasoning-plus/
│ │ │ │ ├── benchmarks.json
│ │ │ │ └── model.json
│ │ │ └── organization.json
│ │ ├── mistral/
│ │ │ ├── models/
│ │ │ │ ├── codestral-22b/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── devstral-medium-2507/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── devstral-small-2507/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── magistral-medium/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── magistral-small-2506/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── ministral-8b-instruct-2410/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── mistral-large-2-2407/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── mistral-nemo-instruct-2407/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── mistral-small-2409/
│ │ │ │ │ └── model.json
│ │ │ │ ├── mistral-small-24b-base-2501/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── mistral-small-24b-instruct-2501/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── mistral-small-3.1-24b-base-2503/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── mistral-small-3.1-24b-instruct-2503/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── mistral-small-3.2-24b-instruct-2506/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── pixtral-12b-2409/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ └── pixtral-large/
│ │ │ │ ├── benchmarks.json
│ │ │ │ └── model.json
│ │ │ └── organization.json
│ │ ├── moonshotai/
│ │ │ ├── models/
│ │ │ │ ├── kimi-k1.5/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── kimi-k2-0905/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── kimi-k2-base/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── kimi-k2-instruct/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ └── kimi-k2-instruct-0905/
│ │ │ │ ├── benchmarks.json
│ │ │ │ └── model.json
│ │ │ └── organization.json
│ │ ├── nvidia/
│ │ │ ├── models/
│ │ │ │ ├── llama-3.1-nemotron-70b-instruct/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── llama-3.1-nemotron-nano-8b-v1/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── llama-3.1-nemotron-ultra-253b-v1/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── llama-3.3-nemotron-super-49b-v1/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ └── nemotron-nano-9b-v2/
│ │ │ │ ├── benchmarks.json
│ │ │ │ └── model.json
│ │ │ └── organization.json
│ │ ├── openai/
│ │ │ ├── models/
│ │ │ │ ├── gpt-3.5-turbo-0125/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gpt-4-0613/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gpt-4-turbo-2024-04-09/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gpt-4.1-2025-04-14/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gpt-4.1-mini-2025-04-14/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gpt-4.1-nano-2025-04-14/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gpt-4.5/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gpt-4o-2024-05-13/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gpt-4o-2024-08-06/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gpt-4o-mini-2024-07-18/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gpt-5-2025-08-07/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gpt-5-codex-2025-09-15/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gpt-5-mini-2025-08-07/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gpt-5-nano-2025-08-07/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gpt-oss-120b/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gpt-oss-20b/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── o1-2024-12-17/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── o1-mini/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── o1-preview/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── o1-pro/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── o3-2025-04-16/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── o3-mini/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── o3-pro-2025-06-10/
│ │ │ │ │ └── model.json
│ │ │ │ └── o4-mini/
│ │ │ │ ├── benchmarks.json
│ │ │ │ └── model.json
│ │ │ └── organization.json
│ │ ├── qwen/
│ │ │ ├── models/
│ │ │ │ ├── qvq-72b-preview/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── qwen-2.5-14b-instruct/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── qwen-2.5-32b-instruct/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── qwen-2.5-72b-instruct/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── qwen-2.5-7b-instruct/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── qwen-2.5-coder-32b-instruct/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── qwen-2.5-coder-7b-instruct/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── qwen2-72b-instruct/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── qwen2-7b-instruct/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── qwen2-vl-72b/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── qwen2.5-omni-7b/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── qwen2.5-vl-32b/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── qwen2.5-vl-72b/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── qwen2.5-vl-7b/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── qwen3-235b-a22b/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── qwen3-235b-a22b-instruct-2507/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── qwen3-235b-a22b-thinking-2507/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── qwen3-30b-a3b/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── qwen3-32b/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── qwen3-next-80b-a3b-base/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── qwen3-next-80b-a3b-instruct/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── qwen3-next-80b-a3b-thinking/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── qwq-32b/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ └── qwq-32b-preview/
│ │ │ │ ├── benchmarks.json
│ │ │ │ └── model.json
│ │ │ └── organization.json
│ │ ├── unknown/
│ │ │ └── organization.json
│ │ ├── xai/
│ │ │ ├── models/
│ │ │ │ ├── grok-1.5/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── grok-1.5v/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── grok-2/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── grok-2-mini/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── grok-3/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── grok-3-mini/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── grok-4/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── grok-4-fast/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── grok-4-heavy/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ └── grok-code-fast-1/
│ │ │ │ ├── benchmarks.json
│ │ │ │ └── model.json
│ │ │ └── organization.json
│ │ └── zai-org/
│ │ ├── models/
│ │ │ ├── glm-4.5/
│ │ │ │ ├── benchmarks.json
│ │ │ │ └── model.json
│ │ │ ├── glm-4.5-air/
│ │ │ │ ├── benchmarks.json
│ │ │ │ └── model.json
│ │ │ ├── glm-4.5v/
│ │ │ │ ├── benchmarks.json
│ │ │ │ └── model.json
│ │ │ └── glm-4.6/
│ │ │ ├── benchmarks.json
│ │ │ └── model.json
│ │ └── organization.json
│ └── providers/
│ ├── anthropic/
│ │ ├── models.json
│ │ └── provider.json
│ ├── azure/
│ │ ├── models.json
│ │ └── provider.json
│ ├── bedrock/
│ │ ├── models.json
│ │ └── provider.json
│ ├── cerebras/
│ │ ├── models.json
│ │ └── provider.json
│ ├── cohere/
│ │ ├── models.json
│ │ └── provider.json
│ ├── deepinfra/
│ │ ├── models.json
│ │ └── provider.json
│ ├── deepseek/
│ │ ├── models.json
│ │ └── provider.json
│ ├── fireworks/
│ │ ├── models.json
│ │ └── provider.json
│ ├── google/
│ │ ├── models.json
│ │ └── provider.json
│ ├── groq/
│ │ ├── models.json
│ │ └── provider.json
│ ├── hyperbolic/
│ │ ├── models.json
│ │ └── provider.json
│ ├── lambda/
│ │ ├── models.json
│ │ └── provider.json
│ ├── mistral/
│ │ ├── models.json
│ │ └── provider.json
│ ├── novita/
│ │ ├── models.json
│ │ └── provider.json
│ ├── openai/
│ │ ├── models.json
│ │ └── provider.json
│ ├── replicate/
│ │ ├── models.json
│ │ └── provider.json
│ ├── sambanova/
│ │ ├── models.json
│ │ └── provider.json
│ ├── together/
│ │ ├── models.json
│ │ └── provider.json
│ ├── xai/
│ │ ├── models.json
│ │ └── provider.json
│ └── zeroeval/
│ ├── models.json
│ └── provider.json
├── package.json
└── schemas/
├── README.md
├── benchmark-results.schema.json
├── benchmark.schema.json
├── integrity-validator.js
├── license.schema.json
├── model.schema.json
├── organization.schema.json
├── provider-models.schema.json
├── provider.schema.json
└── validator.js
================================================
FILE CONTENTS
================================================
================================================
FILE: .github/pull_request_template.md
================================================
## Description
<!-- Briefly describe your changes and add links to the relevant resources -->
References:
<!-- Add links to the relevant resources -->
## Type of Change
<!-- Mark the appropriate option with an [x] -->
- [ ] Model Update/Addition
- [ ] Qualitative Metrics (Benchmark Results) Update/Addition
- [ ] Provider Update/Addition
- [ ] Other (please specify)
## Checklist
- [ ] I've read the [CONTRIBUTING.md](../CONTRIBUTING.md) guidelines
- [ ] My changes are accurate and properly referenced
================================================
FILE: .github/workflows/schema-validation.yml
================================================
name: Schema Validation
on:
pull_request:
branches: [main]
jobs:
validate:
name: Validate Schema
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v3
- name: Setup Node.js
uses: actions/setup-node@v3
with:
node-version: "16"
cache: "npm"
- name: Install dependencies
run: npm ci
- name: Run schema validation
run: node schemas/validator.js
================================================
FILE: .gitignore
================================================
/node_modules
================================================
FILE: .vscode/settings.json
================================================
{
"json.schemas": [
{
"fileMatch": ["/models/*/model.json"],
"url": "../schemas/models-schema.json"
},
{
"fileMatch": ["/models/*/qualitativemetrics.json"],
"url": "../schemas/qualitativemetrics-schema.json"
},
{
"fileMatch": ["/providers/*/provider.json"],
"url": "../schemas/providers-schema.json"
},
{
"fileMatch": ["/providers/*/providermodels.json"],
"url": "../schemas/providermodels-schema.json"
}
]
}
================================================
FILE: CONTRIBUTING.md
================================================
# Contributing to LLM Stats
Thank you for your interest in contributing. This guide outlines the process for updating and adding information to the LLM Stats database.
## Table of Contents
- [Overview](#overview)
- [Data Structure](#data-structure)
- [General Guidelines](#general-guidelines)
- [Organizations](#organizations)
- [Models](#models)
- [Benchmark Results](#benchmark-results)
- [Benchmarks](#benchmarks)
- [Providers](#providers)
- [Licenses](#licenses)
- [Validation](#validation)
- [Submitting Your Contribution](#submitting-your-contribution)
## Overview
All data is organized in the `data/data/` directory with a hierarchical structure. Each entity type has its own JSON schema definition in `schemas/` that validates the data structure.
## Data Structure
```
data/
├── data/
│ ├── organizations/
│ │ └── [organization_id]/
│ │ ├── organization.json
│ │ └── models/
│ │ └── [model_id]/
│ │ ├── model.json
│ │ └── benchmarks.json
│ ├── providers/
│ │ └── [provider_id]/
│ │ ├── provider.json
│ │ └── models.json
│ ├── licenses/
│ │ └── [license_id].json
│ └── benchmarks/
│ └── [benchmark_id].json
└── schemas/
├── organization.schema.json
├── model.schema.json
├── benchmark-results.schema.json
├── benchmark.schema.json
├── provider.schema.json
├── provider-models.schema.json
└── license.schema.json
```
## General Guidelines
1. **Accuracy First**: Ensure all data is accurate and sourced from authoritative references
2. **Follow Structure**: Adhere to the existing file structure and naming conventions
3. **Consistent Formatting**: Use consistent JSON formatting with 2-space indentation
4. **One Change per PR**: Submit one pull request per logical change (e.g., one model, one provider)
5. **Schema Validation**: All data files must validate against their respective JSON schemas
6. **Required Fields**: Pay attention to required vs optional fields in schemas
7. **Timestamps**: Use ISO 8601 format for dates (YYYY-MM-DD or full timestamp)
## Organizations
Organizations represent the entities that create and release models (e.g., OpenAI, Anthropic, Meta).
### Location
`data/data/organizations/[organization_id]/organization.json`
### Adding a New Organization
1. Create a new directory: `data/data/organizations/[organization_id]/`
2. Create `organization.json` with the following structure:
```json
{
"organization_id": "organization-name",
"name": "Organization Display Name",
"website": "https://organization.com",
"description": "Brief description of the organization",
"country": "US",
"created_at": "2025-10-02T00:00:00.000000+00:00",
"updated_at": "2025-10-02T00:00:00.000000+00:00"
}
```
3. Validate against `schemas/organization.schema.json`
4. Create a `models/` subdirectory for future models
### Updating an Existing Organization
1. Navigate to `data/data/organizations/[organization_id]/organization.json`
2. Update the relevant fields
3. Update the `updated_at` timestamp
4. Validate against the schema
## Models
Models are stored within their respective organization directories.
### Location
`data/data/organizations/[organization_id]/models/[model_id]/`
### Adding a New Model
1. Ensure the organization exists in `data/data/organizations/`
2. Ensure the license exists in `data/data/licenses/`
3. Create a new directory: `data/data/organizations/[organization_id]/models/[model_id]/`
4. Create two files in this directory:
#### `model.json`
```json
{
"model_id": "model-name-version",
"name": "Model Display Name",
"organization_id": "organization-name",
"fine_tuned_from_model_id": null,
"description": "Detailed description of the model's capabilities",
"release_date": "2024-10-22",
"announcement_date": "2024-10-22",
"license_id": "proprietary",
"multimodal": false,
"knowledge_cutoff": "2024-04-01",
"param_count": 7000000000,
"training_tokens": 15000000000000,
"available_in_zeroeval": true,
"source_api_ref": "https://...",
"source_playground": "https://...",
"source_paper": "https://...",
"source_scorecard_blog_link": "https://...",
"source_repo_link": "https://github.com/...",
"source_weights_link": "https://huggingface.co/...",
"created_at": "2025-10-02T00:00:00.000000+00:00",
"updated_at": "2025-10-02T00:00:00.000000+00:00",
"model_family_id": null
}
```
**Required Fields**: `model_id`, `name`, `organization_id`, `release_date`, `license_id`, `multimodal`
**Optional Fields**: Set to `null` if not applicable
#### `benchmarks.json`
Start with an empty array if no benchmark results are available yet:
```json
[]
```
5. Validate both files against their respective schemas
### Updating an Existing Model
1. Navigate to `data/data/organizations/[organization_id]/models/[model_id]/model.json`
2. Update the relevant fields
3. Update the `updated_at` timestamp
4. Validate against `schemas/model.schema.json`
## Benchmark Results
Benchmark results are stored in the `benchmarks.json` file within each model directory.
### Location
`data/data/organizations/[organization_id]/models/[model_id]/benchmarks.json`
### Adding Benchmark Results
1. Ensure the benchmark exists in `data/data/benchmarks/`
2. Ensure the model exists
3. Add a new entry to the `benchmarks.json` array:
```json
[
{
"benchmark_id": "mmlu",
"score": 85.5,
"score_unit": "percentage",
"source_link": "https://example.com/results",
"created_at": "2025-10-02T00:00:00.000000+00:00",
"updated_at": "2025-10-02T00:00:00.000000+00:00"
}
]
```
4. Validate against `schemas/benchmark-results.schema.json`
### Updating Benchmark Results
1. Locate the specific result in the array
2. Update the `score` and/or `source_link`
3. Update the `updated_at` timestamp
4. Ensure the `source_link` is reliable and authoritative
## Benchmarks
Benchmarks define the evaluation tests used to measure model performance.
### Location
`data/data/benchmarks/[benchmark_id].json`
### Adding a New Benchmark
1. Create a new file: `data/data/benchmarks/[benchmark_id].json`
2. Follow this structure:
```json
{
"benchmark_id": "benchmark-name",
"name": "Benchmark Display Name",
"description": "Description of what this benchmark measures",
"category": "reasoning",
"source_link": "https://...",
"created_at": "2025-10-02T00:00:00.000000+00:00",
"updated_at": "2025-10-02T00:00:00.000000+00:00"
}
```
3. Validate against `schemas/benchmark.schema.json`
## Providers
Providers are services that offer access to models (e.g., OpenAI API, AWS Bedrock, Google Vertex AI).
### Location
`data/data/providers/[provider_id]/`
### Adding a New Provider
1. Create a new directory: `data/data/providers/[provider_id]/`
2. Create two files:
#### `provider.json`
```json
{
"provider_id": "provider-name",
"name": "Provider Display Name",
"website": "https://provider.com",
"created_at": "2025-10-02T00:00:00.000000+00:00",
"updated_at": "2025-10-02T00:00:00.000000+00:00"
}
```
#### `models.json`
Start with an empty array:
```json
[]
```
3. Validate both files against their respective schemas
### Updating Provider Information
1. Navigate to `data/data/providers/[provider_id]/provider.json`
2. Update the relevant fields
3. Update the `updated_at` timestamp
### Adding Provider Models
Provider models specify pricing and availability of models through specific providers.
1. Open `data/data/providers/[provider_id]/models.json`
2. Add a new entry to the array:
```json
[
{
"provider_model_id": "provider-specific-id",
"model_id": "actual-model-id",
"provider_id": "provider-name",
"input_price_per_million": 3.0,
"output_price_per_million": 15.0,
"context_window": 200000,
"max_output_tokens": 4096,
"available": true,
"created_at": "2025-10-02T00:00:00.000000+00:00",
"updated_at": "2025-10-02T00:00:00.000000+00:00"
}
]
```
3. Ensure the model exists in `data/data/organizations/[org]/models/[model_id]/`
4. Validate against `schemas/provider-models.schema.json`
## Licenses
Licenses define the terms under which models can be used.
### Location
`data/data/licenses/[license_id].json`
### Adding a New License
1. Create a new file: `data/data/licenses/[license_id].json`
2. Follow this structure:
```json
{
"license_id": "license-name",
"name": "License Display Name",
"url": "https://...",
"commercial_use": true,
"created_at": "2025-10-02T00:00:00.000000+00:00",
"updated_at": "2025-10-02T00:00:00.000000+00:00"
}
```
3. Validate against `schemas/license.schema.json`
## Validation
Before submitting your contribution:
### Manual Validation
Run the validator script from the `data/` directory:
```bash
cd data
node schemas/validator.js
```
This will check all JSON files against their respective schemas.
### What the Validator Checks
- JSON syntax correctness
- Required fields are present
- Field types match schema definitions
- ID references exist (e.g., organization_id, license_id)
- Date formats are valid
- URLs are properly formatted
### Common Validation Errors
1. **Missing Required Fields**: Ensure all required fields are present
2. **Invalid Date Format**: Use ISO 8601 format (YYYY-MM-DD or full timestamp)
3. **Invalid References**: Ensure referenced IDs exist (organization_id, license_id, etc.)
4. **Type Mismatch**: Ensure numbers are numbers, strings are strings, etc.
5. **Trailing Commas**: Remove trailing commas in JSON arrays/objects
## Submitting Your Contribution
1. **Fork the Repository**: Create your own fork of the project
2. **Create a Branch**: Use a descriptive branch name (e.g., `add-gpt5-model`, `update-claude-pricing`)
3. **Make Changes**: Follow the guidelines above
4. **Validate Locally**: Run `node schemas/validator.js` to ensure your changes are valid
5. **Commit Changes**: Write clear, descriptive commit messages
6. **Submit a Pull Request**:
- Provide a clear title and description
- List what was added or changed
- Include links to authoritative sources
- Reference any related issues
### Pull Request Template
```markdown
## Description
Brief description of what this PR adds or changes
## Changes
- Added/Updated model: [Model Name]
- Added/Updated organization: [Organization Name]
- Added benchmark results for: [Benchmark Name]
## Sources
- [Source 1]: https://...
- [Source 2]: https://...
## Validation
- [ ] Ran `node schemas/validator.js` successfully
- [ ] All files follow the correct structure
- [ ] All references (organization_id, license_id) are valid
```
### Example Pull Request
For reference, see this [example pull request](https://github.com/JonathanChavezTamales/llm-leaderboard/pull/1).
## Questions?
If you have questions or need clarification, please:
1. Check the schema files in `schemas/` for detailed field definitions
2. Look at existing data files as examples
3. Open an issue on GitHub
Thank you for contributing to LLM Stats!
================================================
FILE: LICENSE.md
================================================
Creative Commons Attribution 4.0 International License
Copyright (c) 2024 jc
This work is licensed under the Creative Commons Attribution 4.0 International License.
To view a copy of this license, visit http://creativecommons.org/licenses/by/4.0/
or send a letter to Creative Commons, PO Box 1866, Mountain View, CA 94042, USA.
You are free to:
- Share — copy and redistribute the material in any medium or format
- Adapt — remix, transform, and build upon the material for any purpose, even commercially
Under the following terms:
- Attribution — You must give appropriate credit, provide a link to the license, and indicate
if changes were made. You may do so in any reasonable manner, but not in any way that
suggests the licensor endorses you or your use.
No additional restrictions — You may not apply legal terms or technological measures that
legally restrict others from doing anything the license permits.
Notices:
- You do not have to comply with the license for elements of the material in the public domain
or where your use is permitted by an applicable exception or limitation.
- No warranties are given. The license may not give you all of the permissions necessary for
your intended use. For example, other rights such as publicity, privacy, or moral rights
may limit how you use the material.
================================================
FILE: README.md
================================================
# DEPRECATED - Updates and contributions
This repository is now depracated and won't be getting any new updates. For contributions and corrections of the data seen in [LLM Stats](https://llm-stats.com/) please create a post with the tag "Issue" in the [official community section](https://llm-stats.com/posts) of the website.
For model and/or benchmark specific corrections, please visit create an Issue under the "Discussion" tab of the model/benchmark, as seen in the example below.
<img width="1156" height="575" alt="Screenshot 2025-10-24 at 1 43 52 p m" src="https://github.com/user-attachments/assets/b78f2cf3-f3ff-4a51-bba4-d8643865d16b" />
---
<img width="1208" alt="image" src="https://github.com/user-attachments/assets/835f1e1b-73e6-405a-b7ad-096d5f5f567a" />
# LLM-Stats.com
[](https://github.com/JonathanChavezTamales/llm-leaderboard/stargazers)
[](CONTRIBUTING.md)
[](https://discord.com/invite/RxGUBvE42d)
[](https://github.com/JonathanChavezTamales/llm-leaderboard/issues)
A community-driven repository of LLM data and benchmarks. Compare and explore language models through our interactive dashboard at [llm-stats.com](https://llm-stats.com).
## Found an issue or have a feature request?
[Open an issue here](https://github.com/JonathanChavezTamales/llm-leaderboard/issues). Thank you!
# Data
## 🔍 What's Inside
Our repository contains detailed information on hundreds of LLMs:
- Model parameters, context window sizes, licensing details, capabilities, and more
- Provider pricing and configurations
- Performance metrics (throughput, latency)
- Standardized benchmark results
- Organization and license information
## 📁 Data Structure
All data is organized in the `data/` directory:
- `data/models/` - Model metadata and configurations
- `data/providers/` - Provider information
- `data/provider_models/` - Provider-specific model pricing and features
- `data/benchmarks/` - Benchmark definitions
- `data/model_benchmarks/` - Model benchmark scores
- `data/organizations/` - Organization information
- `data/licenses/` - License definitions
## 🤝 How to Contribute
We welcome community contributions to keep our data accurate and up-to-date:
1. **Update Model Data**
- Browse the [`data/`](data/) directory structure
- Submit a PR following our [contribution guidelines](CONTRIBUTING.md)
- Check [`schemas/`](schemas/) for JSON Schema validation
## 📈 Data Quality
Accuracy is our priority. To ensure reliable information:
- All benchmark data requires verifiable source links
- Community review process for all changes
- Multiple source citations encouraged
- Regular validation of submitted data
There's no guarantee that the data is 100% accurate, but we do our best to ensure it's as accurate as possible.
## 🌟 Community
- Join our [Discord](https://discord.gg/RxGUBvE42d) for discussions
## Leaderboard
| Name | Release Date | Input Context | Output Context | GPQA | MMLU | MMLU-Pro | MATH | HumanEval | MMMU | LiveCodeBench |
| ---------------------------------------- | ------------ | ------------- | -------------- | ----- | ----- | -------- | ----- | --------- | ----- | ------------- |
| GPT-5 | 2025-08-07 | N/A | N/A | 0.857 | 0.925 | N/A | 0.847 | 0.934 | 0.842 | N/A |
| o1 | 2024-12-17 | N/A | N/A | 0.780 | 0.918 | N/A | 0.964 | 0.881 | 0.776 | N/A |
| GPT-4.5 | 2025-02-27 | N/A | N/A | 0.695 | 0.908 | N/A | N/A | 0.880 | 0.752 | N/A |
| o1-preview | 2024-09-12 | N/A | N/A | 0.733 | 0.908 | N/A | 0.855 | N/A | N/A | N/A |
| Claude 3.5 Sonnet | 2024-10-22 | N/A | N/A | 0.672 | 0.904 | 0.776 | 0.783 | 0.937 | 0.683 | N/A |
| Claude 3.5 Sonnet | 2024-06-21 | N/A | N/A | 0.594 | 0.904 | 0.761 | 0.711 | 0.920 | N/A | N/A |
| Kimi K2 0905 | 2025-09-05 | N/A | N/A | 0.758 | 0.902 | 0.825 | 0.891 | 0.945 | N/A | N/A |
| GPT-4.1 | 2025-04-14 | N/A | N/A | 0.663 | 0.902 | N/A | N/A | N/A | 0.748 | N/A |
| Kimi K2 Instruct | 2025-07-11 | N/A | N/A | 0.751 | 0.895 | 0.811 | N/A | 0.933 | N/A | N/A |
| GPT-4o | 2024-05-13 | N/A | N/A | 0.536 | 0.887 | 0.726 | 0.766 | 0.902 | N/A | N/A |
| DeepSeek-V3 | 2024-12-25 | N/A | N/A | 0.591 | 0.885 | 0.759 | N/A | N/A | N/A | 0.376 |
| Qwen3 235B A22B | 2025-04-29 | N/A | N/A | 0.475 | 0.878 | 0.682 | 0.718 | N/A | N/A | 0.707 |
| Kimi K2 Base | 2025-07-11 | N/A | N/A | 0.481 | 0.878 | 0.692 | 0.702 | N/A | N/A | N/A |
| Grok-2 | 2024-08-13 | N/A | N/A | 0.560 | 0.875 | 0.755 | 0.761 | 0.884 | 0.661 | N/A |
| GPT-4.1 mini | 2025-04-14 | N/A | N/A | 0.650 | 0.875 | N/A | N/A | N/A | 0.727 | N/A |
| Kimi-k1.5 | 2025-01-20 | N/A | N/A | N/A | 0.874 | N/A | N/A | N/A | 0.700 | N/A |
| Llama 3.1 405B Instruct | 2024-07-23 | N/A | N/A | 0.507 | 0.873 | 0.733 | 0.738 | 0.890 | N/A | N/A |
| o3-mini | 2025-01-30 | N/A | N/A | 0.772 | 0.869 | N/A | 0.979 | N/A | N/A | N/A |
| Claude 3 Opus | 2024-02-29 | N/A | N/A | 0.504 | 0.868 | 0.685 | 0.601 | 0.849 | N/A | N/A |
| GPT-4 Turbo | 2024-04-09 | N/A | N/A | 0.480 | 0.865 | N/A | 0.726 | 0.871 | N/A | N/A |
| GPT-4 | 2023-06-13 | N/A | N/A | 0.357 | 0.864 | N/A | 0.420 | 0.670 | N/A | N/A |
| Grok-2 mini | 2024-08-13 | N/A | N/A | 0.510 | 0.862 | 0.720 | 0.730 | 0.857 | 0.632 | N/A |
| Llama 3.2 90B Instruct | 2024-09-25 | N/A | N/A | 0.467 | 0.860 | N/A | 0.680 | N/A | 0.603 | N/A |
| Llama 3.3 70B Instruct | 2024-12-06 | N/A | N/A | 0.505 | 0.860 | 0.689 | 0.770 | 0.884 | N/A | N/A |
| Nova Pro | 2024-11-20 | N/A | N/A | 0.469 | 0.859 | N/A | 0.766 | 0.890 | 0.617 | N/A |
| Gemini 1.5 Pro | 2024-05-01 | N/A | N/A | 0.591 | 0.859 | 0.758 | 0.865 | 0.841 | 0.659 | N/A |
| GPT-4o | 2024-08-06 | N/A | N/A | 0.701 | 0.857 | 0.747 | N/A | N/A | 0.722 | N/A |
| Llama 4 Maverick | 2025-04-05 | N/A | N/A | 0.698 | 0.855 | 0.805 | 0.612 | N/A | 0.734 | 0.434 |
| o1-mini | 2024-09-12 | N/A | N/A | 0.600 | 0.852 | N/A | N/A | 0.924 | N/A | N/A |
| Phi 4 | 2024-12-12 | N/A | N/A | 0.561 | 0.848 | 0.704 | 0.804 | 0.826 | N/A | N/A |
| Mistral Large 2 | 2024-07-24 | N/A | N/A | N/A | 0.840 | N/A | N/A | 0.920 | N/A | N/A |
| Llama 3.1 70B Instruct | 2024-07-23 | N/A | N/A | 0.417 | 0.836 | 0.664 | N/A | 0.805 | N/A | N/A |
| Qwen2.5 32B Instruct | 2024-09-19 | N/A | N/A | 0.495 | 0.833 | 0.690 | 0.831 | 0.884 | N/A | N/A |
| Qwen2 72B Instruct | 2024-07-23 | N/A | N/A | 0.424 | 0.823 | 0.644 | 0.597 | 0.860 | N/A | N/A |
| GPT-4o mini | 2024-07-18 | N/A | N/A | 0.402 | 0.820 | N/A | 0.702 | 0.872 | 0.594 | N/A |
| Grok-1.5 | 2024-03-28 | N/A | N/A | 0.359 | 0.813 | 0.510 | 0.506 | 0.741 | 0.536 | N/A |
| Jamba 1.5 Large | 2024-08-22 | N/A | N/A | 0.369 | 0.812 | 0.535 | N/A | N/A | N/A | N/A |
| Mistral Small 3.1 24B Base | 2025-03-17 | N/A | N/A | 0.375 | 0.810 | 0.560 | N/A | N/A | 0.593 | N/A |
| Mistral Small 3 24B Base | 2025-01-30 | N/A | N/A | 0.344 | 0.807 | 0.544 | 0.460 | N/A | N/A | N/A |
| Mistral Small 3.1 24B Instruct | 2025-03-17 | N/A | N/A | 0.460 | 0.806 | 0.668 | 0.693 | 0.884 | 0.593 | N/A |
| Nova Lite | 2024-11-20 | N/A | N/A | 0.420 | 0.805 | N/A | 0.733 | 0.854 | 0.562 | N/A |
| Mistral Small 3.2 24B Instruct | 2025-06-20 | N/A | N/A | 0.461 | 0.805 | 0.691 | 0.694 | N/A | 0.625 | N/A |
| DeepSeek-V2.5 | 2024-05-08 | N/A | N/A | N/A | 0.804 | N/A | 0.747 | 0.890 | N/A | N/A |
| Llama 3.1 Nemotron 70B Instruct | 2024-10-01 | N/A | N/A | N/A | 0.802 | N/A | N/A | N/A | N/A | N/A |
| GPT-4.1 nano | 2025-04-14 | N/A | N/A | 0.503 | 0.801 | N/A | N/A | N/A | 0.554 | N/A |
| Qwen2.5 14B Instruct | 2024-09-19 | N/A | N/A | 0.455 | 0.797 | 0.637 | 0.800 | 0.835 | N/A | N/A |
| Llama 4 Scout | 2025-04-05 | N/A | N/A | 0.572 | 0.796 | 0.743 | 0.503 | N/A | 0.694 | 0.328 |
| Claude 3 Sonnet | 2024-02-29 | N/A | N/A | 0.404 | 0.790 | 0.568 | 0.431 | 0.730 | N/A | N/A |
| Gemini 1.5 Flash | 2024-05-01 | N/A | N/A | 0.510 | 0.789 | 0.673 | 0.779 | 0.743 | 0.623 | N/A |
| Phi-3.5-MoE-instruct | 2024-08-23 | N/A | N/A | 0.368 | 0.789 | 0.453 | 0.595 | 0.707 | N/A | N/A |
| Qwen2.5 VL 32B Instruct | 2025-02-28 | N/A | N/A | 0.460 | 0.784 | 0.688 | 0.822 | 0.915 | 0.700 | N/A |
| Nova Micro | 2024-11-20 | N/A | N/A | 0.400 | 0.776 | N/A | 0.693 | 0.811 | N/A | N/A |
| Command R+ | 2024-08-30 | N/A | N/A | N/A | 0.757 | N/A | N/A | N/A | N/A | N/A |
| Gemma 2 27B | 2024-06-27 | N/A | N/A | N/A | 0.752 | N/A | 0.423 | 0.518 | N/A | N/A |
| Claude 3 Haiku | 2024-03-13 | N/A | N/A | 0.333 | 0.752 | N/A | 0.389 | 0.759 | N/A | N/A |
| Qwen2.5-Coder 32B Instruct | 2024-09-19 | N/A | N/A | N/A | 0.751 | 0.504 | 0.572 | 0.927 | N/A | 0.314 |
| Llama 3.2 11B Instruct | 2024-09-25 | N/A | N/A | 0.328 | 0.730 | N/A | 0.519 | N/A | 0.507 | N/A |
| Gemini 1.0 Pro | 2024-02-15 | N/A | N/A | 0.279 | 0.718 | N/A | 0.326 | N/A | 0.479 | N/A |
| Gemma 2 9B | 2024-06-27 | N/A | N/A | N/A | 0.713 | N/A | 0.366 | 0.402 | N/A | N/A |
| Qwen2 7B Instruct | 2024-07-23 | N/A | N/A | 0.253 | 0.705 | 0.441 | 0.496 | 0.799 | N/A | 0.266 |
| GPT-3.5 Turbo | 2023-03-21 | N/A | N/A | 0.308 | 0.698 | N/A | 0.431 | 0.680 | 0.000 | N/A |
| Jamba 1.5 Mini | 2024-08-22 | N/A | N/A | 0.323 | 0.697 | 0.425 | N/A | N/A | N/A | N/A |
| Llama 3.1 8B Instruct | 2024-07-23 | N/A | N/A | 0.304 | 0.694 | 0.483 | N/A | 0.726 | N/A | N/A |
| Pixtral-12B | 2024-09-17 | N/A | N/A | N/A | 0.692 | N/A | 0.481 | 0.720 | 0.525 | N/A |
| Phi-3.5-mini-instruct | 2024-08-23 | N/A | N/A | 0.304 | 0.690 | 0.474 | 0.485 | 0.628 | N/A | N/A |
| Mistral NeMo Instruct | 2024-07-18 | N/A | N/A | N/A | 0.680 | N/A | N/A | N/A | N/A | N/A |
| Qwen2.5-Coder 7B Instruct | 2024-09-19 | N/A | N/A | N/A | 0.676 | 0.401 | 0.466 | 0.884 | N/A | 0.182 |
| Phi 4 Mini | 2025-02-01 | N/A | N/A | 0.252 | 0.673 | 0.528 | 0.640 | N/A | N/A | N/A |
| Granite 3.3 8B Instruct | 2025-04-16 | N/A | N/A | N/A | 0.655 | N/A | N/A | 0.897 | N/A | N/A |
| Ministral 8B Instruct | 2024-10-16 | N/A | N/A | N/A | 0.650 | N/A | 0.545 | 0.348 | N/A | N/A |
| Gemma 3n E4B Instructed LiteRT Preview | 2025-05-20 | N/A | N/A | 0.237 | 0.649 | 0.506 | N/A | 0.750 | N/A | 0.132 |
| Gemma 3n E4B Instructed | 2025-06-26 | N/A | N/A | 0.237 | 0.649 | 0.506 | N/A | 0.750 | N/A | 0.132 |
| Granite 3.3 8B Base | 2025-04-16 | N/A | N/A | N/A | 0.639 | N/A | N/A | 0.897 | N/A | N/A |
| Llama 3.2 3B Instruct | 2024-09-25 | N/A | N/A | 0.328 | 0.634 | N/A | 0.480 | N/A | N/A | N/A |
| IBM Granite 4.0 Tiny Preview | 2025-05-02 | N/A | N/A | N/A | 0.604 | N/A | N/A | 0.824 | N/A | N/A |
| Gemma 3n E2B Instructed LiteRT (Preview) | 2025-05-20 | N/A | N/A | 0.248 | 0.601 | 0.405 | N/A | 0.665 | N/A | 0.132 |
| Gemma 3n E2B Instructed | 2025-06-26 | N/A | N/A | 0.248 | 0.601 | 0.405 | N/A | 0.665 | N/A | 0.132 |
| Kimi K2-Instruct-0905 | 2025-09-05 | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A |
| Gemma 3n E4B | 2025-06-26 | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A |
| Gemma 3 12B | 2025-03-12 | N/A | N/A | 0.409 | N/A | 0.606 | 0.838 | 0.854 | N/A | 0.246 |
| Gemini 2.5 Pro | 2025-05-20 | N/A | N/A | 0.830 | N/A | N/A | N/A | N/A | 0.796 | N/A |
| Gemini 2.0 Flash-Lite | 2025-02-05 | N/A | N/A | 0.515 | N/A | 0.716 | 0.868 | N/A | 0.680 | N/A |
| Gemini 2.5 Flash-Lite | 2025-06-17 | N/A | N/A | 0.646 | N/A | N/A | N/A | N/A | 0.729 | 0.337 |
| Gemini 2.5 Pro Preview 06-05 | 2025-06-05 | N/A | N/A | 0.864 | N/A | N/A | N/A | N/A | 0.820 | 0.690 |
| Gemini 2.5 Flash | 2025-05-20 | N/A | N/A | 0.828 | N/A | N/A | N/A | N/A | 0.797 | N/A |
| Gemini 2.0 Flash Thinking | 2025-01-21 | N/A | N/A | 0.742 | N/A | N/A | N/A | N/A | 0.754 | N/A |
| Gemma 3n E2B | 2025-06-26 | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A |
| MedGemma 4B IT | 2025-05-20 | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A |
| Gemma 3 4B | 2025-03-12 | N/A | N/A | 0.308 | N/A | 0.436 | 0.756 | 0.713 | N/A | 0.126 |
| Gemma 3 27B | 2025-03-12 | N/A | N/A | 0.424 | N/A | 0.675 | 0.890 | 0.878 | N/A | 0.297 |
| Gemma 3 1B | 2025-03-12 | N/A | N/A | 0.192 | N/A | 0.147 | 0.480 | 0.415 | N/A | 0.019 |
| Gemini 1.5 Flash 8B | 2024-03-15 | N/A | N/A | 0.384 | N/A | 0.587 | 0.587 | N/A | 0.537 | N/A |
| Gemini Diffusion | 2025-05-20 | N/A | N/A | 0.404 | N/A | N/A | N/A | 0.896 | N/A | 0.309 |
| Gemini 2.0 Flash | 2024-12-01 | N/A | N/A | 0.621 | N/A | 0.764 | 0.897 | N/A | 0.707 | 0.351 |
| Phi 4 Mini Reasoning | 2025-04-30 | N/A | N/A | 0.520 | N/A | N/A | N/A | N/A | N/A | N/A |
| Phi-3.5-vision-instruct | 2024-08-23 | N/A | N/A | N/A | N/A | N/A | N/A | N/A | 0.430 | N/A |
| Phi 4 Reasoning Plus | 2025-04-30 | N/A | N/A | 0.689 | N/A | 0.760 | N/A | N/A | N/A | 0.531 |
| Phi-4-multimodal-instruct | 2025-02-01 | N/A | N/A | N/A | N/A | N/A | N/A | N/A | 0.551 | N/A |
| Phi 4 Reasoning | 2025-04-30 | N/A | N/A | 0.658 | N/A | 0.743 | N/A | N/A | N/A | 0.538 |
| Qwen3-235B-A22B-Instruct-2507 | 2025-07-22 | N/A | N/A | 0.775 | N/A | 0.830 | N/A | N/A | N/A | N/A |
| QwQ-32B | 2025-03-05 | N/A | N/A | 0.652 | N/A | N/A | N/A | N/A | N/A | 0.634 |
| Qwen3-235B-A22B-Thinking-2507 | 2025-07-25 | N/A | N/A | 0.811 | N/A | 0.844 | N/A | N/A | N/A | N/A |
| QwQ-32B-Preview | 2024-11-28 | N/A | N/A | 0.652 | N/A | N/A | N/A | N/A | N/A | 0.500 |
| Qwen3-Next-80B-A3B-Thinking | 2025-09-10 | N/A | N/A | 0.772 | N/A | 0.827 | N/A | N/A | N/A | N/A |
| Qwen2-VL-72B-Instruct | 2024-08-29 | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A |
| Qwen3 32B | 2025-04-29 | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | 0.657 |
| Qwen2.5 72B Instruct | 2024-09-19 | N/A | N/A | 0.490 | N/A | 0.711 | 0.831 | 0.866 | N/A | 0.555 |
| Qwen3 30B A3B | 2025-04-29 | N/A | N/A | 0.658 | N/A | N/A | N/A | N/A | N/A | 0.626 |
| Qwen2.5 VL 7B Instruct | 2025-01-26 | N/A | N/A | N/A | N/A | N/A | N/A | N/A | 0.586 | N/A |
| Qwen3-Next-80B-A3B-Base | 2025-09-10 | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A |
| QvQ-72B-Preview | 2024-12-25 | N/A | N/A | N/A | N/A | N/A | N/A | N/A | 0.703 | N/A |
| Qwen2.5-Omni-7B | 2025-03-27 | N/A | N/A | 0.308 | N/A | 0.470 | 0.715 | 0.787 | 0.592 | N/A |
| Qwen2.5 7B Instruct | 2024-09-19 | N/A | N/A | 0.364 | N/A | 0.563 | 0.755 | 0.848 | N/A | 0.287 |
| Qwen3-Next-80B-A3B-Instruct | 2025-09-10 | N/A | N/A | 0.729 | N/A | 0.806 | N/A | N/A | N/A | N/A |
| Qwen2.5 VL 72B Instruct | 2025-01-26 | N/A | N/A | N/A | N/A | N/A | N/A | N/A | 0.702 | N/A |
| DeepSeek-R1-0528 | 2025-05-28 | N/A | N/A | N/A | N/A | 0.850 | N/A | N/A | N/A | 0.733 |
| DeepSeek VL2 | 2024-12-13 | N/A | N/A | N/A | N/A | N/A | N/A | N/A | 0.511 | N/A |
| DeepSeek VL2 Tiny | 2024-12-13 | N/A | N/A | N/A | N/A | N/A | N/A | N/A | 0.407 | N/A |
| DeepSeek R1 Zero | 2025-01-20 | N/A | N/A | 0.733 | N/A | N/A | N/A | N/A | N/A | 0.500 |
| DeepSeek VL2 Small | 2024-12-13 | N/A | N/A | N/A | N/A | N/A | N/A | N/A | 0.480 | N/A |
| DeepSeek R1 Distill Qwen 7B | 2025-01-20 | N/A | N/A | 0.491 | N/A | N/A | N/A | N/A | N/A | 0.376 |
| DeepSeek R1 Distill Qwen 1.5B | 2025-01-20 | N/A | N/A | 0.338 | N/A | N/A | N/A | N/A | N/A | 0.169 |
| DeepSeek-R1 | 2025-01-20 | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A |
| DeepSeek R1 Distill Llama 8B | 2025-01-20 | N/A | N/A | 0.490 | N/A | N/A | N/A | N/A | N/A | 0.396 |
| DeepSeek R1 Distill Llama 70B | 2025-01-20 | N/A | N/A | 0.652 | N/A | N/A | N/A | N/A | N/A | 0.575 |
| DeepSeek R1 Distill Qwen 14B | 2025-01-20 | N/A | N/A | 0.591 | N/A | N/A | N/A | N/A | N/A | 0.531 |
| DeepSeek R1 Distill Qwen 32B | 2025-01-20 | N/A | N/A | 0.621 | N/A | N/A | N/A | N/A | N/A | 0.572 |
| DeepSeek-V3.1 | 2025-01-10 | N/A | N/A | N/A | N/A | 0.837 | N/A | N/A | N/A | 0.564 |
| DeepSeek-V3.2-Exp | 2025-09-29 | N/A | N/A | N/A | N/A | 0.850 | N/A | N/A | N/A | 0.741 |
| DeepSeek-V3 0324 | 2025-03-25 | N/A | N/A | 0.684 | N/A | 0.812 | N/A | N/A | N/A | 0.492 |
| Grok-3 Mini | 2025-02-17 | N/A | N/A | 0.840 | N/A | N/A | N/A | N/A | N/A | 0.804 |
| Grok-4 Heavy | 2025-07-09 | N/A | N/A | 0.884 | N/A | N/A | N/A | N/A | N/A | 0.794 |
| Grok-4 | 2025-07-09 | N/A | N/A | 0.875 | N/A | N/A | N/A | N/A | N/A | 0.790 |
| Grok-3 | 2025-02-17 | N/A | N/A | 0.846 | N/A | N/A | N/A | N/A | 0.780 | 0.794 |
| Grok-1.5V | 2024-04-12 | N/A | N/A | N/A | N/A | N/A | N/A | N/A | 0.536 | N/A |
| GLM-4.5V | 2025-08-11 | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A |
| GLM-4.5-Air | 2025-07-28 | N/A | N/A | 0.750 | N/A | 0.814 | N/A | N/A | N/A | 0.707 |
| GLM-4.5 | 2025-07-28 | N/A | N/A | 0.791 | N/A | 0.846 | N/A | N/A | N/A | 0.729 |
| Llama-3.3 Nemotron Super 49B v1 | 2025-03-18 | N/A | N/A | 0.667 | N/A | N/A | N/A | N/A | N/A | N/A |
| Llama 3.1 Nemotron Nano 8B V1 | 2025-03-18 | N/A | N/A | 0.541 | N/A | N/A | N/A | N/A | N/A | N/A |
| Llama 3.1 Nemotron Ultra 253B v1 | 2025-04-07 | N/A | N/A | 0.760 | N/A | N/A | N/A | N/A | N/A | 0.663 |
| Claude Opus 4.1 | 2025-08-05 | N/A | N/A | 0.809 | N/A | N/A | N/A | N/A | N/A | N/A |
| Claude Sonnet 4.5 | 2025-09-29 | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A |
| Claude 3.5 Haiku | 2024-10-22 | N/A | N/A | 0.416 | N/A | 0.650 | 0.694 | 0.881 | N/A | N/A |
| Claude 3.7 Sonnet | 2025-02-24 | N/A | N/A | 0.848 | N/A | N/A | N/A | N/A | 0.750 | N/A |
| Claude Sonnet 4 | 2025-05-22 | N/A | N/A | 0.754 | N/A | N/A | N/A | N/A | 0.744 | N/A |
| Claude Opus 4 | 2025-05-22 | N/A | N/A | 0.796 | N/A | N/A | N/A | N/A | N/A | N/A |
| Magistral Small 2506 | 2025-06-10 | N/A | N/A | 0.682 | N/A | N/A | N/A | N/A | N/A | 0.513 |
| Magistral Medium | 2025-06-10 | N/A | N/A | 0.708 | N/A | N/A | N/A | N/A | N/A | 0.503 |
| Devstral Medium | 2025-07-10 | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A |
| Pixtral Large | 2024-11-18 | N/A | N/A | N/A | N/A | N/A | N/A | N/A | 0.640 | N/A |
| Mistral Small 3 24B Instruct | 2025-01-30 | N/A | N/A | 0.453 | N/A | 0.663 | 0.706 | 0.848 | N/A | N/A |
| Devstral Small 1.1 | 2025-07-11 | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A |
| Codestral-22B | 2024-05-29 | N/A | N/A | N/A | N/A | N/A | N/A | 0.811 | N/A | N/A |
| Mistral Small | 2024-09-17 | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A |
| GPT OSS 120B | 2025-08-05 | N/A | N/A | 0.801 | N/A | N/A | N/A | N/A | N/A | N/A |
| o3 | 2025-04-16 | N/A | N/A | 0.833 | N/A | N/A | N/A | N/A | 0.829 | N/A |
| GPT OSS 20B | 2025-08-05 | N/A | N/A | 0.715 | N/A | N/A | N/A | N/A | N/A | N/A |
| o4-mini | 2025-04-16 | N/A | N/A | 0.814 | N/A | N/A | N/A | N/A | 0.816 | N/A |
| o3-pro | 2025-06-10 | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A |
| o1-pro | 2024-12-17 | N/A | N/A | 0.790 | N/A | N/A | N/A | N/A | N/A | N/A |
| GPT-5 nano | 2025-08-07 | N/A | N/A | 0.712 | N/A | N/A | N/A | N/A | N/A | N/A |
| GPT-5 mini | 2025-08-07 | N/A | N/A | 0.823 | N/A | N/A | N/A | N/A | N/A | N/A |
| GPT-5 Codex | 2025-09-15 | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A |
<div align="center">
Built with 💙 by the AI community, for the AI community.<br>
Star this repo if you find it useful!
</div>
================================================
FILE: data/.github/CODEOWNERS
================================================
* @JonathanChavezTamales
* @sebastiancrossa
================================================
FILE: data/benchmarks/aa-index.json
================================================
{
"benchmark_id": "aa-index",
"name": "AA-Index",
"parent_benchmark_id": null,
"categories": ["general"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "No official academic documentation found for this benchmark. Extensive research through ArXiv, IEEE/ACL/NeurIPS papers, and university research sites yielded no peer-reviewed sources for an 'aa-index' benchmark. This entry requires verification from official academic sources.",
"paper_link": null,
"implementation_link": null,
"verified": false,
"created_at": "2025-07-28T00:00:00.000000+00:00",
"updated_at": "2025-07-28T00:00:00.000000+00:00"
}
================================================
FILE: data/benchmarks/acebench.json
================================================
{
"benchmark_id": "acebench",
"name": "ACEBench",
"parent_benchmark_id": null,
"categories": ["general", "reasoning"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "ACEBench is a comprehensive benchmark for evaluating Large Language Models' tool usage capabilities across three primary evaluation types: Normal (basic tool usage scenarios), Special (tool usage with ambiguous or incomplete instructions), and Agent (multi-agent interactions simulating real-world dialogues). The benchmark covers 4,538 APIs across 8 major domains and 68 sub-domains including technology, finance, entertainment, society, health, culture, and environment, supporting both English and Chinese languages.",
"paper_link": "https://arxiv.org/abs/2501.12851",
"implementation_link": "https://github.com/ACEBench/ACEBench",
"verified": false,
"created_at": "2025-09-05T00:00:00.000000+00:00",
"updated_at": "2025-09-30T00:00:00.000000+00:00"
}
================================================
FILE: data/benchmarks/activitynet.json
================================================
{
"benchmark_id": "activitynet",
"name": "ActivityNet",
"parent_benchmark_id": null,
"categories": ["vision", "video"],
"modality": "video",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "A large-scale video benchmark for human activity understanding. Provides samples from 203 activity classes with an average of 137 untrimmed videos per class and 1.41 activity instances per video, for a total of 849 video hours. The benchmark covers a wide range of complex human activities that are of interest to people in their daily living and can be used to compare algorithms for three scenarios: untrimmed video classification, trimmed activity classification, and activity detection.",
"paper_link": "https://openaccess.thecvf.com/content_cvpr_2015/html/Heilbron_ActivityNet_A_Large-Scale_2015_CVPR_paper.html",
"implementation_link": "https://github.com/activitynet/ActivityNet",
"verified": false,
"created_at": "2025-07-19T19:56:15.378371+00:00",
"updated_at": "2025-07-19T19:56:15.378371+00:00"
}
================================================
FILE: data/benchmarks/agieval.json
================================================
{
"benchmark_id": "agieval",
"name": "AGIEval",
"parent_benchmark_id": null,
"categories": ["reasoning", "general", "math"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "A human-centric benchmark for evaluating foundation models on standardized exams including college entrance exams (Gaokao, SAT), law school admission tests (LSAT), math competitions, lawyer qualification tests, and civil service exams. Contains 20 tasks (18 multiple-choice, 2 cloze) designed to assess understanding, knowledge, reasoning, and calculation abilities in real-world academic and professional contexts.",
"paper_link": "https://arxiv.org/abs/2304.06364",
"implementation_link": "https://github.com/ruixiangcui/AGIEval",
"verified": false,
"created_at": "2025-07-19T19:56:13.970928+00:00",
"updated_at": "2025-07-19T19:56:13.970928+00:00"
}
================================================
FILE: data/benchmarks/ai2-reasoning-challenge-(arc).json
================================================
{
"benchmark_id": "ai2-reasoning-challenge-(arc)",
"name": "AI2 Reasoning Challenge (ARC)",
"parent_benchmark_id": null,
"categories": ["reasoning", "general"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "A dataset of 7,787 genuine grade-school level, multiple-choice science questions assembled to encourage research in advanced question-answering. The dataset is partitioned into a Challenge Set and Easy Set, where the Challenge Set contains only questions answered incorrectly by both retrieval-based and word co-occurrence algorithms. Covers multiple scientific domains including biology, physics, earth science, and chemistry, requiring scientific reasoning, causal understanding, and conceptual knowledge beyond simple fact retrieval. Includes a supporting corpus of over 14 million science sentences.",
"paper_link": "https://arxiv.org/abs/1803.05457",
"implementation_link": "https://github.com/allenai/ARC-Solvers",
"verified": false,
"created_at": "2025-07-19T19:56:15.419158+00:00",
"updated_at": "2025-07-19T19:56:15.419158+00:00"
}
================================================
FILE: data/benchmarks/ai2d.json
================================================
{
"benchmark_id": "ai2d",
"name": "AI2D",
"parent_benchmark_id": null,
"categories": ["vision", "reasoning", "multimodal"],
"modality": "multimodal",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "AI2D is a dataset of 4,903 illustrative diagrams from grade school natural sciences (such as food webs, human physiology, and life cycles) with over 15,000 multiple choice questions and answers. The benchmark evaluates diagram understanding and visual reasoning capabilities, requiring models to interpret diagrammatic elements, relationships, and structure to answer questions about scientific concepts represented in visual form.",
"paper_link": "https://arxiv.org/abs/1603.07396",
"implementation_link": "https://allenai.org/data/diagrams",
"verified": false,
"created_at": "2025-07-19T19:56:13.618926+00:00",
"updated_at": "2025-07-19T19:56:13.618926+00:00"
}
================================================
FILE: data/benchmarks/aider-polyglot-edit.json
================================================
{
"benchmark_id": "aider-polyglot-edit",
"name": "Aider-Polyglot Edit",
"parent_benchmark_id": null,
"categories": ["general", "code"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "A challenging multi-language coding benchmark that evaluates models' code editing abilities across C++, Go, Java, JavaScript, Python, and Rust. Contains 225 of Exercism's most difficult programming problems, selected as problems that were solved by 3 or fewer out of 7 top coding models. The benchmark focuses on code editing tasks and measures both correctness of solutions and proper edit format usage. Designed to re-calibrate evaluation scales so top models score between 5-50%.",
"paper_link": null,
"implementation_link": "https://github.com/Aider-AI/polyglot-benchmark",
"verified": false,
"created_at": "2025-07-19T19:56:13.789839+00:00",
"updated_at": "2025-09-30T00:00:00.000000+00:00"
}
================================================
FILE: data/benchmarks/aider-polyglot.json
================================================
{
"benchmark_id": "aider-polyglot",
"name": "Aider-Polyglot",
"parent_benchmark_id": null,
"categories": ["general", "code"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "A coding benchmark that evaluates LLMs on 225 challenging Exercism programming exercises across C++, Go, Java, JavaScript, Python, and Rust. Models receive two attempts to solve each problem, with test error feedback provided after the first attempt if it fails. The benchmark measures both initial problem-solving ability and capacity to edit code based on error feedback, providing an end-to-end evaluation of code generation and editing capabilities across multiple programming languages.",
"paper_link": null,
"implementation_link": "https://github.com/Aider-AI/polyglot-benchmark",
"verified": false,
"created_at": "2025-09-05T00:00:00.000000+00:00",
"updated_at": "2025-09-30T00:00:00.000000+00:00"
}
================================================
FILE: data/benchmarks/aider.json
================================================
{
"benchmark_id": "aider",
"name": "Aider",
"parent_benchmark_id": null,
"categories": ["reasoning", "code"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "Aider is a comprehensive code editing benchmark based on 133 practice exercises from Exercism's Python repository, designed to evaluate AI models' ability to translate natural language coding requests into executable code that passes unit tests. The benchmark measures end-to-end code editing capabilities, including GPT's ability to edit existing code and format code changes for automated saving to local files. The Aider Polyglot variant extends this evaluation across 225 challenging exercises spanning C++, Go, Java, JavaScript, Python, and Rust, making it a standard benchmark for assessing multilingual code editing performance in AI research.",
"paper_link": null,
"implementation_link": "https://github.com/Aider-AI/aider",
"verified": false,
"created_at": "2025-07-19T19:56:14.566857+00:00",
"updated_at": "2025-07-19T19:56:14.566857+00:00"
}
================================================
FILE: data/benchmarks/aime-2024.json
================================================
{
"benchmark_id": "aime-2024",
"name": "AIME 2024",
"parent_benchmark_id": null,
"categories": ["math", "reasoning"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "American Invitational Mathematics Examination 2024, consisting of 30 challenging mathematical reasoning problems from AIME I and AIME II competitions. Each problem requires an integer answer between 0-999 and tests advanced mathematical reasoning across algebra, geometry, combinatorics, and number theory. Used as a benchmark for evaluating mathematical reasoning capabilities in large language models at Olympiad-level difficulty.",
"paper_link": "https://arxiv.org/html/2503.21380v2",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:11.941652+00:00",
"updated_at": "2025-09-30T00:00:00.000000+00:00"
}
================================================
FILE: data/benchmarks/aime-2025.json
================================================
{
"benchmark_id": "aime-2025",
"name": "AIME 2025",
"parent_benchmark_id": null,
"categories": ["math", "reasoning"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "All 30 problems from the 2025 American Invitational Mathematics Examination (AIME I and AIME II), testing olympiad-level mathematical reasoning with integer answers from 000-999. Used as an AI benchmark to evaluate large language models' ability to solve complex mathematical problems requiring multi-step logical deductions and structured symbolic reasoning.",
"paper_link": "https://arxiv.org/abs/2503.21380",
"implementation_link": null,
"verified": false,
"created_at": "2025-09-05T00:00:00.000000+00:00",
"updated_at": "2025-09-05T00:00:00.000000+00:00"
}
================================================
FILE: data/benchmarks/aime.json
================================================
{
"benchmark_id": "aime",
"name": "AIME",
"parent_benchmark_id": null,
"categories": ["math", "reasoning"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "American Invitational Mathematics Examination (AIME) benchmark for evaluating mathematical reasoning capabilities of large language models. Contains 30 challenging mathematical problems from AIME 2024 competition that require multi-step reasoning and advanced mathematical insight. Each problem has an integer answer between 000-999.",
"paper_link": "https://arxiv.org/html/2503.21380v2",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:14.057279+00:00",
"updated_at": "2025-07-19T19:56:14.057279+00:00"
}
================================================
FILE: data/benchmarks/aitz-em.json
================================================
{
"benchmark_id": "aitz-em",
"name": "AITZ_EM",
"parent_benchmark_id": null,
"categories": ["multimodal", "reasoning"],
"modality": "multimodal",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "Android-In-The-Zoo (AitZ) benchmark for evaluating autonomous GUI agents on smartphones. Contains 18,643 screen-action pairs with chain-of-action-thought annotations spanning over 70 Android apps. Designed to connect perception (screen layouts and UI elements) with cognition (action decision-making) for natural language-triggered smartphone task completion.",
"paper_link": "https://arxiv.org/abs/2403.02713",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:14.785085+00:00",
"updated_at": "2025-07-19T19:56:14.785085+00:00"
}
================================================
FILE: data/benchmarks/alignbench.json
================================================
{
"benchmark_id": "alignbench",
"name": "AlignBench",
"parent_benchmark_id": null,
"categories": ["general", "language", "math", "reasoning", "roleplay"],
"modality": "text",
"multilingual": true,
"max_score": 1.0,
"language": "en",
"description": "AlignBench is a comprehensive multi-dimensional benchmark for evaluating Chinese alignment of Large Language Models. It contains 8 main categories: Fundamental Language Ability, Advanced Chinese Understanding, Open-ended Questions, Writing Ability, Logical Reasoning, Mathematics, Task-oriented Role Play, and Professional Knowledge. The benchmark includes 683 real-scenario rooted queries with human-verified references and uses a rule-calibrated multi-dimensional LLM-as-Judge approach with Chain-of-Thought for evaluation.",
"paper_link": "https://arxiv.org/abs/2311.18743",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:14.542033+00:00",
"updated_at": "2025-07-19T19:56:14.542033+00:00"
}
================================================
FILE: data/benchmarks/alpacaeval-2.0.json
================================================
{
"benchmark_id": "alpacaeval-2.0",
"name": "AlpacaEval 2.0",
"parent_benchmark_id": null,
"categories": ["general", "creativity", "reasoning"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "AlpacaEval 2.0 is a length-controlled automatic evaluator for instruction-following language models that uses GPT-4 Turbo to assess model responses against a baseline. It evaluates models on 805 diverse instruction-following tasks including creative writing, classification, programming, and general knowledge questions. The benchmark achieves 0.98 Spearman correlation with ChatBot Arena while being fast (< 3 minutes) and affordable (< $10 in OpenAI credits). It addresses length bias in automatic evaluation through length-controlled win-rates and uses weighted scoring based on response quality.",
"paper_link": "https://arxiv.org/abs/2404.04475",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:15.038178+00:00",
"updated_at": "2025-07-19T19:56:15.038178+00:00"
}
================================================
FILE: data/benchmarks/amc-2022-23.json
================================================
{
"benchmark_id": "amc-2022-23",
"name": "AMC_2022_23",
"parent_benchmark_id": null,
"categories": ["math", "reasoning"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "American Mathematics Competition problems from the 2022-23 academic year, consisting of multiple-choice mathematics competition problems designed for high school students. These problems require advanced mathematical reasoning, problem-solving strategies, and mathematical knowledge covering topics like algebra, geometry, number theory, and combinatorics. The benchmark is derived from the official AMC competitions sponsored by the Mathematical Association of America.",
"paper_link": "https://arxiv.org/abs/2103.03874",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:13.992903+00:00",
"updated_at": "2025-07-19T19:56:13.992903+00:00"
}
================================================
FILE: data/benchmarks/android-control-high-em.json
================================================
{
"benchmark_id": "android-control-high-em",
"name": "Android Control High_EM",
"parent_benchmark_id": null,
"categories": ["multimodal", "reasoning"],
"modality": "multimodal",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "Android device control benchmark using high exact match evaluation metric for assessing agent performance on mobile interface tasks",
"paper_link": null,
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:14.792498+00:00",
"updated_at": "2025-07-19T19:56:14.792498+00:00"
}
================================================
FILE: data/benchmarks/android-control-low-em.json
================================================
{
"benchmark_id": "android-control-low-em",
"name": "Android Control Low_EM",
"parent_benchmark_id": null,
"categories": ["multimodal", "reasoning"],
"modality": "multimodal",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "Android control benchmark evaluating autonomous agents on mobile device interaction tasks with low exact match scoring criteria",
"paper_link": null,
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:14.800337+00:00",
"updated_at": "2025-07-19T19:56:14.800337+00:00"
}
================================================
FILE: data/benchmarks/androidworld-sr.json
================================================
{
"benchmark_id": "androidworld-sr",
"name": "AndroidWorld_SR",
"parent_benchmark_id": null,
"categories": ["general", "multimodal", "reasoning"],
"modality": "multimodal",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "AndroidWorld Success Rate (SR) benchmark - A dynamic benchmarking environment for autonomous agents operating on Android devices. Evaluates agents on 116 programmatic tasks across 20 real-world Android apps using multimodal inputs (screen screenshots, accessibility trees, and natural language instructions). Measures success rate of agents completing tasks like sending messages, creating calendar events, and navigating mobile interfaces. Published at ICLR 2025. Best current performance: 30.6% success rate (M3A agent) vs 80.0% human performance.",
"paper_link": "https://arxiv.org/abs/2405.14573",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:14.808659+00:00",
"updated_at": "2025-07-19T19:56:14.808659+00:00"
}
================================================
FILE: data/benchmarks/api-bank.json
================================================
{
"benchmark_id": "api-bank",
"name": "API-Bank",
"parent_benchmark_id": null,
"categories": ["reasoning"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "A comprehensive benchmark for tool-augmented LLMs that evaluates API planning, retrieval, and calling capabilities. Contains 314 tool-use dialogues with 753 API calls across 73 API tools, designed to assess how effectively LLMs can utilize external tools and overcome obstacles in tool leveraging.",
"paper_link": "https://arxiv.org/abs/2304.08244",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:14.374447+00:00",
"updated_at": "2025-07-19T19:56:14.374447+00:00"
}
================================================
FILE: data/benchmarks/arc-agi-v2.json
================================================
{
"benchmark_id": "arc-agi-v2",
"name": "ARC-AGI v2",
"parent_benchmark_id": null,
"categories": ["reasoning", "vision", "spatial_reasoning"],
"modality": "multimodal",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "ARC-AGI-2 is an upgraded benchmark for measuring abstract reasoning and problem-solving abilities in AI systems through visual grid transformation tasks. It evaluates fluid intelligence via input-output grid pairs (1x1 to 30x30) using colored cells (0-9), requiring models to identify underlying transformation rules from demonstration examples and apply them to test cases. Designed to be easy for humans but challenging for AI, focusing on core cognitive abilities like spatial reasoning, pattern recognition, and compositional generalization.",
"paper_link": "https://arxiv.org/abs/2505.11831",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:13.916360+00:00",
"updated_at": "2025-07-19T19:56:13.916360+00:00"
}
================================================
FILE: data/benchmarks/arc-agi.json
================================================
{
"benchmark_id": "arc-agi",
"name": "ARC-AGI",
"parent_benchmark_id": null,
"categories": ["reasoning", "vision", "spatial_reasoning"],
"modality": "image",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "The Abstraction and Reasoning Corpus for Artificial General Intelligence (ARC-AGI) is a benchmark designed to test general intelligence and abstract reasoning capabilities through visual grid-based transformation tasks. Each task consists of 2-5 demonstration pairs showing input grids transformed into output grids according to underlying rules, with test-takers required to infer these rules and apply them to novel test inputs. The benchmark uses colored grids (up to 30x30) with 10 discrete colors/symbols, designed to measure human-like general fluid intelligence and skill-acquisition efficiency with minimal prior knowledge.",
"paper_link": "https://arxiv.org/abs/1911.01547",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:15.187761+00:00",
"updated_at": "2025-07-19T19:56:15.187761+00:00"
}
================================================
FILE: data/benchmarks/arc-c.json
================================================
{
"benchmark_id": "arc-c",
"name": "ARC-C",
"parent_benchmark_id": null,
"categories": ["reasoning", "general"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "The AI2 Reasoning Challenge (ARC) Challenge Set is a multiple-choice question-answering benchmark containing grade-school level science questions that require advanced reasoning capabilities. ARC-C specifically contains questions that were answered incorrectly by both retrieval-based and word co-occurrence algorithms, making it a particularly challenging subset designed to test commonsense reasoning abilities in AI systems.",
"paper_link": "https://arxiv.org/abs/1803.05457",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:11.052939+00:00",
"updated_at": "2025-07-19T19:56:11.052939+00:00"
}
================================================
FILE: data/benchmarks/arc-e.json
================================================
{
"benchmark_id": "arc-e",
"name": "ARC-E",
"parent_benchmark_id": null,
"categories": ["reasoning", "general"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "ARC-E (AI2 Reasoning Challenge - Easy Set) is a subset of grade-school level, multiple-choice science questions that requires knowledge and reasoning capabilities. Part of the AI2 Reasoning Challenge dataset containing 5,197 questions that test scientific reasoning and factual knowledge. The Easy Set contains questions that are answerable by retrieval-based and word co-occurrence algorithms, making them more accessible than the Challenge Set.",
"paper_link": "https://arxiv.org/abs/1803.05457",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:13.192662+00:00",
"updated_at": "2025-07-19T19:56:13.192662+00:00"
}
================================================
FILE: data/benchmarks/arc.json
================================================
{
"benchmark_id": "arc",
"name": "Arc",
"parent_benchmark_id": null,
"categories": ["reasoning", "general"],
"modality": "multimodal",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "The Abstraction and Reasoning Corpus (ARC) is a benchmark designed to measure human-like general fluid intelligence through grid-based reasoning tasks. It consists of 800 tasks (400 training, 400 evaluation) where each task presents input-output grids that require understanding abstract patterns and transformations. Test-takers must produce exactly correct output grids for all test inputs in a task to solve it, with 3 trials allowed per test input. ARC aims to enable fair comparisons of general intelligence between AI systems and humans using priors designed to be as close as possible to innate human priors.",
"paper_link": "https://arxiv.org/abs/1911.01547",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:13.967150+00:00",
"updated_at": "2025-07-19T19:56:13.967150+00:00"
}
================================================
FILE: data/benchmarks/arena-hard-v2.json
================================================
{
"benchmark_id": "arena-hard-v2",
"name": "Arena-Hard v2",
"parent_benchmark_id": null,
"categories": ["general", "reasoning", "creativity"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "Arena-Hard-Auto v2 is a challenging benchmark consisting of 500 carefully curated prompts sourced from Chatbot Arena and WildChat-1M, designed to evaluate large language models on real-world user queries. The benchmark covers diverse domains including open-ended software engineering problems, mathematics, creative writing, and technical problem-solving. It uses LLM-as-a-Judge for automatic evaluation, achieving 98.6% correlation with human preference rankings while providing 3x higher separation of model performances compared to MT-Bench. The benchmark emphasizes prompt specificity, complexity, and domain knowledge to better distinguish between model capabilities.",
"paper_link": "https://arxiv.org/abs/2406.11939",
"implementation_link": null,
"verified": false,
"created_at": "2025-08-03T22:06:11.411643+00:00",
"updated_at": "2025-08-03T22:06:11.411643+00:00"
}
================================================
FILE: data/benchmarks/arena-hard.json
================================================
{
"benchmark_id": "arena-hard",
"name": "Arena Hard",
"parent_benchmark_id": null,
"categories": ["general", "reasoning", "creativity"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "Arena-Hard-Auto is an automatic evaluation benchmark for instruction-tuned LLMs consisting of 500 challenging real-world prompts curated by BenchBuilder. It includes open-ended software engineering problems, mathematical questions, and creative writing tasks. The benchmark uses LLM-as-a-Judge methodology with GPT-4.1 and Gemini-2.5 as automatic judges to approximate human preference. Arena-Hard achieves 98.6% correlation with human preference rankings and provides 3x higher separation of model performances compared to MT-Bench, making it highly effective for distinguishing between models of similar quality.",
"paper_link": "https://arxiv.org/abs/2406.11939",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:14.079874+00:00",
"updated_at": "2025-07-19T19:56:14.079874+00:00"
}
================================================
FILE: data/benchmarks/attaq.json
================================================
{
"benchmark_id": "attaq",
"name": "AttaQ",
"parent_benchmark_id": null,
"categories": ["safety"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "AttaQ is a unique dataset containing adversarial examples in the form of questions designed to provoke harmful or inappropriate responses from large language models. The benchmark evaluates safety vulnerabilities by using specialized clustering techniques that analyze both the semantic similarity of input attacks and the harmfulness of model responses, facilitating targeted improvements to model safety mechanisms.",
"paper_link": "https://arxiv.org/abs/2311.04124",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:15.079764+00:00",
"updated_at": "2025-07-19T19:56:15.079764+00:00"
}
================================================
FILE: data/benchmarks/autologi.json
================================================
{
"benchmark_id": "autologi",
"name": "AutoLogi",
"parent_benchmark_id": null,
"categories": ["reasoning"],
"modality": "text",
"multilingual": true,
"max_score": 1.0,
"language": "en",
"description": "AutoLogi is an automated method for synthesizing open-ended logic puzzles to evaluate reasoning abilities of Large Language Models. The benchmark addresses limitations of existing multiple-choice reasoning evaluations by featuring program-based verification and controllable difficulty levels. It includes 1,575 English and 883 Chinese puzzles, enabling more reliable evaluation that better distinguishes models' reasoning capabilities across languages.",
"paper_link": "https://arxiv.org/abs/2502.16906",
"implementation_link": null,
"verified": false,
"created_at": "2025-09-05T00:00:00.000000+00:00",
"updated_at": "2025-09-05T00:00:00.000000+00:00"
}
================================================
FILE: data/benchmarks/bbh.json
================================================
{
"benchmark_id": "bbh",
"name": "BBH",
"parent_benchmark_id": null,
"categories": ["reasoning", "math", "language"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "Big-Bench Hard (BBH) is a suite of 23 challenging tasks selected from BIG-Bench for which prior language model evaluations did not outperform the average human-rater. These tasks require multi-step reasoning across diverse domains including arithmetic, logical reasoning, reading comprehension, and commonsense reasoning. The benchmark was designed to test capabilities believed to be beyond current language models and focuses on evaluating complex reasoning skills including temporal understanding, spatial reasoning, causal understanding, and deductive logical reasoning.",
"paper_link": "https://arxiv.org/abs/2210.09261",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:13.031859+00:00",
"updated_at": "2025-07-19T19:56:13.031859+00:00"
}
================================================
FILE: data/benchmarks/bfcl-v2.json
================================================
{
"benchmark_id": "bfcl-v2",
"name": "BFCL v2",
"parent_benchmark_id": null,
"categories": ["general", "reasoning"],
"modality": "text",
"multilingual": true,
"max_score": 1.0,
"language": "en",
"description": "Berkeley Function Calling Leaderboard (BFCL) v2 is a comprehensive benchmark for evaluating large language models' function calling capabilities. It features 2,251 question-function-answer pairs with enterprise and OSS-contributed functions, addressing data contamination and bias through live, user-contributed scenarios. The benchmark evaluates AST accuracy, executable accuracy, irrelevance detection, and relevance detection across multiple programming languages (Python, Java, JavaScript) and includes complex real-world function calling scenarios with multi-lingual prompts.",
"paper_link": "https://arxiv.org/abs/2305.15334",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:14.444045+00:00",
"updated_at": "2025-07-19T19:56:14.444045+00:00"
}
================================================
FILE: data/benchmarks/bfcl-v3-multiturn.json
================================================
{
"benchmark_id": "bfcl-v3-multiturn",
"name": "BFCL_v3_MultiTurn",
"parent_benchmark_id": null,
"categories": ["general", "reasoning"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "Berkeley Function Calling Leaderboard (BFCL) V3 MultiTurn benchmark that evaluates large language models' ability to handle multi-turn and multi-step function calling scenarios. The benchmark introduces complex interactions requiring models to manage sequential function calls, handle conversational context across multiple turns, and make dynamic decisions about when and how to use available functions. BFCL V3 uses state-based evaluation by verifying the actual state of API systems after function execution, providing more realistic assessment of function calling capabilities in agentic applications.",
"paper_link": "https://openreview.net/forum?id=2GmDdhBdDk",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:14.962161+00:00",
"updated_at": "2025-07-19T19:56:14.962161+00:00"
}
================================================
FILE: data/benchmarks/bfcl-v3.json
================================================
{
"benchmark_id": "bfcl-v3",
"name": "BFCL-v3",
"parent_benchmark_id": null,
"categories": ["general", "reasoning"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "Berkeley Function Calling Leaderboard v3 (BFCL-v3) is an advanced benchmark that evaluates large language models' function calling capabilities through multi-turn and multi-step interactions. It introduces extended conversational exchanges where models must retain contextual information across turns and execute multiple internal function calls for complex user requests. The benchmark includes 1000 test cases across domains like vehicle control, trading bots, travel booking, and file system management, using state-based evaluation to verify both system state changes and execution path correctness.",
"paper_link": "https://openreview.net/forum?id=2GmDdhBdDk",
"implementation_link": null,
"verified": false,
"created_at": "2025-08-03T22:06:11.216985+00:00",
"updated_at": "2025-08-03T22:06:11.216985+00:00"
}
================================================
FILE: data/benchmarks/bfcl.json
================================================
{
"benchmark_id": "bfcl",
"name": "BFCL",
"parent_benchmark_id": null,
"categories": ["general", "reasoning"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "The Berkeley Function Calling Leaderboard (BFCL) is the first comprehensive and executable function call evaluation dedicated to assessing Large Language Models' ability to invoke functions. It evaluates serial and parallel function calls across multiple programming languages (Python, Java, JavaScript, REST API) using a novel Abstract Syntax Tree (AST) evaluation method. The benchmark consists of over 2,000 question-function-answer pairs covering diverse application domains and complex use cases including multiple function calls, parallel function calls, and multi-turn interactions.",
"paper_link": "https://openreview.net/pdf?id=2GmDdhBdDk",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:12.763704+00:00",
"updated_at": "2025-07-19T19:56:12.763704+00:00"
}
================================================
FILE: data/benchmarks/big-bench-extra-hard.json
================================================
{
"benchmark_id": "big-bench-extra-hard",
"name": "BIG-Bench Extra Hard",
"parent_benchmark_id": null,
"categories": ["reasoning", "general", "language"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "BIG-Bench Extra Hard (BBEH) is a challenging benchmark that replaces each task in BIG-Bench Hard with a novel task that probes similar reasoning capabilities but exhibits significantly increased difficulty. The benchmark contains 23 tasks testing diverse reasoning skills including many-hop reasoning, causal understanding, spatial reasoning, temporal arithmetic, geometric reasoning, linguistic reasoning, logic puzzles, and humor understanding. Designed to address saturation on existing benchmarks where state-of-the-art models achieve near-perfect scores, BBEH shows substantial room for improvement with best models achieving only 9.8-44.8% average accuracy.",
"paper_link": "https://arxiv.org/abs/2502.19187",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:13.279517+00:00",
"updated_at": "2025-07-19T19:56:13.279517+00:00"
}
================================================
FILE: data/benchmarks/big-bench-hard.json
================================================
{
"benchmark_id": "big-bench-hard",
"name": "BIG-Bench Hard",
"parent_benchmark_id": null,
"categories": ["reasoning", "math", "language"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "BIG-Bench Hard (BBH) is a subset of 23 challenging BIG-Bench tasks selected because prior language model evaluations did not outperform average human-rater performance. The benchmark contains 6,511 evaluation examples testing various forms of multi-step reasoning including arithmetic, logical reasoning (Boolean expressions, logical deduction), geometric reasoning, temporal reasoning, and language understanding. Tasks require capabilities such as causal judgment, object counting, navigation, pattern recognition, and complex problem solving.",
"paper_link": "https://arxiv.org/abs/2210.09261",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:13.222809+00:00",
"updated_at": "2025-07-19T19:56:13.222809+00:00"
}
================================================
FILE: data/benchmarks/big-bench.json
================================================
{
"benchmark_id": "big-bench",
"name": "BIG-Bench",
"parent_benchmark_id": null,
"categories": ["reasoning", "math", "language"],
"modality": "text",
"multilingual": true,
"max_score": 1.0,
"language": "en",
"description": "Beyond the Imitation Game Benchmark (BIG-bench) is a collaborative benchmark consisting of 204+ tasks designed to probe large language models and extrapolate their future capabilities. It covers diverse domains including linguistics, mathematics, common-sense reasoning, biology, physics, social bias, software development, and more. The benchmark focuses on tasks believed to be beyond current language model capabilities and includes both English and non-English tasks across multiple languages.",
"paper_link": "https://arxiv.org/abs/2206.04615",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:13.926457+00:00",
"updated_at": "2025-07-19T19:56:13.926457+00:00"
}
================================================
FILE: data/benchmarks/bigcodebench-full.json
================================================
{
"benchmark_id": "bigcodebench-full",
"name": "BigCodeBench-Full",
"parent_benchmark_id": null,
"categories": ["general", "reasoning"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "A comprehensive benchmark that evaluates large language models' ability to solve complex, practical programming tasks via code generation. Contains 1,140 fine-grained tasks across 7 domains using function calls from 139 libraries. Challenges LLMs to invoke multiple function calls as tools and handle complex instructions for realistic software engineering and general-purpose reasoning tasks.",
"paper_link": "https://arxiv.org/abs/2406.15877",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:14.508830+00:00",
"updated_at": "2025-07-19T19:56:14.508830+00:00"
}
================================================
FILE: data/benchmarks/bigcodebench-hard.json
================================================
{
"benchmark_id": "bigcodebench-hard",
"name": "BigCodeBench-Hard",
"parent_benchmark_id": null,
"categories": ["general", "reasoning"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "BigCodeBench-Hard is a subset of 148 challenging programming tasks from BigCodeBench, designed to evaluate large language models' ability to solve complex, real-world programming problems. These tasks require diverse function calls from multiple libraries across 7 domains including computation, networking, data analysis, and visualization. The benchmark tests compositional reasoning and the ability to implement complex instructions that span 139 libraries with an average of 2.8 libraries per task.",
"paper_link": "https://arxiv.org/abs/2406.15877",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:14.512684+00:00",
"updated_at": "2025-07-19T19:56:14.512684+00:00"
}
================================================
FILE: data/benchmarks/bigcodebench.json
================================================
{
"benchmark_id": "bigcodebench",
"name": "BigCodeBench",
"parent_benchmark_id": null,
"categories": ["general", "reasoning"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "A benchmark that challenges LLMs to invoke multiple function calls as tools from 139 libraries and 7 domains for 1,140 fine-grained programming tasks. Evaluates code generation with diverse function calls and complex instructions, featuring two variants: Complete (code completion based on comprehensive docstrings) and Instruct (generating code from natural language instructions).",
"paper_link": "https://arxiv.org/abs/2406.15877",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:14.048433+00:00",
"updated_at": "2025-07-19T19:56:14.048433+00:00"
}
================================================
FILE: data/benchmarks/bird-sql-(dev).json
================================================
{
"benchmark_id": "bird-sql-(dev)",
"name": "Bird-SQL (dev)",
"parent_benchmark_id": null,
"categories": ["reasoning"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "BIRD (BIg Bench for LaRge-scale Database Grounded Text-to-SQLs) is a comprehensive text-to-SQL benchmark containing 12,751 question-SQL pairs across 95 databases (33.4 GB total) spanning 37+ professional domains. It evaluates large language models' ability to convert natural language to executable SQL queries in real-world scenarios with complex database schemas and dirty data.",
"paper_link": "https://arxiv.org/abs/2305.03111",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:13.410905+00:00",
"updated_at": "2025-07-19T19:56:13.410905+00:00"
}
================================================
FILE: data/benchmarks/blink.json
================================================
{
"benchmark_id": "blink",
"name": "BLINK",
"parent_benchmark_id": null,
"categories": ["vision", "multimodal", "reasoning"],
"modality": "multimodal",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "BLINK: Multimodal Large Language Models Can See but Not Perceive. A benchmark for multimodal language models focusing on core visual perception abilities. Reformats 14 classic computer vision tasks into 3,807 multiple-choice questions paired with single or multiple images and visual prompting. Tasks include relative depth estimation, visual correspondence, forensics detection, multi-view reasoning, counting, object localization, and spatial reasoning that humans can solve 'within a blink'.",
"paper_link": "https://arxiv.org/abs/2404.12390",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:14.326398+00:00",
"updated_at": "2025-07-19T19:56:14.326398+00:00"
}
================================================
FILE: data/benchmarks/boolq.json
================================================
{
"benchmark_id": "boolq",
"name": "BoolQ",
"parent_benchmark_id": null,
"categories": ["language", "reasoning"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "BoolQ is a reading comprehension dataset for yes/no questions containing 15,942 naturally occurring examples. Each example consists of a question, passage, and boolean answer, where questions are generated in unprompted and unconstrained settings. The dataset challenges models with complex, non-factoid information requiring entailment-like inference to solve.",
"paper_link": "https://arxiv.org/abs/1905.10044",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:13.117325+00:00",
"updated_at": "2025-07-19T19:56:13.117325+00:00"
}
================================================
FILE: data/benchmarks/browsecomp-long-128k.json
================================================
{
"benchmark_id": "browsecomp-long-128k",
"name": "BrowseComp Long Context 128k",
"parent_benchmark_id": "browsecomp",
"categories": ["reasoning", "search"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "A challenging benchmark for evaluating web browsing agents' ability to persistently navigate the internet and find hard-to-locate, entangled information. Comprises 1,266 questions requiring strategic reasoning, creative search, and interpretation of retrieved content, with short and easily verifiable answers.",
"paper_link": "https://arxiv.org/abs/2504.12516",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-24T12:00:00.000000+00:00",
"updated_at": "2025-07-24T12:00:00.000000+00:00"
}
================================================
FILE: data/benchmarks/browsecomp-long-256k.json
================================================
{
"benchmark_id": "browsecomp-long-256k",
"name": "BrowseComp Long Context 256k",
"parent_benchmark_id": "browsecomp",
"categories": ["reasoning", "search"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "BrowseComp is a benchmark for measuring the ability of agents to browse the web, comprising 1,266 questions that require persistently navigating the internet in search of hard-to-find, entangled information. Despite the difficulty of the questions, BrowseComp is simple and easy-to-use, as predicted answers are short and easily verifiable against reference answers. The benchmark focuses on questions where answers are obscure, time-invariant, and well-supported by evidence scattered across the open web.",
"paper_link": "https://arxiv.org/abs/2504.12516",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-24T12:00:00.000000+00:00",
"updated_at": "2025-07-24T12:00:00.000000+00:00"
}
================================================
FILE: data/benchmarks/browsecomp-zh.json
================================================
{
"benchmark_id": "browsecomp-zh",
"name": "BrowseComp-zh",
"parent_benchmark_id": "browsecomp",
"categories": ["reasoning", "search"],
"modality": "text",
"multilingual": true,
"max_score": 1.0,
"language": "zh",
"description": "A high-difficulty benchmark purpose-built to comprehensively evaluate LLM agents on the Chinese web, consisting of 289 multi-hop questions spanning 11 diverse domains including Film & TV, Technology, Medicine, and History. Questions are reverse-engineered from short, objective, and easily verifiable answers, requiring sophisticated reasoning and information reconciliation beyond basic retrieval. The benchmark addresses linguistic, infrastructural, and censorship-related complexities in Chinese web environments.",
"paper_link": "https://arxiv.org/abs/2504.19314",
"implementation_link": null,
"verified": false,
"created_at": "2025-09-15T00:00:00.000000+00:00",
"updated_at": "2025-09-15T00:00:00.000000+00:00"
}
================================================
FILE: data/benchmarks/browsecomp.json
================================================
{
"benchmark_id": "browsecomp",
"name": "BrowseComp",
"parent_benchmark_id": null,
"categories": ["reasoning", "search"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "BrowseComp is a benchmark comprising 1,266 questions that challenge AI agents to persistently navigate the internet in search of hard-to-find, entangled information. The benchmark measures agents' ability to exercise persistence in information gathering, demonstrate creativity in web navigation, and find concise, verifiable answers. Despite the difficulty of the questions, BrowseComp is simple and easy-to-use, as predicted answers are short and easily verifiable against reference answers.",
"paper_link": "https://arxiv.org/abs/2504.12516",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-28T00:00:00.000000+00:00",
"updated_at": "2025-07-28T00:00:00.000000+00:00"
}
================================================
FILE: data/benchmarks/c-eval.json
================================================
{
"benchmark_id": "c-eval",
"name": "C-Eval",
"parent_benchmark_id": null,
"categories": ["general", "reasoning"],
"modality": "text",
"multilingual": true,
"max_score": 1.0,
"language": "en",
"description": "C-Eval is a comprehensive Chinese evaluation suite designed to assess advanced knowledge and reasoning abilities of foundation models in a Chinese context. It comprises 13,948 multiple-choice questions across 52 diverse disciplines spanning humanities, science, and engineering, with four difficulty levels: middle school, high school, college, and professional. The benchmark includes C-Eval Hard, a subset of very challenging subjects requiring advanced reasoning abilities.",
"paper_link": "https://arxiv.org/abs/2305.08322",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:11.917478+00:00",
"updated_at": "2025-07-19T19:56:11.917478+00:00"
}
================================================
FILE: data/benchmarks/cbnsl.json
================================================
{
"benchmark_id": "cbnsl",
"name": "CBNSL",
"parent_benchmark_id": null,
"categories": ["math", "reasoning"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "Curriculum Learning of Bayesian Network Structures (CBNSL) benchmark for evaluating algorithms that learn Bayesian network structures from data using curriculum learning techniques. The benchmark uses networks from the bnlearn repository and evaluates structure learning performance using BDeu scoring metrics.",
"paper_link": "http://proceedings.mlr.press/v45/Zhao15a.pdf",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:12.590999+00:00",
"updated_at": "2025-07-19T19:56:12.590999+00:00"
}
================================================
FILE: data/benchmarks/cc-ocr.json
================================================
{
"benchmark_id": "cc-ocr",
"name": "CC-OCR",
"parent_benchmark_id": null,
"categories": ["vision", "multimodal", "text-to-image"],
"modality": "multimodal",
"multilingual": true,
"max_score": 1.0,
"language": "en",
"description": "A comprehensive OCR benchmark for evaluating Large Multimodal Models (LMMs) in literacy. Comprises four OCR-centric tracks: multi-scene text reading, multilingual text reading, document parsing, and key information extraction. Contains 39 subsets with 7,058 fully annotated images, 41% sourced from real applications. Tests capabilities including text grounding, multi-orientation text recognition, and detecting hallucination/repetition across diverse visual challenges.",
"paper_link": "https://arxiv.org/abs/2412.02210",
"implementation_link": "https://github.com/AlibabaResearch/AdvancedLiterateMachinery",
"verified": false,
"created_at": "2025-07-19T19:56:14.652986+00:00",
"updated_at": "2025-07-19T19:56:14.652986+00:00"
}
================================================
FILE: data/benchmarks/cfeval.json
================================================
{
"benchmark_id": "cfeval",
"name": "CFEval",
"parent_benchmark_id": null,
"categories": ["code"],
"modality": "text",
"multilingual": false,
"max_score": 10000.0,
"language": "en",
"description": "CFEval benchmark for evaluating code generation and problem-solving capabilities",
"paper_link": null,
"implementation_link": null,
"verified": false,
"created_at": "2025-09-15T00:00:00.000000+00:00",
"updated_at": "2025-09-15T00:00:00.000000+00:00"
}
================================================
FILE: data/benchmarks/charadessta.json
================================================
{
"benchmark_id": "charadessta",
"name": "CharadesSTA",
"parent_benchmark_id": null,
"categories": ["video", "language", "multimodal"],
"modality": "multimodal",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "Charades-STA is a benchmark dataset for temporal activity localization via language queries, extending the Charades dataset with sentence temporal annotations. It contains 12,408 training and 3,720 testing segment-sentence pairs from videos with natural language descriptions and precise temporal boundaries for localizing activities based on language queries.",
"paper_link": "https://arxiv.org/abs/1705.02101",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:14.760027+00:00",
"updated_at": "2025-07-19T19:56:14.760027+00:00"
}
================================================
FILE: data/benchmarks/chartqa.json
================================================
{
"benchmark_id": "chartqa",
"name": "ChartQA",
"parent_benchmark_id": null,
"categories": ["reasoning", "vision", "multimodal"],
"modality": "multimodal",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "ChartQA is a large-scale benchmark comprising 9.6K human-written questions and 23.1K questions generated from human-written chart summaries, designed to evaluate models' abilities in visual and logical reasoning over charts.",
"paper_link": "https://arxiv.org/abs/2203.10244",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:12.783541+00:00",
"updated_at": "2025-07-19T19:56:12.783541+00:00"
}
================================================
FILE: data/benchmarks/charxiv-d.json
================================================
{
"benchmark_id": "charxiv-d",
"name": "CharXiv-D",
"parent_benchmark_id": null,
"categories": ["reasoning", "vision", "multimodal"],
"modality": "multimodal",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "CharXiv-D is the descriptive questions subset of the CharXiv benchmark, designed to assess multimodal large language models' ability to extract basic information from scientific charts. It contains descriptive questions covering information extraction, enumeration, pattern recognition, and counting across 2,323 diverse charts from arXiv papers, all curated and verified by human experts.",
"paper_link": "https://arxiv.org/abs/2406.18521",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:15.325204+00:00",
"updated_at": "2025-07-19T19:56:15.325204+00:00"
}
================================================
FILE: data/benchmarks/charxiv-r.json
================================================
{
"benchmark_id": "charxiv-r",
"name": "CharXiv-R",
"parent_benchmark_id": null,
"categories": ["reasoning", "vision", "multimodal"],
"modality": "multimodal",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "CharXiv-R is the reasoning component of the CharXiv benchmark, focusing on complex reasoning questions that require synthesizing information across visual chart elements. It evaluates multimodal large language models on their ability to understand and reason about scientific charts from arXiv papers through various reasoning tasks.",
"paper_link": "https://arxiv.org/abs/2406.18521",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:15.191553+00:00",
"updated_at": "2025-07-19T19:56:15.191553+00:00"
}
================================================
FILE: data/benchmarks/chexpert-cxr.json
================================================
{
"benchmark_id": "chexpert-cxr",
"name": "CheXpert CXR",
"parent_benchmark_id": null,
"categories": ["healthcare", "vision"],
"modality": "image",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "CheXpert is a large dataset of 224,316 chest radiographs from 65,240 patients for automated chest X-ray interpretation. The dataset includes uncertainty labels for 14 medical observations extracted from radiology reports. It serves as a benchmark for developing and evaluating automated chest radiograph interpretation models.",
"paper_link": "https://arxiv.org/abs/1901.07031",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:14.021015+00:00",
"updated_at": "2025-07-19T19:56:14.021015+00:00"
}
================================================
FILE: data/benchmarks/cluewsc.json
================================================
{
"benchmark_id": "cluewsc",
"name": "CLUEWSC",
"parent_benchmark_id": null,
"categories": ["language", "reasoning"],
"modality": "text",
"multilingual": true,
"max_score": 1.0,
"language": "en",
"description": "CLUEWSC2020 is the Chinese version of the Winograd Schema Challenge, part of the CLUE benchmark. It focuses on pronoun disambiguation and coreference resolution, requiring models to determine which noun a pronoun refers to in a sentence. The dataset contains 1,244 training samples and 304 development samples extracted from contemporary Chinese literature.",
"paper_link": "https://arxiv.org/abs/2004.05986",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:12.233189+00:00",
"updated_at": "2025-07-19T19:56:12.233189+00:00"
}
================================================
FILE: data/benchmarks/cmmlu.json
================================================
{
"benchmark_id": "cmmlu",
"name": "CMMLU",
"parent_benchmark_id": null,
"categories": ["language", "reasoning", "general"],
"modality": "text",
"multilingual": true,
"max_score": 1.0,
"language": "en",
"description": "CMMLU (Chinese Massive Multitask Language Understanding) is a comprehensive Chinese benchmark that evaluates the knowledge and reasoning capabilities of large language models across 67 different subject topics. The benchmark covers natural sciences, social sciences, engineering, and humanities with multiple-choice questions ranging from basic to advanced professional levels.",
"paper_link": "https://arxiv.org/abs/2306.09212",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:14.941108+00:00",
"updated_at": "2025-07-19T19:56:14.941108+00:00"
}
================================================
FILE: data/benchmarks/cnmo-2024.json
================================================
{
"benchmark_id": "cnmo-2024",
"name": "CNMO 2024",
"parent_benchmark_id": null,
"categories": ["math"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "China Mathematical Olympiad 2024 - A challenging mathematics competition.",
"paper_link": null,
"implementation_link": null,
"verified": false,
"created_at": "2025-09-05T00:00:00.000000+00:00",
"updated_at": "2025-09-05T00:00:00.000000+00:00"
}
================================================
FILE: data/benchmarks/codeforces.json
================================================
{
"benchmark_id": "codeforces",
"name": "CodeForces",
"parent_benchmark_id": null,
"categories": ["math", "reasoning"],
"modality": "text",
"multilingual": false,
"max_score": 3000.0,
"language": "en",
"description": "A competitive programming benchmark using problems from the CodeForces platform. The benchmark evaluates code generation capabilities of LLMs on algorithmic problems with difficulty ratings ranging from 800 to 2400. Problems cover diverse algorithmic categories including dynamic programming, graph algorithms, data structures, and mathematical problems with standardized evaluation through direct platform submission.",
"paper_link": "https://arxiv.org/abs/2501.01257",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:14.624663+00:00",
"updated_at": "2025-07-19T19:56:14.624663+00:00"
}
================================================
FILE: data/benchmarks/codegolf-v2.2.json
================================================
{
"benchmark_id": "codegolf-v2.2",
"name": "Codegolf v2.2",
"parent_benchmark_id": null,
"categories": ["code"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "Codegolf v2.2 benchmark",
"paper_link": null,
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:13.778275+00:00",
"updated_at": "2025-07-19T19:56:13.778275+00:00"
}
================================================
FILE: data/benchmarks/collie.json
================================================
{
"benchmark_id": "collie",
"name": "COLLIE",
"parent_benchmark_id": null,
"categories": ["language", "reasoning", "writing"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "COLLIE is a grammar-based framework for systematic construction of constrained text generation tasks. It allows specification of rich, compositional constraints across diverse generation levels and modeling challenges including language understanding, logical reasoning, and semantic planning. The COLLIE-v1 dataset contains 2,080 instances across 13 constraint structures.",
"paper_link": "https://arxiv.org/abs/2307.08689",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:15.250323+00:00",
"updated_at": "2025-07-19T19:56:15.250323+00:00"
}
================================================
FILE: data/benchmarks/common-voice-15.json
================================================
{
"benchmark_id": "common-voice-15",
"name": "Common Voice 15",
"parent_benchmark_id": null,
"categories": ["audio", "speech-to-text", "language"],
"modality": "audio",
"multilingual": true,
"max_score": 100.0,
"language": "en",
"description": "Common Voice is a massively-multilingual collection of transcribed speech intended for speech technology research and development. Version 15.0 contains 28,750 recorded hours across 114 languages, consisting of crowdsourced voice recordings with corresponding transcriptions.",
"paper_link": "https://arxiv.org/abs/1912.06670",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:14.830793+00:00",
"updated_at": "2025-07-19T19:56:14.830793+00:00"
}
================================================
FILE: data/benchmarks/commonsenseqa.json
================================================
{
"benchmark_id": "commonsenseqa",
"name": "CommonSenseQA",
"parent_benchmark_id": null,
"categories": ["reasoning", "language"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "CommonSenseQA is a multiple-choice question answering dataset that requires different types of commonsense knowledge to predict correct answers. It contains 12,102 questions with one correct answer and four distractors, designed to test semantic reasoning and conceptual relationships. Questions are created based on ConceptNet concepts and require prior world knowledge for accurate reasoning.",
"paper_link": "https://arxiv.org/abs/1811.00937",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:15.129679+00:00",
"updated_at": "2025-07-19T19:56:15.129679+00:00"
}
================================================
FILE: data/benchmarks/complexfuncbench.json
================================================
{
"benchmark_id": "complexfuncbench",
"name": "ComplexFuncBench",
"parent_benchmark_id": null,
"categories": ["long_context", "reasoning"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "ComplexFuncBench is a benchmark designed to evaluate large language models' capabilities in handling complex function calling scenarios. It encompasses multi-step and constrained function calling tasks that require long-parameter filling, parameter value reasoning, and managing contexts up to 128k tokens. The benchmark includes 1,000 samples across five real-world scenarios.",
"paper_link": "https://arxiv.org/abs/2501.10132",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:15.336577+00:00",
"updated_at": "2025-07-19T19:56:15.336577+00:00"
}
================================================
FILE: data/benchmarks/covost2-en-zh.json
================================================
{
"benchmark_id": "covost2-en-zh",
"name": "CoVoST2 en-zh",
"parent_benchmark_id": null,
"categories": ["audio", "speech-to-text", "language"],
"modality": "audio",
"multilingual": true,
"max_score": 100.0,
"language": "en",
"description": "CoVoST 2 English-to-Chinese subset is part of the large-scale multilingual speech translation corpus derived from Common Voice. This subset focuses specifically on English to Chinese speech translation tasks within the broader CoVoST 2 dataset.",
"paper_link": "https://arxiv.org/abs/2007.10310",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:14.825578+00:00",
"updated_at": "2025-07-19T19:56:14.825578+00:00"
}
================================================
FILE: data/benchmarks/covost2.json
================================================
{
"benchmark_id": "covost2",
"name": "CoVoST2",
"parent_benchmark_id": null,
"categories": ["audio", "speech-to-text", "language"],
"modality": "audio",
"multilingual": true,
"max_score": 1.0,
"language": "en",
"description": "CoVoST 2 is a large-scale multilingual speech translation corpus derived from Common Voice, covering translations from 21 languages into English and from English into 15 languages. The dataset contains 2,880 hours of speech with 78K speakers for speech translation research.",
"paper_link": "https://arxiv.org/abs/2007.10310",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:13.958237+00:00",
"updated_at": "2025-07-19T19:56:13.958237+00:00"
}
================================================
FILE: data/benchmarks/crag.json
================================================
{
"benchmark_id": "crag",
"name": "CRAG",
"parent_benchmark_id": null,
"categories": ["reasoning", "search"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "CRAG (Comprehensive RAG Benchmark) is a factual question answering benchmark consisting of 4,409 question-answer pairs across 5 domains (finance, sports, music, movie, open domain) and 8 question categories. The benchmark includes mock APIs to simulate web and Knowledge Graph search, designed to represent the diverse and dynamic nature of real-world QA tasks with temporal dynamism ranging from years to seconds. It evaluates retrieval-augmented generation systems for trustworthy question answering.",
"paper_link": "https://arxiv.org/abs/2406.04744",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:12.741280+00:00",
"updated_at": "2025-07-19T19:56:12.741280+00:00"
}
================================================
FILE: data/benchmarks/creative-writing-v3.json
================================================
{
"benchmark_id": "creative-writing-v3",
"name": "Creative Writing v3",
"parent_benchmark_id": null,
"categories": ["creativity", "writing"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "EQ-Bench Creative Writing v3 is an LLM-judged creative writing benchmark that evaluates models across 32 writing prompts with 3 iterations per prompt. Uses a hybrid scoring system combining rubric assessment and Elo ratings through pairwise comparisons. Challenges models in areas like humor, romance, spatial awareness, and unique perspectives to assess emotional intelligence and creative writing capabilities.",
"paper_link": "https://arxiv.org/abs/2312.06281",
"implementation_link": null,
"verified": false,
"created_at": "2025-08-03T22:06:11.157942+00:00",
"updated_at": "2025-08-03T22:06:11.157942+00:00"
}
================================================
FILE: data/benchmarks/crperelation.json
================================================
{
"benchmark_id": "crperelation",
"name": "CRPErelation",
"parent_benchmark_id": null,
"categories": ["healthcare", "reasoning"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "Clinical reasoning problems evaluation benchmark for assessing diagnostic reasoning and medical knowledge application capabilities.",
"paper_link": null,
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:14.834739+00:00",
"updated_at": "2025-07-19T19:56:14.834739+00:00"
}
================================================
FILE: data/benchmarks/crux-o.json
================================================
{
"benchmark_id": "crux-o",
"name": "CRUX-O",
"parent_benchmark_id": null,
"categories": ["reasoning"],
"modality": "text",
"multilingual": false,
"max_score": 100.0,
"language": "en",
"description": "CRUXEval-O (output prediction) is part of the CRUXEval benchmark consisting of 800 Python functions (3-13 lines) designed to evaluate AI models' capabilities in code reasoning, understanding, and execution. The benchmark tests models' ability to predict correct function outputs given function code and inputs, focusing on short problems that a good human programmer should be able to solve in a minute.",
"paper_link": "https://arxiv.org/abs/2401.03065",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:14.635245+00:00",
"updated_at": "2025-07-19T19:56:14.635245+00:00"
}
================================================
FILE: data/benchmarks/cruxeval-input-cot.json
================================================
{
"benchmark_id": "cruxeval-input-cot",
"name": "CRUXEval-Input-CoT",
"parent_benchmark_id": null,
"categories": ["reasoning"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "CRUXEval input prediction task with Chain of Thought (CoT) prompting. Part of the CRUXEval benchmark for code reasoning, understanding, and execution evaluation. Given a Python function and its expected output, the task is to predict the appropriate input using chain-of-thought reasoning. Consists of 800 Python functions (3-13 lines) designed to evaluate code comprehension and reasoning capabilities.",
"paper_link": "https://arxiv.org/abs/2401.03065",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:14.551746+00:00",
"updated_at": "2025-07-19T19:56:14.551746+00:00"
}
================================================
FILE: data/benchmarks/cruxeval-o.json
================================================
{
"benchmark_id": "cruxeval-o",
"name": "CruxEval-O",
"parent_benchmark_id": null,
"categories": ["reasoning"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "CruxEval-O is the output prediction task of the CRUXEval benchmark, designed to evaluate code reasoning, understanding, and execution capabilities. It consists of 800 Python functions (3-13 lines) where models must predict the output given a function and input. The benchmark tests fundamental code execution reasoning abilities and goes beyond simple code generation to assess deeper understanding of program behavior.",
"paper_link": "https://arxiv.org/abs/2401.03065",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:15.146592+00:00",
"updated_at": "2025-07-19T19:56:15.146592+00:00"
}
================================================
FILE: data/benchmarks/cruxeval-output-cot.json
================================================
{
"benchmark_id": "cruxeval-output-cot",
"name": "CRUXEval-Output-CoT",
"parent_benchmark_id": null,
"categories": ["reasoning"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "CRUXEval-O (output prediction) with Chain-of-Thought prompting. Part of the CRUXEval benchmark consisting of 800 Python functions (3-13 lines) designed to evaluate code reasoning, understanding, and execution capabilities. The output prediction task requires models to predict the output of a given Python function with specific inputs, evaluated using chain-of-thought reasoning methodology.",
"paper_link": "https://arxiv.org/abs/2401.03065",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:14.555432+00:00",
"updated_at": "2025-07-19T19:56:14.555432+00:00"
}
================================================
FILE: data/benchmarks/csimpleqa.json
================================================
{
"benchmark_id": "csimpleqa",
"name": "CSimpleQA",
"parent_benchmark_id": null,
"categories": ["general", "language"],
"modality": "text",
"multilingual": true,
"max_score": 1.0,
"language": "en",
"description": "Chinese SimpleQA is the first comprehensive Chinese benchmark to evaluate the factuality ability of language models to answer short questions. It contains 3,000 high-quality questions spanning 6 major topics with 99 diverse subtopics, designed to assess Chinese factual knowledge across humanities, science, engineering, culture, and society.",
"paper_link": "https://arxiv.org/abs/2411.07140",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:11.931358+00:00",
"updated_at": "2025-07-19T19:56:11.931358+00:00"
}
================================================
FILE: data/benchmarks/cybersecurity-ctfs.json
================================================
{
"benchmark_id": "cybersecurity-ctfs",
"name": "Cybersecurity CTFs",
"parent_benchmark_id": null,
"categories": ["safety"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "Cybersecurity Capture the Flag (CTF) benchmark for evaluating LLMs in offensive security challenges. Contains diverse cybersecurity tasks including cryptography, web exploitation, binary analysis, and forensics to assess AI capabilities in cybersecurity problem-solving.",
"paper_link": "https://arxiv.org/abs/2406.05590",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:15.387055+00:00",
"updated_at": "2025-07-19T19:56:15.387055+00:00"
}
================================================
FILE: data/benchmarks/dermmcqa.json
================================================
{
"benchmark_id": "dermmcqa",
"name": "DermMCQA",
"parent_benchmark_id": null,
"categories": ["healthcare"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "Dermatology multiple choice question assessment benchmark for evaluating medical knowledge and diagnostic reasoning in dermatological conditions and treatments.",
"paper_link": "https://arxiv.org/abs/2309.06961",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:14.024498+00:00",
"updated_at": "2025-07-19T19:56:14.024498+00:00"
}
================================================
FILE: data/benchmarks/docvqa.json
================================================
{
"benchmark_id": "docvqa",
"name": "DocVQA",
"parent_benchmark_id": null,
"categories": ["vision", "multimodal"],
"modality": "multimodal",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "A dataset for Visual Question Answering on document images containing 50,000 questions defined on 12,000+ document images. The benchmark tests AI's ability to understand document structure and content, requiring models to comprehend document layout and perform information retrieval to answer questions about document images.",
"paper_link": "https://arxiv.org/abs/2007.00398",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:12.825214+00:00",
"updated_at": "2025-07-19T19:56:12.825214+00:00"
}
================================================
FILE: data/benchmarks/docvqatest.json
================================================
{
"benchmark_id": "docvqatest",
"name": "DocVQAtest",
"parent_benchmark_id": null,
"categories": ["vision", "multimodal"],
"modality": "multimodal",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "DocVQA is a Visual Question Answering benchmark on document images containing 50,000 questions defined on 12,000+ document images. The benchmark focuses on understanding document structure and content to answer questions about various document types including letters, memos, notes, and reports from the UCSF Industry Documents Library.",
"paper_link": "https://arxiv.org/abs/2007.00398",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:14.579372+00:00",
"updated_at": "2025-07-19T19:56:14.579372+00:00"
}
================================================
FILE: data/benchmarks/drop.json
================================================
{
"benchmark_id": "drop",
"name": "DROP",
"parent_benchmark_id": null,
"categories": ["reasoning", "math"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "DROP (Discrete Reasoning Over Paragraphs) is a reading comprehension benchmark requiring discrete reasoning over paragraph content. It contains crowdsourced, adversarially-created questions that require resolving references and performing discrete operations like addition, counting, or sorting, demanding comprehensive paragraph understanding beyond paraphrase-and-entity-typing shortcuts.",
"paper_link": "https://arxiv.org/abs/1903.00161",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:12.981569+00:00",
"updated_at": "2025-07-19T19:56:12.981569+00:00"
}
================================================
FILE: data/benchmarks/ds-arena-code.json
================================================
{
"benchmark_id": "ds-arena-code",
"name": "DS-Arena-Code",
"parent_benchmark_id": null,
"categories": ["reasoning"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "Data Science Arena Code benchmark for evaluating LLMs on realistic data science code generation tasks. Tests capabilities in complex data processing, analysis, and programming across popular Python libraries used in data science workflows.",
"paper_link": "https://arxiv.org/abs/2505.15621",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:15.057744+00:00",
"updated_at": "2025-07-19T19:56:15.057744+00:00"
}
================================================
FILE: data/benchmarks/ds-fim-eval.json
================================================
{
"benchmark_id": "ds-fim-eval",
"name": "DS-FIM-Eval",
"parent_benchmark_id": null,
"categories": ["general"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "DeepSeek's internal Fill-in-the-Middle evaluation dataset for measuring code completion performance improvements in data science contexts",
"paper_link": "https://arxiv.org/abs/2406.11931",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:15.053854+00:00",
"updated_at": "2025-07-19T19:56:15.053854+00:00"
}
================================================
FILE: data/benchmarks/eclektic.json
================================================
{
"benchmark_id": "eclektic",
"name": "ECLeKTic",
"parent_benchmark_id": null,
"categories": ["language", "reasoning"],
"modality": "text",
"multilingual": true,
"max_score": 1.0,
"language": "en",
"description": "A multilingual closed-book question answering dataset that evaluates cross-lingual knowledge transfer in large language models across 12 languages, using knowledge-seeking questions based on Wikipedia articles that exist only in one language",
"paper_link": "https://arxiv.org/abs/2502.21228",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:13.561292+00:00",
"updated_at": "2025-07-19T19:56:13.561292+00:00"
}
================================================
FILE: data/benchmarks/egoschema.json
================================================
{
"benchmark_id": "egoschema",
"name": "EgoSchema",
"parent_benchmark_id": null,
"categories": ["vision", "reasoning", "long_context"],
"modality": "video",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "A diagnostic benchmark for very long-form video language understanding consisting of over 5000 human curated multiple choice questions based on 3-minute video clips from Ego4D, covering a broad range of natural human activities and behaviors",
"paper_link": "https://arxiv.org/abs/2308.09126",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:12.915240+00:00",
"updated_at": "2025-07-19T19:56:12.915240+00:00"
}
================================================
FILE: data/benchmarks/erqa.json
================================================
{
"benchmark_id": "erqa",
"name": "ERQA",
"parent_benchmark_id": null,
"categories": ["vision", "reasoning", "spatial_reasoning"],
"modality": "multimodal",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "Embodied Reasoning Question Answering benchmark consisting of 400 multiple-choice visual questions across spatial reasoning, trajectory reasoning, action reasoning, state estimation, and multi-view reasoning for evaluating AI capabilities in physical world interactions",
"paper_link": "https://arxiv.org/abs/2503.20020",
"implementation_link": "https://github.com/embodiedreasoning/ERQA",
"verified": false,
"created_at": "2025-07-24T12:00:00.000000+00:00",
"updated_at": "2025-07-24T12:00:00.000000+00:00"
}
================================================
FILE: data/benchmarks/evalplus.json
================================================
{
"benchmark_id": "evalplus",
"name": "EvalPlus",
"parent_benchmark_id": null,
"categories": ["reasoning", "code"],
"modality": "text",
"multilingual": false,
"max_score": 100.0,
"language": "en",
"description": "A rigorous code synthesis evaluation framework that augments existing datasets with extensive test cases generated by LLM and mutation-based strategies to better assess functional correctness of generated code, including HumanEval+ with 80x more test cases",
"paper_link": "https://arxiv.org/abs/2305.01210",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:11.793176+00:00",
"updated_at": "2025-07-19T19:56:11.793176+00:00"
}
================================================
FILE: data/benchmarks/facts-grounding.json
================================================
{
"benchmark_id": "facts-grounding",
"name": "FACTS Grounding",
"parent_benchmark_id": null,
"categories": ["reasoning"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "A benchmark evaluating language models' ability to generate factually accurate and well-grounded responses based on long-form input context, comprising 1,719 examples with documents up to 32k tokens requiring detailed responses that are fully grounded in provided documents",
"paper_link": "https://arxiv.org/abs/2501.03200",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:13.260285+00:00",
"updated_at": "2025-07-19T19:56:13.260285+00:00"
}
================================================
FILE: data/benchmarks/factscore.json
================================================
{
"benchmark_id": "factscore",
"name": "FActScore",
"parent_benchmark_id": null,
"categories": ["reasoning"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "A fine-grained atomic evaluation metric for factual precision in long-form text generation that breaks generated text into atomic facts and computes the percentage supported by reliable knowledge sources, with automated assessment using retrieval and language models",
"paper_link": "https://arxiv.org/abs/2305.14251",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-24T12:00:00.000000+00:00",
"updated_at": "2025-07-24T12:00:00.000000+00:00"
}
================================================
FILE: data/benchmarks/finqa.json
================================================
{
"benchmark_id": "finqa",
"name": "FinQA",
"parent_benchmark_id": null,
"categories": ["finance", "math", "reasoning"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "A large-scale dataset for numerical reasoning over financial data with question-answering pairs written by financial experts, featuring complex numerical reasoning and understanding of heterogeneous representations with annotated gold reasoning programs for full explainability",
"paper_link": "https://arxiv.org/abs/2109.00122",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:12.734486+00:00",
"updated_at": "2025-07-19T19:56:12.734486+00:00"
}
================================================
FILE: data/benchmarks/flenqa.json
================================================
{
"benchmark_id": "flenqa",
"name": "FlenQA",
"parent_benchmark_id": null,
"categories": ["reasoning", "long_context"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "Flexible Length Question Answering dataset for evaluating the impact of input length on reasoning performance of language models, featuring True/False questions embedded in contexts of varying lengths (250-3000 tokens) across three reasoning tasks: Monotone Relations, People In Rooms, and simplified Ruletaker",
"paper_link": "https://arxiv.org/abs/2402.14848",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:14.277205+00:00",
"updated_at": "2025-07-19T19:56:14.277205+00:00"
}
================================================
FILE: data/benchmarks/fleurs.json
================================================
{
"benchmark_id": "fleurs",
"name": "FLEURS",
"parent_benchmark_id": null,
"categories": ["language", "speech-to-text"],
"modality": "audio",
"multilingual": true,
"max_score": 100.0,
"language": "en",
"description": "Few-shot Learning Evaluation of Universal Representations of Speech - a parallel speech dataset in 102 languages built on FLoRes-101 with approximately 12 hours of speech supervision per language for tasks including ASR, speech language identification, translation and retrieval",
"paper_link": "https://arxiv.org/abs/2205.12446",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:13.943695+00:00",
"updated_at": "2025-07-19T19:56:13.943695+00:00"
}
================================================
FILE: data/benchmarks/frames.json
================================================
{
"benchmark_id": "frames",
"name": "FRAMES",
"parent_benchmark_id": null,
"categories": ["reasoning", "search"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "Factuality, Retrieval, And reasoning MEasurement Set - a unified evaluation dataset of 824 challenging multi-hop questions for testing retrieval-augmented generation systems across factuality, retrieval accuracy, and reasoning capabilities, requiring integration of 2-15 Wikipedia articles per question",
"paper_link": "https://arxiv.org/abs/2409.12941",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:14.954436+00:00",
"updated_at": "2025-07-19T19:56:14.954436+00:00"
}
================================================
FILE: data/benchmarks/french-mmlu.json
================================================
{
"benchmark_id": "french-mmlu",
"name": "French MMLU",
"parent_benchmark_id": null,
"categories": ["general", "language", "reasoning"],
"modality": "text",
"multilingual": true,
"max_score": 1.0,
"language": "en",
"description": "French version of MMLU-Pro, a multilingual benchmark for evaluating language models' cross-lingual reasoning capabilities across 14 diverse domains including mathematics, physics, chemistry, law, engineering, psychology, and health.",
"paper_link": "https://arxiv.org/abs/2503.10497",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:15.134340+00:00",
"updated_at": "2025-07-19T19:56:15.134340+00:00"
}
================================================
FILE: data/benchmarks/frontiermath.json
================================================
{
"benchmark_id": "frontiermath",
"name": "FrontierMath",
"parent_benchmark_id": null,
"categories": ["math", "reasoning"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "A benchmark of hundreds of original, exceptionally challenging mathematics problems crafted and vetted by expert mathematicians, covering most major branches of modern mathematics from number theory and real analysis to algebraic geometry and category theory.",
"paper_link": "https://arxiv.org/abs/2411.04872",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:15.179213+00:00",
"updated_at": "2025-07-19T19:56:15.179213+00:00"
}
================================================
FILE: data/benchmarks/functionalmath.json
================================================
{
"benchmark_id": "functionalmath",
"name": "FunctionalMATH",
"parent_benchmark_id": null,
"categories": ["math", "reasoning"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "A functional variant of the MATH benchmark that tests language models' ability to generalize reasoning patterns across different problem instances, revealing the reasoning gap between static and functional performance.",
"paper_link": "https://arxiv.org/abs/2402.19450",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:13.987516+00:00",
"updated_at": "2025-07-19T19:56:13.987516+00:00"
}
================================================
FILE: data/benchmarks/giantsteps-tempo.json
================================================
{
"benchmark_id": "giantsteps-tempo",
"name": "GiantSteps Tempo",
"parent_benchmark_id": null,
"categories": ["audio"],
"modality": "audio",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "A dataset for tempo estimation in electronic dance music containing 664 2-minute audio previews from Beatport, annotated from user corrections for evaluating automatic tempo estimation algorithms.",
"paper_link": "https://archives.ismir.net/ismir2015/paper/000246.pdf",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:14.838584+00:00",
"updated_at": "2025-07-19T19:56:14.838584+00:00"
}
================================================
FILE: data/benchmarks/global-mmlu-lite.json
================================================
{
"benchmark_id": "global-mmlu-lite",
"name": "Global-MMLU-Lite",
"parent_benchmark_id": null,
"categories": ["general", "language", "reasoning"],
"modality": "text",
"multilingual": true,
"max_score": 1.0,
"language": "en",
"description": "A lightweight version of Global MMLU benchmark that evaluates language models across multiple languages while addressing cultural and linguistic biases in multilingual evaluation.",
"paper_link": "https://arxiv.org/abs/2412.03304",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:13.534515+00:00",
"updated_at": "2025-07-19T19:56:13.534515+00:00"
}
================================================
FILE: data/benchmarks/global-mmlu.json
================================================
{
"benchmark_id": "global-mmlu",
"name": "Global-MMLU",
"parent_benchmark_id": null,
"categories": ["general", "language", "reasoning"],
"modality": "text",
"multilingual": true,
"max_score": 1.0,
"language": "en",
"description": "A comprehensive multilingual benchmark covering 42 languages that addresses cultural and linguistic biases in evaluation, with improved translation quality and culturally sensitive question subsets.",
"paper_link": "https://arxiv.org/abs/2412.03304",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:13.747524+00:00",
"updated_at": "2025-07-19T19:56:13.747524+00:00"
}
================================================
FILE: data/benchmarks/gorilla-benchmark-api-bench.json
================================================
{
"benchmark_id": "gorilla-benchmark-api-bench",
"name": "Gorilla Benchmark API Bench",
"parent_benchmark_id": null,
"categories": ["reasoning", "code"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "APIBench, a comprehensive dataset of over 11,000 instruction-API pairs from HuggingFace, TorchHub, and TensorHub APIs for evaluating language models' ability to generate accurate API calls.",
"paper_link": "https://arxiv.org/abs/2305.15334",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:14.383584+00:00",
"updated_at": "2025-07-19T19:56:14.383584+00:00"
}
================================================
FILE: data/benchmarks/govreport.json
================================================
{
"benchmark_id": "govreport",
"name": "GovReport",
"parent_benchmark_id": null,
"categories": ["summarization", "long_context"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "A long document summarization dataset consisting of reports from government research agencies including Congressional Research Service and U.S. Government Accountability Office, with significantly longer documents and summaries than other datasets.",
"paper_link": "https://arxiv.org/abs/2104.02112",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:14.218809+00:00",
"updated_at": "2025-07-19T19:56:14.218809+00:00"
}
================================================
FILE: data/benchmarks/gpqa-biology.json
================================================
{
"benchmark_id": "gpqa-biology",
"name": "GPQA Biology",
"parent_benchmark_id": null,
"categories": ["reasoning", "general"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "Biology subset of GPQA, containing challenging multiple-choice questions written by domain experts in biology. These Google-proof questions require graduate-level knowledge and reasoning.",
"paper_link": "https://arxiv.org/abs/2311.12022",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:15.391187+00:00",
"updated_at": "2025-07-19T19:56:15.391187+00:00"
}
================================================
FILE: data/benchmarks/gpqa-chemistry.json
================================================
{
"benchmark_id": "gpqa-chemistry",
"name": "GPQA Chemistry",
"parent_benchmark_id": null,
"categories": ["reasoning", "chemistry"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "Chemistry subset of GPQA, containing challenging multiple-choice questions written by domain experts in chemistry. These Google-proof questions require graduate-level knowledge and reasoning.",
"paper_link": "https://arxiv.org/abs/2311.12022",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:15.395806+00:00",
"updated_at": "2025-07-19T19:56:15.395806+00:00"
}
================================================
FILE: data/benchmarks/gpqa-physics.json
================================================
{
"benchmark_id": "gpqa-physics",
"name": "GPQA Physics",
"parent_benchmark_id": null,
"categories": ["reasoning", "physics"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "Physics subset of GPQA, containing challenging multiple-choice questions written by domain experts in physics. These Google-proof questions require graduate-level knowledge and reasoning.",
"paper_link": "https://arxiv.org/abs/2311.12022",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:15.400663+00:00",
"updated_at": "2025-07-19T19:56:15.400663+00:00"
}
================================================
FILE: data/benchmarks/gpqa.json
================================================
{
"benchmark_id": "gpqa",
"name": "GPQA",
"parent_benchmark_id": null,
"categories": ["reasoning", "general"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "A challenging dataset of 448 multiple-choice questions written by domain experts in biology, physics, and chemistry. Questions are Google-proof and extremely difficult, with PhD experts reaching 65% accuracy.",
"paper_link": "https://arxiv.org/abs/2311.12022",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:11.588605+00:00",
"updated_at": "2025-07-19T19:56:11.588605+00:00"
}
================================================
FILE: data/benchmarks/graphwalks-bfs-%3C128k.json
================================================
{
"benchmark_id": "graphwalks-bfs-<128k",
"name": "Graphwalks BFS <128k",
"parent_benchmark_id": null,
"categories": ["reasoning", "spatial_reasoning"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "A graph reasoning benchmark that evaluates language models' ability to perform breadth-first search (BFS) operations on graphs with context length under 128k tokens, returning nodes reachable at specified depths.",
"paper_link": null,
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:15.287324+00:00",
"updated_at": "2025-07-19T19:56:15.287324+00:00"
}
================================================
FILE: data/benchmarks/graphwalks-bfs-%3E128k.json
================================================
{
"benchmark_id": "graphwalks-bfs->128k",
"name": "Graphwalks BFS >128k",
"parent_benchmark_id": null,
"categories": ["reasoning", "spatial_reasoning", "long_context"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "A graph reasoning benchmark that evaluates language models' ability to perform breadth-first search (BFS) operations on graphs with context length over 128k tokens, testing long-context reasoning capabilities.",
"paper_link": null,
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:15.295876+00:00",
"updated_at": "2025-07-19T19:56:15.295876+00:00"
}
================================================
FILE: data/benchmarks/graphwalks-parents-%3C128k.json
================================================
{
"benchmark_id": "graphwalks-parents-<128k",
"name": "Graphwalks parents <128k",
"parent_benchmark_id": null,
"categories": ["reasoning", "spatial_reasoning"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "A graph reasoning benchmark that evaluates language models' ability to find parent nodes in graphs with context length under 128k tokens, requiring understanding of graph structure and edge relationships.",
"paper_link": null,
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:15.303643+00:00",
"updated_at": "2025-07-19T19:56:15.303643+00:00"
}
================================================
FILE: data/benchmarks/graphwalks-parents-%3E128k.json
================================================
{
"benchmark_id": "graphwalks-parents->128k",
"name": "Graphwalks parents >128k",
"parent_benchmark_id": null,
"categories": ["reasoning", "spatial_reasoning", "long_context"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "A graph reasoning benchmark that evaluates language models' ability to find parent nodes in graphs with context length over 128k tokens, testing long-context reasoning and graph structure understanding.",
"paper_link": null,
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:15.316836+00:00",
"updated_at": "2025-07-19T19:56:15.316836+00:00"
}
================================================
FILE: data/benchmarks/groundui-1k.json
================================================
{
"benchmark_id": "groundui-1k",
"name": "GroundUI-1K",
"parent_benchmark_id": null,
"categories": ["multimodal", "vision"],
"modality": "multimodal",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "A subset of GroundUI-18K for UI grounding evaluation, where models must predict action coordinates on screenshots based on single-step instructions across web, desktop, and mobile platforms.",
"paper_link": "https://arxiv.org/abs/2403.17918",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:12.758595+00:00",
"updated_at": "2025-07-19T19:56:12.758595+00:00"
}
================================================
FILE: data/benchmarks/gsm-8k-(cot).json
================================================
{
"benchmark_id": "gsm-8k-(cot)",
"name": "GSM-8K (CoT)",
"parent_benchmark_id": null,
"categories": ["math", "reasoning"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "Grade School Math 8K with Chain-of-Thought prompting, featuring 8.5K high-quality linguistically diverse grade school math word problems requiring multi-step reasoning and elementary arithmetic operations.",
"paper_link": "https://arxiv.org/abs/2110.14168",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:14.360381+00:00",
"updated_at": "2025-07-19T19:56:14.360381+00:00"
}
================================================
FILE: data/benchmarks/gsm8k-chat.json
================================================
{
"benchmark_id": "gsm8k-chat",
"name": "GSM8K Chat",
"parent_benchmark_id": null,
"categories": ["math", "reasoning"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "Grade School Math 8K adapted for chat format evaluation, featuring 8.5K high-quality linguistically diverse grade school math word problems requiring multi-step reasoning and elementary arithmetic operations.",
"paper_link": "https://arxiv.org/abs/2110.14168",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:15.101578+00:00",
"updated_at": "2025-07-19T19:56:15.101578+00:00"
}
================================================
FILE: data/benchmarks/gsm8k.json
================================================
{
"benchmark_id": "gsm8k",
"name": "GSM8k",
"parent_benchmark_id": null,
"categories": ["math", "reasoning"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "Grade School Math 8K, a dataset of 8.5K high-quality linguistically diverse grade school math word problems requiring multi-step reasoning and elementary arithmetic operations.",
"paper_link": "https://arxiv.org/abs/2110.14168",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:11.397385+00:00",
"updated_at": "2025-07-19T19:56:11.397385+00:00"
}
================================================
FILE: data/benchmarks/hallusion-bench.json
================================================
{
"benchmark_id": "hallusion-bench",
"name": "Hallusion Bench",
"parent_benchmark_id": null,
"categories": ["vision", "reasoning"],
"modality": "multimodal",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "A comprehensive benchmark designed to evaluate image-context reasoning in large visual-language models (LVLMs) by challenging models with 346 images and 1,129 carefully crafted questions to assess language hallucination and visual illusion",
"paper_link": "https://arxiv.org/abs/2310.14566",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:14.689507+00:00",
"updated_at": "2025-07-19T19:56:14.689507+00:00"
}
================================================
FILE: data/benchmarks/healthbench-hard.json
================================================
{
"benchmark_id": "healthbench-hard",
"name": "HealthBench Hard",
"parent_benchmark_id": null,
"categories": ["healthcare"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "A challenging variation of HealthBench that evaluates large language models' performance and safety in healthcare through 5,000 multi-turn conversations with particularly rigorous evaluation criteria validated by 262 physicians from 60 countries",
"paper_link": "https://arxiv.org/abs/2505.08775",
"implementation_link": null,
"verified": false,
"created_at": "2025-08-05T19:56:13.424873+00:00",
"updated_at": "2025-08-05T19:56:13.424873+00:00"
}
================================================
FILE: data/benchmarks/healthbench.json
================================================
{
"benchmark_id": "healthbench",
"name": "HealthBench",
"parent_benchmark_id": null,
"categories": ["healthcare"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "An open-source benchmark for measuring performance and safety of large language models in healthcare, consisting of 5,000 multi-turn conversations evaluated by 262 physicians using 48,562 unique rubric criteria across health contexts and behavioral dimensions",
"paper_link": "https://arxiv.org/abs/2505.08775",
"implementation_link": null,
"verified": false,
"created_at": "2025-08-05T19:56:13.424873+00:00",
"updated_at": "2025-08-05T19:56:13.424873+00:00"
}
================================================
FILE: data/benchmarks/hellaswag.json
================================================
{
"benchmark_id": "hellaswag",
"name": "HellaSwag",
"parent_benchmark_id": null,
"categories": ["reasoning"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "A challenging commonsense natural language inference dataset that uses Adversarial Filtering to create questions trivial for humans (>95% accuracy) but difficult for state-of-the-art models, requiring completion of sentence endings based on physical situations and everyday activities",
"paper_link": "https://arxiv.org/abs/1905.07830",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:11.145630+00:00",
"updated_at": "2025-07-19T19:56:11.145630+00:00"
}
================================================
FILE: data/benchmarks/hiddenmath.json
================================================
{
"benchmark_id": "hiddenmath",
"name": "HiddenMath",
"parent_benchmark_id": null,
"categories": ["math", "reasoning"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "Google DeepMind's internal mathematical reasoning benchmark that introduces novel problems not encountered during model training to evaluate true mathematical reasoning capabilities rather than memorization",
"paper_link": "https://storage.googleapis.com/deepmind-media/gemini/gemini_v1_5_report.pdf",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:13.424873+00:00",
"updated_at": "2025-07-19T19:56:13.424873+00:00"
}
================================================
FILE: data/benchmarks/hle.json
================================================
{
"benchmark_id": "hle",
"name": "HLE",
"parent_benchmark_id": null,
"categories": ["reasoning", "math"],
"modality": "multimodal",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "Humanity's Last Exam (HLE) is a multi-modal academic benchmark with 2,500 questions across mathematics, humanities, and natural sciences, designed to test LLM capabilities at the frontier of human knowledge with unambiguous, verifiable solutions",
"paper_link": "https://arxiv.org/abs/2501.14249",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-28T00:00:00.000000+00:00",
"updated_at": "2025-07-28T00:00:00.000000+00:00"
}
================================================
FILE: data/benchmarks/hmmt-2025.json
================================================
{
"benchmark_id": "hmmt-2025",
"name": "HMMT 2025",
"parent_benchmark_id": null,
"categories": ["math"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "Harvard-MIT Mathematics Tournament 2025 - A prestigious student-organized mathematics competition for high school students featuring two tournaments (November 2025 at MIT and February 2026 at Harvard) with individual tests, team rounds, and guts rounds",
"paper_link": "http://web.mit.edu/HMMT/www/",
"implementation_link": null,
"verified": false,
"created_at": "2025-09-05T00:00:00.000000+00:00",
"updated_at": "2025-09-05T00:00:00.000000+00:00"
}
================================================
FILE: data/benchmarks/hmmt25.json
================================================
{
"benchmark_id": "hmmt25",
"name": "HMMT25",
"parent_benchmark_id": null,
"categories": ["math"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "Harvard-MIT Mathematics Tournament 2025 - A prestigious student-organized mathematics competition for high school students featuring two tournaments (November 2025 at MIT and February 2026 at Harvard) with individual tests, team rounds, and guts rounds",
"paper_link": "http://web.mit.edu/HMMT/www/",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:15.061281+00:00",
"updated_at": "2025-07-19T19:56:15.061281+00:00"
}
================================================
FILE: data/benchmarks/humaneval+.json
================================================
{
"benchmark_id": "humaneval+",
"name": "HumanEval+",
"parent_benchmark_id": null,
"categories": ["reasoning"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "Enhanced version of HumanEval that extends the original test cases by 80x using EvalPlus framework for rigorous evaluation of LLM-synthesized code functional correctness, detecting previously undetected wrong code",
"paper_link": "https://arxiv.org/abs/2305.01210",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:14.062352+00:00",
"updated_at": "2025-07-19T19:56:14.062352+00:00"
}
================================================
FILE: data/benchmarks/humaneval-average.json
================================================
{
"benchmark_id": "humaneval-average",
"name": "HumanEval-Average",
"parent_benchmark_id": null,
"categories": ["reasoning"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "A variant of the HumanEval benchmark that measures functional correctness for synthesizing programs from docstrings, consisting of 164 original programming problems assessing language comprehension, algorithms, and simple mathematics",
"paper_link": "https://arxiv.org/abs/2107.03374",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:15.171175+00:00",
"updated_at": "2025-07-19T19:56:15.171175+00:00"
}
================================================
FILE: data/benchmarks/humaneval-er.json
================================================
{
"benchmark_id": "humaneval-er",
"name": "HumanEval-ER",
"parent_benchmark_id": null,
"categories": ["reasoning"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "A variant of the HumanEval benchmark that measures functional correctness for synthesizing programs from docstrings, consisting of 164 original programming problems assessing language comprehension, algorithms, and simple mathematics",
"paper_link": "https://arxiv.org/abs/2107.03374",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:12.704744+00:00",
"updated_at": "2025-07-19T19:56:12.704744+00:00"
}
================================================
FILE: data/benchmarks/humaneval-mul.json
================================================
{
"benchmark_id": "humaneval-mul",
"name": "HumanEval-Mul",
"parent_benchmark_id": null,
"categories": ["reasoning"],
"modality": "text",
"multilingual": true,
"max_score": 1.0,
"language": "en",
"description": "A multilingual variant of the HumanEval benchmark that measures functional correctness for synthesizing programs from docstrings, consisting of 164 original programming problems assessing language comprehension, algorithms, and simple mathematics",
"paper_link": "https://arxiv.org/abs/2107.03374",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:15.032472+00:00",
"updated_at": "2025-07-19T19:56:15.032472+00:00"
}
================================================
FILE: data/benchmarks/humaneval-plus.json
================================================
{
"benchmark_id": "humaneval-plus",
"name": "HumanEval Plus",
"parent_benchmark_id": null,
"categories": ["reasoning", "code"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "Enhanced version of HumanEval that extends the original test cases by 80x using EvalPlus framework for rigorous evaluation of LLM-synthesized code functional correctness, detecting previously undetected wrong code",
"paper_link": "https://arxiv.org/abs/2305.01210",
"implementation_link": null,
"verified": false,
"created_at": "2025-08-03T22:06:10.921756+00:00",
"updated_at": "2025-08-03T22:06:10.921756+00:00"
}
================================================
FILE: data/benchmarks/humaneval.json
================================================
{
"benchmark_id": "humaneval",
"name": "HumanEval",
"parent_benchmark_id": null,
"categories": ["reasoning", "code"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "A benchmark that measures functional correctness for synthesizing programs from docstrings, consisting of 164 original programming problems assessing language comprehension, algorithms, and simple mathematics",
"paper_link": "https://arxiv.org/abs/2107.03374",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:12.595263+00:00",
"updated_at": "2025-07-19T19:56:12.595263+00:00"
}
================================================
FILE: data/benchmarks/humanevalfim-average.json
================================================
{
"benchmark_id": "humanevalfim-average",
"name": "HumanEvalFIM-Average",
"parent_benchmark_id": null,
"categories": ["general"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "Average evaluation of HumanEval Fill-in-the-Middle benchmark variants (single-line, multi-line, random-span) for assessing code infilling capabilities of language models",
"paper_link": "https://arxiv.org/abs/2207.14255",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:15.160562+00:00",
"updated_at": "2025-07-19T19:56:15.160562+00:00"
}
================================================
FILE: data/benchmarks/humanity's-last-exam.json
================================================
{
"benchmark_id": "humanity's-last-exam",
"name": "Humanity's Last Exam",
"parent_benchmark_id": null,
"categories": ["general"],
"modality": "multimodal",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "A multi-modal benchmark at the frontier of human knowledge with 2,500 questions across dozens of subjects including mathematics, humanities, and natural sciences, created by nearly 1000 subject expert contributors from over 500 institutions",
"paper_link": "https://arxiv.org/abs/2501.14249",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:12.507693+00:00",
"updated_at": "2025-07-19T19:56:12.507693+00:00"
}
================================================
FILE: data/benchmarks/if.json
================================================
{
"benchmark_id": "if",
"name": "IF",
"parent_benchmark_id": null,
"categories": ["general"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "Instruction-Following Evaluation (IFEval) benchmark for large language models, focusing on verifiable instructions with 25 types of instructions and around 500 prompts containing one or more verifiable constraints",
"paper_link": "https://arxiv.org/abs/2311.07911",
"implementation_link": null,
"verified": false,
"created_at": "2025-08-03T22:06:11.089394+00:00",
"updated_at": "2025-08-03T22:06:11.089394+00:00"
}
================================================
FILE: data/benchmarks/ifeval.json
================================================
{
"benchmark_id": "ifeval",
"name": "IFEval",
"parent_benchmark_id": null,
"categories": ["general"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "Instruction-Following Evaluation (IFEval) benchmark for large language models, focusing on verifiable instructions with 25 types of instructions and around 500 prompts containing one or more verifiable constraints",
"paper_link": "https://arxiv.org/abs/2311.07911",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:12.241350+00:00",
"updated_at": "2025-07-19T19:56:12.241350+00:00"
}
================================================
FILE: data/benchmarks/include.json
================================================
{
"benchmark_id": "include",
"name": "Include",
"parent_benchmark_id": null,
"categories": ["general"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "Include benchmark - specific documentation not found in official sources",
"paper_link": null,
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:13.724387+00:00",
"updated_at": "2025-07-19T19:56:13.724387+00:00"
}
================================================
FILE: data/benchmarks/infinitebench-en.mc.json
================================================
{
"benchmark_id": "infinitebench-en.mc",
"name": "InfiniteBench/En.MC",
"parent_benchmark_id": null,
"categories": ["long_context"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "InfiniteBench English Multiple Choice variant - first LLM benchmark featuring average data length surpassing 100K tokens for evaluating long-context capabilities with 12 tasks spanning diverse domains",
"paper_link": "https://arxiv.org/abs/2402.13718",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:14.461508+00:00",
"updated_at": "2025-07-19T19:56:14.461508+00:00"
}
================================================
FILE: data/benchmarks/infinitebench-en.qa.json
================================================
{
"benchmark_id": "infinitebench-en.qa",
"name": "InfiniteBench/En.QA",
"parent_benchmark_id": null,
"categories": ["long_context"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "InfiniteBench English Question Answering variant - first LLM benchmark featuring average data length surpassing 100K tokens for evaluating long-context capabilities with 12 tasks spanning diverse domains",
"paper_link": "https://arxiv.org/abs/2402.13718",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:14.457927+00:00",
"updated_at": "2025-07-19T19:56:14.457927+00:00"
}
================================================
FILE: data/benchmarks/infographicsqa.json
================================================
{
"benchmark_id": "infographicsqa",
"name": "InfographicsQA",
"parent_benchmark_id": null,
"categories": ["vision", "multimodal"],
"modality": "multimodal",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "InfographicVQA dataset with 5,485 infographic images and over 30,000 questions requiring joint reasoning over document layout, textual content, graphical elements, and data visualizations with elementary reasoning and arithmetic skills",
"paper_link": "https://arxiv.org/abs/2104.12756",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:14.417669+00:00",
"updated_at": "2025-07-19T19:56:14.417669+00:00"
}
================================================
FILE: data/benchmarks/infovqa.json
================================================
{
"benchmark_id": "infovqa",
"name": "InfoVQA",
"parent_benchmark_id": null,
"categories": ["vision", "multimodal"],
"modality": "multimodal",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "InfoVQA dataset with 30,000 questions and 5,000 infographic images requiring joint reasoning over document layout, textual content, graphical elements, and data visualizations with elementary reasoning and arithmetic skills",
"paper_link": "https://arxiv.org/abs/2104.12756",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:13.601294+00:00",
"updated_at": "2025-07-19T19:56:13.601294+00:00"
}
================================================
FILE: data/benchmarks/infovqatest.json
================================================
{
"benchmark_id": "infovqatest",
"name": "InfoVQAtest",
"parent_benchmark_id": null,
"categories": ["vision", "multimodal"],
"modality": "multimodal",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "InfoVQA test set with infographic images requiring joint reasoning over document layout, textual content, graphical elements, and data visualizations with elementary reasoning and arithmetic skills",
"paper_link": "https://arxiv.org/abs/2104.12756",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:14.583939+00:00",
"updated_at": "2025-07-19T19:56:14.583939+00:00"
}
================================================
FILE: data/benchmarks/instruct-humaneval.json
================================================
{
"benchmark_id": "instruct-humaneval",
"name": "Instruct HumanEval",
"parent_benchmark_id": null,
"categories": ["general"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "Instruction-based variant of HumanEval benchmark for evaluating large language models' code generation capabilities with functional correctness using pass@k metric on programming problems",
"paper_link": "https://arxiv.org/abs/2107.03374",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:15.105488+00:00",
"updated_at": "2025-07-19T19:56:15.105488+00:00"
}
================================================
FILE: data/benchmarks/intergps.json
================================================
{
"benchmark_id": "intergps",
"name": "InterGPS",
"parent_benchmark_id": null,
"categories": ["math", "spatial_reasoning"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "Interpretable Geometry Problem Solver (Inter-GPS) with Geometry3K dataset of 3,002 geometry problems with dense annotation in formal language using theorem knowledge and symbolic reasoning",
"paper_link": "https://arxiv.org/abs/2105.04165",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:14.259321+00:00",
"updated_at": "2025-07-19T19:56:14.259321+00:00"
}
================================================
FILE: data/benchmarks/internal-api-instruction-following-(hard).json
================================================
{
"benchmark_id": "internal-api-instruction-following-(hard)",
"name": "Internal API instruction following (hard)",
"parent_benchmark_id": null,
"categories": ["general"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "Internal API instruction following (hard) benchmark - specific documentation not found in official sources",
"paper_link": null,
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:15.222560+00:00",
"updated_at": "2025-07-19T19:56:15.222560+00:00"
}
================================================
FILE: data/benchmarks/lbpp-(v2).json
================================================
{
"benchmark_id": "lbpp-(v2)",
"name": "LBPP (v2)",
"parent_benchmark_id": null,
"categories": ["reasoning"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "LBPP (v2) benchmark - specific documentation not found in official sources, possibly related to language-based planning problems",
"paper_link": "https://arxiv.org/abs/2206.10498",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:14.053535+00:00",
"updated_at": "2025-07-19T19:56:14.053535+00:00"
}
================================================
FILE: data/benchmarks/livebench-20241125.json
================================================
{
"benchmark_id": "livebench-20241125",
"name": "LiveBench 20241125",
"parent_benchmark_id": null,
"categories": ["math", "reasoning", "general"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "LiveBench is a challenging, contamination-limited LLM benchmark that addresses test set contamination by releasing new questions monthly based on recently-released datasets, arXiv papers, news articles, and IMDb movie synopses. It comprises tasks across math, coding, reasoning, language, instruction following, and data analysis with verifiable, objective ground-truth answers.",
"paper_link": "https://arxiv.org/abs/2406.19314",
"implementation_link": null,
"verified": false,
"created_at": "2025-08-03T22:06:11.046321+00:00",
"updated_at": "2025-08-03T22:06:11.046321+00:00"
}
================================================
FILE: data/benchmarks/livebench.json
================================================
{
"benchmark_id": "livebench",
"name": "LiveBench",
"parent_benchmark_id": null,
"categories": ["math", "reasoning", "general"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "LiveBench is a challenging, contamination-limited LLM benchmark that addresses test set contamination by releasing new questions monthly based on recently-released datasets, arXiv papers, news articles, and IMDb movie synopses. It comprises tasks across math, coding, reasoning, language, instruction following, and data analysis with verifiable, objective ground-truth answers.",
"paper_link": "https://arxiv.org/abs/2406.19314",
"implementation_link": null,
"verified": false,
"created_at": "2025-09-05T00:00:00.000000+00:00",
"updated_at": "2025-09-05T00:00:00.000000+00:00"
}
================================================
FILE: data/benchmarks/livecodebench(01-09).json
================================================
{
"benchmark_id": "livecodebench(01-09)",
"name": "LiveCodeBench(01-09)",
"parent_benchmark_id": null,
"categories": ["reasoning", "general"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "LiveCodeBench is a holistic and contamination-free evaluation benchmark for large language models for code. It continuously collects new problems from programming contests (LeetCode, AtCoder, CodeForces) and evaluates four different scenarios: code generation, self-repair, code execution, and test output prediction. Problems are annotated with release dates to enable evaluation on unseen problems released after a model's training cutoff.",
"paper_link": "https://arxiv.org/abs/2403.07974",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:15.049594+00:00",
"updated_at": "2025-07-19T19:56:15.049594+00:00"
}
================================================
FILE: data/benchmarks/livecodebench-v5-24.12-25.2.json
================================================
{
"benchmark_id": "livecodebench-v5-24.12-25.2",
"name": "LiveCodeBench v5 24.12-25.2",
"parent_benchmark_id": null,
"categories": ["reasoning", "general"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "LiveCodeBench is a holistic and contamination-free evaluation benchmark for large language models for code. It continuously collects new problems from programming contests (LeetCode, AtCoder, CodeForces) and evaluates four different scenarios: code generation, self-repair, code execution, and test output prediction. Problems are annotated with release dates to enable evaluation on unseen problems released after a model's training cutoff.",
"paper_link": "https://arxiv.org/abs/2403.07974",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:12.066180+00:00",
"updated_at": "2025-07-19T19:56:12.066180+00:00"
}
================================================
FILE: data/benchmarks/livecodebench-v5.json
================================================
{
"benchmark_id": "livecodebench-v5",
"name": "LiveCodeBench v5",
"parent_benchmark_id": null,
"categories": ["reasoning", "general"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "LiveCodeBench is a holistic and contamination-free evaluation benchmark for large language models for code. It continuously collects new problems from programming contests (LeetCode, AtCoder, CodeForces) and evaluates four different scenarios: code generation, self-repair, code execution, and test output prediction. Problems are annotated with release dates to enable evaluation on unseen problems released after a model's training cutoff.",
"paper_link": "https://arxiv.org/abs/2403.07974",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:13.759330+00:00",
"updated_at": "2025-07-19T19:56:13.759330+00:00"
}
================================================
FILE: data/benchmarks/livecodebench-v6.json
================================================
{
"benchmark_id": "livecodebench-v6",
"name": "LiveCodeBench v6",
"parent_benchmark_id": null,
"categories": ["reasoning", "general"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "LiveCodeBench is a holistic and contamination-free evaluation benchmark for large language models for code. It continuously collects new problems from programming contests (LeetCode, AtCoder, CodeForces) and evaluates four different scenarios: code generation, self-repair, code execution, and test output prediction. Problems are annotated with release dates to enable evaluation on unseen problems released after a model's training cutoff.",
"paper_link": "https://arxiv.org/abs/2403.07974",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:11.785682+00:00",
"updated_at": "2025-07-19T19:56:11.785682+00:00"
}
================================================
FILE: data/benchmarks/livecodebench.json
================================================
{
"benchmark_id": "livecodebench",
"name": "LiveCodeBench",
"parent_benchmark_id": null,
"categories": ["reasoning", "general", "code"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "LiveCodeBench is a holistic and contamination-free evaluation benchmark for large language models for code. It continuously collects new problems from programming contests (LeetCode, AtCoder, CodeForces) and evaluates four different scenarios: code generation, self-repair, code execution, and test output prediction. Problems are annotated with release dates to enable evaluation on unseen problems released after a model's training cutoff.",
"paper_link": "https://arxiv.org/abs/2403.07974",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:13.292229+00:00",
"updated_at": "2025-07-19T19:56:13.292229+00:00"
}
================================================
FILE: data/benchmarks/longbench-v2.json
================================================
{
"benchmark_id": "longbench-v2",
"name": "LongBench v2",
"parent_benchmark_id": null,
"categories": ["long_context", "reasoning", "general"],
"modality": "text",
"multilingual": true,
"max_score": 1.0,
"language": "en",
"description": "LongBench v2 is a benchmark designed to assess the ability of LLMs to handle long-context problems requiring deep understanding and reasoning across real-world multitasks. It consists of 503 challenging multiple-choice questions with contexts ranging from 8k to 2M words across six major task categories: single-document QA, multi-document QA, long in-context learning, long-dialogue history understanding, code repository understanding, and long structured data understanding.",
"paper_link": "https://arxiv.org/abs/2412.15204",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:15.029281+00:00",
"updated_at": "2025-07-19T19:56:15.029281+00:00"
}
================================================
FILE: data/benchmarks/longfact-concepts.json
================================================
{
"benchmark_id": "longfact-concepts",
"name": "LongFact Concepts",
"parent_benchmark_id": null,
"categories": ["general", "reasoning"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "LongFact is a benchmark for evaluating long-form factuality in large language models. It comprises 2,280 fact-seeking prompts spanning 38 topics, designed to test a model's ability to generate accurate, long-form responses. The benchmark uses SAFE (Search-Augmented Factuality Evaluator) to evaluate factual accuracy.",
"paper_link": "https://arxiv.org/abs/2403.18802",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-24T12:00:00.000000+00:00",
"updated_at": "2025-07-24T12:00:00.000000+00:00"
}
================================================
FILE: data/benchmarks/longfact-objects.json
================================================
{
"benchmark_id": "longfact-objects",
"name": "LongFact Objects",
"parent_benchmark_id": null,
"categories": ["general", "reasoning"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "LongFact is a benchmark for evaluating long-form factuality in large language models. It comprises 2,280 fact-seeking prompts spanning 38 topics, designed to test a model's ability to generate accurate, long-form responses. The benchmark uses SAFE (Search-Augmented Factuality Evaluator) to evaluate factual accuracy.",
"paper_link": "https://arxiv.org/abs/2403.18802",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-24T12:00:00.000000+00:00",
"updated_at": "2025-07-24T12:00:00.000000+00:00"
}
================================================
FILE: data/benchmarks/longvideobench.json
================================================
{
"benchmark_id": "longvideobench",
"name": "LongVideoBench",
"parent_benchmark_id": null,
"categories": ["vision", "long_context", "multimodal"],
"modality": "multimodal",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "LongVideoBench is a question-answering benchmark featuring video-language interleaved inputs up to an hour long. It includes 3,763 varying-length web-collected videos with subtitles across diverse themes and 6,678 human-annotated multiple-choice questions in 17 fine-grained categories for comprehensive evaluation of long-term multimodal understanding.",
"paper_link": "https://arxiv.org/abs/2407.15754",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:14.730349+00:00",
"updated_at": "2025-07-19T19:56:14.730349+00:00"
}
================================================
FILE: data/benchmarks/lsat.json
================================================
{
"benchmark_id": "lsat",
"name": "LSAT",
"parent_benchmark_id": null,
"categories": ["reasoning", "legal", "general"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "LSAT (Law School Admission Test) benchmark evaluating complex reasoning capabilities across three challenging tasks: analytical reasoning, logical reasoning, and reading comprehension. The LSAT measures skills considered essential for success in law school including critical thinking, reading comprehension of complex texts, and analysis of arguments.",
"paper_link": "https://arxiv.org/abs/2108.00648",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:15.409871+00:00",
"updated_at": "2025-07-19T19:56:15.409871+00:00"
}
================================================
FILE: data/benchmarks/lvbench.json
================================================
{
"benchmark_id": "lvbench",
"name": "LVBench",
"parent_benchmark_id": null,
"categories": ["vision", "multimodal", "long_context"],
"modality": "multimodal",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "LVBench is an extreme long video understanding benchmark designed to evaluate multimodal models on videos up to two hours in duration. It contains 6 major categories and 21 subcategories, with videos averaging five times longer than existing datasets. The benchmark addresses applications requiring comprehension of extremely long videos.",
"paper_link": "https://arxiv.org/abs/2406.08035",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:12.724041+00:00",
"updated_at": "2025-07-19T19:56:12.724041+00:00"
}
================================================
FILE: data/benchmarks/math-(cot).json
================================================
{
"benchmark_id": "math-(cot)",
"name": "MATH (CoT)",
"parent_benchmark_id": null,
"categories": ["math", "reasoning"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "MATH dataset contains 12,500 challenging competition mathematics problems from AMC 10, AMC 12, AIME, and other mathematics competitions. Each problem includes full step-by-step solutions and spans multiple difficulty levels (1-5) across seven mathematical subjects. This variant uses Chain-of-Thought prompting to encourage step-by-step reasoning.",
"paper_link": "https://arxiv.org/abs/2103.03874",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:14.366159+00:00",
"updated_at": "2025-07-19T19:56:14.366159+00:00"
}
================================================
FILE: data/benchmarks/math-500.json
================================================
{
"benchmark_id": "math-500",
"name": "MATH-500",
"parent_benchmark_id": null,
"categories": ["math", "reasoning"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "MATH-500 is a subset of the MATH dataset containing 500 challenging competition mathematics problems from AMC 10, AMC 12, AIME, and other mathematics competitions. Each problem includes full step-by-step solutions and spans multiple difficulty levels across seven mathematical subjects including Prealgebra, Algebra, Number Theory, Counting and Probability, Geometry, Intermediate Algebra, and Precalculus.",
"paper_link": "https://arxiv.org/abs/2103.03874",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:12.027850+00:00",
"updated_at": "2025-07-19T19:56:12.027850+00:00"
}
================================================
FILE: data/benchmarks/math.json
================================================
{
"benchmark_id": "math",
"name": "MATH",
"parent_benchmark_id": null,
"categories": ["math", "reasoning"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "MATH dataset contains 12,500 challenging competition mathematics problems from AMC 10, AMC 12, AIME, and other mathematics competitions. Each problem includes full step-by-step solutions and spans multiple difficulty levels (1-5) across seven mathematical subjects including Prealgebra, Algebra, Number Theory, Counting and Probability, Geometry, Intermediate Algebra, and Precalculus.",
"paper_link": "https://arxiv.org/abs/2103.03874",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:11.804258+00:00",
"updated_at": "2025-07-19T19:56:11.804258+00:00"
}
================================================
FILE: data/benchmarks/mathvision.json
================================================
{
"benchmark_id": "mathvision",
"name": "MathVision",
"parent_benchmark_id": null,
"categories": ["math", "vision", "multimodal"],
"modality": "multimodal",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "MATH-Vision is a dataset designed to measure multimodal mathematical reasoning capabilities. It focuses on evaluating how well models can solve mathematical problems that require both visual understanding and mathematical reasoning, bridging the gap between visual and mathematical domains.",
"paper_link": "https://arxiv.org/abs/2402.14804",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:14.695583+00:00",
"updated_at": "2025-07-19T19:56:14.695583+00:00"
}
================================================
FILE: data/benchmarks/mathvista-mini.json
================================================
{
"benchmark_id": "mathvista-mini",
"name": "MathVista-Mini",
"parent_benchmark_id": null,
"categories": ["math", "vision", "multimodal"],
"modality": "multimodal",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "MathVista-Mini is a smaller version of the MathVista benchmark that evaluates mathematical reasoning in visual contexts. It consists of examples derived from multimodal datasets involving mathematics, combining challenges from diverse mathematical and visual tasks to assess foundation models' ability to solve problems requiring both visual understanding and mathematical reasoning.",
"paper_link": "https://arxiv.org/abs/2310.02255",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:13.654470+00:00",
"updated_at": "2025-07-19T19:56:13.654470+00:00"
}
================================================
FILE: data/benchmarks/mathvista.json
================================================
{
"benchmark_id": "mathvista",
"name": "MathVista",
"parent_benchmark_id": null,
"categories": ["math", "vision", "multimodal"],
"modality": "multimodal",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "MathVista evaluates mathematical reasoning of foundation models in visual contexts. It consists of 6,141 examples derived from 28 existing multimodal datasets and 3 newly created datasets (IQTest, FunctionQA, and PaperQA), combining challenges from diverse mathematical and visual tasks to assess models' ability to understand complex figures and perform rigorous reasoning.",
"paper_link": "https://arxiv.org/abs/2310.02255",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:12.069611+00:00",
"updated_at": "2025-07-19T19:56:12.069611+00:00"
}
================================================
FILE: data/benchmarks/mbpp+.json
================================================
{
"benchmark_id": "mbpp+",
"name": "MBPP+",
"parent_benchmark_id": null,
"categories": ["reasoning", "general"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "MBPP+ is an enhanced version of MBPP (Mostly Basic Python Problems) with significantly more test cases (35x) for more rigorous evaluation. MBPP is a benchmark of 974 crowd-sourced Python programming problems designed to be solvable by entry-level programmers, covering programming fundamentals and standard library functionality.",
"paper_link": "https://arxiv.org/abs/2108.07732",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:14.501855+00:00",
"updated_at": "2025-07-19T19:56:14.501855+00:00"
}
================================================
FILE: data/benchmarks/mbpp-++-base-version.json
================================================
{
"benchmark_id": "mbpp-++-base-version",
"name": "MBPP ++ base version",
"parent_benchmark_id": null,
"categories": ["reasoning", "general"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "MBPP (Mostly Basic Python Problems) is a benchmark of 974 crowd-sourced Python programming problems designed to be solvable by entry-level programmers. Each problem consists of a task description, code solution, and 3 automated test cases covering programming fundamentals and standard library functionality. This is an enhanced version with additional test cases.",
"paper_link": "https://arxiv.org/abs/2108.07732",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:14.341560+00:00",
"updated_at": "2025-07-19T19:56:14.341560+00:00"
}
================================================
FILE: data/benchmarks/mbpp-evalplus-(base).json
================================================
{
"benchmark_id": "mbpp-evalplus-(base)",
"name": "MBPP EvalPlus (base)",
"parent_benchmark_id": null,
"categories": ["reasoning", "general"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "MBPP (Mostly Basic Python Problems) is a benchmark of 974 crowd-sourced Python programming problems designed to be solvable by entry-level programmers. EvalPlus extends MBPP with significantly more test cases (35x) for more rigorous evaluation of LLM-synthesized code, providing high-quality and precise evaluation.",
"paper_link": "https://arxiv.org/abs/2108.07732",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:14.421722+00:00",
"updated_at": "2025-07-19T19:56:14.421722+00:00"
}
================================================
FILE: data/benchmarks/mbpp-evalplus.json
================================================
{
"benchmark_id": "mbpp-evalplus",
"name": "MBPP EvalPlus",
"parent_benchmark_id": null,
"categories": ["reasoning", "general"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "MBPP (Mostly Basic Python Problems) is a benchmark of 974 crowd-sourced Python programming problems designed to be solvable by entry-level programmers. EvalPlus extends MBPP with significantly more test cases (35x) for more rigorous evaluation of LLM-synthesized code, providing high-quality and precise evaluation.",
"paper_link": "https://arxiv.org/abs/2108.07732",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:14.425667+00:00",
"updated_at": "2025-07-19T19:56:14.425667+00:00"
}
================================================
FILE: data/benchmarks/mbpp-pass@1.json
================================================
{
"benchmark_id": "mbpp-pass@1",
"name": "MBPP pass@1",
"parent_benchmark_id": null,
"categories": ["reasoning", "general"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "MBPP (Mostly Basic Python Problems) is a benchmark of 974 crowd-sourced Python programming problems designed to be solvable by entry-level programmers. Each problem consists of a task description, code solution, and 3 automated test cases. This variant uses pass@1 evaluation metric measuring the percentage of problems solved correctly on the first attempt.",
"paper_link": "https://arxiv.org/abs/2108.07732",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:15.138778+00:00",
"updated_at": "2025-07-19T19:56:15.138778+00:00"
}
================================================
FILE: data/benchmarks/mbpp-plus.json
================================================
{
"benchmark_id": "mbpp-plus",
"name": "MBPP Plus",
"parent_benchmark_id": null,
"categories": ["reasoning", "code"],
"modality": "text",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "MBPP (Mostly Basic Python Problems) is a benchmark of 974 crowd-sourced Python programming problems designed to be solvable by entry-level programmers. Each problem consists of a task description, code solution, and 3 automated test cases covering programming fundamentals and standard library functionality. This is an enhanced version with additional test cases for more rigorous evaluation.",
"paper_link": "https://arxiv.org/abs/2108.07732",
"implementation_link": null,
"verified": false,
"created_at": "2025-08-03T22:06:11.143382+00:00",
"updated_at": "2025-08-03T22:06:11.143382+00:00"
}
================================================
FILE: data/benchmarks/mbpp.json
================================================
{
"benchmark_id": "mbpp",
"name": "MBPP",
"parent_benchmark_id": null,
"categories": ["reasoning", "general"],
"modality": "text",
"multilingual": false,
"max_score": 100.0,
"language": "en",
"description": "MBPP (Mostly Basic Python Problems) is a benchmark of 974 crowd-sourced Python programming problems designed to be solvable by entry-level programmers. Each problem consists of a task description, code solution, and 3 automated test cases covering programming fundamentals and standard library functionality.",
"paper_link": "https://arxiv.org/abs/2108.07732",
"implementation_link": null,
"verified": false,
"created_at": "2025-07-19T19:56:13.453370+00:00",
"updated_at": "2025-07-19T19:56:13.453370+00:00"
}
================================================
FILE: data/benchmarks/medxpertqa.json
================================================
{
"benchmark_id": "medxpertqa",
"name": "MedXpertQA",
"parent_benchmark_id": null,
"categories": ["healthcare", "reasoning", "multimodal"],
"modality": "multimodal",
"multilingual": false,
"max_score": 1.0,
"language": "en",
"description": "A comprehensive benchmark to evaluate expert-level medical knowledg
gitextract_261_qksq/
├── .github/
│ ├── pull_request_template.md
│ └── workflows/
│ └── schema-validation.yml
├── .gitignore
├── .vscode/
│ └── settings.json
├── CONTRIBUTING.md
├── LICENSE.md
├── README.md
├── data/
│ ├── .github/
│ │ └── CODEOWNERS
│ ├── benchmarks/
│ │ ├── aa-index.json
│ │ ├── acebench.json
│ │ ├── activitynet.json
│ │ ├── agieval.json
│ │ ├── ai2-reasoning-challenge-(arc).json
│ │ ├── ai2d.json
│ │ ├── aider-polyglot-edit.json
│ │ ├── aider-polyglot.json
│ │ ├── aider.json
│ │ ├── aime-2024.json
│ │ ├── aime-2025.json
│ │ ├── aime.json
│ │ ├── aitz-em.json
│ │ ├── alignbench.json
│ │ ├── alpacaeval-2.0.json
│ │ ├── amc-2022-23.json
│ │ ├── android-control-high-em.json
│ │ ├── android-control-low-em.json
│ │ ├── androidworld-sr.json
│ │ ├── api-bank.json
│ │ ├── arc-agi-v2.json
│ │ ├── arc-agi.json
│ │ ├── arc-c.json
│ │ ├── arc-e.json
│ │ ├── arc.json
│ │ ├── arena-hard-v2.json
│ │ ├── arena-hard.json
│ │ ├── attaq.json
│ │ ├── autologi.json
│ │ ├── bbh.json
│ │ ├── bfcl-v2.json
│ │ ├── bfcl-v3-multiturn.json
│ │ ├── bfcl-v3.json
│ │ ├── bfcl.json
│ │ ├── big-bench-extra-hard.json
│ │ ├── big-bench-hard.json
│ │ ├── big-bench.json
│ │ ├── bigcodebench-full.json
│ │ ├── bigcodebench-hard.json
│ │ ├── bigcodebench.json
│ │ ├── bird-sql-(dev).json
│ │ ├── blink.json
│ │ ├── boolq.json
│ │ ├── browsecomp-long-128k.json
│ │ ├── browsecomp-long-256k.json
│ │ ├── browsecomp-zh.json
│ │ ├── browsecomp.json
│ │ ├── c-eval.json
│ │ ├── cbnsl.json
│ │ ├── cc-ocr.json
│ │ ├── cfeval.json
│ │ ├── charadessta.json
│ │ ├── chartqa.json
│ │ ├── charxiv-d.json
│ │ ├── charxiv-r.json
│ │ ├── chexpert-cxr.json
│ │ ├── cluewsc.json
│ │ ├── cmmlu.json
│ │ ├── cnmo-2024.json
│ │ ├── codeforces.json
│ │ ├── codegolf-v2.2.json
│ │ ├── collie.json
│ │ ├── common-voice-15.json
│ │ ├── commonsenseqa.json
│ │ ├── complexfuncbench.json
│ │ ├── covost2-en-zh.json
│ │ ├── covost2.json
│ │ ├── crag.json
│ │ ├── creative-writing-v3.json
│ │ ├── crperelation.json
│ │ ├── crux-o.json
│ │ ├── cruxeval-input-cot.json
│ │ ├── cruxeval-o.json
│ │ ├── cruxeval-output-cot.json
│ │ ├── csimpleqa.json
│ │ ├── cybersecurity-ctfs.json
│ │ ├── dermmcqa.json
│ │ ├── docvqa.json
│ │ ├── docvqatest.json
│ │ ├── drop.json
│ │ ├── ds-arena-code.json
│ │ ├── ds-fim-eval.json
│ │ ├── eclektic.json
│ │ ├── egoschema.json
│ │ ├── erqa.json
│ │ ├── evalplus.json
│ │ ├── facts-grounding.json
│ │ ├── factscore.json
│ │ ├── finqa.json
│ │ ├── flenqa.json
│ │ ├── fleurs.json
│ │ ├── frames.json
│ │ ├── french-mmlu.json
│ │ ├── frontiermath.json
│ │ ├── functionalmath.json
│ │ ├── giantsteps-tempo.json
│ │ ├── global-mmlu-lite.json
│ │ ├── global-mmlu.json
│ │ ├── gorilla-benchmark-api-bench.json
│ │ ├── govreport.json
│ │ ├── gpqa-biology.json
│ │ ├── gpqa-chemistry.json
│ │ ├── gpqa-physics.json
│ │ ├── gpqa.json
│ │ ├── graphwalks-bfs-%3C128k.json
│ │ ├── graphwalks-bfs-%3E128k.json
│ │ ├── graphwalks-parents-%3C128k.json
│ │ ├── graphwalks-parents-%3E128k.json
│ │ ├── groundui-1k.json
│ │ ├── gsm-8k-(cot).json
│ │ ├── gsm8k-chat.json
│ │ ├── gsm8k.json
│ │ ├── hallusion-bench.json
│ │ ├── healthbench-hard.json
│ │ ├── healthbench.json
│ │ ├── hellaswag.json
│ │ ├── hiddenmath.json
│ │ ├── hle.json
│ │ ├── hmmt-2025.json
│ │ ├── hmmt25.json
│ │ ├── humaneval+.json
│ │ ├── humaneval-average.json
│ │ ├── humaneval-er.json
│ │ ├── humaneval-mul.json
│ │ ├── humaneval-plus.json
│ │ ├── humaneval.json
│ │ ├── humanevalfim-average.json
│ │ ├── humanity's-last-exam.json
│ │ ├── if.json
│ │ ├── ifeval.json
│ │ ├── include.json
│ │ ├── infinitebench-en.mc.json
│ │ ├── infinitebench-en.qa.json
│ │ ├── infographicsqa.json
│ │ ├── infovqa.json
│ │ ├── infovqatest.json
│ │ ├── instruct-humaneval.json
│ │ ├── intergps.json
│ │ ├── internal-api-instruction-following-(hard).json
│ │ ├── lbpp-(v2).json
│ │ ├── livebench-20241125.json
│ │ ├── livebench.json
│ │ ├── livecodebench(01-09).json
│ │ ├── livecodebench-v5-24.12-25.2.json
│ │ ├── livecodebench-v5.json
│ │ ├── livecodebench-v6.json
│ │ ├── livecodebench.json
│ │ ├── longbench-v2.json
│ │ ├── longfact-concepts.json
│ │ ├── longfact-objects.json
│ │ ├── longvideobench.json
│ │ ├── lsat.json
│ │ ├── lvbench.json
│ │ ├── math-(cot).json
│ │ ├── math-500.json
│ │ ├── math.json
│ │ ├── mathvision.json
│ │ ├── mathvista-mini.json
│ │ ├── mathvista.json
│ │ ├── mbpp+.json
│ │ ├── mbpp-++-base-version.json
│ │ ├── mbpp-evalplus-(base).json
│ │ ├── mbpp-evalplus.json
│ │ ├── mbpp-pass@1.json
│ │ ├── mbpp-plus.json
│ │ ├── mbpp.json
│ │ ├── medxpertqa.json
│ │ ├── mega-mlqa.json
│ │ ├── mega-tydi-qa.json
│ │ ├── mega-udpos.json
│ │ ├── mega-xcopa.json
│ │ ├── mega-xstorycloze.json
│ │ ├── meld.json
│ │ ├── mgsm.json
│ │ ├── mimic-cxr.json
│ │ ├── mlvu-m.json
│ │ ├── mlvu.json
│ │ ├── mm-if-eval.json
│ │ ├── mm-mind2web.json
│ │ ├── mm-mt-bench.json
│ │ ├── mmau-music.json
│ │ ├── mmau-sound.json
│ │ ├── mmau-speech.json
│ │ ├── mmau.json
│ │ ├── mmbench-test.json
│ │ ├── mmbench-v1.1.json
│ │ ├── mmbench-video.json
│ │ ├── mmbench.json
│ │ ├── mme-realworld.json
│ │ ├── mme.json
│ │ ├── mmlu-(cot).json
│ │ ├── mmlu-base.json
│ │ ├── mmlu-chat.json
│ │ ├── mmlu-french.json
│ │ ├── mmlu-pro.json
│ │ ├── mmlu-prox.json
│ │ ├── mmlu-redux-2.0.json
│ │ ├── mmlu-redux.json
│ │ ├── mmlu-stem.json
│ │ ├── mmlu.json
│ │ ├── mmmlu.json
│ │ ├── mmmu-(val).json
│ │ ├── mmmu-(validation).json
│ │ ├── mmmu-pro.json
│ │ ├── mmmu.json
│ │ ├── mmmuval.json
│ │ ├── mmstar.json
│ │ ├── mmt-bench.json
│ │ ├── mmvet.json
│ │ ├── mmvetgpt4turbo.json
│ │ ├── mobileminiwob++-sr.json
│ │ ├── mrcr-1m-(pointwise).json
│ │ ├── mrcr-1m.json
│ │ ├── mrcr-v2-(8-needle).json
│ │ ├── mrcr-v2.json
│ │ ├── mrcr.json
│ │ ├── mt-bench.json
│ │ ├── mtvqa.json
│ │ ├── muirbench.json
│ │ ├── multi-if.json
│ │ ├── multi-swe-bench.json
│ │ ├── multichallenge-(o3-mini-grader).json
│ │ ├── multichallenge.json
│ │ ├── multilf.json
│ │ ├── multilingual-mgsm-(cot).json
│ │ ├── multilingual-mmlu.json
│ │ ├── multipl-e-humaneval.json
│ │ ├── multipl-e-mbpp.json
│ │ ├── multipl-e.json
│ │ ├── musiccaps.json
│ │ ├── musr.json
│ │ ├── mvbench.json
│ │ ├── natural-questions.json
│ │ ├── natural2code.json
│ │ ├── nexus.json
│ │ ├── nih-multi-needle.json
│ │ ├── nmos.json
│ │ ├── nq.json
│ │ ├── ocrbench-v2-(en).json
│ │ ├── ocrbench-v2-(zh).json
│ │ ├── ocrbench-v2.json
│ │ ├── ocrbench.json
│ │ ├── odinw.json
│ │ ├── ojbench.json
│ │ ├── olympiadbench.json
│ │ ├── omnibench-music.json
│ │ ├── omnibench.json
│ │ ├── omnimath.json
│ │ ├── open-rewrite.json
│ │ ├── openai-mmlu.json
│ │ ├── openai-mrcr%3A-2-needle-128k.json
│ │ ├── openai-mrcr%3A-2-needle-1m.json
│ │ ├── openai-mrcr%3A-2-needle-256k.json
│ │ ├── openbookqa.json
│ │ ├── osworld-extended.json
│ │ ├── osworld-screenshot-only.json
│ │ ├── osworld.json
│ │ ├── pathmcqa.json
│ │ ├── perceptiontest.json
│ │ ├── phibench.json
│ │ ├── physicsfinals.json
│ │ ├── piqa.json
│ │ ├── pointgrounding.json
│ │ ├── polymath-en.json
│ │ ├── polymath.json
│ │ ├── pope.json
│ │ ├── popqa.json
│ │ ├── qasper.json
│ │ ├── qmsum.json
│ │ ├── realworldqa.json
│ │ ├── repobench.json
│ │ ├── repoqa.json
│ │ ├── ruler.json
│ │ ├── sat-math.json
│ │ ├── scale-multichallenge.json
│ │ ├── scicode.json
│ │ ├── scienceqa-visual.json
│ │ ├── scienceqa.json
│ │ ├── screenspot-pro.json
│ │ ├── screenspot.json
│ │ ├── simpleqa.json
│ │ ├── slakevqa.json
│ │ ├── social-iqa.json
│ │ ├── spider.json
│ │ ├── squality.json
│ │ ├── stem.json
│ │ ├── summscreenfd.json
│ │ ├── superglue.json
│ │ ├── supergpqa.json
│ │ ├── swe-bench-multilingual.json
│ │ ├── swe-bench-verified-(agentic-coding).json
│ │ ├── swe-bench-verified-(agentless).json
│ │ ├── swe-bench-verified-(multiple-attempts).json
│ │ ├── swe-bench-verified.json
│ │ ├── swe-dev.json
│ │ ├── swe-lancer-(ic-diamond-subset).json
│ │ ├── swe-lancer.json
│ │ ├── tau-bench-airline.json
│ │ ├── tau-bench-retail.json
│ │ ├── tau-bench.json
│ │ ├── tau2-airline.json
│ │ ├── tau2-retail.json
│ │ ├── tau2-telecom.json
│ │ ├── tempcompass.json
│ │ ├── terminal-bench.json
│ │ ├── terminus.json
│ │ ├── textvqa.json
│ │ ├── theoremqa.json
│ │ ├── tldr9+-(test).json
│ │ ├── translation-en-to-set1-comet22.json
│ │ ├── translation-en-to-set1-spbleu.json
│ │ ├── translation-set1-to-en-comet22.json
│ │ ├── translation-set1-to-en-spbleu.json
│ │ ├── triviaqa.json
│ │ ├── truthfulqa.json
│ │ ├── tydiqa.json
│ │ ├── uniform-bar-exam.json
│ │ ├── usamo25.json
│ │ ├── vatex.json
│ │ ├── vcr-en-easy.json
│ │ ├── vibe-eval.json
│ │ ├── video-mme-(long,-no-subtitles).json
│ │ ├── video-mme.json
│ │ ├── video-mmew-sub.json
│ │ ├── videomme-w-o-sub..json
│ │ ├── videomme-w-sub..json
│ │ ├── videommmu.json
│ │ ├── visualwebbench.json
│ │ ├── vocalsound.json
│ │ ├── voicebench-avg.json
│ │ ├── vqa-rad.json
│ │ ├── vqav2-(test).json
│ │ ├── vqav2-(val).json
│ │ ├── vqav2.json
│ │ ├── wild-bench.json
│ │ ├── winogrande.json
│ │ ├── wmt23.json
│ │ ├── wmt24++.json
│ │ ├── writingbench.json
│ │ ├── xlsum-english.json
│ │ ├── xstest.json
│ │ └── zebralogic.json
│ ├── licenses/
│ │ ├── apache_2_0.json
│ │ ├── cc_by_nc.json
│ │ ├── creative_commons_attribution_4_0_license.json
│ │ ├── deepseek.json
│ │ ├── gemma.json
│ │ ├── health_ai_developer_foundations_terms_of_use.json
│ │ ├── jamba_open_model_license.json
│ │ ├── llama3_2.json
│ │ ├── llama_3_1_community_license.json
│ │ ├── llama_3_2_community_license.json
│ │ ├── llama_3_3_community_license_agreement.json
│ │ ├── llama_4_community_license_agreement.json
│ │ ├── mistral_research_license.json
│ │ ├── mistral_research_license_(mrl)_for_research;_mistral_commercial_license_for_commercial_use.json
│ │ ├── mit.json
│ │ ├── mit_+_model_license_(commercial_use_allowed).json
│ │ ├── mit_license.json
│ │ ├── mnpl_0_1.json
│ │ ├── modified_mit_license.json
│ │ ├── nvidia_open_model_license_agreement.json
│ │ ├── proprietary.json
│ │ ├── qwen.json
│ │ ├── tongyi_qianwen.json
│ │ └── unknown.json
│ ├── organizations/
│ │ ├── ai21/
│ │ │ ├── models/
│ │ │ │ ├── jamba-1.5-large/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ └── jamba-1.5-mini/
│ │ │ │ ├── benchmarks.json
│ │ │ │ └── model.json
│ │ │ └── organization.json
│ │ ├── amazon/
│ │ │ ├── models/
│ │ │ │ ├── nova-lite/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── nova-micro/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ └── nova-pro/
│ │ │ │ ├── benchmarks.json
│ │ │ │ └── model.json
│ │ │ └── organization.json
│ │ ├── anthropic/
│ │ │ ├── models/
│ │ │ │ ├── claude-3-5-haiku-20241022/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── claude-3-5-sonnet-20240620/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── claude-3-5-sonnet-20241022/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── claude-3-7-sonnet-20250219/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── claude-3-haiku-20240307/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── claude-3-opus-20240229/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── claude-3-sonnet-20240229/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── claude-haiku-4-5-20251015/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── claude-opus-4-1-20250805/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── claude-opus-4-20250514/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── claude-sonnet-4-20250514/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ └── claude-sonnet-4-5-20250929/
│ │ │ │ ├── benchmarks.json
│ │ │ │ └── model.json
│ │ │ └── organization.json
│ │ ├── cohere/
│ │ │ ├── models/
│ │ │ │ └── command-r-plus-04-2024/
│ │ │ │ ├── benchmarks.json
│ │ │ │ └── model.json
│ │ │ └── organization.json
│ │ ├── deepseek/
│ │ │ ├── models/
│ │ │ │ ├── deepseek-r1/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── deepseek-r1-0528/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── deepseek-r1-distill-llama-70b/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── deepseek-r1-distill-llama-8b/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── deepseek-r1-distill-qwen-1.5b/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── deepseek-r1-distill-qwen-14b/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── deepseek-r1-distill-qwen-32b/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── deepseek-r1-distill-qwen-7b/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── deepseek-r1-zero/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── deepseek-v2.5/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── deepseek-v3/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── deepseek-v3-0324/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── deepseek-v3.1/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── deepseek-v3.2-exp/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── deepseek-vl2/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── deepseek-vl2-small/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ └── deepseek-vl2-tiny/
│ │ │ │ ├── benchmarks.json
│ │ │ │ └── model.json
│ │ │ └── organization.json
│ │ ├── google/
│ │ │ ├── models/
│ │ │ │ ├── gemini-1.0-pro/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gemini-1.5-flash/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gemini-1.5-flash-8b/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gemini-1.5-pro/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gemini-2.0-flash/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gemini-2.0-flash-lite/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gemini-2.0-flash-thinking/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gemini-2.5-flash/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gemini-2.5-flash-lite/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gemini-2.5-pro/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gemini-2.5-pro-preview-06-05/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gemini-diffusion/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gemma-2-27b-it/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gemma-2-9b-it/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gemma-3-12b-it/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gemma-3-1b-it/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gemma-3-27b-it/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gemma-3-4b-it/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gemma-3n-e2b/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gemma-3n-e2b-it/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gemma-3n-e2b-it-litert-preview/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gemma-3n-e4b/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gemma-3n-e4b-it/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gemma-3n-e4b-it-litert-preview/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ └── medgemma-4b-it/
│ │ │ │ ├── benchmarks.json
│ │ │ │ └── model.json
│ │ │ └── organization.json
│ │ ├── ibm/
│ │ │ ├── models/
│ │ │ │ ├── granite-3.3-8b-base/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── granite-3.3-8b-instruct/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ └── granite-4.0-tiny-preview/
│ │ │ │ ├── benchmarks.json
│ │ │ │ └── model.json
│ │ │ └── organization.json
│ │ ├── meta/
│ │ │ ├── models/
│ │ │ │ ├── llama-3.1-405b-instruct/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── llama-3.1-70b-instruct/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── llama-3.1-8b-instruct/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── llama-3.2-11b-instruct/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── llama-3.2-3b-instruct/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── llama-3.2-90b-instruct/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── llama-3.3-70b-instruct/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── llama-4-maverick/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ └── llama-4-scout/
│ │ │ │ ├── benchmarks.json
│ │ │ │ └── model.json
│ │ │ └── organization.json
│ │ ├── microsoft/
│ │ │ ├── models/
│ │ │ │ ├── phi-3.5-mini-instruct/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── phi-3.5-moe-instruct/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── phi-3.5-vision-instruct/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── phi-4/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── phi-4-mini/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── phi-4-mini-reasoning/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── phi-4-multimodal-instruct/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── phi-4-reasoning/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ └── phi-4-reasoning-plus/
│ │ │ │ ├── benchmarks.json
│ │ │ │ └── model.json
│ │ │ └── organization.json
│ │ ├── mistral/
│ │ │ ├── models/
│ │ │ │ ├── codestral-22b/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── devstral-medium-2507/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── devstral-small-2507/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── magistral-medium/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── magistral-small-2506/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── ministral-8b-instruct-2410/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── mistral-large-2-2407/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── mistral-nemo-instruct-2407/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── mistral-small-2409/
│ │ │ │ │ └── model.json
│ │ │ │ ├── mistral-small-24b-base-2501/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── mistral-small-24b-instruct-2501/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── mistral-small-3.1-24b-base-2503/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── mistral-small-3.1-24b-instruct-2503/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── mistral-small-3.2-24b-instruct-2506/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── pixtral-12b-2409/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ └── pixtral-large/
│ │ │ │ ├── benchmarks.json
│ │ │ │ └── model.json
│ │ │ └── organization.json
│ │ ├── moonshotai/
│ │ │ ├── models/
│ │ │ │ ├── kimi-k1.5/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── kimi-k2-0905/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── kimi-k2-base/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── kimi-k2-instruct/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ └── kimi-k2-instruct-0905/
│ │ │ │ ├── benchmarks.json
│ │ │ │ └── model.json
│ │ │ └── organization.json
│ │ ├── nvidia/
│ │ │ ├── models/
│ │ │ │ ├── llama-3.1-nemotron-70b-instruct/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── llama-3.1-nemotron-nano-8b-v1/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── llama-3.1-nemotron-ultra-253b-v1/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── llama-3.3-nemotron-super-49b-v1/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ └── nemotron-nano-9b-v2/
│ │ │ │ ├── benchmarks.json
│ │ │ │ └── model.json
│ │ │ └── organization.json
│ │ ├── openai/
│ │ │ ├── models/
│ │ │ │ ├── gpt-3.5-turbo-0125/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gpt-4-0613/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gpt-4-turbo-2024-04-09/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gpt-4.1-2025-04-14/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gpt-4.1-mini-2025-04-14/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gpt-4.1-nano-2025-04-14/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gpt-4.5/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gpt-4o-2024-05-13/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gpt-4o-2024-08-06/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gpt-4o-mini-2024-07-18/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gpt-5-2025-08-07/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gpt-5-codex-2025-09-15/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gpt-5-mini-2025-08-07/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gpt-5-nano-2025-08-07/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gpt-oss-120b/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── gpt-oss-20b/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── o1-2024-12-17/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── o1-mini/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── o1-preview/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── o1-pro/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── o3-2025-04-16/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── o3-mini/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── o3-pro-2025-06-10/
│ │ │ │ │ └── model.json
│ │ │ │ └── o4-mini/
│ │ │ │ ├── benchmarks.json
│ │ │ │ └── model.json
│ │ │ └── organization.json
│ │ ├── qwen/
│ │ │ ├── models/
│ │ │ │ ├── qvq-72b-preview/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── qwen-2.5-14b-instruct/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── qwen-2.5-32b-instruct/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── qwen-2.5-72b-instruct/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── qwen-2.5-7b-instruct/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── qwen-2.5-coder-32b-instruct/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── qwen-2.5-coder-7b-instruct/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── qwen2-72b-instruct/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── qwen2-7b-instruct/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── qwen2-vl-72b/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── qwen2.5-omni-7b/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── qwen2.5-vl-32b/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── qwen2.5-vl-72b/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── qwen2.5-vl-7b/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── qwen3-235b-a22b/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── qwen3-235b-a22b-instruct-2507/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── qwen3-235b-a22b-thinking-2507/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── qwen3-30b-a3b/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── qwen3-32b/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── qwen3-next-80b-a3b-base/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── qwen3-next-80b-a3b-instruct/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── qwen3-next-80b-a3b-thinking/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── qwq-32b/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ └── qwq-32b-preview/
│ │ │ │ ├── benchmarks.json
│ │ │ │ └── model.json
│ │ │ └── organization.json
│ │ ├── unknown/
│ │ │ └── organization.json
│ │ ├── xai/
│ │ │ ├── models/
│ │ │ │ ├── grok-1.5/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── grok-1.5v/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── grok-2/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── grok-2-mini/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── grok-3/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── grok-3-mini/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── grok-4/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── grok-4-fast/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ ├── grok-4-heavy/
│ │ │ │ │ ├── benchmarks.json
│ │ │ │ │ └── model.json
│ │ │ │ └── grok-code-fast-1/
│ │ │ │ ├── benchmarks.json
│ │ │ │ └── model.json
│ │ │ └── organization.json
│ │ └── zai-org/
│ │ ├── models/
│ │ │ ├── glm-4.5/
│ │ │ │ ├── benchmarks.json
│ │ │ │ └── model.json
│ │ │ ├── glm-4.5-air/
│ │ │ │ ├── benchmarks.json
│ │ │ │ └── model.json
│ │ │ ├── glm-4.5v/
│ │ │ │ ├── benchmarks.json
│ │ │ │ └── model.json
│ │ │ └── glm-4.6/
│ │ │ ├── benchmarks.json
│ │ │ └── model.json
│ │ └── organization.json
│ └── providers/
│ ├── anthropic/
│ │ ├── models.json
│ │ └── provider.json
│ ├── azure/
│ │ ├── models.json
│ │ └── provider.json
│ ├── bedrock/
│ │ ├── models.json
│ │ └── provider.json
│ ├── cerebras/
│ │ ├── models.json
│ │ └── provider.json
│ ├── cohere/
│ │ ├── models.json
│ │ └── provider.json
│ ├── deepinfra/
│ │ ├── models.json
│ │ └── provider.json
│ ├── deepseek/
│ │ ├── models.json
│ │ └── provider.json
│ ├── fireworks/
│ │ ├── models.json
│ │ └── provider.json
│ ├── google/
│ │ ├── models.json
│ │ └── provider.json
│ ├── groq/
│ │ ├── models.json
│ │ └── provider.json
│ ├── hyperbolic/
│ │ ├── models.json
│ │ └── provider.json
│ ├── lambda/
│ │ ├── models.json
│ │ └── provider.json
│ ├── mistral/
│ │ ├── models.json
│ │ └── provider.json
│ ├── novita/
│ │ ├── models.json
│ │ └── provider.json
│ ├── openai/
│ │ ├── models.json
│ │ └── provider.json
│ ├── replicate/
│ │ ├── models.json
│ │ └── provider.json
│ ├── sambanova/
│ │ ├── models.json
│ │ └── provider.json
│ ├── together/
│ │ ├── models.json
│ │ └── provider.json
│ ├── xai/
│ │ ├── models.json
│ │ └── provider.json
│ └── zeroeval/
│ ├── models.json
│ └── provider.json
├── package.json
└── schemas/
├── README.md
├── benchmark-results.schema.json
├── benchmark.schema.json
├── integrity-validator.js
├── license.schema.json
├── model.schema.json
├── organization.schema.json
├── provider-models.schema.json
├── provider.schema.json
└── validator.js
SYMBOL INDEX (9 symbols across 2 files)
FILE: schemas/integrity-validator.js
class IntegrityValidator (line 5) | class IntegrityValidator {
method constructor (line 6) | constructor(dataDir) {
method loadJSON (line 23) | loadJSON(filePath) {
method loadAllData (line 34) | async loadAllData() {
method checkDuplicates (line 124) | checkDuplicates() {
method checkReferences (line 153) | checkReferences() {
method checkOrphans (line 303) | checkOrphans() {
method validate (line 377) | async validate() {
FILE: schemas/validator.js
function validateSchema (line 6) | function validateSchema(schemaName, filePattern, isArray = false) {
Condensed preview — 778 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (2,539K chars).
[
{
"path": ".github/pull_request_template.md",
"chars": 512,
"preview": "## Description\n\n<!-- Briefly describe your changes and add links to the relevant resources -->\n\nReferences:\n\n<!-- Add li"
},
{
"path": ".github/workflows/schema-validation.yml",
"chars": 475,
"preview": "name: Schema Validation\n\non:\n pull_request:\n branches: [main]\n\njobs:\n validate:\n name: Validate Schema\n runs-"
},
{
"path": ".gitignore",
"chars": 14,
"preview": "/node_modules\n"
},
{
"path": ".vscode/settings.json",
"chars": 492,
"preview": "{\n \"json.schemas\": [\n {\n \"fileMatch\": [\"/models/*/model.json\"],\n \"url\": \"../schemas/models-schema.json\"\n "
},
{
"path": "CONTRIBUTING.md",
"chars": 11019,
"preview": "# Contributing to LLM Stats\n\nThank you for your interest in contributing. This guide outlines the process for updating a"
},
{
"path": "LICENSE.md",
"chars": 1329,
"preview": "Creative Commons Attribution 4.0 International License\n\nCopyright (c) 2024 jc\n\nThis work is licensed under the Creative "
},
{
"path": "README.md",
"chars": 30567,
"preview": "# DEPRECATED - Updates and contributions\n\nThis repository is now depracated and won't be getting any new updates. For co"
},
{
"path": "data/.github/CODEOWNERS",
"chars": 44,
"preview": "* @JonathanChavezTamales\n* @sebastiancrossa\n"
},
{
"path": "data/benchmarks/aa-index.json",
"chars": 677,
"preview": "{\n \"benchmark_id\": \"aa-index\",\n \"name\": \"AA-Index\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"general\"],\n \"mod"
},
{
"path": "data/benchmarks/acebench.json",
"chars": 997,
"preview": "{\n \"benchmark_id\": \"acebench\",\n \"name\": \"ACEBench\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"general\", \"reason"
},
{
"path": "data/benchmarks/activitynet.json",
"chars": 1052,
"preview": "{\n \"benchmark_id\": \"activitynet\",\n \"name\": \"ActivityNet\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"vision\", \"v"
},
{
"path": "data/benchmarks/agieval.json",
"chars": 901,
"preview": "{\n \"benchmark_id\": \"agieval\",\n \"name\": \"AGIEval\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"reasoning\", \"genera"
},
{
"path": "data/benchmarks/ai2-reasoning-challenge-(arc).json",
"chars": 1125,
"preview": "{\n \"benchmark_id\": \"ai2-reasoning-challenge-(arc)\",\n \"name\": \"AI2 Reasoning Challenge (ARC)\",\n \"parent_benchmark_id\":"
},
{
"path": "data/benchmarks/ai2d.json",
"chars": 917,
"preview": "{\n \"benchmark_id\": \"ai2d\",\n \"name\": \"AI2D\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"vision\", \"reasoning\", \"mu"
},
{
"path": "data/benchmarks/aider-polyglot-edit.json",
"chars": 957,
"preview": "{\n \"benchmark_id\": \"aider-polyglot-edit\",\n \"name\": \"Aider-Polyglot Edit\",\n \"parent_benchmark_id\": null,\n \"categories"
},
{
"path": "data/benchmarks/aider-polyglot.json",
"chars": 956,
"preview": "{\n \"benchmark_id\": \"aider-polyglot\",\n \"name\": \"Aider-Polyglot\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"gener"
},
{
"path": "data/benchmarks/aider.json",
"chars": 1086,
"preview": "{\n \"benchmark_id\": \"aider\",\n \"name\": \"Aider\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"reasoning\", \"code\"],\n "
},
{
"path": "data/benchmarks/aime-2024.json",
"chars": 878,
"preview": "{\n \"benchmark_id\": \"aime-2024\",\n \"name\": \"AIME 2024\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"math\", \"reasoni"
},
{
"path": "data/benchmarks/aime-2025.json",
"chars": 802,
"preview": "{\n \"benchmark_id\": \"aime-2025\",\n \"name\": \"AIME 2025\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"math\", \"reasoni"
},
{
"path": "data/benchmarks/aime.json",
"chars": 768,
"preview": "{\n \"benchmark_id\": \"aime\",\n \"name\": \"AIME\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"math\", \"reasoning\"],\n \"m"
},
{
"path": "data/benchmarks/aitz-em.json",
"chars": 809,
"preview": "{\n \"benchmark_id\": \"aitz-em\",\n \"name\": \"AITZ_EM\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"multimodal\", \"reaso"
},
{
"path": "data/benchmarks/alignbench.json",
"chars": 1003,
"preview": "{\n \"benchmark_id\": \"alignbench\",\n \"name\": \"AlignBench\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"general\", \"la"
},
{
"path": "data/benchmarks/alpacaeval-2.0.json",
"chars": 1068,
"preview": "{\n \"benchmark_id\": \"alpacaeval-2.0\",\n \"name\": \"AlpacaEval 2.0\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"gener"
},
{
"path": "data/benchmarks/amc-2022-23.json",
"chars": 916,
"preview": "{\n \"benchmark_id\": \"amc-2022-23\",\n \"name\": \"AMC_2022_23\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"math\", \"rea"
},
{
"path": "data/benchmarks/android-control-high-em.json",
"chars": 583,
"preview": "{\n \"benchmark_id\": \"android-control-high-em\",\n \"name\": \"Android Control High_EM\",\n \"parent_benchmark_id\": null,\n \"ca"
},
{
"path": "data/benchmarks/android-control-low-em.json",
"chars": 577,
"preview": "{\n \"benchmark_id\": \"android-control-low-em\",\n \"name\": \"Android Control Low_EM\",\n \"parent_benchmark_id\": null,\n \"cate"
},
{
"path": "data/benchmarks/androidworld-sr.json",
"chars": 1026,
"preview": "{\n \"benchmark_id\": \"androidworld-sr\",\n \"name\": \"AndroidWorld_SR\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"gen"
},
{
"path": "data/benchmarks/api-bank.json",
"chars": 729,
"preview": "{\n \"benchmark_id\": \"api-bank\",\n \"name\": \"API-Bank\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"reasoning\"],\n \"m"
},
{
"path": "data/benchmarks/arc-agi-v2.json",
"chars": 1018,
"preview": "{\n \"benchmark_id\": \"arc-agi-v2\",\n \"name\": \"ARC-AGI v2\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"reasoning\", \""
},
{
"path": "data/benchmarks/arc-agi.json",
"chars": 1093,
"preview": "{\n \"benchmark_id\": \"arc-agi\",\n \"name\": \"ARC-AGI\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"reasoning\", \"vision"
},
{
"path": "data/benchmarks/arc-c.json",
"chars": 864,
"preview": "{\n \"benchmark_id\": \"arc-c\",\n \"name\": \"ARC-C\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"reasoning\", \"general\"],"
},
{
"path": "data/benchmarks/arc-e.json",
"chars": 883,
"preview": "{\n \"benchmark_id\": \"arc-e\",\n \"name\": \"ARC-E\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"reasoning\", \"general\"],"
},
{
"path": "data/benchmarks/arc.json",
"chars": 1054,
"preview": "{\n \"benchmark_id\": \"arc\",\n \"name\": \"Arc\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"reasoning\", \"general\"],\n \""
},
{
"path": "data/benchmarks/arena-hard-v2.json",
"chars": 1139,
"preview": "{\n \"benchmark_id\": \"arena-hard-v2\",\n \"name\": \"Arena-Hard v2\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"general"
},
{
"path": "data/benchmarks/arena-hard.json",
"chars": 1075,
"preview": "{\n \"benchmark_id\": \"arena-hard\",\n \"name\": \"Arena Hard\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"general\", \"re"
},
{
"path": "data/benchmarks/attaq.json",
"chars": 840,
"preview": "{\n \"benchmark_id\": \"attaq\",\n \"name\": \"AttaQ\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"safety\"],\n \"modality\":"
},
{
"path": "data/benchmarks/autologi.json",
"chars": 885,
"preview": "{\n \"benchmark_id\": \"autologi\",\n \"name\": \"AutoLogi\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"reasoning\"],\n \"m"
},
{
"path": "data/benchmarks/bbh.json",
"chars": 1016,
"preview": "{\n \"benchmark_id\": \"bbh\",\n \"name\": \"BBH\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"reasoning\", \"math\", \"langua"
},
{
"path": "data/benchmarks/bfcl-v2.json",
"chars": 1020,
"preview": "{\n \"benchmark_id\": \"bfcl-v2\",\n \"name\": \"BFCL v2\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"general\", \"reasonin"
},
{
"path": "data/benchmarks/bfcl-v3-multiturn.json",
"chars": 1076,
"preview": "{\n \"benchmark_id\": \"bfcl-v3-multiturn\",\n \"name\": \"BFCL_v3_MultiTurn\",\n \"parent_benchmark_id\": null,\n \"categories\": ["
},
{
"path": "data/benchmarks/bfcl-v3.json",
"chars": 1054,
"preview": "{\n \"benchmark_id\": \"bfcl-v3\",\n \"name\": \"BFCL-v3\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"general\", \"reasonin"
},
{
"path": "data/benchmarks/bfcl.json",
"chars": 1032,
"preview": "{\n \"benchmark_id\": \"bfcl\",\n \"name\": \"BFCL\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"general\", \"reasoning\"],\n "
},
{
"path": "data/benchmarks/big-bench-extra-hard.json",
"chars": 1142,
"preview": "{\n \"benchmark_id\": \"big-bench-extra-hard\",\n \"name\": \"BIG-Bench Extra Hard\",\n \"parent_benchmark_id\": null,\n \"categori"
},
{
"path": "data/benchmarks/big-bench-hard.json",
"chars": 1009,
"preview": "{\n \"benchmark_id\": \"big-bench-hard\",\n \"name\": \"BIG-Bench Hard\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"reaso"
},
{
"path": "data/benchmarks/big-bench.json",
"chars": 950,
"preview": "{\n \"benchmark_id\": \"big-bench\",\n \"name\": \"BIG-Bench\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"reasoning\", \"ma"
},
{
"path": "data/benchmarks/bigcodebench-full.json",
"chars": 854,
"preview": "{\n \"benchmark_id\": \"bigcodebench-full\",\n \"name\": \"BigCodeBench-Full\",\n \"parent_benchmark_id\": null,\n \"categories\": ["
},
{
"path": "data/benchmarks/bigcodebench-hard.json",
"chars": 963,
"preview": "{\n \"benchmark_id\": \"bigcodebench-hard\",\n \"name\": \"BigCodeBench-Hard\",\n \"parent_benchmark_id\": null,\n \"categories\": ["
},
{
"path": "data/benchmarks/bigcodebench.json",
"chars": 833,
"preview": "{\n \"benchmark_id\": \"bigcodebench\",\n \"name\": \"BigCodeBench\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"general\","
},
{
"path": "data/benchmarks/bird-sql-(dev).json",
"chars": 824,
"preview": "{\n \"benchmark_id\": \"bird-sql-(dev)\",\n \"name\": \"Bird-SQL (dev)\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"reaso"
},
{
"path": "data/benchmarks/blink.json",
"chars": 950,
"preview": "{\n \"benchmark_id\": \"blink\",\n \"name\": \"BLINK\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"vision\", \"multimodal\", "
},
{
"path": "data/benchmarks/boolq.json",
"chars": 799,
"preview": "{\n \"benchmark_id\": \"boolq\",\n \"name\": \"BoolQ\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"language\", \"reasoning\"]"
},
{
"path": "data/benchmarks/browsecomp-long-128k.json",
"chars": 793,
"preview": "{\n \"benchmark_id\": \"browsecomp-long-128k\",\n \"name\": \"BrowseComp Long Context 128k\",\n \"parent_benchmark_id\": \"browseco"
},
{
"path": "data/benchmarks/browsecomp-long-256k.json",
"chars": 989,
"preview": "{\n \"benchmark_id\": \"browsecomp-long-256k\",\n \"name\": \"BrowseComp Long Context 256k\",\n \"parent_benchmark_id\": \"browseco"
},
{
"path": "data/benchmarks/browsecomp-zh.json",
"chars": 976,
"preview": "{\n \"benchmark_id\": \"browsecomp-zh\",\n \"name\": \"BrowseComp-zh\",\n \"parent_benchmark_id\": \"browsecomp\",\n \"categories\": ["
},
{
"path": "data/benchmarks/browsecomp.json",
"chars": 940,
"preview": "{\n \"benchmark_id\": \"browsecomp\",\n \"name\": \"BrowseComp\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"reasoning\", \""
},
{
"path": "data/benchmarks/c-eval.json",
"chars": 914,
"preview": "{\n \"benchmark_id\": \"c-eval\",\n \"name\": \"C-Eval\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"general\", \"reasoning\""
},
{
"path": "data/benchmarks/cbnsl.json",
"chars": 756,
"preview": "{\n \"benchmark_id\": \"cbnsl\",\n \"name\": \"CBNSL\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"math\", \"reasoning\"],\n "
},
{
"path": "data/benchmarks/cc-ocr.json",
"chars": 990,
"preview": "{\n \"benchmark_id\": \"cc-ocr\",\n \"name\": \"CC-OCR\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"vision\", \"multimodal\""
},
{
"path": "data/benchmarks/cfeval.json",
"chars": 478,
"preview": "{\n \"benchmark_id\": \"cfeval\",\n \"name\": \"CFEval\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"code\"],\n \"modality\":"
},
{
"path": "data/benchmarks/charadessta.json",
"chars": 827,
"preview": "{\n \"benchmark_id\": \"charadessta\",\n \"name\": \"CharadesSTA\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"video\", \"la"
},
{
"path": "data/benchmarks/chartqa.json",
"chars": 685,
"preview": "{\n \"benchmark_id\": \"chartqa\",\n \"name\": \"ChartQA\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"reasoning\", \"vision"
},
{
"path": "data/benchmarks/charxiv-d.json",
"chars": 854,
"preview": "{\n \"benchmark_id\": \"charxiv-d\",\n \"name\": \"CharXiv-D\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"reasoning\", \"vi"
},
{
"path": "data/benchmarks/charxiv-r.json",
"chars": 798,
"preview": "{\n \"benchmark_id\": \"charxiv-r\",\n \"name\": \"CharXiv-R\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"reasoning\", \"vi"
},
{
"path": "data/benchmarks/chexpert-cxr.json",
"chars": 779,
"preview": "{\n \"benchmark_id\": \"chexpert-cxr\",\n \"name\": \"CheXpert CXR\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"healthcar"
},
{
"path": "data/benchmarks/cluewsc.json",
"chars": 799,
"preview": "{\n \"benchmark_id\": \"cluewsc\",\n \"name\": \"CLUEWSC\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"language\", \"reasoni"
},
{
"path": "data/benchmarks/cmmlu.json",
"chars": 826,
"preview": "{\n \"benchmark_id\": \"cmmlu\",\n \"name\": \"CMMLU\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"language\", \"reasoning\","
},
{
"path": "data/benchmarks/cnmo-2024.json",
"chars": 473,
"preview": "{\n \"benchmark_id\": \"cnmo-2024\",\n \"name\": \"CNMO 2024\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"math\"],\n \"moda"
},
{
"path": "data/benchmarks/codeforces.json",
"chars": 866,
"preview": "{\n \"benchmark_id\": \"codeforces\",\n \"name\": \"CodeForces\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"math\", \"reaso"
},
{
"path": "data/benchmarks/codegolf-v2.2.json",
"chars": 430,
"preview": "{\n \"benchmark_id\": \"codegolf-v2.2\",\n \"name\": \"Codegolf v2.2\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"code\"],"
},
{
"path": "data/benchmarks/collie.json",
"chars": 825,
"preview": "{\n \"benchmark_id\": \"collie\",\n \"name\": \"COLLIE\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"language\", \"reasoning"
},
{
"path": "data/benchmarks/common-voice-15.json",
"chars": 750,
"preview": "{\n \"benchmark_id\": \"common-voice-15\",\n \"name\": \"Common Voice 15\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"aud"
},
{
"path": "data/benchmarks/commonsenseqa.json",
"chars": 849,
"preview": "{\n \"benchmark_id\": \"commonsenseqa\",\n \"name\": \"CommonSenseQA\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"reasoni"
},
{
"path": "data/benchmarks/complexfuncbench.json",
"chars": 842,
"preview": "{\n \"benchmark_id\": \"complexfuncbench\",\n \"name\": \"ComplexFuncBench\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"l"
},
{
"path": "data/benchmarks/covost2-en-zh.json",
"chars": 715,
"preview": "{\n \"benchmark_id\": \"covost2-en-zh\",\n \"name\": \"CoVoST2 en-zh\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"audio\","
},
{
"path": "data/benchmarks/covost2.json",
"chars": 731,
"preview": "{\n \"benchmark_id\": \"covost2\",\n \"name\": \"CoVoST2\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"audio\", \"speech-to-"
},
{
"path": "data/benchmarks/crag.json",
"chars": 936,
"preview": "{\n \"benchmark_id\": \"crag\",\n \"name\": \"CRAG\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"reasoning\", \"search\"],\n "
},
{
"path": "data/benchmarks/creative-writing-v3.json",
"chars": 878,
"preview": "{\n \"benchmark_id\": \"creative-writing-v3\",\n \"name\": \"Creative Writing v3\",\n \"parent_benchmark_id\": null,\n \"categories"
},
{
"path": "data/benchmarks/crperelation.json",
"chars": 555,
"preview": "{\n \"benchmark_id\": \"crperelation\",\n \"name\": \"CRPErelation\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"healthcar"
},
{
"path": "data/benchmarks/crux-o.json",
"chars": 833,
"preview": "{\n \"benchmark_id\": \"crux-o\",\n \"name\": \"CRUX-O\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"reasoning\"],\n \"modal"
},
{
"path": "data/benchmarks/cruxeval-input-cot.json",
"chars": 855,
"preview": "{\n \"benchmark_id\": \"cruxeval-input-cot\",\n \"name\": \"CRUXEval-Input-CoT\",\n \"parent_benchmark_id\": null,\n \"categories\":"
},
{
"path": "data/benchmarks/cruxeval-o.json",
"chars": 855,
"preview": "{\n \"benchmark_id\": \"cruxeval-o\",\n \"name\": \"CruxEval-O\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"reasoning\"],\n"
},
{
"path": "data/benchmarks/cruxeval-output-cot.json",
"chars": 846,
"preview": "{\n \"benchmark_id\": \"cruxeval-output-cot\",\n \"name\": \"CRUXEval-Output-CoT\",\n \"parent_benchmark_id\": null,\n \"categories"
},
{
"path": "data/benchmarks/csimpleqa.json",
"chars": 785,
"preview": "{\n \"benchmark_id\": \"csimpleqa\",\n \"name\": \"CSimpleQA\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"general\", \"lang"
},
{
"path": "data/benchmarks/cybersecurity-ctfs.json",
"chars": 719,
"preview": "{\n \"benchmark_id\": \"cybersecurity-ctfs\",\n \"name\": \"Cybersecurity CTFs\",\n \"parent_benchmark_id\": null,\n \"categories\":"
},
{
"path": "data/benchmarks/dermmcqa.json",
"chars": 593,
"preview": "{\n \"benchmark_id\": \"dermmcqa\",\n \"name\": \"DermMCQA\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"healthcare\"],\n \""
},
{
"path": "data/benchmarks/docvqa.json",
"chars": 770,
"preview": "{\n \"benchmark_id\": \"docvqa\",\n \"name\": \"DocVQA\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"vision\", \"multimodal\""
},
{
"path": "data/benchmarks/docvqatest.json",
"chars": 789,
"preview": "{\n \"benchmark_id\": \"docvqatest\",\n \"name\": \"DocVQAtest\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"vision\", \"mul"
},
{
"path": "data/benchmarks/drop.json",
"chars": 822,
"preview": "{\n \"benchmark_id\": \"drop\",\n \"name\": \"DROP\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"reasoning\", \"math\"],\n \"m"
},
{
"path": "data/benchmarks/ds-arena-code.json",
"chars": 681,
"preview": "{\n \"benchmark_id\": \"ds-arena-code\",\n \"name\": \"DS-Arena-Code\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"reasoni"
},
{
"path": "data/benchmarks/ds-fim-eval.json",
"chars": 573,
"preview": "{\n \"benchmark_id\": \"ds-fim-eval\",\n \"name\": \"DS-FIM-Eval\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"general\"],\n"
},
{
"path": "data/benchmarks/eclektic.json",
"chars": 684,
"preview": "{\n \"benchmark_id\": \"eclektic\",\n \"name\": \"ECLeKTic\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"language\", \"reaso"
},
{
"path": "data/benchmarks/egoschema.json",
"chars": 702,
"preview": "{\n \"benchmark_id\": \"egoschema\",\n \"name\": \"EgoSchema\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"vision\", \"reaso"
},
{
"path": "data/benchmarks/erqa.json",
"chars": 770,
"preview": "{\n \"benchmark_id\": \"erqa\",\n \"name\": \"ERQA\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"vision\", \"reasoning\", \"sp"
},
{
"path": "data/benchmarks/evalplus.json",
"chars": 699,
"preview": "{\n \"benchmark_id\": \"evalplus\",\n \"name\": \"EvalPlus\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"reasoning\", \"code"
},
{
"path": "data/benchmarks/facts-grounding.json",
"chars": 719,
"preview": "{\n \"benchmark_id\": \"facts-grounding\",\n \"name\": \"FACTS Grounding\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"rea"
},
{
"path": "data/benchmarks/factscore.json",
"chars": 701,
"preview": "{\n \"benchmark_id\": \"factscore\",\n \"name\": \"FActScore\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"reasoning\"],\n "
},
{
"path": "data/benchmarks/finqa.json",
"chars": 722,
"preview": "{\n \"benchmark_id\": \"finqa\",\n \"name\": \"FinQA\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"finance\", \"math\", \"reas"
},
{
"path": "data/benchmarks/flenqa.json",
"chars": 754,
"preview": "{\n \"benchmark_id\": \"flenqa\",\n \"name\": \"FlenQA\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"reasoning\", \"long_con"
},
{
"path": "data/benchmarks/fleurs.json",
"chars": 725,
"preview": "{\n \"benchmark_id\": \"fleurs\",\n \"name\": \"FLEURS\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"language\", \"speech-to"
},
{
"path": "data/benchmarks/frames.json",
"chars": 740,
"preview": "{\n \"benchmark_id\": \"frames\",\n \"name\": \"FRAMES\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"reasoning\", \"search\"]"
},
{
"path": "data/benchmarks/french-mmlu.json",
"chars": 692,
"preview": "{\n \"benchmark_id\": \"french-mmlu\",\n \"name\": \"French MMLU\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"general\", \""
},
{
"path": "data/benchmarks/frontiermath.json",
"chars": 707,
"preview": "{\n \"benchmark_id\": \"frontiermath\",\n \"name\": \"FrontierMath\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"math\", \"r"
},
{
"path": "data/benchmarks/functionalmath.json",
"chars": 670,
"preview": "{\n \"benchmark_id\": \"functionalmath\",\n \"name\": \"FunctionalMATH\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"math\""
},
{
"path": "data/benchmarks/giantsteps-tempo.json",
"chars": 662,
"preview": "{\n \"benchmark_id\": \"giantsteps-tempo\",\n \"name\": \"GiantSteps Tempo\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"a"
},
{
"path": "data/benchmarks/global-mmlu-lite.json",
"chars": 649,
"preview": "{\n \"benchmark_id\": \"global-mmlu-lite\",\n \"name\": \"Global-MMLU-Lite\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"g"
},
{
"path": "data/benchmarks/global-mmlu.json",
"chars": 658,
"preview": "{\n \"benchmark_id\": \"global-mmlu\",\n \"name\": \"Global-MMLU\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"general\", \""
},
{
"path": "data/benchmarks/gorilla-benchmark-api-bench.json",
"chars": 668,
"preview": "{\n \"benchmark_id\": \"gorilla-benchmark-api-bench\",\n \"name\": \"Gorilla Benchmark API Bench\",\n \"parent_benchmark_id\": nul"
},
{
"path": "data/benchmarks/govreport.json",
"chars": 702,
"preview": "{\n \"benchmark_id\": \"govreport\",\n \"name\": \"GovReport\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"summarization\","
},
{
"path": "data/benchmarks/gpqa-biology.json",
"chars": 638,
"preview": "{\n \"benchmark_id\": \"gpqa-biology\",\n \"name\": \"GPQA Biology\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"reasoning"
},
{
"path": "data/benchmarks/gpqa-chemistry.json",
"chars": 648,
"preview": "{\n \"benchmark_id\": \"gpqa-chemistry\",\n \"name\": \"GPQA Chemistry\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"reaso"
},
{
"path": "data/benchmarks/gpqa-physics.json",
"chars": 638,
"preview": "{\n \"benchmark_id\": \"gpqa-physics\",\n \"name\": \"GPQA Physics\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"reasoning"
},
{
"path": "data/benchmarks/gpqa.json",
"chars": 643,
"preview": "{\n \"benchmark_id\": \"gpqa\",\n \"name\": \"GPQA\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"reasoning\", \"general\"],\n "
},
{
"path": "data/benchmarks/graphwalks-bfs-%3C128k.json",
"chars": 659,
"preview": "{\n \"benchmark_id\": \"graphwalks-bfs-<128k\",\n \"name\": \"Graphwalks BFS <128k\",\n \"parent_benchmark_id\": null,\n \"categori"
},
{
"path": "data/benchmarks/graphwalks-bfs-%3E128k.json",
"chars": 672,
"preview": "{\n \"benchmark_id\": \"graphwalks-bfs->128k\",\n \"name\": \"Graphwalks BFS >128k\",\n \"parent_benchmark_id\": null,\n \"categori"
},
{
"path": "data/benchmarks/graphwalks-parents-%3C128k.json",
"chars": 659,
"preview": "{\n \"benchmark_id\": \"graphwalks-parents-<128k\",\n \"name\": \"Graphwalks parents <128k\",\n \"parent_benchmark_id\": null,\n \""
},
{
"path": "data/benchmarks/graphwalks-parents-%3E128k.json",
"chars": 673,
"preview": "{\n \"benchmark_id\": \"graphwalks-parents->128k\",\n \"name\": \"Graphwalks parents >128k\",\n \"parent_benchmark_id\": null,\n \""
},
{
"path": "data/benchmarks/groundui-1k.json",
"chars": 645,
"preview": "{\n \"benchmark_id\": \"groundui-1k\",\n \"name\": \"GroundUI-1K\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"multimodal\""
},
{
"path": "data/benchmarks/gsm-8k-(cot).json",
"chars": 653,
"preview": "{\n \"benchmark_id\": \"gsm-8k-(cot)\",\n \"name\": \"GSM-8K (CoT)\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"math\", \"r"
},
{
"path": "data/benchmarks/gsm8k-chat.json",
"chars": 652,
"preview": "{\n \"benchmark_id\": \"gsm8k-chat\",\n \"name\": \"GSM8K Chat\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"math\", \"reaso"
},
{
"path": "data/benchmarks/gsm8k.json",
"chars": 610,
"preview": "{\n \"benchmark_id\": \"gsm8k\",\n \"name\": \"GSM8k\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"math\", \"reasoning\"],\n "
},
{
"path": "data/benchmarks/hallusion-bench.json",
"chars": 701,
"preview": "{\n \"benchmark_id\": \"hallusion-bench\",\n \"name\": \"Hallusion Bench\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"vis"
},
{
"path": "data/benchmarks/healthbench-hard.json",
"chars": 695,
"preview": "{\n \"benchmark_id\": \"healthbench-hard\",\n \"name\": \"HealthBench Hard\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"h"
},
{
"path": "data/benchmarks/healthbench.json",
"chars": 699,
"preview": "{\n \"benchmark_id\": \"healthbench\",\n \"name\": \"HealthBench\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"healthcare\""
},
{
"path": "data/benchmarks/hellaswag.json",
"chars": 718,
"preview": "{\n \"benchmark_id\": \"hellaswag\",\n \"name\": \"HellaSwag\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"reasoning\"],\n "
},
{
"path": "data/benchmarks/hiddenmath.json",
"chars": 693,
"preview": "{\n \"benchmark_id\": \"hiddenmath\",\n \"name\": \"HiddenMath\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"math\", \"reaso"
},
{
"path": "data/benchmarks/hle.json",
"chars": 682,
"preview": "{\n \"benchmark_id\": \"hle\",\n \"name\": \"HLE\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"reasoning\", \"math\"],\n \"mod"
},
{
"path": "data/benchmarks/hmmt-2025.json",
"chars": 678,
"preview": "{\n \"benchmark_id\": \"hmmt-2025\",\n \"name\": \"HMMT 2025\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"math\"],\n \"moda"
},
{
"path": "data/benchmarks/hmmt25.json",
"chars": 671,
"preview": "{\n \"benchmark_id\": \"hmmt25\",\n \"name\": \"HMMT25\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"math\"],\n \"modality\":"
},
{
"path": "data/benchmarks/humaneval+.json",
"chars": 649,
"preview": "{\n \"benchmark_id\": \"humaneval+\",\n \"name\": \"HumanEval+\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"reasoning\"],\n"
},
{
"path": "data/benchmarks/humaneval-average.json",
"chars": 683,
"preview": "{\n \"benchmark_id\": \"humaneval-average\",\n \"name\": \"HumanEval-Average\",\n \"parent_benchmark_id\": null,\n \"categories\": ["
},
{
"path": "data/benchmarks/humaneval-er.json",
"chars": 673,
"preview": "{\n \"benchmark_id\": \"humaneval-er\",\n \"name\": \"HumanEval-ER\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"reasoning"
},
{
"path": "data/benchmarks/humaneval-mul.json",
"chars": 687,
"preview": "{\n \"benchmark_id\": \"humaneval-mul\",\n \"name\": \"HumanEval-Mul\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"reasoni"
},
{
"path": "data/benchmarks/humaneval-plus.json",
"chars": 666,
"preview": "{\n \"benchmark_id\": \"humaneval-plus\",\n \"name\": \"HumanEval Plus\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"reaso"
},
{
"path": "data/benchmarks/humaneval.json",
"chars": 651,
"preview": "{\n \"benchmark_id\": \"humaneval\",\n \"name\": \"HumanEval\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"reasoning\", \"co"
},
{
"path": "data/benchmarks/humanevalfim-average.json",
"chars": 623,
"preview": "{\n \"benchmark_id\": \"humanevalfim-average\",\n \"name\": \"HumanEvalFIM-Average\",\n \"parent_benchmark_id\": null,\n \"categori"
},
{
"path": "data/benchmarks/humanity's-last-exam.json",
"chars": 700,
"preview": "{\n \"benchmark_id\": \"humanity's-last-exam\",\n \"name\": \"Humanity's Last Exam\",\n \"parent_benchmark_id\": null,\n \"categori"
},
{
"path": "data/benchmarks/if.json",
"chars": 631,
"preview": "{\n \"benchmark_id\": \"if\",\n \"name\": \"IF\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"general\"],\n \"modality\": \"tex"
},
{
"path": "data/benchmarks/ifeval.json",
"chars": 639,
"preview": "{\n \"benchmark_id\": \"ifeval\",\n \"name\": \"IFEval\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"general\"],\n \"modalit"
},
{
"path": "data/benchmarks/include.json",
"chars": 470,
"preview": "{\n \"benchmark_id\": \"include\",\n \"name\": \"Include\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"general\"],\n \"modal"
},
{
"path": "data/benchmarks/infinitebench-en.mc.json",
"chars": 657,
"preview": "{\n \"benchmark_id\": \"infinitebench-en.mc\",\n \"name\": \"InfiniteBench/En.MC\",\n \"parent_benchmark_id\": null,\n \"categories"
},
{
"path": "data/benchmarks/infinitebench-en.qa.json",
"chars": 660,
"preview": "{\n \"benchmark_id\": \"infinitebench-en.qa\",\n \"name\": \"InfiniteBench/En.QA\",\n \"parent_benchmark_id\": null,\n \"categories"
},
{
"path": "data/benchmarks/infographicsqa.json",
"chars": 696,
"preview": "{\n \"benchmark_id\": \"infographicsqa\",\n \"name\": \"InfographicsQA\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"visio"
},
{
"path": "data/benchmarks/infovqa.json",
"chars": 670,
"preview": "{\n \"benchmark_id\": \"infovqa\",\n \"name\": \"InfoVQA\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"vision\", \"multimoda"
},
{
"path": "data/benchmarks/infovqatest.json",
"chars": 652,
"preview": "{\n \"benchmark_id\": \"infovqatest\",\n \"name\": \"InfoVQAtest\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"vision\", \"m"
},
{
"path": "data/benchmarks/instruct-humaneval.json",
"chars": 637,
"preview": "{\n \"benchmark_id\": \"instruct-humaneval\",\n \"name\": \"Instruct HumanEval\",\n \"parent_benchmark_id\": null,\n \"categories\":"
},
{
"path": "data/benchmarks/intergps.json",
"chars": 636,
"preview": "{\n \"benchmark_id\": \"intergps\",\n \"name\": \"InterGPS\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"math\", \"spatial_r"
},
{
"path": "data/benchmarks/internal-api-instruction-following-(hard).json",
"chars": 572,
"preview": "{\n \"benchmark_id\": \"internal-api-instruction-following-(hard)\",\n \"name\": \"Internal API instruction following (hard)\",\n"
},
{
"path": "data/benchmarks/lbpp-(v2).json",
"chars": 562,
"preview": "{\n \"benchmark_id\": \"lbpp-(v2)\",\n \"name\": \"LBPP (v2)\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"reasoning\"],\n "
},
{
"path": "data/benchmarks/livebench-20241125.json",
"chars": 849,
"preview": "{\n \"benchmark_id\": \"livebench-20241125\",\n \"name\": \"LiveBench 20241125\",\n \"parent_benchmark_id\": null,\n \"categories\":"
},
{
"path": "data/benchmarks/livebench.json",
"chars": 832,
"preview": "{\n \"benchmark_id\": \"livebench\",\n \"name\": \"LiveBench\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"math\", \"reasoni"
},
{
"path": "data/benchmarks/livecodebench(01-09).json",
"chars": 908,
"preview": "{\n \"benchmark_id\": \"livecodebench(01-09)\",\n \"name\": \"LiveCodeBench(01-09)\",\n \"parent_benchmark_id\": null,\n \"categori"
},
{
"path": "data/benchmarks/livecodebench-v5-24.12-25.2.json",
"chars": 922,
"preview": "{\n \"benchmark_id\": \"livecodebench-v5-24.12-25.2\",\n \"name\": \"LiveCodeBench v5 24.12-25.2\",\n \"parent_benchmark_id\": nul"
},
{
"path": "data/benchmarks/livecodebench-v5.json",
"chars": 900,
"preview": "{\n \"benchmark_id\": \"livecodebench-v5\",\n \"name\": \"LiveCodeBench v5\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"r"
},
{
"path": "data/benchmarks/livecodebench-v6.json",
"chars": 900,
"preview": "{\n \"benchmark_id\": \"livecodebench-v6\",\n \"name\": \"LiveCodeBench v6\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"r"
},
{
"path": "data/benchmarks/livecodebench.json",
"chars": 903,
"preview": "{\n \"benchmark_id\": \"livecodebench\",\n \"name\": \"LiveCodeBench\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"reasoni"
},
{
"path": "data/benchmarks/longbench-v2.json",
"chars": 943,
"preview": "{\n \"benchmark_id\": \"longbench-v2\",\n \"name\": \"LongBench v2\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"long_cont"
},
{
"path": "data/benchmarks/longfact-concepts.json",
"chars": 779,
"preview": "{\n \"benchmark_id\": \"longfact-concepts\",\n \"name\": \"LongFact Concepts\",\n \"parent_benchmark_id\": null,\n \"categories\": ["
},
{
"path": "data/benchmarks/longfact-objects.json",
"chars": 777,
"preview": "{\n \"benchmark_id\": \"longfact-objects\",\n \"name\": \"LongFact Objects\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"g"
},
{
"path": "data/benchmarks/longvideobench.json",
"chars": 830,
"preview": "{\n \"benchmark_id\": \"longvideobench\",\n \"name\": \"LongVideoBench\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"visio"
},
{
"path": "data/benchmarks/lsat.json",
"chars": 796,
"preview": "{\n \"benchmark_id\": \"lsat\",\n \"name\": \"LSAT\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"reasoning\", \"legal\", \"gen"
},
{
"path": "data/benchmarks/lvbench.json",
"chars": 801,
"preview": "{\n \"benchmark_id\": \"lvbench\",\n \"name\": \"LVBench\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"vision\", \"multimoda"
},
{
"path": "data/benchmarks/math-(cot).json",
"chars": 791,
"preview": "{\n \"benchmark_id\": \"math-(cot)\",\n \"name\": \"MATH (CoT)\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"math\", \"reaso"
},
{
"path": "data/benchmarks/math-500.json",
"chars": 846,
"preview": "{\n \"benchmark_id\": \"math-500\",\n \"name\": \"MATH-500\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"math\", \"reasoning"
},
{
"path": "data/benchmarks/math.json",
"chars": 817,
"preview": "{\n \"benchmark_id\": \"math\",\n \"name\": \"MATH\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"math\", \"reasoning\"],\n \"m"
},
{
"path": "data/benchmarks/mathvision.json",
"chars": 751,
"preview": "{\n \"benchmark_id\": \"mathvision\",\n \"name\": \"MathVision\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"math\", \"visio"
},
{
"path": "data/benchmarks/mathvista-mini.json",
"chars": 852,
"preview": "{\n \"benchmark_id\": \"mathvista-mini\",\n \"name\": \"MathVista-Mini\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"math\""
},
{
"path": "data/benchmarks/mathvista.json",
"chars": 833,
"preview": "{\n \"benchmark_id\": \"mathvista\",\n \"name\": \"MathVista\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"math\", \"vision\""
},
{
"path": "data/benchmarks/mbpp+.json",
"chars": 766,
"preview": "{\n \"benchmark_id\": \"mbpp+\",\n \"name\": \"MBPP+\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"reasoning\", \"general\"],"
},
{
"path": "data/benchmarks/mbpp-++-base-version.json",
"chars": 831,
"preview": "{\n \"benchmark_id\": \"mbpp-++-base-version\",\n \"name\": \"MBPP ++ base version\",\n \"parent_benchmark_id\": null,\n \"categori"
},
{
"path": "data/benchmarks/mbpp-evalplus-(base).json",
"chars": 782,
"preview": "{\n \"benchmark_id\": \"mbpp-evalplus-(base)\",\n \"name\": \"MBPP EvalPlus (base)\",\n \"parent_benchmark_id\": null,\n \"categori"
},
{
"path": "data/benchmarks/mbpp-evalplus.json",
"chars": 768,
"preview": "{\n \"benchmark_id\": \"mbpp-evalplus\",\n \"name\": \"MBPP EvalPlus\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"reasoni"
},
{
"path": "data/benchmarks/mbpp-pass@1.json",
"chars": 807,
"preview": "{\n \"benchmark_id\": \"mbpp-pass@1\",\n \"name\": \"MBPP pass@1\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"reasoning\","
},
{
"path": "data/benchmarks/mbpp-plus.json",
"chars": 836,
"preview": "{\n \"benchmark_id\": \"mbpp-plus\",\n \"name\": \"MBPP Plus\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"reasoning\", \"co"
},
{
"path": "data/benchmarks/mbpp.json",
"chars": 745,
"preview": "{\n \"benchmark_id\": \"mbpp\",\n \"name\": \"MBPP\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"reasoning\", \"general\"],\n "
},
{
"path": "data/benchmarks/medxpertqa.json",
"chars": 781,
"preview": "{\n \"benchmark_id\": \"medxpertqa\",\n \"name\": \"MedXpertQA\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"healthcare\", "
},
{
"path": "data/benchmarks/mega-mlqa.json",
"chars": 787,
"preview": "{\n \"benchmark_id\": \"mega-mlqa\",\n \"name\": \"MEGA MLQA\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"language\", \"rea"
},
{
"path": "data/benchmarks/mega-tydi-qa.json",
"chars": 821,
"preview": "{\n \"benchmark_id\": \"mega-tydi-qa\",\n \"name\": \"MEGA TyDi QA\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"language\""
},
{
"path": "data/benchmarks/mega-udpos.json",
"chars": 757,
"preview": "{\n \"benchmark_id\": \"mega-udpos\",\n \"name\": \"MEGA UDPOS\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"language\"],\n "
},
{
"path": "data/benchmarks/mega-xcopa.json",
"chars": 799,
"preview": "{\n \"benchmark_id\": \"mega-xcopa\",\n \"name\": \"MEGA XCOPA\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"reasoning\", \""
},
{
"path": "data/benchmarks/mega-xstorycloze.json",
"chars": 801,
"preview": "{\n \"benchmark_id\": \"mega-xstorycloze\",\n \"name\": \"MEGA XStoryCloze\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"r"
},
{
"path": "data/benchmarks/meld.json",
"chars": 825,
"preview": "{\n \"benchmark_id\": \"meld\",\n \"name\": \"Meld\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"multimodal\", \"psychology\""
},
{
"path": "data/benchmarks/mgsm.json",
"chars": 788,
"preview": "{\n \"benchmark_id\": \"mgsm\",\n \"name\": \"MGSM\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"math\", \"reasoning\"],\n \"m"
},
{
"path": "data/benchmarks/mimic-cxr.json",
"chars": 831,
"preview": "{\n \"benchmark_id\": \"mimic-cxr\",\n \"name\": \"MIMIC CXR\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"healthcare\", \"v"
},
{
"path": "data/benchmarks/mlvu-m.json",
"chars": 412,
"preview": "{\n \"benchmark_id\": \"mlvu-m\",\n \"name\": \"MLVU-M\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"general\"],\n \"modalit"
},
{
"path": "data/benchmarks/mlvu.json",
"chars": 702,
"preview": "{\n \"benchmark_id\": \"mlvu\",\n \"name\": \"MLVU\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"video\", \"multimodal\", \"lo"
},
{
"path": "data/benchmarks/mm-if-eval.json",
"chars": 673,
"preview": "{\n \"benchmark_id\": \"mm-if-eval\",\n \"name\": \"MM IF-Eval\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"multimodal\", "
},
{
"path": "data/benchmarks/mm-mind2web.json",
"chars": 709,
"preview": "{\n \"benchmark_id\": \"mm-mind2web\",\n \"name\": \"MM-Mind2Web\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"multimodal\""
},
{
"path": "data/benchmarks/mm-mt-bench.json",
"chars": 646,
"preview": "{\n \"benchmark_id\": \"mm-mt-bench\",\n \"name\": \"MM-MT-Bench\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"multimodal\""
},
{
"path": "data/benchmarks/mmau-music.json",
"chars": 721,
"preview": "{\n \"benchmark_id\": \"mmau-music\",\n \"name\": \"MMAU Music\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"audio\", \"mult"
},
{
"path": "data/benchmarks/mmau-sound.json",
"chars": 743,
"preview": "{\n \"benchmark_id\": \"mmau-sound\",\n \"name\": \"MMAU Sound\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"audio\", \"mult"
},
{
"path": "data/benchmarks/mmau-speech.json",
"chars": 743,
"preview": "{\n \"benchmark_id\": \"mmau-speech\",\n \"name\": \"MMAU Speech\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"audio\", \"mu"
},
{
"path": "data/benchmarks/mmau.json",
"chars": 751,
"preview": "{\n \"benchmark_id\": \"mmau\",\n \"name\": \"MMAU\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"audio\", \"multimodal\", \"re"
},
{
"path": "data/benchmarks/mmbench-test.json",
"chars": 710,
"preview": "{\n \"benchmark_id\": \"mmbench-test\",\n \"name\": \"MMBench_test\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"vision\", "
},
{
"path": "data/benchmarks/mmbench-v1.1.json",
"chars": 722,
"preview": "{\n \"benchmark_id\": \"mmbench-v1.1\",\n \"name\": \"MMBench-V1.1\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"vision\", "
},
{
"path": "data/benchmarks/mmbench-video.json",
"chars": 773,
"preview": "{\n \"benchmark_id\": \"mmbench-video\",\n \"name\": \"MMBench-Video\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"video\","
},
{
"path": "data/benchmarks/mmbench.json",
"chars": 698,
"preview": "{\n \"benchmark_id\": \"mmbench\",\n \"name\": \"MMBench\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"vision\", \"multimoda"
},
{
"path": "data/benchmarks/mme-realworld.json",
"chars": 805,
"preview": "{\n \"benchmark_id\": \"mme-realworld\",\n \"name\": \"MME-RealWorld\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"vision\""
},
{
"path": "data/benchmarks/mme.json",
"chars": 739,
"preview": "{\n \"benchmark_id\": \"mme\",\n \"name\": \"MME\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"vision\", \"multimodal\", \"rea"
},
{
"path": "data/benchmarks/mmlu-(cot).json",
"chars": 787,
"preview": "{\n \"benchmark_id\": \"mmlu-(cot)\",\n \"name\": \"MMLU (CoT)\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"language\", \"r"
},
{
"path": "data/benchmarks/mmlu-base.json",
"chars": 806,
"preview": "{\n \"benchmark_id\": \"mmlu-base\",\n \"name\": \"MMLU-Base\",\n \"parent_benchmark_id\": null,\n \"categories\": [\"language\", \"rea"
}
]
// ... and 578 more files (download for full content)
About this extraction
This page contains the full source code of the JonathanChavezTamales/LLMStats GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 778 files (2.2 MB), approximately 611.7k tokens, and a symbol index with 9 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.