gitextract_261_qksq/

├── .github/
│   ├── pull_request_template.md
│   └── workflows/
│       └── schema-validation.yml
├── .gitignore
├── .vscode/
│   └── settings.json
├── CONTRIBUTING.md
├── LICENSE.md
├── README.md
├── data/
│   ├── .github/
│   │   └── CODEOWNERS
│   ├── benchmarks/
│   │   ├── aa-index.json
│   │   ├── acebench.json
│   │   ├── activitynet.json
│   │   ├── agieval.json
│   │   ├── ai2-reasoning-challenge-(arc).json
│   │   ├── ai2d.json
│   │   ├── aider-polyglot-edit.json
│   │   ├── aider-polyglot.json
│   │   ├── aider.json
│   │   ├── aime-2024.json
│   │   ├── aime-2025.json
│   │   ├── aime.json
│   │   ├── aitz-em.json
│   │   ├── alignbench.json
│   │   ├── alpacaeval-2.0.json
│   │   ├── amc-2022-23.json
│   │   ├── android-control-high-em.json
│   │   ├── android-control-low-em.json
│   │   ├── androidworld-sr.json
│   │   ├── api-bank.json
│   │   ├── arc-agi-v2.json
│   │   ├── arc-agi.json
│   │   ├── arc-c.json
│   │   ├── arc-e.json
│   │   ├── arc.json
│   │   ├── arena-hard-v2.json
│   │   ├── arena-hard.json
│   │   ├── attaq.json
│   │   ├── autologi.json
│   │   ├── bbh.json
│   │   ├── bfcl-v2.json
│   │   ├── bfcl-v3-multiturn.json
│   │   ├── bfcl-v3.json
│   │   ├── bfcl.json
│   │   ├── big-bench-extra-hard.json
│   │   ├── big-bench-hard.json
│   │   ├── big-bench.json
│   │   ├── bigcodebench-full.json
│   │   ├── bigcodebench-hard.json
│   │   ├── bigcodebench.json
│   │   ├── bird-sql-(dev).json
│   │   ├── blink.json
│   │   ├── boolq.json
│   │   ├── browsecomp-long-128k.json
│   │   ├── browsecomp-long-256k.json
│   │   ├── browsecomp-zh.json
│   │   ├── browsecomp.json
│   │   ├── c-eval.json
│   │   ├── cbnsl.json
│   │   ├── cc-ocr.json
│   │   ├── cfeval.json
│   │   ├── charadessta.json
│   │   ├── chartqa.json
│   │   ├── charxiv-d.json
│   │   ├── charxiv-r.json
│   │   ├── chexpert-cxr.json
│   │   ├── cluewsc.json
│   │   ├── cmmlu.json
│   │   ├── cnmo-2024.json
│   │   ├── codeforces.json
│   │   ├── codegolf-v2.2.json
│   │   ├── collie.json
│   │   ├── common-voice-15.json
│   │   ├── commonsenseqa.json
│   │   ├── complexfuncbench.json
│   │   ├── covost2-en-zh.json
│   │   ├── covost2.json
│   │   ├── crag.json
│   │   ├── creative-writing-v3.json
│   │   ├── crperelation.json
│   │   ├── crux-o.json
│   │   ├── cruxeval-input-cot.json
│   │   ├── cruxeval-o.json
│   │   ├── cruxeval-output-cot.json
│   │   ├── csimpleqa.json
│   │   ├── cybersecurity-ctfs.json
│   │   ├── dermmcqa.json
│   │   ├── docvqa.json
│   │   ├── docvqatest.json
│   │   ├── drop.json
│   │   ├── ds-arena-code.json
│   │   ├── ds-fim-eval.json
│   │   ├── eclektic.json
│   │   ├── egoschema.json
│   │   ├── erqa.json
│   │   ├── evalplus.json
│   │   ├── facts-grounding.json
│   │   ├── factscore.json
│   │   ├── finqa.json
│   │   ├── flenqa.json
│   │   ├── fleurs.json
│   │   ├── frames.json
│   │   ├── french-mmlu.json
│   │   ├── frontiermath.json
│   │   ├── functionalmath.json
│   │   ├── giantsteps-tempo.json
│   │   ├── global-mmlu-lite.json
│   │   ├── global-mmlu.json
│   │   ├── gorilla-benchmark-api-bench.json
│   │   ├── govreport.json
│   │   ├── gpqa-biology.json
│   │   ├── gpqa-chemistry.json
│   │   ├── gpqa-physics.json
│   │   ├── gpqa.json
│   │   ├── graphwalks-bfs-%3C128k.json
│   │   ├── graphwalks-bfs-%3E128k.json
│   │   ├── graphwalks-parents-%3C128k.json
│   │   ├── graphwalks-parents-%3E128k.json
│   │   ├── groundui-1k.json
│   │   ├── gsm-8k-(cot).json
│   │   ├── gsm8k-chat.json
│   │   ├── gsm8k.json
│   │   ├── hallusion-bench.json
│   │   ├── healthbench-hard.json
│   │   ├── healthbench.json
│   │   ├── hellaswag.json
│   │   ├── hiddenmath.json
│   │   ├── hle.json
│   │   ├── hmmt-2025.json
│   │   ├── hmmt25.json
│   │   ├── humaneval+.json
│   │   ├── humaneval-average.json
│   │   ├── humaneval-er.json
│   │   ├── humaneval-mul.json
│   │   ├── humaneval-plus.json
│   │   ├── humaneval.json
│   │   ├── humanevalfim-average.json
│   │   ├── humanity's-last-exam.json
│   │   ├── if.json
│   │   ├── ifeval.json
│   │   ├── include.json
│   │   ├── infinitebench-en.mc.json
│   │   ├── infinitebench-en.qa.json
│   │   ├── infographicsqa.json
│   │   ├── infovqa.json
│   │   ├── infovqatest.json
│   │   ├── instruct-humaneval.json
│   │   ├── intergps.json
│   │   ├── internal-api-instruction-following-(hard).json
│   │   ├── lbpp-(v2).json
│   │   ├── livebench-20241125.json
│   │   ├── livebench.json
│   │   ├── livecodebench(01-09).json
│   │   ├── livecodebench-v5-24.12-25.2.json
│   │   ├── livecodebench-v5.json
│   │   ├── livecodebench-v6.json
│   │   ├── livecodebench.json
│   │   ├── longbench-v2.json
│   │   ├── longfact-concepts.json
│   │   ├── longfact-objects.json
│   │   ├── longvideobench.json
│   │   ├── lsat.json
│   │   ├── lvbench.json
│   │   ├── math-(cot).json
│   │   ├── math-500.json
│   │   ├── math.json
│   │   ├── mathvision.json
│   │   ├── mathvista-mini.json
│   │   ├── mathvista.json
│   │   ├── mbpp+.json
│   │   ├── mbpp-++-base-version.json
│   │   ├── mbpp-evalplus-(base).json
│   │   ├── mbpp-evalplus.json
│   │   ├── mbpp-pass@1.json
│   │   ├── mbpp-plus.json
│   │   ├── mbpp.json
│   │   ├── medxpertqa.json
│   │   ├── mega-mlqa.json
│   │   ├── mega-tydi-qa.json
│   │   ├── mega-udpos.json
│   │   ├── mega-xcopa.json
│   │   ├── mega-xstorycloze.json
│   │   ├── meld.json
│   │   ├── mgsm.json
│   │   ├── mimic-cxr.json
│   │   ├── mlvu-m.json
│   │   ├── mlvu.json
│   │   ├── mm-if-eval.json
│   │   ├── mm-mind2web.json
│   │   ├── mm-mt-bench.json
│   │   ├── mmau-music.json
│   │   ├── mmau-sound.json
│   │   ├── mmau-speech.json
│   │   ├── mmau.json
│   │   ├── mmbench-test.json
│   │   ├── mmbench-v1.1.json
│   │   ├── mmbench-video.json
│   │   ├── mmbench.json
│   │   ├── mme-realworld.json
│   │   ├── mme.json
│   │   ├── mmlu-(cot).json
│   │   ├── mmlu-base.json
│   │   ├── mmlu-chat.json
│   │   ├── mmlu-french.json
│   │   ├── mmlu-pro.json
│   │   ├── mmlu-prox.json
│   │   ├── mmlu-redux-2.0.json
│   │   ├── mmlu-redux.json
│   │   ├── mmlu-stem.json
│   │   ├── mmlu.json
│   │   ├── mmmlu.json
│   │   ├── mmmu-(val).json
│   │   ├── mmmu-(validation).json
│   │   ├── mmmu-pro.json
│   │   ├── mmmu.json
│   │   ├── mmmuval.json
│   │   ├── mmstar.json
│   │   ├── mmt-bench.json
│   │   ├── mmvet.json
│   │   ├── mmvetgpt4turbo.json
│   │   ├── mobileminiwob++-sr.json
│   │   ├── mrcr-1m-(pointwise).json
│   │   ├── mrcr-1m.json
│   │   ├── mrcr-v2-(8-needle).json
│   │   ├── mrcr-v2.json
│   │   ├── mrcr.json
│   │   ├── mt-bench.json
│   │   ├── mtvqa.json
│   │   ├── muirbench.json
│   │   ├── multi-if.json
│   │   ├── multi-swe-bench.json
│   │   ├── multichallenge-(o3-mini-grader).json
│   │   ├── multichallenge.json
│   │   ├── multilf.json
│   │   ├── multilingual-mgsm-(cot).json
│   │   ├── multilingual-mmlu.json
│   │   ├── multipl-e-humaneval.json
│   │   ├── multipl-e-mbpp.json
│   │   ├── multipl-e.json
│   │   ├── musiccaps.json
│   │   ├── musr.json
│   │   ├── mvbench.json
│   │   ├── natural-questions.json
│   │   ├── natural2code.json
│   │   ├── nexus.json
│   │   ├── nih-multi-needle.json
│   │   ├── nmos.json
│   │   ├── nq.json
│   │   ├── ocrbench-v2-(en).json
│   │   ├── ocrbench-v2-(zh).json
│   │   ├── ocrbench-v2.json
│   │   ├── ocrbench.json
│   │   ├── odinw.json
│   │   ├── ojbench.json
│   │   ├── olympiadbench.json
│   │   ├── omnibench-music.json
│   │   ├── omnibench.json
│   │   ├── omnimath.json
│   │   ├── open-rewrite.json
│   │   ├── openai-mmlu.json
│   │   ├── openai-mrcr%3A-2-needle-128k.json
│   │   ├── openai-mrcr%3A-2-needle-1m.json
│   │   ├── openai-mrcr%3A-2-needle-256k.json
│   │   ├── openbookqa.json
│   │   ├── osworld-extended.json
│   │   ├── osworld-screenshot-only.json
│   │   ├── osworld.json
│   │   ├── pathmcqa.json
│   │   ├── perceptiontest.json
│   │   ├── phibench.json
│   │   ├── physicsfinals.json
│   │   ├── piqa.json
│   │   ├── pointgrounding.json
│   │   ├── polymath-en.json
│   │   ├── polymath.json
│   │   ├── pope.json
│   │   ├── popqa.json
│   │   ├── qasper.json
│   │   ├── qmsum.json
│   │   ├── realworldqa.json
│   │   ├── repobench.json
│   │   ├── repoqa.json
│   │   ├── ruler.json
│   │   ├── sat-math.json
│   │   ├── scale-multichallenge.json
│   │   ├── scicode.json
│   │   ├── scienceqa-visual.json
│   │   ├── scienceqa.json
│   │   ├── screenspot-pro.json
│   │   ├── screenspot.json
│   │   ├── simpleqa.json
│   │   ├── slakevqa.json
│   │   ├── social-iqa.json
│   │   ├── spider.json
│   │   ├── squality.json
│   │   ├── stem.json
│   │   ├── summscreenfd.json
│   │   ├── superglue.json
│   │   ├── supergpqa.json
│   │   ├── swe-bench-multilingual.json
│   │   ├── swe-bench-verified-(agentic-coding).json
│   │   ├── swe-bench-verified-(agentless).json
│   │   ├── swe-bench-verified-(multiple-attempts).json
│   │   ├── swe-bench-verified.json
│   │   ├── swe-dev.json
│   │   ├── swe-lancer-(ic-diamond-subset).json
│   │   ├── swe-lancer.json
│   │   ├── tau-bench-airline.json
│   │   ├── tau-bench-retail.json
│   │   ├── tau-bench.json
│   │   ├── tau2-airline.json
│   │   ├── tau2-retail.json
│   │   ├── tau2-telecom.json
│   │   ├── tempcompass.json
│   │   ├── terminal-bench.json
│   │   ├── terminus.json
│   │   ├── textvqa.json
│   │   ├── theoremqa.json
│   │   ├── tldr9+-(test).json
│   │   ├── translation-en-to-set1-comet22.json
│   │   ├── translation-en-to-set1-spbleu.json
│   │   ├── translation-set1-to-en-comet22.json
│   │   ├── translation-set1-to-en-spbleu.json
│   │   ├── triviaqa.json
│   │   ├── truthfulqa.json
│   │   ├── tydiqa.json
│   │   ├── uniform-bar-exam.json
│   │   ├── usamo25.json
│   │   ├── vatex.json
│   │   ├── vcr-en-easy.json
│   │   ├── vibe-eval.json
│   │   ├── video-mme-(long,-no-subtitles).json
│   │   ├── video-mme.json
│   │   ├── video-mmew-sub.json
│   │   ├── videomme-w-o-sub..json
│   │   ├── videomme-w-sub..json
│   │   ├── videommmu.json
│   │   ├── visualwebbench.json
│   │   ├── vocalsound.json
│   │   ├── voicebench-avg.json
│   │   ├── vqa-rad.json
│   │   ├── vqav2-(test).json
│   │   ├── vqav2-(val).json
│   │   ├── vqav2.json
│   │   ├── wild-bench.json
│   │   ├── winogrande.json
│   │   ├── wmt23.json
│   │   ├── wmt24++.json
│   │   ├── writingbench.json
│   │   ├── xlsum-english.json
│   │   ├── xstest.json
│   │   └── zebralogic.json
│   ├── licenses/
│   │   ├── apache_2_0.json
│   │   ├── cc_by_nc.json
│   │   ├── creative_commons_attribution_4_0_license.json
│   │   ├── deepseek.json
│   │   ├── gemma.json
│   │   ├── health_ai_developer_foundations_terms_of_use.json
│   │   ├── jamba_open_model_license.json
│   │   ├── llama3_2.json
│   │   ├── llama_3_1_community_license.json
│   │   ├── llama_3_2_community_license.json
│   │   ├── llama_3_3_community_license_agreement.json
│   │   ├── llama_4_community_license_agreement.json
│   │   ├── mistral_research_license.json
│   │   ├── mistral_research_license_(mrl)_for_research;_mistral_commercial_license_for_commercial_use.json
│   │   ├── mit.json
│   │   ├── mit_+_model_license_(commercial_use_allowed).json
│   │   ├── mit_license.json
│   │   ├── mnpl_0_1.json
│   │   ├── modified_mit_license.json
│   │   ├── nvidia_open_model_license_agreement.json
│   │   ├── proprietary.json
│   │   ├── qwen.json
│   │   ├── tongyi_qianwen.json
│   │   └── unknown.json
│   ├── organizations/
│   │   ├── ai21/
│   │   │   ├── models/
│   │   │   │   ├── jamba-1.5-large/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   └── jamba-1.5-mini/
│   │   │   │       ├── benchmarks.json
│   │   │   │       └── model.json
│   │   │   └── organization.json
│   │   ├── amazon/
│   │   │   ├── models/
│   │   │   │   ├── nova-lite/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── nova-micro/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   └── nova-pro/
│   │   │   │       ├── benchmarks.json
│   │   │   │       └── model.json
│   │   │   └── organization.json
│   │   ├── anthropic/
│   │   │   ├── models/
│   │   │   │   ├── claude-3-5-haiku-20241022/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── claude-3-5-sonnet-20240620/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── claude-3-5-sonnet-20241022/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── claude-3-7-sonnet-20250219/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── claude-3-haiku-20240307/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── claude-3-opus-20240229/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── claude-3-sonnet-20240229/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── claude-haiku-4-5-20251015/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── claude-opus-4-1-20250805/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── claude-opus-4-20250514/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── claude-sonnet-4-20250514/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   └── claude-sonnet-4-5-20250929/
│   │   │   │       ├── benchmarks.json
│   │   │   │       └── model.json
│   │   │   └── organization.json
│   │   ├── cohere/
│   │   │   ├── models/
│   │   │   │   └── command-r-plus-04-2024/
│   │   │   │       ├── benchmarks.json
│   │   │   │       └── model.json
│   │   │   └── organization.json
│   │   ├── deepseek/
│   │   │   ├── models/
│   │   │   │   ├── deepseek-r1/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── deepseek-r1-0528/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── deepseek-r1-distill-llama-70b/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── deepseek-r1-distill-llama-8b/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── deepseek-r1-distill-qwen-1.5b/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── deepseek-r1-distill-qwen-14b/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── deepseek-r1-distill-qwen-32b/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── deepseek-r1-distill-qwen-7b/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── deepseek-r1-zero/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── deepseek-v2.5/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── deepseek-v3/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── deepseek-v3-0324/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── deepseek-v3.1/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── deepseek-v3.2-exp/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── deepseek-vl2/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── deepseek-vl2-small/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   └── deepseek-vl2-tiny/
│   │   │   │       ├── benchmarks.json
│   │   │   │       └── model.json
│   │   │   └── organization.json
│   │   ├── google/
│   │   │   ├── models/
│   │   │   │   ├── gemini-1.0-pro/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gemini-1.5-flash/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gemini-1.5-flash-8b/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gemini-1.5-pro/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gemini-2.0-flash/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gemini-2.0-flash-lite/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gemini-2.0-flash-thinking/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gemini-2.5-flash/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gemini-2.5-flash-lite/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gemini-2.5-pro/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gemini-2.5-pro-preview-06-05/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gemini-diffusion/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gemma-2-27b-it/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gemma-2-9b-it/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gemma-3-12b-it/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gemma-3-1b-it/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gemma-3-27b-it/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gemma-3-4b-it/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gemma-3n-e2b/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gemma-3n-e2b-it/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gemma-3n-e2b-it-litert-preview/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gemma-3n-e4b/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gemma-3n-e4b-it/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gemma-3n-e4b-it-litert-preview/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   └── medgemma-4b-it/
│   │   │   │       ├── benchmarks.json
│   │   │   │       └── model.json
│   │   │   └── organization.json
│   │   ├── ibm/
│   │   │   ├── models/
│   │   │   │   ├── granite-3.3-8b-base/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── granite-3.3-8b-instruct/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   └── granite-4.0-tiny-preview/
│   │   │   │       ├── benchmarks.json
│   │   │   │       └── model.json
│   │   │   └── organization.json
│   │   ├── meta/
│   │   │   ├── models/
│   │   │   │   ├── llama-3.1-405b-instruct/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── llama-3.1-70b-instruct/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── llama-3.1-8b-instruct/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── llama-3.2-11b-instruct/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── llama-3.2-3b-instruct/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── llama-3.2-90b-instruct/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── llama-3.3-70b-instruct/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── llama-4-maverick/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   └── llama-4-scout/
│   │   │   │       ├── benchmarks.json
│   │   │   │       └── model.json
│   │   │   └── organization.json
│   │   ├── microsoft/
│   │   │   ├── models/
│   │   │   │   ├── phi-3.5-mini-instruct/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── phi-3.5-moe-instruct/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── phi-3.5-vision-instruct/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── phi-4/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── phi-4-mini/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── phi-4-mini-reasoning/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── phi-4-multimodal-instruct/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── phi-4-reasoning/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   └── phi-4-reasoning-plus/
│   │   │   │       ├── benchmarks.json
│   │   │   │       └── model.json
│   │   │   └── organization.json
│   │   ├── mistral/
│   │   │   ├── models/
│   │   │   │   ├── codestral-22b/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── devstral-medium-2507/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── devstral-small-2507/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── magistral-medium/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── magistral-small-2506/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── ministral-8b-instruct-2410/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── mistral-large-2-2407/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── mistral-nemo-instruct-2407/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── mistral-small-2409/
│   │   │   │   │   └── model.json
│   │   │   │   ├── mistral-small-24b-base-2501/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── mistral-small-24b-instruct-2501/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── mistral-small-3.1-24b-base-2503/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── mistral-small-3.1-24b-instruct-2503/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── mistral-small-3.2-24b-instruct-2506/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── pixtral-12b-2409/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   └── pixtral-large/
│   │   │   │       ├── benchmarks.json
│   │   │   │       └── model.json
│   │   │   └── organization.json
│   │   ├── moonshotai/
│   │   │   ├── models/
│   │   │   │   ├── kimi-k1.5/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── kimi-k2-0905/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── kimi-k2-base/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── kimi-k2-instruct/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   └── kimi-k2-instruct-0905/
│   │   │   │       ├── benchmarks.json
│   │   │   │       └── model.json
│   │   │   └── organization.json
│   │   ├── nvidia/
│   │   │   ├── models/
│   │   │   │   ├── llama-3.1-nemotron-70b-instruct/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── llama-3.1-nemotron-nano-8b-v1/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── llama-3.1-nemotron-ultra-253b-v1/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── llama-3.3-nemotron-super-49b-v1/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   └── nemotron-nano-9b-v2/
│   │   │   │       ├── benchmarks.json
│   │   │   │       └── model.json
│   │   │   └── organization.json
│   │   ├── openai/
│   │   │   ├── models/
│   │   │   │   ├── gpt-3.5-turbo-0125/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gpt-4-0613/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gpt-4-turbo-2024-04-09/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gpt-4.1-2025-04-14/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gpt-4.1-mini-2025-04-14/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gpt-4.1-nano-2025-04-14/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gpt-4.5/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gpt-4o-2024-05-13/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gpt-4o-2024-08-06/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gpt-4o-mini-2024-07-18/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gpt-5-2025-08-07/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gpt-5-codex-2025-09-15/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gpt-5-mini-2025-08-07/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gpt-5-nano-2025-08-07/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gpt-oss-120b/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gpt-oss-20b/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── o1-2024-12-17/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── o1-mini/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── o1-preview/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── o1-pro/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── o3-2025-04-16/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── o3-mini/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── o3-pro-2025-06-10/
│   │   │   │   │   └── model.json
│   │   │   │   └── o4-mini/
│   │   │   │       ├── benchmarks.json
│   │   │   │       └── model.json
│   │   │   └── organization.json
│   │   ├── qwen/
│   │   │   ├── models/
│   │   │   │   ├── qvq-72b-preview/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── qwen-2.5-14b-instruct/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── qwen-2.5-32b-instruct/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── qwen-2.5-72b-instruct/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── qwen-2.5-7b-instruct/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── qwen-2.5-coder-32b-instruct/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── qwen-2.5-coder-7b-instruct/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── qwen2-72b-instruct/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── qwen2-7b-instruct/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── qwen2-vl-72b/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── qwen2.5-omni-7b/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── qwen2.5-vl-32b/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── qwen2.5-vl-72b/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── qwen2.5-vl-7b/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── qwen3-235b-a22b/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── qwen3-235b-a22b-instruct-2507/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── qwen3-235b-a22b-thinking-2507/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── qwen3-30b-a3b/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── qwen3-32b/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── qwen3-next-80b-a3b-base/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── qwen3-next-80b-a3b-instruct/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── qwen3-next-80b-a3b-thinking/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── qwq-32b/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   └── qwq-32b-preview/
│   │   │   │       ├── benchmarks.json
│   │   │   │       └── model.json
│   │   │   └── organization.json
│   │   ├── unknown/
│   │   │   └── organization.json
│   │   ├── xai/
│   │   │   ├── models/
│   │   │   │   ├── grok-1.5/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── grok-1.5v/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── grok-2/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── grok-2-mini/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── grok-3/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── grok-3-mini/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── grok-4/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── grok-4-fast/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── grok-4-heavy/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   └── grok-code-fast-1/
│   │   │   │       ├── benchmarks.json
│   │   │   │       └── model.json
│   │   │   └── organization.json
│   │   └── zai-org/
│   │       ├── models/
│   │       │   ├── glm-4.5/
│   │       │   │   ├── benchmarks.json
│   │       │   │   └── model.json
│   │       │   ├── glm-4.5-air/
│   │       │   │   ├── benchmarks.json
│   │       │   │   └── model.json
│   │       │   ├── glm-4.5v/
│   │       │   │   ├── benchmarks.json
│   │       │   │   └── model.json
│   │       │   └── glm-4.6/
│   │       │       ├── benchmarks.json
│   │       │       └── model.json
│   │       └── organization.json
│   └── providers/
│       ├── anthropic/
│       │   ├── models.json
│       │   └── provider.json
│       ├── azure/
│       │   ├── models.json
│       │   └── provider.json
│       ├── bedrock/
│       │   ├── models.json
│       │   └── provider.json
│       ├── cerebras/
│       │   ├── models.json
│       │   └── provider.json
│       ├── cohere/
│       │   ├── models.json
│       │   └── provider.json
│       ├── deepinfra/
│       │   ├── models.json
│       │   └── provider.json
│       ├── deepseek/
│       │   ├── models.json
│       │   └── provider.json
│       ├── fireworks/
│       │   ├── models.json
│       │   └── provider.json
│       ├── google/
│       │   ├── models.json
│       │   └── provider.json
│       ├── groq/
│       │   ├── models.json
│       │   └── provider.json
│       ├── hyperbolic/
│       │   ├── models.json
│       │   └── provider.json
│       ├── lambda/
│       │   ├── models.json
│       │   └── provider.json
│       ├── mistral/
│       │   ├── models.json
│       │   └── provider.json
│       ├── novita/
│       │   ├── models.json
│       │   └── provider.json
│       ├── openai/
│       │   ├── models.json
│       │   └── provider.json
│       ├── replicate/
│       │   ├── models.json
│       │   └── provider.json
│       ├── sambanova/
│       │   ├── models.json
│       │   └── provider.json
│       ├── together/
│       │   ├── models.json
│       │   └── provider.json
│       ├── xai/
│       │   ├── models.json
│       │   └── provider.json
│       └── zeroeval/
│           ├── models.json
│           └── provider.json
├── package.json
└── schemas/
    ├── README.md
    ├── benchmark-results.schema.json
    ├── benchmark.schema.json
    ├── integrity-validator.js
    ├── license.schema.json
    ├── model.schema.json
    ├── organization.schema.json
    ├── provider-models.schema.json
    ├── provider.schema.json
    └── validator.js