Repository: JonathanChavezTamales/LLMStats Branch: main Commit: 872b75f63b8d Files: 778 Total size: 2.2 MB Directory structure: gitextract_261_qksq/ ├── .github/ │ ├── pull_request_template.md │ └── workflows/ │ └── schema-validation.yml ├── .gitignore ├── .vscode/ │ └── settings.json ├── CONTRIBUTING.md ├── LICENSE.md ├── README.md ├── data/ │ ├── .github/ │ │ └── CODEOWNERS │ ├── benchmarks/ │ │ ├── aa-index.json │ │ ├── acebench.json │ │ ├── activitynet.json │ │ ├── agieval.json │ │ ├── ai2-reasoning-challenge-(arc).json │ │ ├── ai2d.json │ │ ├── aider-polyglot-edit.json │ │ ├── aider-polyglot.json │ │ ├── aider.json │ │ ├── aime-2024.json │ │ ├── aime-2025.json │ │ ├── aime.json │ │ ├── aitz-em.json │ │ ├── alignbench.json │ │ ├── alpacaeval-2.0.json │ │ ├── amc-2022-23.json │ │ ├── android-control-high-em.json │ │ ├── android-control-low-em.json │ │ ├── androidworld-sr.json │ │ ├── api-bank.json │ │ ├── arc-agi-v2.json │ │ ├── arc-agi.json │ │ ├── arc-c.json │ │ ├── arc-e.json │ │ ├── arc.json │ │ ├── arena-hard-v2.json │ │ ├── arena-hard.json │ │ ├── attaq.json │ │ ├── autologi.json │ │ ├── bbh.json │ │ ├── bfcl-v2.json │ │ ├── bfcl-v3-multiturn.json │ │ ├── bfcl-v3.json │ │ ├── bfcl.json │ │ ├── big-bench-extra-hard.json │ │ ├── big-bench-hard.json │ │ ├── big-bench.json │ │ ├── bigcodebench-full.json │ │ ├── bigcodebench-hard.json │ │ ├── bigcodebench.json │ │ ├── bird-sql-(dev).json │ │ ├── blink.json │ │ ├── boolq.json │ │ ├── browsecomp-long-128k.json │ │ ├── browsecomp-long-256k.json │ │ ├── browsecomp-zh.json │ │ ├── browsecomp.json │ │ ├── c-eval.json │ │ ├── cbnsl.json │ │ ├── cc-ocr.json │ │ ├── cfeval.json │ │ ├── charadessta.json │ │ ├── chartqa.json │ │ ├── charxiv-d.json │ │ ├── charxiv-r.json │ │ ├── chexpert-cxr.json │ │ ├── cluewsc.json │ │ ├── cmmlu.json │ │ ├── cnmo-2024.json │ │ ├── codeforces.json │ │ ├── codegolf-v2.2.json │ │ ├── collie.json │ │ ├── common-voice-15.json │ │ ├── commonsenseqa.json │ │ ├── complexfuncbench.json │ │ ├── covost2-en-zh.json │ │ ├── covost2.json │ │ ├── crag.json │ │ ├── creative-writing-v3.json │ │ ├── crperelation.json │ │ ├── crux-o.json │ │ ├── cruxeval-input-cot.json │ │ ├── cruxeval-o.json │ │ ├── cruxeval-output-cot.json │ │ ├── csimpleqa.json │ │ ├── cybersecurity-ctfs.json │ │ ├── dermmcqa.json │ │ ├── docvqa.json │ │ ├── docvqatest.json │ │ ├── drop.json │ │ ├── ds-arena-code.json │ │ ├── ds-fim-eval.json │ │ ├── eclektic.json │ │ ├── egoschema.json │ │ ├── erqa.json │ │ ├── evalplus.json │ │ ├── facts-grounding.json │ │ ├── factscore.json │ │ ├── finqa.json │ │ ├── flenqa.json │ │ ├── fleurs.json │ │ ├── frames.json │ │ ├── french-mmlu.json │ │ ├── frontiermath.json │ │ ├── functionalmath.json │ │ ├── giantsteps-tempo.json │ │ ├── global-mmlu-lite.json │ │ ├── global-mmlu.json │ │ ├── gorilla-benchmark-api-bench.json │ │ ├── govreport.json │ │ ├── gpqa-biology.json │ │ ├── gpqa-chemistry.json │ │ ├── gpqa-physics.json │ │ ├── gpqa.json │ │ ├── graphwalks-bfs-%3C128k.json │ │ ├── graphwalks-bfs-%3E128k.json │ │ ├── graphwalks-parents-%3C128k.json │ │ ├── graphwalks-parents-%3E128k.json │ │ ├── groundui-1k.json │ │ ├── gsm-8k-(cot).json │ │ ├── gsm8k-chat.json │ │ ├── gsm8k.json │ │ ├── hallusion-bench.json │ │ ├── healthbench-hard.json │ │ ├── healthbench.json │ │ ├── hellaswag.json │ │ ├── hiddenmath.json │ │ ├── hle.json │ │ ├── hmmt-2025.json │ │ ├── hmmt25.json │ │ ├── humaneval+.json │ │ ├── humaneval-average.json │ │ ├── humaneval-er.json │ │ ├── humaneval-mul.json │ │ ├── humaneval-plus.json │ │ ├── humaneval.json │ │ ├── humanevalfim-average.json │ │ ├── humanity's-last-exam.json │ │ ├── if.json │ │ ├── ifeval.json │ │ ├── include.json │ │ ├── infinitebench-en.mc.json │ │ ├── infinitebench-en.qa.json │ │ ├── infographicsqa.json │ │ ├── infovqa.json │ │ ├── infovqatest.json │ │ ├── instruct-humaneval.json │ │ ├── intergps.json │ │ ├── internal-api-instruction-following-(hard).json │ │ ├── lbpp-(v2).json │ │ ├── livebench-20241125.json │ │ ├── livebench.json │ │ ├── livecodebench(01-09).json │ │ ├── livecodebench-v5-24.12-25.2.json │ │ ├── livecodebench-v5.json │ │ ├── livecodebench-v6.json │ │ ├── livecodebench.json │ │ ├── longbench-v2.json │ │ ├── longfact-concepts.json │ │ ├── longfact-objects.json │ │ ├── longvideobench.json │ │ ├── lsat.json │ │ ├── lvbench.json │ │ ├── math-(cot).json │ │ ├── math-500.json │ │ ├── math.json │ │ ├── mathvision.json │ │ ├── mathvista-mini.json │ │ ├── mathvista.json │ │ ├── mbpp+.json │ │ ├── mbpp-++-base-version.json │ │ ├── mbpp-evalplus-(base).json │ │ ├── mbpp-evalplus.json │ │ ├── mbpp-pass@1.json │ │ ├── mbpp-plus.json │ │ ├── mbpp.json │ │ ├── medxpertqa.json │ │ ├── mega-mlqa.json │ │ ├── mega-tydi-qa.json │ │ ├── mega-udpos.json │ │ ├── mega-xcopa.json │ │ ├── mega-xstorycloze.json │ │ ├── meld.json │ │ ├── mgsm.json │ │ ├── mimic-cxr.json │ │ ├── mlvu-m.json │ │ ├── mlvu.json │ │ ├── mm-if-eval.json │ │ ├── mm-mind2web.json │ │ ├── mm-mt-bench.json │ │ ├── mmau-music.json │ │ ├── mmau-sound.json │ │ ├── mmau-speech.json │ │ ├── mmau.json │ │ ├── mmbench-test.json │ │ ├── mmbench-v1.1.json │ │ ├── mmbench-video.json │ │ ├── mmbench.json │ │ ├── mme-realworld.json │ │ ├── mme.json │ │ ├── mmlu-(cot).json │ │ ├── mmlu-base.json │ │ ├── mmlu-chat.json │ │ ├── mmlu-french.json │ │ ├── mmlu-pro.json │ │ ├── mmlu-prox.json │ │ ├── mmlu-redux-2.0.json │ │ ├── mmlu-redux.json │ │ ├── mmlu-stem.json │ │ ├── mmlu.json │ │ ├── mmmlu.json │ │ ├── mmmu-(val).json │ │ ├── mmmu-(validation).json │ │ ├── mmmu-pro.json │ │ ├── mmmu.json │ │ ├── mmmuval.json │ │ ├── mmstar.json │ │ ├── mmt-bench.json │ │ ├── mmvet.json │ │ ├── mmvetgpt4turbo.json │ │ ├── mobileminiwob++-sr.json │ │ ├── mrcr-1m-(pointwise).json │ │ ├── mrcr-1m.json │ │ ├── mrcr-v2-(8-needle).json │ │ ├── mrcr-v2.json │ │ ├── mrcr.json │ │ ├── mt-bench.json │ │ ├── mtvqa.json │ │ ├── muirbench.json │ │ ├── multi-if.json │ │ ├── multi-swe-bench.json │ │ ├── multichallenge-(o3-mini-grader).json │ │ ├── multichallenge.json │ │ ├── multilf.json │ │ ├── multilingual-mgsm-(cot).json │ │ ├── multilingual-mmlu.json │ │ ├── multipl-e-humaneval.json │ │ ├── multipl-e-mbpp.json │ │ ├── multipl-e.json │ │ ├── musiccaps.json │ │ ├── musr.json │ │ ├── mvbench.json │ │ ├── natural-questions.json │ │ ├── natural2code.json │ │ ├── nexus.json │ │ ├── nih-multi-needle.json │ │ ├── nmos.json │ │ ├── nq.json │ │ ├── ocrbench-v2-(en).json │ │ ├── ocrbench-v2-(zh).json │ │ ├── ocrbench-v2.json │ │ ├── ocrbench.json │ │ ├── odinw.json │ │ ├── ojbench.json │ │ ├── olympiadbench.json │ │ ├── omnibench-music.json │ │ ├── omnibench.json │ │ ├── omnimath.json │ │ ├── open-rewrite.json │ │ ├── openai-mmlu.json │ │ ├── openai-mrcr%3A-2-needle-128k.json │ │ ├── openai-mrcr%3A-2-needle-1m.json │ │ ├── openai-mrcr%3A-2-needle-256k.json │ │ ├── openbookqa.json │ │ ├── osworld-extended.json │ │ ├── osworld-screenshot-only.json │ │ ├── osworld.json │ │ ├── pathmcqa.json │ │ ├── perceptiontest.json │ │ ├── phibench.json │ │ ├── physicsfinals.json │ │ ├── piqa.json │ │ ├── pointgrounding.json │ │ ├── polymath-en.json │ │ ├── polymath.json │ │ ├── pope.json │ │ ├── popqa.json │ │ ├── qasper.json │ │ ├── qmsum.json │ │ ├── realworldqa.json │ │ ├── repobench.json │ │ ├── repoqa.json │ │ ├── ruler.json │ │ ├── sat-math.json │ │ ├── scale-multichallenge.json │ │ ├── scicode.json │ │ ├── scienceqa-visual.json │ │ ├── scienceqa.json │ │ ├── screenspot-pro.json │ │ ├── screenspot.json │ │ ├── simpleqa.json │ │ ├── slakevqa.json │ │ ├── social-iqa.json │ │ ├── spider.json │ │ ├── squality.json │ │ ├── stem.json │ │ ├── summscreenfd.json │ │ ├── superglue.json │ │ ├── supergpqa.json │ │ ├── swe-bench-multilingual.json │ │ ├── swe-bench-verified-(agentic-coding).json │ │ ├── swe-bench-verified-(agentless).json │ │ ├── swe-bench-verified-(multiple-attempts).json │ │ ├── swe-bench-verified.json │ │ ├── swe-dev.json │ │ ├── swe-lancer-(ic-diamond-subset).json │ │ ├── swe-lancer.json │ │ ├── tau-bench-airline.json │ │ ├── tau-bench-retail.json │ │ ├── tau-bench.json │ │ ├── tau2-airline.json │ │ ├── tau2-retail.json │ │ ├── tau2-telecom.json │ │ ├── tempcompass.json │ │ ├── terminal-bench.json │ │ ├── terminus.json │ │ ├── textvqa.json │ │ ├── theoremqa.json │ │ ├── tldr9+-(test).json │ │ ├── translation-en-to-set1-comet22.json │ │ ├── translation-en-to-set1-spbleu.json │ │ ├── translation-set1-to-en-comet22.json │ │ ├── translation-set1-to-en-spbleu.json │ │ ├── triviaqa.json │ │ ├── truthfulqa.json │ │ ├── tydiqa.json │ │ ├── uniform-bar-exam.json │ │ ├── usamo25.json │ │ ├── vatex.json │ │ ├── vcr-en-easy.json │ │ ├── vibe-eval.json │ │ ├── video-mme-(long,-no-subtitles).json │ │ ├── video-mme.json │ │ ├── video-mmew-sub.json │ │ ├── videomme-w-o-sub..json │ │ ├── videomme-w-sub..json │ │ ├── videommmu.json │ │ ├── visualwebbench.json │ │ ├── vocalsound.json │ │ ├── voicebench-avg.json │ │ ├── vqa-rad.json │ │ ├── vqav2-(test).json │ │ ├── vqav2-(val).json │ │ ├── vqav2.json │ │ ├── wild-bench.json │ │ ├── winogrande.json │ │ ├── wmt23.json │ │ ├── wmt24++.json │ │ ├── writingbench.json │ │ ├── xlsum-english.json │ │ ├── xstest.json │ │ └── zebralogic.json │ ├── licenses/ │ │ ├── apache_2_0.json │ │ ├── cc_by_nc.json │ │ ├── creative_commons_attribution_4_0_license.json │ │ ├── deepseek.json │ │ ├── gemma.json │ │ ├── health_ai_developer_foundations_terms_of_use.json │ │ ├── jamba_open_model_license.json │ │ ├── llama3_2.json │ │ ├── llama_3_1_community_license.json │ │ ├── llama_3_2_community_license.json │ │ ├── llama_3_3_community_license_agreement.json │ │ ├── llama_4_community_license_agreement.json │ │ ├── mistral_research_license.json │ │ ├── mistral_research_license_(mrl)_for_research;_mistral_commercial_license_for_commercial_use.json │ │ ├── mit.json │ │ ├── mit_+_model_license_(commercial_use_allowed).json │ │ ├── mit_license.json │ │ ├── mnpl_0_1.json │ │ ├── modified_mit_license.json │ │ ├── nvidia_open_model_license_agreement.json │ │ ├── proprietary.json │ │ ├── qwen.json │ │ ├── tongyi_qianwen.json │ │ └── unknown.json │ ├── organizations/ │ │ ├── ai21/ │ │ │ ├── models/ │ │ │ │ ├── jamba-1.5-large/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ └── jamba-1.5-mini/ │ │ │ │ ├── benchmarks.json │ │ │ │ └── model.json │ │ │ └── organization.json │ │ ├── amazon/ │ │ │ ├── models/ │ │ │ │ ├── nova-lite/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── nova-micro/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ └── nova-pro/ │ │ │ │ ├── benchmarks.json │ │ │ │ └── model.json │ │ │ └── organization.json │ │ ├── anthropic/ │ │ │ ├── models/ │ │ │ │ ├── claude-3-5-haiku-20241022/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── claude-3-5-sonnet-20240620/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── claude-3-5-sonnet-20241022/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── claude-3-7-sonnet-20250219/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── claude-3-haiku-20240307/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── claude-3-opus-20240229/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── claude-3-sonnet-20240229/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── claude-haiku-4-5-20251015/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── claude-opus-4-1-20250805/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── claude-opus-4-20250514/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── claude-sonnet-4-20250514/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ └── claude-sonnet-4-5-20250929/ │ │ │ │ ├── benchmarks.json │ │ │ │ └── model.json │ │ │ └── organization.json │ │ ├── cohere/ │ │ │ ├── models/ │ │ │ │ └── command-r-plus-04-2024/ │ │ │ │ ├── benchmarks.json │ │ │ │ └── model.json │ │ │ └── organization.json │ │ ├── deepseek/ │ │ │ ├── models/ │ │ │ │ ├── deepseek-r1/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── deepseek-r1-0528/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── deepseek-r1-distill-llama-70b/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── deepseek-r1-distill-llama-8b/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── deepseek-r1-distill-qwen-1.5b/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── deepseek-r1-distill-qwen-14b/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── deepseek-r1-distill-qwen-32b/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── deepseek-r1-distill-qwen-7b/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── deepseek-r1-zero/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── deepseek-v2.5/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── deepseek-v3/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── deepseek-v3-0324/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── deepseek-v3.1/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── deepseek-v3.2-exp/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── deepseek-vl2/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── deepseek-vl2-small/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ └── deepseek-vl2-tiny/ │ │ │ │ ├── benchmarks.json │ │ │ │ └── model.json │ │ │ └── organization.json │ │ ├── google/ │ │ │ ├── models/ │ │ │ │ ├── gemini-1.0-pro/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── gemini-1.5-flash/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── gemini-1.5-flash-8b/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── gemini-1.5-pro/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── gemini-2.0-flash/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── gemini-2.0-flash-lite/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── gemini-2.0-flash-thinking/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── gemini-2.5-flash/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── gemini-2.5-flash-lite/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── gemini-2.5-pro/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── gemini-2.5-pro-preview-06-05/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── gemini-diffusion/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── gemma-2-27b-it/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── gemma-2-9b-it/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── gemma-3-12b-it/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── gemma-3-1b-it/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── gemma-3-27b-it/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── gemma-3-4b-it/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── gemma-3n-e2b/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── gemma-3n-e2b-it/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── gemma-3n-e2b-it-litert-preview/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── gemma-3n-e4b/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── gemma-3n-e4b-it/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── gemma-3n-e4b-it-litert-preview/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ └── medgemma-4b-it/ │ │ │ │ ├── benchmarks.json │ │ │ │ └── model.json │ │ │ └── organization.json │ │ ├── ibm/ │ │ │ ├── models/ │ │ │ │ ├── granite-3.3-8b-base/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── granite-3.3-8b-instruct/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ └── granite-4.0-tiny-preview/ │ │ │ │ ├── benchmarks.json │ │ │ │ └── model.json │ │ │ └── organization.json │ │ ├── meta/ │ │ │ ├── models/ │ │ │ │ ├── llama-3.1-405b-instruct/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── llama-3.1-70b-instruct/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── llama-3.1-8b-instruct/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── llama-3.2-11b-instruct/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── llama-3.2-3b-instruct/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── llama-3.2-90b-instruct/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── llama-3.3-70b-instruct/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── llama-4-maverick/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ └── llama-4-scout/ │ │ │ │ ├── benchmarks.json │ │ │ │ └── model.json │ │ │ └── organization.json │ │ ├── microsoft/ │ │ │ ├── models/ │ │ │ │ ├── phi-3.5-mini-instruct/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── phi-3.5-moe-instruct/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── phi-3.5-vision-instruct/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── phi-4/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── phi-4-mini/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── phi-4-mini-reasoning/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── phi-4-multimodal-instruct/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── phi-4-reasoning/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ └── phi-4-reasoning-plus/ │ │ │ │ ├── benchmarks.json │ │ │ │ └── model.json │ │ │ └── organization.json │ │ ├── mistral/ │ │ │ ├── models/ │ │ │ │ ├── codestral-22b/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── devstral-medium-2507/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── devstral-small-2507/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── magistral-medium/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── magistral-small-2506/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── ministral-8b-instruct-2410/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── mistral-large-2-2407/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── mistral-nemo-instruct-2407/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── mistral-small-2409/ │ │ │ │ │ └── model.json │ │ │ │ ├── mistral-small-24b-base-2501/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── mistral-small-24b-instruct-2501/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── mistral-small-3.1-24b-base-2503/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── mistral-small-3.1-24b-instruct-2503/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── mistral-small-3.2-24b-instruct-2506/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── pixtral-12b-2409/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ └── pixtral-large/ │ │ │ │ ├── benchmarks.json │ │ │ │ └── model.json │ │ │ └── organization.json │ │ ├── moonshotai/ │ │ │ ├── models/ │ │ │ │ ├── kimi-k1.5/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── kimi-k2-0905/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── kimi-k2-base/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── kimi-k2-instruct/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ └── kimi-k2-instruct-0905/ │ │ │ │ ├── benchmarks.json │ │ │ │ └── model.json │ │ │ └── organization.json │ │ ├── nvidia/ │ │ │ ├── models/ │ │ │ │ ├── llama-3.1-nemotron-70b-instruct/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── llama-3.1-nemotron-nano-8b-v1/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── llama-3.1-nemotron-ultra-253b-v1/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── llama-3.3-nemotron-super-49b-v1/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ └── nemotron-nano-9b-v2/ │ │ │ │ ├── benchmarks.json │ │ │ │ └── model.json │ │ │ └── organization.json │ │ ├── openai/ │ │ │ ├── models/ │ │ │ │ ├── gpt-3.5-turbo-0125/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── gpt-4-0613/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── gpt-4-turbo-2024-04-09/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── gpt-4.1-2025-04-14/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── gpt-4.1-mini-2025-04-14/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── gpt-4.1-nano-2025-04-14/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── gpt-4.5/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── gpt-4o-2024-05-13/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── gpt-4o-2024-08-06/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── gpt-4o-mini-2024-07-18/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── gpt-5-2025-08-07/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── gpt-5-codex-2025-09-15/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── gpt-5-mini-2025-08-07/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── gpt-5-nano-2025-08-07/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── gpt-oss-120b/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── gpt-oss-20b/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── o1-2024-12-17/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── o1-mini/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── o1-preview/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── o1-pro/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── o3-2025-04-16/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── o3-mini/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── o3-pro-2025-06-10/ │ │ │ │ │ └── model.json │ │ │ │ └── o4-mini/ │ │ │ │ ├── benchmarks.json │ │ │ │ └── model.json │ │ │ └── organization.json │ │ ├── qwen/ │ │ │ ├── models/ │ │ │ │ ├── qvq-72b-preview/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── qwen-2.5-14b-instruct/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── qwen-2.5-32b-instruct/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── qwen-2.5-72b-instruct/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── qwen-2.5-7b-instruct/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── qwen-2.5-coder-32b-instruct/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── qwen-2.5-coder-7b-instruct/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── qwen2-72b-instruct/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── qwen2-7b-instruct/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── qwen2-vl-72b/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── qwen2.5-omni-7b/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── qwen2.5-vl-32b/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── qwen2.5-vl-72b/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── qwen2.5-vl-7b/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── qwen3-235b-a22b/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── qwen3-235b-a22b-instruct-2507/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── qwen3-235b-a22b-thinking-2507/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── qwen3-30b-a3b/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── qwen3-32b/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── qwen3-next-80b-a3b-base/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── qwen3-next-80b-a3b-instruct/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── qwen3-next-80b-a3b-thinking/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── qwq-32b/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ └── qwq-32b-preview/ │ │ │ │ ├── benchmarks.json │ │ │ │ └── model.json │ │ │ └── organization.json │ │ ├── unknown/ │ │ │ └── organization.json │ │ ├── xai/ │ │ │ ├── models/ │ │ │ │ ├── grok-1.5/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── grok-1.5v/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── grok-2/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── grok-2-mini/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── grok-3/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── grok-3-mini/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── grok-4/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── grok-4-fast/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ ├── grok-4-heavy/ │ │ │ │ │ ├── benchmarks.json │ │ │ │ │ └── model.json │ │ │ │ └── grok-code-fast-1/ │ │ │ │ ├── benchmarks.json │ │ │ │ └── model.json │ │ │ └── organization.json │ │ └── zai-org/ │ │ ├── models/ │ │ │ ├── glm-4.5/ │ │ │ │ ├── benchmarks.json │ │ │ │ └── model.json │ │ │ ├── glm-4.5-air/ │ │ │ │ ├── benchmarks.json │ │ │ │ └── model.json │ │ │ ├── glm-4.5v/ │ │ │ │ ├── benchmarks.json │ │ │ │ └── model.json │ │ │ └── glm-4.6/ │ │ │ ├── benchmarks.json │ │ │ └── model.json │ │ └── organization.json │ └── providers/ │ ├── anthropic/ │ │ ├── models.json │ │ └── provider.json │ ├── azure/ │ │ ├── models.json │ │ └── provider.json │ ├── bedrock/ │ │ ├── models.json │ │ └── provider.json │ ├── cerebras/ │ │ ├── models.json │ │ └── provider.json │ ├── cohere/ │ │ ├── models.json │ │ └── provider.json │ ├── deepinfra/ │ │ ├── models.json │ │ └── provider.json │ ├── deepseek/ │ │ ├── models.json │ │ └── provider.json │ ├── fireworks/ │ │ ├── models.json │ │ └── provider.json │ ├── google/ │ │ ├── models.json │ │ └── provider.json │ ├── groq/ │ │ ├── models.json │ │ └── provider.json │ ├── hyperbolic/ │ │ ├── models.json │ │ └── provider.json │ ├── lambda/ │ │ ├── models.json │ │ └── provider.json │ ├── mistral/ │ │ ├── models.json │ │ └── provider.json │ ├── novita/ │ │ ├── models.json │ │ └── provider.json │ ├── openai/ │ │ ├── models.json │ │ └── provider.json │ ├── replicate/ │ │ ├── models.json │ │ └── provider.json │ ├── sambanova/ │ │ ├── models.json │ │ └── provider.json │ ├── together/ │ │ ├── models.json │ │ └── provider.json │ ├── xai/ │ │ ├── models.json │ │ └── provider.json │ └── zeroeval/ │ ├── models.json │ └── provider.json ├── package.json └── schemas/ ├── README.md ├── benchmark-results.schema.json ├── benchmark.schema.json ├── integrity-validator.js ├── license.schema.json ├── model.schema.json ├── organization.schema.json ├── provider-models.schema.json ├── provider.schema.json └── validator.js ================================================ FILE CONTENTS ================================================ ================================================ FILE: .github/pull_request_template.md ================================================ ## Description References: ## Type of Change - [ ] Model Update/Addition - [ ] Qualitative Metrics (Benchmark Results) Update/Addition - [ ] Provider Update/Addition - [ ] Other (please specify) ## Checklist - [ ] I've read the [CONTRIBUTING.md](../CONTRIBUTING.md) guidelines - [ ] My changes are accurate and properly referenced ================================================ FILE: .github/workflows/schema-validation.yml ================================================ name: Schema Validation on: pull_request: branches: [main] jobs: validate: name: Validate Schema runs-on: ubuntu-latest steps: - name: Checkout code uses: actions/checkout@v3 - name: Setup Node.js uses: actions/setup-node@v3 with: node-version: "16" cache: "npm" - name: Install dependencies run: npm ci - name: Run schema validation run: node schemas/validator.js ================================================ FILE: .gitignore ================================================ /node_modules ================================================ FILE: .vscode/settings.json ================================================ { "json.schemas": [ { "fileMatch": ["/models/*/model.json"], "url": "../schemas/models-schema.json" }, { "fileMatch": ["/models/*/qualitativemetrics.json"], "url": "../schemas/qualitativemetrics-schema.json" }, { "fileMatch": ["/providers/*/provider.json"], "url": "../schemas/providers-schema.json" }, { "fileMatch": ["/providers/*/providermodels.json"], "url": "../schemas/providermodels-schema.json" } ] } ================================================ FILE: CONTRIBUTING.md ================================================ # Contributing to LLM Stats Thank you for your interest in contributing. This guide outlines the process for updating and adding information to the LLM Stats database. ## Table of Contents - [Overview](#overview) - [Data Structure](#data-structure) - [General Guidelines](#general-guidelines) - [Organizations](#organizations) - [Models](#models) - [Benchmark Results](#benchmark-results) - [Benchmarks](#benchmarks) - [Providers](#providers) - [Licenses](#licenses) - [Validation](#validation) - [Submitting Your Contribution](#submitting-your-contribution) ## Overview All data is organized in the `data/data/` directory with a hierarchical structure. Each entity type has its own JSON schema definition in `schemas/` that validates the data structure. ## Data Structure ``` data/ ├── data/ │ ├── organizations/ │ │ └── [organization_id]/ │ │ ├── organization.json │ │ └── models/ │ │ └── [model_id]/ │ │ ├── model.json │ │ └── benchmarks.json │ ├── providers/ │ │ └── [provider_id]/ │ │ ├── provider.json │ │ └── models.json │ ├── licenses/ │ │ └── [license_id].json │ └── benchmarks/ │ └── [benchmark_id].json └── schemas/ ├── organization.schema.json ├── model.schema.json ├── benchmark-results.schema.json ├── benchmark.schema.json ├── provider.schema.json ├── provider-models.schema.json └── license.schema.json ``` ## General Guidelines 1. **Accuracy First**: Ensure all data is accurate and sourced from authoritative references 2. **Follow Structure**: Adhere to the existing file structure and naming conventions 3. **Consistent Formatting**: Use consistent JSON formatting with 2-space indentation 4. **One Change per PR**: Submit one pull request per logical change (e.g., one model, one provider) 5. **Schema Validation**: All data files must validate against their respective JSON schemas 6. **Required Fields**: Pay attention to required vs optional fields in schemas 7. **Timestamps**: Use ISO 8601 format for dates (YYYY-MM-DD or full timestamp) ## Organizations Organizations represent the entities that create and release models (e.g., OpenAI, Anthropic, Meta). ### Location `data/data/organizations/[organization_id]/organization.json` ### Adding a New Organization 1. Create a new directory: `data/data/organizations/[organization_id]/` 2. Create `organization.json` with the following structure: ```json { "organization_id": "organization-name", "name": "Organization Display Name", "website": "https://organization.com", "description": "Brief description of the organization", "country": "US", "created_at": "2025-10-02T00:00:00.000000+00:00", "updated_at": "2025-10-02T00:00:00.000000+00:00" } ``` 3. Validate against `schemas/organization.schema.json` 4. Create a `models/` subdirectory for future models ### Updating an Existing Organization 1. Navigate to `data/data/organizations/[organization_id]/organization.json` 2. Update the relevant fields 3. Update the `updated_at` timestamp 4. Validate against the schema ## Models Models are stored within their respective organization directories. ### Location `data/data/organizations/[organization_id]/models/[model_id]/` ### Adding a New Model 1. Ensure the organization exists in `data/data/organizations/` 2. Ensure the license exists in `data/data/licenses/` 3. Create a new directory: `data/data/organizations/[organization_id]/models/[model_id]/` 4. Create two files in this directory: #### `model.json` ```json { "model_id": "model-name-version", "name": "Model Display Name", "organization_id": "organization-name", "fine_tuned_from_model_id": null, "description": "Detailed description of the model's capabilities", "release_date": "2024-10-22", "announcement_date": "2024-10-22", "license_id": "proprietary", "multimodal": false, "knowledge_cutoff": "2024-04-01", "param_count": 7000000000, "training_tokens": 15000000000000, "available_in_zeroeval": true, "source_api_ref": "https://...", "source_playground": "https://...", "source_paper": "https://...", "source_scorecard_blog_link": "https://...", "source_repo_link": "https://github.com/...", "source_weights_link": "https://huggingface.co/...", "created_at": "2025-10-02T00:00:00.000000+00:00", "updated_at": "2025-10-02T00:00:00.000000+00:00", "model_family_id": null } ``` **Required Fields**: `model_id`, `name`, `organization_id`, `release_date`, `license_id`, `multimodal` **Optional Fields**: Set to `null` if not applicable #### `benchmarks.json` Start with an empty array if no benchmark results are available yet: ```json [] ``` 5. Validate both files against their respective schemas ### Updating an Existing Model 1. Navigate to `data/data/organizations/[organization_id]/models/[model_id]/model.json` 2. Update the relevant fields 3. Update the `updated_at` timestamp 4. Validate against `schemas/model.schema.json` ## Benchmark Results Benchmark results are stored in the `benchmarks.json` file within each model directory. ### Location `data/data/organizations/[organization_id]/models/[model_id]/benchmarks.json` ### Adding Benchmark Results 1. Ensure the benchmark exists in `data/data/benchmarks/` 2. Ensure the model exists 3. Add a new entry to the `benchmarks.json` array: ```json [ { "benchmark_id": "mmlu", "score": 85.5, "score_unit": "percentage", "source_link": "https://example.com/results", "created_at": "2025-10-02T00:00:00.000000+00:00", "updated_at": "2025-10-02T00:00:00.000000+00:00" } ] ``` 4. Validate against `schemas/benchmark-results.schema.json` ### Updating Benchmark Results 1. Locate the specific result in the array 2. Update the `score` and/or `source_link` 3. Update the `updated_at` timestamp 4. Ensure the `source_link` is reliable and authoritative ## Benchmarks Benchmarks define the evaluation tests used to measure model performance. ### Location `data/data/benchmarks/[benchmark_id].json` ### Adding a New Benchmark 1. Create a new file: `data/data/benchmarks/[benchmark_id].json` 2. Follow this structure: ```json { "benchmark_id": "benchmark-name", "name": "Benchmark Display Name", "description": "Description of what this benchmark measures", "category": "reasoning", "source_link": "https://...", "created_at": "2025-10-02T00:00:00.000000+00:00", "updated_at": "2025-10-02T00:00:00.000000+00:00" } ``` 3. Validate against `schemas/benchmark.schema.json` ## Providers Providers are services that offer access to models (e.g., OpenAI API, AWS Bedrock, Google Vertex AI). ### Location `data/data/providers/[provider_id]/` ### Adding a New Provider 1. Create a new directory: `data/data/providers/[provider_id]/` 2. Create two files: #### `provider.json` ```json { "provider_id": "provider-name", "name": "Provider Display Name", "website": "https://provider.com", "created_at": "2025-10-02T00:00:00.000000+00:00", "updated_at": "2025-10-02T00:00:00.000000+00:00" } ``` #### `models.json` Start with an empty array: ```json [] ``` 3. Validate both files against their respective schemas ### Updating Provider Information 1. Navigate to `data/data/providers/[provider_id]/provider.json` 2. Update the relevant fields 3. Update the `updated_at` timestamp ### Adding Provider Models Provider models specify pricing and availability of models through specific providers. 1. Open `data/data/providers/[provider_id]/models.json` 2. Add a new entry to the array: ```json [ { "provider_model_id": "provider-specific-id", "model_id": "actual-model-id", "provider_id": "provider-name", "input_price_per_million": 3.0, "output_price_per_million": 15.0, "context_window": 200000, "max_output_tokens": 4096, "available": true, "created_at": "2025-10-02T00:00:00.000000+00:00", "updated_at": "2025-10-02T00:00:00.000000+00:00" } ] ``` 3. Ensure the model exists in `data/data/organizations/[org]/models/[model_id]/` 4. Validate against `schemas/provider-models.schema.json` ## Licenses Licenses define the terms under which models can be used. ### Location `data/data/licenses/[license_id].json` ### Adding a New License 1. Create a new file: `data/data/licenses/[license_id].json` 2. Follow this structure: ```json { "license_id": "license-name", "name": "License Display Name", "url": "https://...", "commercial_use": true, "created_at": "2025-10-02T00:00:00.000000+00:00", "updated_at": "2025-10-02T00:00:00.000000+00:00" } ``` 3. Validate against `schemas/license.schema.json` ## Validation Before submitting your contribution: ### Manual Validation Run the validator script from the `data/` directory: ```bash cd data node schemas/validator.js ``` This will check all JSON files against their respective schemas. ### What the Validator Checks - JSON syntax correctness - Required fields are present - Field types match schema definitions - ID references exist (e.g., organization_id, license_id) - Date formats are valid - URLs are properly formatted ### Common Validation Errors 1. **Missing Required Fields**: Ensure all required fields are present 2. **Invalid Date Format**: Use ISO 8601 format (YYYY-MM-DD or full timestamp) 3. **Invalid References**: Ensure referenced IDs exist (organization_id, license_id, etc.) 4. **Type Mismatch**: Ensure numbers are numbers, strings are strings, etc. 5. **Trailing Commas**: Remove trailing commas in JSON arrays/objects ## Submitting Your Contribution 1. **Fork the Repository**: Create your own fork of the project 2. **Create a Branch**: Use a descriptive branch name (e.g., `add-gpt5-model`, `update-claude-pricing`) 3. **Make Changes**: Follow the guidelines above 4. **Validate Locally**: Run `node schemas/validator.js` to ensure your changes are valid 5. **Commit Changes**: Write clear, descriptive commit messages 6. **Submit a Pull Request**: - Provide a clear title and description - List what was added or changed - Include links to authoritative sources - Reference any related issues ### Pull Request Template ```markdown ## Description Brief description of what this PR adds or changes ## Changes - Added/Updated model: [Model Name] - Added/Updated organization: [Organization Name] - Added benchmark results for: [Benchmark Name] ## Sources - [Source 1]: https://... - [Source 2]: https://... ## Validation - [ ] Ran `node schemas/validator.js` successfully - [ ] All files follow the correct structure - [ ] All references (organization_id, license_id) are valid ``` ### Example Pull Request For reference, see this [example pull request](https://github.com/JonathanChavezTamales/llm-leaderboard/pull/1). ## Questions? If you have questions or need clarification, please: 1. Check the schema files in `schemas/` for detailed field definitions 2. Look at existing data files as examples 3. Open an issue on GitHub Thank you for contributing to LLM Stats! ================================================ FILE: LICENSE.md ================================================ Creative Commons Attribution 4.0 International License Copyright (c) 2024 jc This work is licensed under the Creative Commons Attribution 4.0 International License. To view a copy of this license, visit http://creativecommons.org/licenses/by/4.0/ or send a letter to Creative Commons, PO Box 1866, Mountain View, CA 94042, USA. You are free to: - Share — copy and redistribute the material in any medium or format - Adapt — remix, transform, and build upon the material for any purpose, even commercially Under the following terms: - Attribution — You must give appropriate credit, provide a link to the license, and indicate if changes were made. You may do so in any reasonable manner, but not in any way that suggests the licensor endorses you or your use. No additional restrictions — You may not apply legal terms or technological measures that legally restrict others from doing anything the license permits. Notices: - You do not have to comply with the license for elements of the material in the public domain or where your use is permitted by an applicable exception or limitation. - No warranties are given. The license may not give you all of the permissions necessary for your intended use. For example, other rights such as publicity, privacy, or moral rights may limit how you use the material. ================================================ FILE: README.md ================================================ # DEPRECATED - Updates and contributions This repository is now depracated and won't be getting any new updates. For contributions and corrections of the data seen in [LLM Stats](https://llm-stats.com/) please create a post with the tag "Issue" in the [official community section](https://llm-stats.com/posts) of the website. For model and/or benchmark specific corrections, please visit create an Issue under the "Discussion" tab of the model/benchmark, as seen in the example below. Screenshot 2025-10-24 at 1 43 52 p m --- image # LLM-Stats.com [![GitHub stars](https://img.shields.io/github/stars/JonathanChavezTamales/llm-leaderboard?style=social)](https://github.com/JonathanChavezTamales/llm-leaderboard/stargazers) [![PRs Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg)](CONTRIBUTING.md) [![Discord](https://img.shields.io/badge/Discord-Join%20Us-7289da?logo=discord&logoColor=white)](https://discord.com/invite/RxGUBvE42d) [![Issues](https://img.shields.io/github/issues/JonathanChavezTamales/llm-leaderboard)](https://github.com/JonathanChavezTamales/llm-leaderboard/issues) A community-driven repository of LLM data and benchmarks. Compare and explore language models through our interactive dashboard at [llm-stats.com](https://llm-stats.com). ## Found an issue or have a feature request? [Open an issue here](https://github.com/JonathanChavezTamales/llm-leaderboard/issues). Thank you! # Data ## 🔍 What's Inside Our repository contains detailed information on hundreds of LLMs: - Model parameters, context window sizes, licensing details, capabilities, and more - Provider pricing and configurations - Performance metrics (throughput, latency) - Standardized benchmark results - Organization and license information ## 📁 Data Structure All data is organized in the `data/` directory: - `data/models/` - Model metadata and configurations - `data/providers/` - Provider information - `data/provider_models/` - Provider-specific model pricing and features - `data/benchmarks/` - Benchmark definitions - `data/model_benchmarks/` - Model benchmark scores - `data/organizations/` - Organization information - `data/licenses/` - License definitions ## 🤝 How to Contribute We welcome community contributions to keep our data accurate and up-to-date: 1. **Update Model Data** - Browse the [`data/`](data/) directory structure - Submit a PR following our [contribution guidelines](CONTRIBUTING.md) - Check [`schemas/`](schemas/) for JSON Schema validation ## 📈 Data Quality Accuracy is our priority. To ensure reliable information: - All benchmark data requires verifiable source links - Community review process for all changes - Multiple source citations encouraged - Regular validation of submitted data There's no guarantee that the data is 100% accurate, but we do our best to ensure it's as accurate as possible. ## 🌟 Community - Join our [Discord](https://discord.gg/RxGUBvE42d) for discussions ## Leaderboard | Name | Release Date | Input Context | Output Context | GPQA | MMLU | MMLU-Pro | MATH | HumanEval | MMMU | LiveCodeBench | | ---------------------------------------- | ------------ | ------------- | -------------- | ----- | ----- | -------- | ----- | --------- | ----- | ------------- | | GPT-5 | 2025-08-07 | N/A | N/A | 0.857 | 0.925 | N/A | 0.847 | 0.934 | 0.842 | N/A | | o1 | 2024-12-17 | N/A | N/A | 0.780 | 0.918 | N/A | 0.964 | 0.881 | 0.776 | N/A | | GPT-4.5 | 2025-02-27 | N/A | N/A | 0.695 | 0.908 | N/A | N/A | 0.880 | 0.752 | N/A | | o1-preview | 2024-09-12 | N/A | N/A | 0.733 | 0.908 | N/A | 0.855 | N/A | N/A | N/A | | Claude 3.5 Sonnet | 2024-10-22 | N/A | N/A | 0.672 | 0.904 | 0.776 | 0.783 | 0.937 | 0.683 | N/A | | Claude 3.5 Sonnet | 2024-06-21 | N/A | N/A | 0.594 | 0.904 | 0.761 | 0.711 | 0.920 | N/A | N/A | | Kimi K2 0905 | 2025-09-05 | N/A | N/A | 0.758 | 0.902 | 0.825 | 0.891 | 0.945 | N/A | N/A | | GPT-4.1 | 2025-04-14 | N/A | N/A | 0.663 | 0.902 | N/A | N/A | N/A | 0.748 | N/A | | Kimi K2 Instruct | 2025-07-11 | N/A | N/A | 0.751 | 0.895 | 0.811 | N/A | 0.933 | N/A | N/A | | GPT-4o | 2024-05-13 | N/A | N/A | 0.536 | 0.887 | 0.726 | 0.766 | 0.902 | N/A | N/A | | DeepSeek-V3 | 2024-12-25 | N/A | N/A | 0.591 | 0.885 | 0.759 | N/A | N/A | N/A | 0.376 | | Qwen3 235B A22B | 2025-04-29 | N/A | N/A | 0.475 | 0.878 | 0.682 | 0.718 | N/A | N/A | 0.707 | | Kimi K2 Base | 2025-07-11 | N/A | N/A | 0.481 | 0.878 | 0.692 | 0.702 | N/A | N/A | N/A | | Grok-2 | 2024-08-13 | N/A | N/A | 0.560 | 0.875 | 0.755 | 0.761 | 0.884 | 0.661 | N/A | | GPT-4.1 mini | 2025-04-14 | N/A | N/A | 0.650 | 0.875 | N/A | N/A | N/A | 0.727 | N/A | | Kimi-k1.5 | 2025-01-20 | N/A | N/A | N/A | 0.874 | N/A | N/A | N/A | 0.700 | N/A | | Llama 3.1 405B Instruct | 2024-07-23 | N/A | N/A | 0.507 | 0.873 | 0.733 | 0.738 | 0.890 | N/A | N/A | | o3-mini | 2025-01-30 | N/A | N/A | 0.772 | 0.869 | N/A | 0.979 | N/A | N/A | N/A | | Claude 3 Opus | 2024-02-29 | N/A | N/A | 0.504 | 0.868 | 0.685 | 0.601 | 0.849 | N/A | N/A | | GPT-4 Turbo | 2024-04-09 | N/A | N/A | 0.480 | 0.865 | N/A | 0.726 | 0.871 | N/A | N/A | | GPT-4 | 2023-06-13 | N/A | N/A | 0.357 | 0.864 | N/A | 0.420 | 0.670 | N/A | N/A | | Grok-2 mini | 2024-08-13 | N/A | N/A | 0.510 | 0.862 | 0.720 | 0.730 | 0.857 | 0.632 | N/A | | Llama 3.2 90B Instruct | 2024-09-25 | N/A | N/A | 0.467 | 0.860 | N/A | 0.680 | N/A | 0.603 | N/A | | Llama 3.3 70B Instruct | 2024-12-06 | N/A | N/A | 0.505 | 0.860 | 0.689 | 0.770 | 0.884 | N/A | N/A | | Nova Pro | 2024-11-20 | N/A | N/A | 0.469 | 0.859 | N/A | 0.766 | 0.890 | 0.617 | N/A | | Gemini 1.5 Pro | 2024-05-01 | N/A | N/A | 0.591 | 0.859 | 0.758 | 0.865 | 0.841 | 0.659 | N/A | | GPT-4o | 2024-08-06 | N/A | N/A | 0.701 | 0.857 | 0.747 | N/A | N/A | 0.722 | N/A | | Llama 4 Maverick | 2025-04-05 | N/A | N/A | 0.698 | 0.855 | 0.805 | 0.612 | N/A | 0.734 | 0.434 | | o1-mini | 2024-09-12 | N/A | N/A | 0.600 | 0.852 | N/A | N/A | 0.924 | N/A | N/A | | Phi 4 | 2024-12-12 | N/A | N/A | 0.561 | 0.848 | 0.704 | 0.804 | 0.826 | N/A | N/A | | Mistral Large 2 | 2024-07-24 | N/A | N/A | N/A | 0.840 | N/A | N/A | 0.920 | N/A | N/A | | Llama 3.1 70B Instruct | 2024-07-23 | N/A | N/A | 0.417 | 0.836 | 0.664 | N/A | 0.805 | N/A | N/A | | Qwen2.5 32B Instruct | 2024-09-19 | N/A | N/A | 0.495 | 0.833 | 0.690 | 0.831 | 0.884 | N/A | N/A | | Qwen2 72B Instruct | 2024-07-23 | N/A | N/A | 0.424 | 0.823 | 0.644 | 0.597 | 0.860 | N/A | N/A | | GPT-4o mini | 2024-07-18 | N/A | N/A | 0.402 | 0.820 | N/A | 0.702 | 0.872 | 0.594 | N/A | | Grok-1.5 | 2024-03-28 | N/A | N/A | 0.359 | 0.813 | 0.510 | 0.506 | 0.741 | 0.536 | N/A | | Jamba 1.5 Large | 2024-08-22 | N/A | N/A | 0.369 | 0.812 | 0.535 | N/A | N/A | N/A | N/A | | Mistral Small 3.1 24B Base | 2025-03-17 | N/A | N/A | 0.375 | 0.810 | 0.560 | N/A | N/A | 0.593 | N/A | | Mistral Small 3 24B Base | 2025-01-30 | N/A | N/A | 0.344 | 0.807 | 0.544 | 0.460 | N/A | N/A | N/A | | Mistral Small 3.1 24B Instruct | 2025-03-17 | N/A | N/A | 0.460 | 0.806 | 0.668 | 0.693 | 0.884 | 0.593 | N/A | | Nova Lite | 2024-11-20 | N/A | N/A | 0.420 | 0.805 | N/A | 0.733 | 0.854 | 0.562 | N/A | | Mistral Small 3.2 24B Instruct | 2025-06-20 | N/A | N/A | 0.461 | 0.805 | 0.691 | 0.694 | N/A | 0.625 | N/A | | DeepSeek-V2.5 | 2024-05-08 | N/A | N/A | N/A | 0.804 | N/A | 0.747 | 0.890 | N/A | N/A | | Llama 3.1 Nemotron 70B Instruct | 2024-10-01 | N/A | N/A | N/A | 0.802 | N/A | N/A | N/A | N/A | N/A | | GPT-4.1 nano | 2025-04-14 | N/A | N/A | 0.503 | 0.801 | N/A | N/A | N/A | 0.554 | N/A | | Qwen2.5 14B Instruct | 2024-09-19 | N/A | N/A | 0.455 | 0.797 | 0.637 | 0.800 | 0.835 | N/A | N/A | | Llama 4 Scout | 2025-04-05 | N/A | N/A | 0.572 | 0.796 | 0.743 | 0.503 | N/A | 0.694 | 0.328 | | Claude 3 Sonnet | 2024-02-29 | N/A | N/A | 0.404 | 0.790 | 0.568 | 0.431 | 0.730 | N/A | N/A | | Gemini 1.5 Flash | 2024-05-01 | N/A | N/A | 0.510 | 0.789 | 0.673 | 0.779 | 0.743 | 0.623 | N/A | | Phi-3.5-MoE-instruct | 2024-08-23 | N/A | N/A | 0.368 | 0.789 | 0.453 | 0.595 | 0.707 | N/A | N/A | | Qwen2.5 VL 32B Instruct | 2025-02-28 | N/A | N/A | 0.460 | 0.784 | 0.688 | 0.822 | 0.915 | 0.700 | N/A | | Nova Micro | 2024-11-20 | N/A | N/A | 0.400 | 0.776 | N/A | 0.693 | 0.811 | N/A | N/A | | Command R+ | 2024-08-30 | N/A | N/A | N/A | 0.757 | N/A | N/A | N/A | N/A | N/A | | Gemma 2 27B | 2024-06-27 | N/A | N/A | N/A | 0.752 | N/A | 0.423 | 0.518 | N/A | N/A | | Claude 3 Haiku | 2024-03-13 | N/A | N/A | 0.333 | 0.752 | N/A | 0.389 | 0.759 | N/A | N/A | | Qwen2.5-Coder 32B Instruct | 2024-09-19 | N/A | N/A | N/A | 0.751 | 0.504 | 0.572 | 0.927 | N/A | 0.314 | | Llama 3.2 11B Instruct | 2024-09-25 | N/A | N/A | 0.328 | 0.730 | N/A | 0.519 | N/A | 0.507 | N/A | | Gemini 1.0 Pro | 2024-02-15 | N/A | N/A | 0.279 | 0.718 | N/A | 0.326 | N/A | 0.479 | N/A | | Gemma 2 9B | 2024-06-27 | N/A | N/A | N/A | 0.713 | N/A | 0.366 | 0.402 | N/A | N/A | | Qwen2 7B Instruct | 2024-07-23 | N/A | N/A | 0.253 | 0.705 | 0.441 | 0.496 | 0.799 | N/A | 0.266 | | GPT-3.5 Turbo | 2023-03-21 | N/A | N/A | 0.308 | 0.698 | N/A | 0.431 | 0.680 | 0.000 | N/A | | Jamba 1.5 Mini | 2024-08-22 | N/A | N/A | 0.323 | 0.697 | 0.425 | N/A | N/A | N/A | N/A | | Llama 3.1 8B Instruct | 2024-07-23 | N/A | N/A | 0.304 | 0.694 | 0.483 | N/A | 0.726 | N/A | N/A | | Pixtral-12B | 2024-09-17 | N/A | N/A | N/A | 0.692 | N/A | 0.481 | 0.720 | 0.525 | N/A | | Phi-3.5-mini-instruct | 2024-08-23 | N/A | N/A | 0.304 | 0.690 | 0.474 | 0.485 | 0.628 | N/A | N/A | | Mistral NeMo Instruct | 2024-07-18 | N/A | N/A | N/A | 0.680 | N/A | N/A | N/A | N/A | N/A | | Qwen2.5-Coder 7B Instruct | 2024-09-19 | N/A | N/A | N/A | 0.676 | 0.401 | 0.466 | 0.884 | N/A | 0.182 | | Phi 4 Mini | 2025-02-01 | N/A | N/A | 0.252 | 0.673 | 0.528 | 0.640 | N/A | N/A | N/A | | Granite 3.3 8B Instruct | 2025-04-16 | N/A | N/A | N/A | 0.655 | N/A | N/A | 0.897 | N/A | N/A | | Ministral 8B Instruct | 2024-10-16 | N/A | N/A | N/A | 0.650 | N/A | 0.545 | 0.348 | N/A | N/A | | Gemma 3n E4B Instructed LiteRT Preview | 2025-05-20 | N/A | N/A | 0.237 | 0.649 | 0.506 | N/A | 0.750 | N/A | 0.132 | | Gemma 3n E4B Instructed | 2025-06-26 | N/A | N/A | 0.237 | 0.649 | 0.506 | N/A | 0.750 | N/A | 0.132 | | Granite 3.3 8B Base | 2025-04-16 | N/A | N/A | N/A | 0.639 | N/A | N/A | 0.897 | N/A | N/A | | Llama 3.2 3B Instruct | 2024-09-25 | N/A | N/A | 0.328 | 0.634 | N/A | 0.480 | N/A | N/A | N/A | | IBM Granite 4.0 Tiny Preview | 2025-05-02 | N/A | N/A | N/A | 0.604 | N/A | N/A | 0.824 | N/A | N/A | | Gemma 3n E2B Instructed LiteRT (Preview) | 2025-05-20 | N/A | N/A | 0.248 | 0.601 | 0.405 | N/A | 0.665 | N/A | 0.132 | | Gemma 3n E2B Instructed | 2025-06-26 | N/A | N/A | 0.248 | 0.601 | 0.405 | N/A | 0.665 | N/A | 0.132 | | Kimi K2-Instruct-0905 | 2025-09-05 | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | | Gemma 3n E4B | 2025-06-26 | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | | Gemma 3 12B | 2025-03-12 | N/A | N/A | 0.409 | N/A | 0.606 | 0.838 | 0.854 | N/A | 0.246 | | Gemini 2.5 Pro | 2025-05-20 | N/A | N/A | 0.830 | N/A | N/A | N/A | N/A | 0.796 | N/A | | Gemini 2.0 Flash-Lite | 2025-02-05 | N/A | N/A | 0.515 | N/A | 0.716 | 0.868 | N/A | 0.680 | N/A | | Gemini 2.5 Flash-Lite | 2025-06-17 | N/A | N/A | 0.646 | N/A | N/A | N/A | N/A | 0.729 | 0.337 | | Gemini 2.5 Pro Preview 06-05 | 2025-06-05 | N/A | N/A | 0.864 | N/A | N/A | N/A | N/A | 0.820 | 0.690 | | Gemini 2.5 Flash | 2025-05-20 | N/A | N/A | 0.828 | N/A | N/A | N/A | N/A | 0.797 | N/A | | Gemini 2.0 Flash Thinking | 2025-01-21 | N/A | N/A | 0.742 | N/A | N/A | N/A | N/A | 0.754 | N/A | | Gemma 3n E2B | 2025-06-26 | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | | MedGemma 4B IT | 2025-05-20 | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | | Gemma 3 4B | 2025-03-12 | N/A | N/A | 0.308 | N/A | 0.436 | 0.756 | 0.713 | N/A | 0.126 | | Gemma 3 27B | 2025-03-12 | N/A | N/A | 0.424 | N/A | 0.675 | 0.890 | 0.878 | N/A | 0.297 | | Gemma 3 1B | 2025-03-12 | N/A | N/A | 0.192 | N/A | 0.147 | 0.480 | 0.415 | N/A | 0.019 | | Gemini 1.5 Flash 8B | 2024-03-15 | N/A | N/A | 0.384 | N/A | 0.587 | 0.587 | N/A | 0.537 | N/A | | Gemini Diffusion | 2025-05-20 | N/A | N/A | 0.404 | N/A | N/A | N/A | 0.896 | N/A | 0.309 | | Gemini 2.0 Flash | 2024-12-01 | N/A | N/A | 0.621 | N/A | 0.764 | 0.897 | N/A | 0.707 | 0.351 | | Phi 4 Mini Reasoning | 2025-04-30 | N/A | N/A | 0.520 | N/A | N/A | N/A | N/A | N/A | N/A | | Phi-3.5-vision-instruct | 2024-08-23 | N/A | N/A | N/A | N/A | N/A | N/A | N/A | 0.430 | N/A | | Phi 4 Reasoning Plus | 2025-04-30 | N/A | N/A | 0.689 | N/A | 0.760 | N/A | N/A | N/A | 0.531 | | Phi-4-multimodal-instruct | 2025-02-01 | N/A | N/A | N/A | N/A | N/A | N/A | N/A | 0.551 | N/A | | Phi 4 Reasoning | 2025-04-30 | N/A | N/A | 0.658 | N/A | 0.743 | N/A | N/A | N/A | 0.538 | | Qwen3-235B-A22B-Instruct-2507 | 2025-07-22 | N/A | N/A | 0.775 | N/A | 0.830 | N/A | N/A | N/A | N/A | | QwQ-32B | 2025-03-05 | N/A | N/A | 0.652 | N/A | N/A | N/A | N/A | N/A | 0.634 | | Qwen3-235B-A22B-Thinking-2507 | 2025-07-25 | N/A | N/A | 0.811 | N/A | 0.844 | N/A | N/A | N/A | N/A | | QwQ-32B-Preview | 2024-11-28 | N/A | N/A | 0.652 | N/A | N/A | N/A | N/A | N/A | 0.500 | | Qwen3-Next-80B-A3B-Thinking | 2025-09-10 | N/A | N/A | 0.772 | N/A | 0.827 | N/A | N/A | N/A | N/A | | Qwen2-VL-72B-Instruct | 2024-08-29 | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | | Qwen3 32B | 2025-04-29 | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | 0.657 | | Qwen2.5 72B Instruct | 2024-09-19 | N/A | N/A | 0.490 | N/A | 0.711 | 0.831 | 0.866 | N/A | 0.555 | | Qwen3 30B A3B | 2025-04-29 | N/A | N/A | 0.658 | N/A | N/A | N/A | N/A | N/A | 0.626 | | Qwen2.5 VL 7B Instruct | 2025-01-26 | N/A | N/A | N/A | N/A | N/A | N/A | N/A | 0.586 | N/A | | Qwen3-Next-80B-A3B-Base | 2025-09-10 | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | | QvQ-72B-Preview | 2024-12-25 | N/A | N/A | N/A | N/A | N/A | N/A | N/A | 0.703 | N/A | | Qwen2.5-Omni-7B | 2025-03-27 | N/A | N/A | 0.308 | N/A | 0.470 | 0.715 | 0.787 | 0.592 | N/A | | Qwen2.5 7B Instruct | 2024-09-19 | N/A | N/A | 0.364 | N/A | 0.563 | 0.755 | 0.848 | N/A | 0.287 | | Qwen3-Next-80B-A3B-Instruct | 2025-09-10 | N/A | N/A | 0.729 | N/A | 0.806 | N/A | N/A | N/A | N/A | | Qwen2.5 VL 72B Instruct | 2025-01-26 | N/A | N/A | N/A | N/A | N/A | N/A | N/A | 0.702 | N/A | | DeepSeek-R1-0528 | 2025-05-28 | N/A | N/A | N/A | N/A | 0.850 | N/A | N/A | N/A | 0.733 | | DeepSeek VL2 | 2024-12-13 | N/A | N/A | N/A | N/A | N/A | N/A | N/A | 0.511 | N/A | | DeepSeek VL2 Tiny | 2024-12-13 | N/A | N/A | N/A | N/A | N/A | N/A | N/A | 0.407 | N/A | | DeepSeek R1 Zero | 2025-01-20 | N/A | N/A | 0.733 | N/A | N/A | N/A | N/A | N/A | 0.500 | | DeepSeek VL2 Small | 2024-12-13 | N/A | N/A | N/A | N/A | N/A | N/A | N/A | 0.480 | N/A | | DeepSeek R1 Distill Qwen 7B | 2025-01-20 | N/A | N/A | 0.491 | N/A | N/A | N/A | N/A | N/A | 0.376 | | DeepSeek R1 Distill Qwen 1.5B | 2025-01-20 | N/A | N/A | 0.338 | N/A | N/A | N/A | N/A | N/A | 0.169 | | DeepSeek-R1 | 2025-01-20 | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | | DeepSeek R1 Distill Llama 8B | 2025-01-20 | N/A | N/A | 0.490 | N/A | N/A | N/A | N/A | N/A | 0.396 | | DeepSeek R1 Distill Llama 70B | 2025-01-20 | N/A | N/A | 0.652 | N/A | N/A | N/A | N/A | N/A | 0.575 | | DeepSeek R1 Distill Qwen 14B | 2025-01-20 | N/A | N/A | 0.591 | N/A | N/A | N/A | N/A | N/A | 0.531 | | DeepSeek R1 Distill Qwen 32B | 2025-01-20 | N/A | N/A | 0.621 | N/A | N/A | N/A | N/A | N/A | 0.572 | | DeepSeek-V3.1 | 2025-01-10 | N/A | N/A | N/A | N/A | 0.837 | N/A | N/A | N/A | 0.564 | | DeepSeek-V3.2-Exp | 2025-09-29 | N/A | N/A | N/A | N/A | 0.850 | N/A | N/A | N/A | 0.741 | | DeepSeek-V3 0324 | 2025-03-25 | N/A | N/A | 0.684 | N/A | 0.812 | N/A | N/A | N/A | 0.492 | | Grok-3 Mini | 2025-02-17 | N/A | N/A | 0.840 | N/A | N/A | N/A | N/A | N/A | 0.804 | | Grok-4 Heavy | 2025-07-09 | N/A | N/A | 0.884 | N/A | N/A | N/A | N/A | N/A | 0.794 | | Grok-4 | 2025-07-09 | N/A | N/A | 0.875 | N/A | N/A | N/A | N/A | N/A | 0.790 | | Grok-3 | 2025-02-17 | N/A | N/A | 0.846 | N/A | N/A | N/A | N/A | 0.780 | 0.794 | | Grok-1.5V | 2024-04-12 | N/A | N/A | N/A | N/A | N/A | N/A | N/A | 0.536 | N/A | | GLM-4.5V | 2025-08-11 | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | | GLM-4.5-Air | 2025-07-28 | N/A | N/A | 0.750 | N/A | 0.814 | N/A | N/A | N/A | 0.707 | | GLM-4.5 | 2025-07-28 | N/A | N/A | 0.791 | N/A | 0.846 | N/A | N/A | N/A | 0.729 | | Llama-3.3 Nemotron Super 49B v1 | 2025-03-18 | N/A | N/A | 0.667 | N/A | N/A | N/A | N/A | N/A | N/A | | Llama 3.1 Nemotron Nano 8B V1 | 2025-03-18 | N/A | N/A | 0.541 | N/A | N/A | N/A | N/A | N/A | N/A | | Llama 3.1 Nemotron Ultra 253B v1 | 2025-04-07 | N/A | N/A | 0.760 | N/A | N/A | N/A | N/A | N/A | 0.663 | | Claude Opus 4.1 | 2025-08-05 | N/A | N/A | 0.809 | N/A | N/A | N/A | N/A | N/A | N/A | | Claude Sonnet 4.5 | 2025-09-29 | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | | Claude 3.5 Haiku | 2024-10-22 | N/A | N/A | 0.416 | N/A | 0.650 | 0.694 | 0.881 | N/A | N/A | | Claude 3.7 Sonnet | 2025-02-24 | N/A | N/A | 0.848 | N/A | N/A | N/A | N/A | 0.750 | N/A | | Claude Sonnet 4 | 2025-05-22 | N/A | N/A | 0.754 | N/A | N/A | N/A | N/A | 0.744 | N/A | | Claude Opus 4 | 2025-05-22 | N/A | N/A | 0.796 | N/A | N/A | N/A | N/A | N/A | N/A | | Magistral Small 2506 | 2025-06-10 | N/A | N/A | 0.682 | N/A | N/A | N/A | N/A | N/A | 0.513 | | Magistral Medium | 2025-06-10 | N/A | N/A | 0.708 | N/A | N/A | N/A | N/A | N/A | 0.503 | | Devstral Medium | 2025-07-10 | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | | Pixtral Large | 2024-11-18 | N/A | N/A | N/A | N/A | N/A | N/A | N/A | 0.640 | N/A | | Mistral Small 3 24B Instruct | 2025-01-30 | N/A | N/A | 0.453 | N/A | 0.663 | 0.706 | 0.848 | N/A | N/A | | Devstral Small 1.1 | 2025-07-11 | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | | Codestral-22B | 2024-05-29 | N/A | N/A | N/A | N/A | N/A | N/A | 0.811 | N/A | N/A | | Mistral Small | 2024-09-17 | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | | GPT OSS 120B | 2025-08-05 | N/A | N/A | 0.801 | N/A | N/A | N/A | N/A | N/A | N/A | | o3 | 2025-04-16 | N/A | N/A | 0.833 | N/A | N/A | N/A | N/A | 0.829 | N/A | | GPT OSS 20B | 2025-08-05 | N/A | N/A | 0.715 | N/A | N/A | N/A | N/A | N/A | N/A | | o4-mini | 2025-04-16 | N/A | N/A | 0.814 | N/A | N/A | N/A | N/A | 0.816 | N/A | | o3-pro | 2025-06-10 | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | | o1-pro | 2024-12-17 | N/A | N/A | 0.790 | N/A | N/A | N/A | N/A | N/A | N/A | | GPT-5 nano | 2025-08-07 | N/A | N/A | 0.712 | N/A | N/A | N/A | N/A | N/A | N/A | | GPT-5 mini | 2025-08-07 | N/A | N/A | 0.823 | N/A | N/A | N/A | N/A | N/A | N/A | | GPT-5 Codex | 2025-09-15 | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A |
Built with 💙 by the AI community, for the AI community.
Star this repo if you find it useful!
================================================ FILE: data/.github/CODEOWNERS ================================================ * @JonathanChavezTamales * @sebastiancrossa ================================================ FILE: data/benchmarks/aa-index.json ================================================ { "benchmark_id": "aa-index", "name": "AA-Index", "parent_benchmark_id": null, "categories": ["general"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "No official academic documentation found for this benchmark. Extensive research through ArXiv, IEEE/ACL/NeurIPS papers, and university research sites yielded no peer-reviewed sources for an 'aa-index' benchmark. This entry requires verification from official academic sources.", "paper_link": null, "implementation_link": null, "verified": false, "created_at": "2025-07-28T00:00:00.000000+00:00", "updated_at": "2025-07-28T00:00:00.000000+00:00" } ================================================ FILE: data/benchmarks/acebench.json ================================================ { "benchmark_id": "acebench", "name": "ACEBench", "parent_benchmark_id": null, "categories": ["general", "reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "ACEBench is a comprehensive benchmark for evaluating Large Language Models' tool usage capabilities across three primary evaluation types: Normal (basic tool usage scenarios), Special (tool usage with ambiguous or incomplete instructions), and Agent (multi-agent interactions simulating real-world dialogues). The benchmark covers 4,538 APIs across 8 major domains and 68 sub-domains including technology, finance, entertainment, society, health, culture, and environment, supporting both English and Chinese languages.", "paper_link": "https://arxiv.org/abs/2501.12851", "implementation_link": "https://github.com/ACEBench/ACEBench", "verified": false, "created_at": "2025-09-05T00:00:00.000000+00:00", "updated_at": "2025-09-30T00:00:00.000000+00:00" } ================================================ FILE: data/benchmarks/activitynet.json ================================================ { "benchmark_id": "activitynet", "name": "ActivityNet", "parent_benchmark_id": null, "categories": ["vision", "video"], "modality": "video", "multilingual": false, "max_score": 1.0, "language": "en", "description": "A large-scale video benchmark for human activity understanding. Provides samples from 203 activity classes with an average of 137 untrimmed videos per class and 1.41 activity instances per video, for a total of 849 video hours. The benchmark covers a wide range of complex human activities that are of interest to people in their daily living and can be used to compare algorithms for three scenarios: untrimmed video classification, trimmed activity classification, and activity detection.", "paper_link": "https://openaccess.thecvf.com/content_cvpr_2015/html/Heilbron_ActivityNet_A_Large-Scale_2015_CVPR_paper.html", "implementation_link": "https://github.com/activitynet/ActivityNet", "verified": false, "created_at": "2025-07-19T19:56:15.378371+00:00", "updated_at": "2025-07-19T19:56:15.378371+00:00" } ================================================ FILE: data/benchmarks/agieval.json ================================================ { "benchmark_id": "agieval", "name": "AGIEval", "parent_benchmark_id": null, "categories": ["reasoning", "general", "math"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "A human-centric benchmark for evaluating foundation models on standardized exams including college entrance exams (Gaokao, SAT), law school admission tests (LSAT), math competitions, lawyer qualification tests, and civil service exams. Contains 20 tasks (18 multiple-choice, 2 cloze) designed to assess understanding, knowledge, reasoning, and calculation abilities in real-world academic and professional contexts.", "paper_link": "https://arxiv.org/abs/2304.06364", "implementation_link": "https://github.com/ruixiangcui/AGIEval", "verified": false, "created_at": "2025-07-19T19:56:13.970928+00:00", "updated_at": "2025-07-19T19:56:13.970928+00:00" } ================================================ FILE: data/benchmarks/ai2-reasoning-challenge-(arc).json ================================================ { "benchmark_id": "ai2-reasoning-challenge-(arc)", "name": "AI2 Reasoning Challenge (ARC)", "parent_benchmark_id": null, "categories": ["reasoning", "general"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "A dataset of 7,787 genuine grade-school level, multiple-choice science questions assembled to encourage research in advanced question-answering. The dataset is partitioned into a Challenge Set and Easy Set, where the Challenge Set contains only questions answered incorrectly by both retrieval-based and word co-occurrence algorithms. Covers multiple scientific domains including biology, physics, earth science, and chemistry, requiring scientific reasoning, causal understanding, and conceptual knowledge beyond simple fact retrieval. Includes a supporting corpus of over 14 million science sentences.", "paper_link": "https://arxiv.org/abs/1803.05457", "implementation_link": "https://github.com/allenai/ARC-Solvers", "verified": false, "created_at": "2025-07-19T19:56:15.419158+00:00", "updated_at": "2025-07-19T19:56:15.419158+00:00" } ================================================ FILE: data/benchmarks/ai2d.json ================================================ { "benchmark_id": "ai2d", "name": "AI2D", "parent_benchmark_id": null, "categories": ["vision", "reasoning", "multimodal"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "AI2D is a dataset of 4,903 illustrative diagrams from grade school natural sciences (such as food webs, human physiology, and life cycles) with over 15,000 multiple choice questions and answers. The benchmark evaluates diagram understanding and visual reasoning capabilities, requiring models to interpret diagrammatic elements, relationships, and structure to answer questions about scientific concepts represented in visual form.", "paper_link": "https://arxiv.org/abs/1603.07396", "implementation_link": "https://allenai.org/data/diagrams", "verified": false, "created_at": "2025-07-19T19:56:13.618926+00:00", "updated_at": "2025-07-19T19:56:13.618926+00:00" } ================================================ FILE: data/benchmarks/aider-polyglot-edit.json ================================================ { "benchmark_id": "aider-polyglot-edit", "name": "Aider-Polyglot Edit", "parent_benchmark_id": null, "categories": ["general", "code"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "A challenging multi-language coding benchmark that evaluates models' code editing abilities across C++, Go, Java, JavaScript, Python, and Rust. Contains 225 of Exercism's most difficult programming problems, selected as problems that were solved by 3 or fewer out of 7 top coding models. The benchmark focuses on code editing tasks and measures both correctness of solutions and proper edit format usage. Designed to re-calibrate evaluation scales so top models score between 5-50%.", "paper_link": null, "implementation_link": "https://github.com/Aider-AI/polyglot-benchmark", "verified": false, "created_at": "2025-07-19T19:56:13.789839+00:00", "updated_at": "2025-09-30T00:00:00.000000+00:00" } ================================================ FILE: data/benchmarks/aider-polyglot.json ================================================ { "benchmark_id": "aider-polyglot", "name": "Aider-Polyglot", "parent_benchmark_id": null, "categories": ["general", "code"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "A coding benchmark that evaluates LLMs on 225 challenging Exercism programming exercises across C++, Go, Java, JavaScript, Python, and Rust. Models receive two attempts to solve each problem, with test error feedback provided after the first attempt if it fails. The benchmark measures both initial problem-solving ability and capacity to edit code based on error feedback, providing an end-to-end evaluation of code generation and editing capabilities across multiple programming languages.", "paper_link": null, "implementation_link": "https://github.com/Aider-AI/polyglot-benchmark", "verified": false, "created_at": "2025-09-05T00:00:00.000000+00:00", "updated_at": "2025-09-30T00:00:00.000000+00:00" } ================================================ FILE: data/benchmarks/aider.json ================================================ { "benchmark_id": "aider", "name": "Aider", "parent_benchmark_id": null, "categories": ["reasoning", "code"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "Aider is a comprehensive code editing benchmark based on 133 practice exercises from Exercism's Python repository, designed to evaluate AI models' ability to translate natural language coding requests into executable code that passes unit tests. The benchmark measures end-to-end code editing capabilities, including GPT's ability to edit existing code and format code changes for automated saving to local files. The Aider Polyglot variant extends this evaluation across 225 challenging exercises spanning C++, Go, Java, JavaScript, Python, and Rust, making it a standard benchmark for assessing multilingual code editing performance in AI research.", "paper_link": null, "implementation_link": "https://github.com/Aider-AI/aider", "verified": false, "created_at": "2025-07-19T19:56:14.566857+00:00", "updated_at": "2025-07-19T19:56:14.566857+00:00" } ================================================ FILE: data/benchmarks/aime-2024.json ================================================ { "benchmark_id": "aime-2024", "name": "AIME 2024", "parent_benchmark_id": null, "categories": ["math", "reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "American Invitational Mathematics Examination 2024, consisting of 30 challenging mathematical reasoning problems from AIME I and AIME II competitions. Each problem requires an integer answer between 0-999 and tests advanced mathematical reasoning across algebra, geometry, combinatorics, and number theory. Used as a benchmark for evaluating mathematical reasoning capabilities in large language models at Olympiad-level difficulty.", "paper_link": "https://arxiv.org/html/2503.21380v2", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:11.941652+00:00", "updated_at": "2025-09-30T00:00:00.000000+00:00" } ================================================ FILE: data/benchmarks/aime-2025.json ================================================ { "benchmark_id": "aime-2025", "name": "AIME 2025", "parent_benchmark_id": null, "categories": ["math", "reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "All 30 problems from the 2025 American Invitational Mathematics Examination (AIME I and AIME II), testing olympiad-level mathematical reasoning with integer answers from 000-999. Used as an AI benchmark to evaluate large language models' ability to solve complex mathematical problems requiring multi-step logical deductions and structured symbolic reasoning.", "paper_link": "https://arxiv.org/abs/2503.21380", "implementation_link": null, "verified": false, "created_at": "2025-09-05T00:00:00.000000+00:00", "updated_at": "2025-09-05T00:00:00.000000+00:00" } ================================================ FILE: data/benchmarks/aime.json ================================================ { "benchmark_id": "aime", "name": "AIME", "parent_benchmark_id": null, "categories": ["math", "reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "American Invitational Mathematics Examination (AIME) benchmark for evaluating mathematical reasoning capabilities of large language models. Contains 30 challenging mathematical problems from AIME 2024 competition that require multi-step reasoning and advanced mathematical insight. Each problem has an integer answer between 000-999.", "paper_link": "https://arxiv.org/html/2503.21380v2", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.057279+00:00", "updated_at": "2025-07-19T19:56:14.057279+00:00" } ================================================ FILE: data/benchmarks/aitz-em.json ================================================ { "benchmark_id": "aitz-em", "name": "AITZ_EM", "parent_benchmark_id": null, "categories": ["multimodal", "reasoning"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "Android-In-The-Zoo (AitZ) benchmark for evaluating autonomous GUI agents on smartphones. Contains 18,643 screen-action pairs with chain-of-action-thought annotations spanning over 70 Android apps. Designed to connect perception (screen layouts and UI elements) with cognition (action decision-making) for natural language-triggered smartphone task completion.", "paper_link": "https://arxiv.org/abs/2403.02713", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.785085+00:00", "updated_at": "2025-07-19T19:56:14.785085+00:00" } ================================================ FILE: data/benchmarks/alignbench.json ================================================ { "benchmark_id": "alignbench", "name": "AlignBench", "parent_benchmark_id": null, "categories": ["general", "language", "math", "reasoning", "roleplay"], "modality": "text", "multilingual": true, "max_score": 1.0, "language": "en", "description": "AlignBench is a comprehensive multi-dimensional benchmark for evaluating Chinese alignment of Large Language Models. It contains 8 main categories: Fundamental Language Ability, Advanced Chinese Understanding, Open-ended Questions, Writing Ability, Logical Reasoning, Mathematics, Task-oriented Role Play, and Professional Knowledge. The benchmark includes 683 real-scenario rooted queries with human-verified references and uses a rule-calibrated multi-dimensional LLM-as-Judge approach with Chain-of-Thought for evaluation.", "paper_link": "https://arxiv.org/abs/2311.18743", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.542033+00:00", "updated_at": "2025-07-19T19:56:14.542033+00:00" } ================================================ FILE: data/benchmarks/alpacaeval-2.0.json ================================================ { "benchmark_id": "alpacaeval-2.0", "name": "AlpacaEval 2.0", "parent_benchmark_id": null, "categories": ["general", "creativity", "reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "AlpacaEval 2.0 is a length-controlled automatic evaluator for instruction-following language models that uses GPT-4 Turbo to assess model responses against a baseline. It evaluates models on 805 diverse instruction-following tasks including creative writing, classification, programming, and general knowledge questions. The benchmark achieves 0.98 Spearman correlation with ChatBot Arena while being fast (< 3 minutes) and affordable (< $10 in OpenAI credits). It addresses length bias in automatic evaluation through length-controlled win-rates and uses weighted scoring based on response quality.", "paper_link": "https://arxiv.org/abs/2404.04475", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:15.038178+00:00", "updated_at": "2025-07-19T19:56:15.038178+00:00" } ================================================ FILE: data/benchmarks/amc-2022-23.json ================================================ { "benchmark_id": "amc-2022-23", "name": "AMC_2022_23", "parent_benchmark_id": null, "categories": ["math", "reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "American Mathematics Competition problems from the 2022-23 academic year, consisting of multiple-choice mathematics competition problems designed for high school students. These problems require advanced mathematical reasoning, problem-solving strategies, and mathematical knowledge covering topics like algebra, geometry, number theory, and combinatorics. The benchmark is derived from the official AMC competitions sponsored by the Mathematical Association of America.", "paper_link": "https://arxiv.org/abs/2103.03874", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:13.992903+00:00", "updated_at": "2025-07-19T19:56:13.992903+00:00" } ================================================ FILE: data/benchmarks/android-control-high-em.json ================================================ { "benchmark_id": "android-control-high-em", "name": "Android Control High_EM", "parent_benchmark_id": null, "categories": ["multimodal", "reasoning"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "Android device control benchmark using high exact match evaluation metric for assessing agent performance on mobile interface tasks", "paper_link": null, "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.792498+00:00", "updated_at": "2025-07-19T19:56:14.792498+00:00" } ================================================ FILE: data/benchmarks/android-control-low-em.json ================================================ { "benchmark_id": "android-control-low-em", "name": "Android Control Low_EM", "parent_benchmark_id": null, "categories": ["multimodal", "reasoning"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "Android control benchmark evaluating autonomous agents on mobile device interaction tasks with low exact match scoring criteria", "paper_link": null, "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.800337+00:00", "updated_at": "2025-07-19T19:56:14.800337+00:00" } ================================================ FILE: data/benchmarks/androidworld-sr.json ================================================ { "benchmark_id": "androidworld-sr", "name": "AndroidWorld_SR", "parent_benchmark_id": null, "categories": ["general", "multimodal", "reasoning"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "AndroidWorld Success Rate (SR) benchmark - A dynamic benchmarking environment for autonomous agents operating on Android devices. Evaluates agents on 116 programmatic tasks across 20 real-world Android apps using multimodal inputs (screen screenshots, accessibility trees, and natural language instructions). Measures success rate of agents completing tasks like sending messages, creating calendar events, and navigating mobile interfaces. Published at ICLR 2025. Best current performance: 30.6% success rate (M3A agent) vs 80.0% human performance.", "paper_link": "https://arxiv.org/abs/2405.14573", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.808659+00:00", "updated_at": "2025-07-19T19:56:14.808659+00:00" } ================================================ FILE: data/benchmarks/api-bank.json ================================================ { "benchmark_id": "api-bank", "name": "API-Bank", "parent_benchmark_id": null, "categories": ["reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "A comprehensive benchmark for tool-augmented LLMs that evaluates API planning, retrieval, and calling capabilities. Contains 314 tool-use dialogues with 753 API calls across 73 API tools, designed to assess how effectively LLMs can utilize external tools and overcome obstacles in tool leveraging.", "paper_link": "https://arxiv.org/abs/2304.08244", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.374447+00:00", "updated_at": "2025-07-19T19:56:14.374447+00:00" } ================================================ FILE: data/benchmarks/arc-agi-v2.json ================================================ { "benchmark_id": "arc-agi-v2", "name": "ARC-AGI v2", "parent_benchmark_id": null, "categories": ["reasoning", "vision", "spatial_reasoning"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "ARC-AGI-2 is an upgraded benchmark for measuring abstract reasoning and problem-solving abilities in AI systems through visual grid transformation tasks. It evaluates fluid intelligence via input-output grid pairs (1x1 to 30x30) using colored cells (0-9), requiring models to identify underlying transformation rules from demonstration examples and apply them to test cases. Designed to be easy for humans but challenging for AI, focusing on core cognitive abilities like spatial reasoning, pattern recognition, and compositional generalization.", "paper_link": "https://arxiv.org/abs/2505.11831", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:13.916360+00:00", "updated_at": "2025-07-19T19:56:13.916360+00:00" } ================================================ FILE: data/benchmarks/arc-agi.json ================================================ { "benchmark_id": "arc-agi", "name": "ARC-AGI", "parent_benchmark_id": null, "categories": ["reasoning", "vision", "spatial_reasoning"], "modality": "image", "multilingual": false, "max_score": 1.0, "language": "en", "description": "The Abstraction and Reasoning Corpus for Artificial General Intelligence (ARC-AGI) is a benchmark designed to test general intelligence and abstract reasoning capabilities through visual grid-based transformation tasks. Each task consists of 2-5 demonstration pairs showing input grids transformed into output grids according to underlying rules, with test-takers required to infer these rules and apply them to novel test inputs. The benchmark uses colored grids (up to 30x30) with 10 discrete colors/symbols, designed to measure human-like general fluid intelligence and skill-acquisition efficiency with minimal prior knowledge.", "paper_link": "https://arxiv.org/abs/1911.01547", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:15.187761+00:00", "updated_at": "2025-07-19T19:56:15.187761+00:00" } ================================================ FILE: data/benchmarks/arc-c.json ================================================ { "benchmark_id": "arc-c", "name": "ARC-C", "parent_benchmark_id": null, "categories": ["reasoning", "general"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "The AI2 Reasoning Challenge (ARC) Challenge Set is a multiple-choice question-answering benchmark containing grade-school level science questions that require advanced reasoning capabilities. ARC-C specifically contains questions that were answered incorrectly by both retrieval-based and word co-occurrence algorithms, making it a particularly challenging subset designed to test commonsense reasoning abilities in AI systems.", "paper_link": "https://arxiv.org/abs/1803.05457", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:11.052939+00:00", "updated_at": "2025-07-19T19:56:11.052939+00:00" } ================================================ FILE: data/benchmarks/arc-e.json ================================================ { "benchmark_id": "arc-e", "name": "ARC-E", "parent_benchmark_id": null, "categories": ["reasoning", "general"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "ARC-E (AI2 Reasoning Challenge - Easy Set) is a subset of grade-school level, multiple-choice science questions that requires knowledge and reasoning capabilities. Part of the AI2 Reasoning Challenge dataset containing 5,197 questions that test scientific reasoning and factual knowledge. The Easy Set contains questions that are answerable by retrieval-based and word co-occurrence algorithms, making them more accessible than the Challenge Set.", "paper_link": "https://arxiv.org/abs/1803.05457", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:13.192662+00:00", "updated_at": "2025-07-19T19:56:13.192662+00:00" } ================================================ FILE: data/benchmarks/arc.json ================================================ { "benchmark_id": "arc", "name": "Arc", "parent_benchmark_id": null, "categories": ["reasoning", "general"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "The Abstraction and Reasoning Corpus (ARC) is a benchmark designed to measure human-like general fluid intelligence through grid-based reasoning tasks. It consists of 800 tasks (400 training, 400 evaluation) where each task presents input-output grids that require understanding abstract patterns and transformations. Test-takers must produce exactly correct output grids for all test inputs in a task to solve it, with 3 trials allowed per test input. ARC aims to enable fair comparisons of general intelligence between AI systems and humans using priors designed to be as close as possible to innate human priors.", "paper_link": "https://arxiv.org/abs/1911.01547", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:13.967150+00:00", "updated_at": "2025-07-19T19:56:13.967150+00:00" } ================================================ FILE: data/benchmarks/arena-hard-v2.json ================================================ { "benchmark_id": "arena-hard-v2", "name": "Arena-Hard v2", "parent_benchmark_id": null, "categories": ["general", "reasoning", "creativity"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "Arena-Hard-Auto v2 is a challenging benchmark consisting of 500 carefully curated prompts sourced from Chatbot Arena and WildChat-1M, designed to evaluate large language models on real-world user queries. The benchmark covers diverse domains including open-ended software engineering problems, mathematics, creative writing, and technical problem-solving. It uses LLM-as-a-Judge for automatic evaluation, achieving 98.6% correlation with human preference rankings while providing 3x higher separation of model performances compared to MT-Bench. The benchmark emphasizes prompt specificity, complexity, and domain knowledge to better distinguish between model capabilities.", "paper_link": "https://arxiv.org/abs/2406.11939", "implementation_link": null, "verified": false, "created_at": "2025-08-03T22:06:11.411643+00:00", "updated_at": "2025-08-03T22:06:11.411643+00:00" } ================================================ FILE: data/benchmarks/arena-hard.json ================================================ { "benchmark_id": "arena-hard", "name": "Arena Hard", "parent_benchmark_id": null, "categories": ["general", "reasoning", "creativity"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "Arena-Hard-Auto is an automatic evaluation benchmark for instruction-tuned LLMs consisting of 500 challenging real-world prompts curated by BenchBuilder. It includes open-ended software engineering problems, mathematical questions, and creative writing tasks. The benchmark uses LLM-as-a-Judge methodology with GPT-4.1 and Gemini-2.5 as automatic judges to approximate human preference. Arena-Hard achieves 98.6% correlation with human preference rankings and provides 3x higher separation of model performances compared to MT-Bench, making it highly effective for distinguishing between models of similar quality.", "paper_link": "https://arxiv.org/abs/2406.11939", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.079874+00:00", "updated_at": "2025-07-19T19:56:14.079874+00:00" } ================================================ FILE: data/benchmarks/attaq.json ================================================ { "benchmark_id": "attaq", "name": "AttaQ", "parent_benchmark_id": null, "categories": ["safety"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "AttaQ is a unique dataset containing adversarial examples in the form of questions designed to provoke harmful or inappropriate responses from large language models. The benchmark evaluates safety vulnerabilities by using specialized clustering techniques that analyze both the semantic similarity of input attacks and the harmfulness of model responses, facilitating targeted improvements to model safety mechanisms.", "paper_link": "https://arxiv.org/abs/2311.04124", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:15.079764+00:00", "updated_at": "2025-07-19T19:56:15.079764+00:00" } ================================================ FILE: data/benchmarks/autologi.json ================================================ { "benchmark_id": "autologi", "name": "AutoLogi", "parent_benchmark_id": null, "categories": ["reasoning"], "modality": "text", "multilingual": true, "max_score": 1.0, "language": "en", "description": "AutoLogi is an automated method for synthesizing open-ended logic puzzles to evaluate reasoning abilities of Large Language Models. The benchmark addresses limitations of existing multiple-choice reasoning evaluations by featuring program-based verification and controllable difficulty levels. It includes 1,575 English and 883 Chinese puzzles, enabling more reliable evaluation that better distinguishes models' reasoning capabilities across languages.", "paper_link": "https://arxiv.org/abs/2502.16906", "implementation_link": null, "verified": false, "created_at": "2025-09-05T00:00:00.000000+00:00", "updated_at": "2025-09-05T00:00:00.000000+00:00" } ================================================ FILE: data/benchmarks/bbh.json ================================================ { "benchmark_id": "bbh", "name": "BBH", "parent_benchmark_id": null, "categories": ["reasoning", "math", "language"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "Big-Bench Hard (BBH) is a suite of 23 challenging tasks selected from BIG-Bench for which prior language model evaluations did not outperform the average human-rater. These tasks require multi-step reasoning across diverse domains including arithmetic, logical reasoning, reading comprehension, and commonsense reasoning. The benchmark was designed to test capabilities believed to be beyond current language models and focuses on evaluating complex reasoning skills including temporal understanding, spatial reasoning, causal understanding, and deductive logical reasoning.", "paper_link": "https://arxiv.org/abs/2210.09261", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:13.031859+00:00", "updated_at": "2025-07-19T19:56:13.031859+00:00" } ================================================ FILE: data/benchmarks/bfcl-v2.json ================================================ { "benchmark_id": "bfcl-v2", "name": "BFCL v2", "parent_benchmark_id": null, "categories": ["general", "reasoning"], "modality": "text", "multilingual": true, "max_score": 1.0, "language": "en", "description": "Berkeley Function Calling Leaderboard (BFCL) v2 is a comprehensive benchmark for evaluating large language models' function calling capabilities. It features 2,251 question-function-answer pairs with enterprise and OSS-contributed functions, addressing data contamination and bias through live, user-contributed scenarios. The benchmark evaluates AST accuracy, executable accuracy, irrelevance detection, and relevance detection across multiple programming languages (Python, Java, JavaScript) and includes complex real-world function calling scenarios with multi-lingual prompts.", "paper_link": "https://arxiv.org/abs/2305.15334", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.444045+00:00", "updated_at": "2025-07-19T19:56:14.444045+00:00" } ================================================ FILE: data/benchmarks/bfcl-v3-multiturn.json ================================================ { "benchmark_id": "bfcl-v3-multiturn", "name": "BFCL_v3_MultiTurn", "parent_benchmark_id": null, "categories": ["general", "reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "Berkeley Function Calling Leaderboard (BFCL) V3 MultiTurn benchmark that evaluates large language models' ability to handle multi-turn and multi-step function calling scenarios. The benchmark introduces complex interactions requiring models to manage sequential function calls, handle conversational context across multiple turns, and make dynamic decisions about when and how to use available functions. BFCL V3 uses state-based evaluation by verifying the actual state of API systems after function execution, providing more realistic assessment of function calling capabilities in agentic applications.", "paper_link": "https://openreview.net/forum?id=2GmDdhBdDk", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.962161+00:00", "updated_at": "2025-07-19T19:56:14.962161+00:00" } ================================================ FILE: data/benchmarks/bfcl-v3.json ================================================ { "benchmark_id": "bfcl-v3", "name": "BFCL-v3", "parent_benchmark_id": null, "categories": ["general", "reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "Berkeley Function Calling Leaderboard v3 (BFCL-v3) is an advanced benchmark that evaluates large language models' function calling capabilities through multi-turn and multi-step interactions. It introduces extended conversational exchanges where models must retain contextual information across turns and execute multiple internal function calls for complex user requests. The benchmark includes 1000 test cases across domains like vehicle control, trading bots, travel booking, and file system management, using state-based evaluation to verify both system state changes and execution path correctness.", "paper_link": "https://openreview.net/forum?id=2GmDdhBdDk", "implementation_link": null, "verified": false, "created_at": "2025-08-03T22:06:11.216985+00:00", "updated_at": "2025-08-03T22:06:11.216985+00:00" } ================================================ FILE: data/benchmarks/bfcl.json ================================================ { "benchmark_id": "bfcl", "name": "BFCL", "parent_benchmark_id": null, "categories": ["general", "reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "The Berkeley Function Calling Leaderboard (BFCL) is the first comprehensive and executable function call evaluation dedicated to assessing Large Language Models' ability to invoke functions. It evaluates serial and parallel function calls across multiple programming languages (Python, Java, JavaScript, REST API) using a novel Abstract Syntax Tree (AST) evaluation method. The benchmark consists of over 2,000 question-function-answer pairs covering diverse application domains and complex use cases including multiple function calls, parallel function calls, and multi-turn interactions.", "paper_link": "https://openreview.net/pdf?id=2GmDdhBdDk", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:12.763704+00:00", "updated_at": "2025-07-19T19:56:12.763704+00:00" } ================================================ FILE: data/benchmarks/big-bench-extra-hard.json ================================================ { "benchmark_id": "big-bench-extra-hard", "name": "BIG-Bench Extra Hard", "parent_benchmark_id": null, "categories": ["reasoning", "general", "language"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "BIG-Bench Extra Hard (BBEH) is a challenging benchmark that replaces each task in BIG-Bench Hard with a novel task that probes similar reasoning capabilities but exhibits significantly increased difficulty. The benchmark contains 23 tasks testing diverse reasoning skills including many-hop reasoning, causal understanding, spatial reasoning, temporal arithmetic, geometric reasoning, linguistic reasoning, logic puzzles, and humor understanding. Designed to address saturation on existing benchmarks where state-of-the-art models achieve near-perfect scores, BBEH shows substantial room for improvement with best models achieving only 9.8-44.8% average accuracy.", "paper_link": "https://arxiv.org/abs/2502.19187", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:13.279517+00:00", "updated_at": "2025-07-19T19:56:13.279517+00:00" } ================================================ FILE: data/benchmarks/big-bench-hard.json ================================================ { "benchmark_id": "big-bench-hard", "name": "BIG-Bench Hard", "parent_benchmark_id": null, "categories": ["reasoning", "math", "language"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "BIG-Bench Hard (BBH) is a subset of 23 challenging BIG-Bench tasks selected because prior language model evaluations did not outperform average human-rater performance. The benchmark contains 6,511 evaluation examples testing various forms of multi-step reasoning including arithmetic, logical reasoning (Boolean expressions, logical deduction), geometric reasoning, temporal reasoning, and language understanding. Tasks require capabilities such as causal judgment, object counting, navigation, pattern recognition, and complex problem solving.", "paper_link": "https://arxiv.org/abs/2210.09261", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:13.222809+00:00", "updated_at": "2025-07-19T19:56:13.222809+00:00" } ================================================ FILE: data/benchmarks/big-bench.json ================================================ { "benchmark_id": "big-bench", "name": "BIG-Bench", "parent_benchmark_id": null, "categories": ["reasoning", "math", "language"], "modality": "text", "multilingual": true, "max_score": 1.0, "language": "en", "description": "Beyond the Imitation Game Benchmark (BIG-bench) is a collaborative benchmark consisting of 204+ tasks designed to probe large language models and extrapolate their future capabilities. It covers diverse domains including linguistics, mathematics, common-sense reasoning, biology, physics, social bias, software development, and more. The benchmark focuses on tasks believed to be beyond current language model capabilities and includes both English and non-English tasks across multiple languages.", "paper_link": "https://arxiv.org/abs/2206.04615", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:13.926457+00:00", "updated_at": "2025-07-19T19:56:13.926457+00:00" } ================================================ FILE: data/benchmarks/bigcodebench-full.json ================================================ { "benchmark_id": "bigcodebench-full", "name": "BigCodeBench-Full", "parent_benchmark_id": null, "categories": ["general", "reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "A comprehensive benchmark that evaluates large language models' ability to solve complex, practical programming tasks via code generation. Contains 1,140 fine-grained tasks across 7 domains using function calls from 139 libraries. Challenges LLMs to invoke multiple function calls as tools and handle complex instructions for realistic software engineering and general-purpose reasoning tasks.", "paper_link": "https://arxiv.org/abs/2406.15877", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.508830+00:00", "updated_at": "2025-07-19T19:56:14.508830+00:00" } ================================================ FILE: data/benchmarks/bigcodebench-hard.json ================================================ { "benchmark_id": "bigcodebench-hard", "name": "BigCodeBench-Hard", "parent_benchmark_id": null, "categories": ["general", "reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "BigCodeBench-Hard is a subset of 148 challenging programming tasks from BigCodeBench, designed to evaluate large language models' ability to solve complex, real-world programming problems. These tasks require diverse function calls from multiple libraries across 7 domains including computation, networking, data analysis, and visualization. The benchmark tests compositional reasoning and the ability to implement complex instructions that span 139 libraries with an average of 2.8 libraries per task.", "paper_link": "https://arxiv.org/abs/2406.15877", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.512684+00:00", "updated_at": "2025-07-19T19:56:14.512684+00:00" } ================================================ FILE: data/benchmarks/bigcodebench.json ================================================ { "benchmark_id": "bigcodebench", "name": "BigCodeBench", "parent_benchmark_id": null, "categories": ["general", "reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "A benchmark that challenges LLMs to invoke multiple function calls as tools from 139 libraries and 7 domains for 1,140 fine-grained programming tasks. Evaluates code generation with diverse function calls and complex instructions, featuring two variants: Complete (code completion based on comprehensive docstrings) and Instruct (generating code from natural language instructions).", "paper_link": "https://arxiv.org/abs/2406.15877", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.048433+00:00", "updated_at": "2025-07-19T19:56:14.048433+00:00" } ================================================ FILE: data/benchmarks/bird-sql-(dev).json ================================================ { "benchmark_id": "bird-sql-(dev)", "name": "Bird-SQL (dev)", "parent_benchmark_id": null, "categories": ["reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "BIRD (BIg Bench for LaRge-scale Database Grounded Text-to-SQLs) is a comprehensive text-to-SQL benchmark containing 12,751 question-SQL pairs across 95 databases (33.4 GB total) spanning 37+ professional domains. It evaluates large language models' ability to convert natural language to executable SQL queries in real-world scenarios with complex database schemas and dirty data.", "paper_link": "https://arxiv.org/abs/2305.03111", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:13.410905+00:00", "updated_at": "2025-07-19T19:56:13.410905+00:00" } ================================================ FILE: data/benchmarks/blink.json ================================================ { "benchmark_id": "blink", "name": "BLINK", "parent_benchmark_id": null, "categories": ["vision", "multimodal", "reasoning"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "BLINK: Multimodal Large Language Models Can See but Not Perceive. A benchmark for multimodal language models focusing on core visual perception abilities. Reformats 14 classic computer vision tasks into 3,807 multiple-choice questions paired with single or multiple images and visual prompting. Tasks include relative depth estimation, visual correspondence, forensics detection, multi-view reasoning, counting, object localization, and spatial reasoning that humans can solve 'within a blink'.", "paper_link": "https://arxiv.org/abs/2404.12390", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.326398+00:00", "updated_at": "2025-07-19T19:56:14.326398+00:00" } ================================================ FILE: data/benchmarks/boolq.json ================================================ { "benchmark_id": "boolq", "name": "BoolQ", "parent_benchmark_id": null, "categories": ["language", "reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "BoolQ is a reading comprehension dataset for yes/no questions containing 15,942 naturally occurring examples. Each example consists of a question, passage, and boolean answer, where questions are generated in unprompted and unconstrained settings. The dataset challenges models with complex, non-factoid information requiring entailment-like inference to solve.", "paper_link": "https://arxiv.org/abs/1905.10044", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:13.117325+00:00", "updated_at": "2025-07-19T19:56:13.117325+00:00" } ================================================ FILE: data/benchmarks/browsecomp-long-128k.json ================================================ { "benchmark_id": "browsecomp-long-128k", "name": "BrowseComp Long Context 128k", "parent_benchmark_id": "browsecomp", "categories": ["reasoning", "search"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "A challenging benchmark for evaluating web browsing agents' ability to persistently navigate the internet and find hard-to-locate, entangled information. Comprises 1,266 questions requiring strategic reasoning, creative search, and interpretation of retrieved content, with short and easily verifiable answers.", "paper_link": "https://arxiv.org/abs/2504.12516", "implementation_link": null, "verified": false, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00" } ================================================ FILE: data/benchmarks/browsecomp-long-256k.json ================================================ { "benchmark_id": "browsecomp-long-256k", "name": "BrowseComp Long Context 256k", "parent_benchmark_id": "browsecomp", "categories": ["reasoning", "search"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "BrowseComp is a benchmark for measuring the ability of agents to browse the web, comprising 1,266 questions that require persistently navigating the internet in search of hard-to-find, entangled information. Despite the difficulty of the questions, BrowseComp is simple and easy-to-use, as predicted answers are short and easily verifiable against reference answers. The benchmark focuses on questions where answers are obscure, time-invariant, and well-supported by evidence scattered across the open web.", "paper_link": "https://arxiv.org/abs/2504.12516", "implementation_link": null, "verified": false, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00" } ================================================ FILE: data/benchmarks/browsecomp-zh.json ================================================ { "benchmark_id": "browsecomp-zh", "name": "BrowseComp-zh", "parent_benchmark_id": "browsecomp", "categories": ["reasoning", "search"], "modality": "text", "multilingual": true, "max_score": 1.0, "language": "zh", "description": "A high-difficulty benchmark purpose-built to comprehensively evaluate LLM agents on the Chinese web, consisting of 289 multi-hop questions spanning 11 diverse domains including Film & TV, Technology, Medicine, and History. Questions are reverse-engineered from short, objective, and easily verifiable answers, requiring sophisticated reasoning and information reconciliation beyond basic retrieval. The benchmark addresses linguistic, infrastructural, and censorship-related complexities in Chinese web environments.", "paper_link": "https://arxiv.org/abs/2504.19314", "implementation_link": null, "verified": false, "created_at": "2025-09-15T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00" } ================================================ FILE: data/benchmarks/browsecomp.json ================================================ { "benchmark_id": "browsecomp", "name": "BrowseComp", "parent_benchmark_id": null, "categories": ["reasoning", "search"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "BrowseComp is a benchmark comprising 1,266 questions that challenge AI agents to persistently navigate the internet in search of hard-to-find, entangled information. The benchmark measures agents' ability to exercise persistence in information gathering, demonstrate creativity in web navigation, and find concise, verifiable answers. Despite the difficulty of the questions, BrowseComp is simple and easy-to-use, as predicted answers are short and easily verifiable against reference answers.", "paper_link": "https://arxiv.org/abs/2504.12516", "implementation_link": null, "verified": false, "created_at": "2025-07-28T00:00:00.000000+00:00", "updated_at": "2025-07-28T00:00:00.000000+00:00" } ================================================ FILE: data/benchmarks/c-eval.json ================================================ { "benchmark_id": "c-eval", "name": "C-Eval", "parent_benchmark_id": null, "categories": ["general", "reasoning"], "modality": "text", "multilingual": true, "max_score": 1.0, "language": "en", "description": "C-Eval is a comprehensive Chinese evaluation suite designed to assess advanced knowledge and reasoning abilities of foundation models in a Chinese context. It comprises 13,948 multiple-choice questions across 52 diverse disciplines spanning humanities, science, and engineering, with four difficulty levels: middle school, high school, college, and professional. The benchmark includes C-Eval Hard, a subset of very challenging subjects requiring advanced reasoning abilities.", "paper_link": "https://arxiv.org/abs/2305.08322", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:11.917478+00:00", "updated_at": "2025-07-19T19:56:11.917478+00:00" } ================================================ FILE: data/benchmarks/cbnsl.json ================================================ { "benchmark_id": "cbnsl", "name": "CBNSL", "parent_benchmark_id": null, "categories": ["math", "reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "Curriculum Learning of Bayesian Network Structures (CBNSL) benchmark for evaluating algorithms that learn Bayesian network structures from data using curriculum learning techniques. The benchmark uses networks from the bnlearn repository and evaluates structure learning performance using BDeu scoring metrics.", "paper_link": "http://proceedings.mlr.press/v45/Zhao15a.pdf", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:12.590999+00:00", "updated_at": "2025-07-19T19:56:12.590999+00:00" } ================================================ FILE: data/benchmarks/cc-ocr.json ================================================ { "benchmark_id": "cc-ocr", "name": "CC-OCR", "parent_benchmark_id": null, "categories": ["vision", "multimodal", "text-to-image"], "modality": "multimodal", "multilingual": true, "max_score": 1.0, "language": "en", "description": "A comprehensive OCR benchmark for evaluating Large Multimodal Models (LMMs) in literacy. Comprises four OCR-centric tracks: multi-scene text reading, multilingual text reading, document parsing, and key information extraction. Contains 39 subsets with 7,058 fully annotated images, 41% sourced from real applications. Tests capabilities including text grounding, multi-orientation text recognition, and detecting hallucination/repetition across diverse visual challenges.", "paper_link": "https://arxiv.org/abs/2412.02210", "implementation_link": "https://github.com/AlibabaResearch/AdvancedLiterateMachinery", "verified": false, "created_at": "2025-07-19T19:56:14.652986+00:00", "updated_at": "2025-07-19T19:56:14.652986+00:00" } ================================================ FILE: data/benchmarks/cfeval.json ================================================ { "benchmark_id": "cfeval", "name": "CFEval", "parent_benchmark_id": null, "categories": ["code"], "modality": "text", "multilingual": false, "max_score": 10000.0, "language": "en", "description": "CFEval benchmark for evaluating code generation and problem-solving capabilities", "paper_link": null, "implementation_link": null, "verified": false, "created_at": "2025-09-15T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00" } ================================================ FILE: data/benchmarks/charadessta.json ================================================ { "benchmark_id": "charadessta", "name": "CharadesSTA", "parent_benchmark_id": null, "categories": ["video", "language", "multimodal"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "Charades-STA is a benchmark dataset for temporal activity localization via language queries, extending the Charades dataset with sentence temporal annotations. It contains 12,408 training and 3,720 testing segment-sentence pairs from videos with natural language descriptions and precise temporal boundaries for localizing activities based on language queries.", "paper_link": "https://arxiv.org/abs/1705.02101", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.760027+00:00", "updated_at": "2025-07-19T19:56:14.760027+00:00" } ================================================ FILE: data/benchmarks/chartqa.json ================================================ { "benchmark_id": "chartqa", "name": "ChartQA", "parent_benchmark_id": null, "categories": ["reasoning", "vision", "multimodal"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "ChartQA is a large-scale benchmark comprising 9.6K human-written questions and 23.1K questions generated from human-written chart summaries, designed to evaluate models' abilities in visual and logical reasoning over charts.", "paper_link": "https://arxiv.org/abs/2203.10244", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:12.783541+00:00", "updated_at": "2025-07-19T19:56:12.783541+00:00" } ================================================ FILE: data/benchmarks/charxiv-d.json ================================================ { "benchmark_id": "charxiv-d", "name": "CharXiv-D", "parent_benchmark_id": null, "categories": ["reasoning", "vision", "multimodal"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "CharXiv-D is the descriptive questions subset of the CharXiv benchmark, designed to assess multimodal large language models' ability to extract basic information from scientific charts. It contains descriptive questions covering information extraction, enumeration, pattern recognition, and counting across 2,323 diverse charts from arXiv papers, all curated and verified by human experts.", "paper_link": "https://arxiv.org/abs/2406.18521", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:15.325204+00:00", "updated_at": "2025-07-19T19:56:15.325204+00:00" } ================================================ FILE: data/benchmarks/charxiv-r.json ================================================ { "benchmark_id": "charxiv-r", "name": "CharXiv-R", "parent_benchmark_id": null, "categories": ["reasoning", "vision", "multimodal"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "CharXiv-R is the reasoning component of the CharXiv benchmark, focusing on complex reasoning questions that require synthesizing information across visual chart elements. It evaluates multimodal large language models on their ability to understand and reason about scientific charts from arXiv papers through various reasoning tasks.", "paper_link": "https://arxiv.org/abs/2406.18521", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:15.191553+00:00", "updated_at": "2025-07-19T19:56:15.191553+00:00" } ================================================ FILE: data/benchmarks/chexpert-cxr.json ================================================ { "benchmark_id": "chexpert-cxr", "name": "CheXpert CXR", "parent_benchmark_id": null, "categories": ["healthcare", "vision"], "modality": "image", "multilingual": false, "max_score": 1.0, "language": "en", "description": "CheXpert is a large dataset of 224,316 chest radiographs from 65,240 patients for automated chest X-ray interpretation. The dataset includes uncertainty labels for 14 medical observations extracted from radiology reports. It serves as a benchmark for developing and evaluating automated chest radiograph interpretation models.", "paper_link": "https://arxiv.org/abs/1901.07031", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.021015+00:00", "updated_at": "2025-07-19T19:56:14.021015+00:00" } ================================================ FILE: data/benchmarks/cluewsc.json ================================================ { "benchmark_id": "cluewsc", "name": "CLUEWSC", "parent_benchmark_id": null, "categories": ["language", "reasoning"], "modality": "text", "multilingual": true, "max_score": 1.0, "language": "en", "description": "CLUEWSC2020 is the Chinese version of the Winograd Schema Challenge, part of the CLUE benchmark. It focuses on pronoun disambiguation and coreference resolution, requiring models to determine which noun a pronoun refers to in a sentence. The dataset contains 1,244 training samples and 304 development samples extracted from contemporary Chinese literature.", "paper_link": "https://arxiv.org/abs/2004.05986", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:12.233189+00:00", "updated_at": "2025-07-19T19:56:12.233189+00:00" } ================================================ FILE: data/benchmarks/cmmlu.json ================================================ { "benchmark_id": "cmmlu", "name": "CMMLU", "parent_benchmark_id": null, "categories": ["language", "reasoning", "general"], "modality": "text", "multilingual": true, "max_score": 1.0, "language": "en", "description": "CMMLU (Chinese Massive Multitask Language Understanding) is a comprehensive Chinese benchmark that evaluates the knowledge and reasoning capabilities of large language models across 67 different subject topics. The benchmark covers natural sciences, social sciences, engineering, and humanities with multiple-choice questions ranging from basic to advanced professional levels.", "paper_link": "https://arxiv.org/abs/2306.09212", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.941108+00:00", "updated_at": "2025-07-19T19:56:14.941108+00:00" } ================================================ FILE: data/benchmarks/cnmo-2024.json ================================================ { "benchmark_id": "cnmo-2024", "name": "CNMO 2024", "parent_benchmark_id": null, "categories": ["math"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "China Mathematical Olympiad 2024 - A challenging mathematics competition.", "paper_link": null, "implementation_link": null, "verified": false, "created_at": "2025-09-05T00:00:00.000000+00:00", "updated_at": "2025-09-05T00:00:00.000000+00:00" } ================================================ FILE: data/benchmarks/codeforces.json ================================================ { "benchmark_id": "codeforces", "name": "CodeForces", "parent_benchmark_id": null, "categories": ["math", "reasoning"], "modality": "text", "multilingual": false, "max_score": 3000.0, "language": "en", "description": "A competitive programming benchmark using problems from the CodeForces platform. The benchmark evaluates code generation capabilities of LLMs on algorithmic problems with difficulty ratings ranging from 800 to 2400. Problems cover diverse algorithmic categories including dynamic programming, graph algorithms, data structures, and mathematical problems with standardized evaluation through direct platform submission.", "paper_link": "https://arxiv.org/abs/2501.01257", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.624663+00:00", "updated_at": "2025-07-19T19:56:14.624663+00:00" } ================================================ FILE: data/benchmarks/codegolf-v2.2.json ================================================ { "benchmark_id": "codegolf-v2.2", "name": "Codegolf v2.2", "parent_benchmark_id": null, "categories": ["code"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "Codegolf v2.2 benchmark", "paper_link": null, "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:13.778275+00:00", "updated_at": "2025-07-19T19:56:13.778275+00:00" } ================================================ FILE: data/benchmarks/collie.json ================================================ { "benchmark_id": "collie", "name": "COLLIE", "parent_benchmark_id": null, "categories": ["language", "reasoning", "writing"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "COLLIE is a grammar-based framework for systematic construction of constrained text generation tasks. It allows specification of rich, compositional constraints across diverse generation levels and modeling challenges including language understanding, logical reasoning, and semantic planning. The COLLIE-v1 dataset contains 2,080 instances across 13 constraint structures.", "paper_link": "https://arxiv.org/abs/2307.08689", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:15.250323+00:00", "updated_at": "2025-07-19T19:56:15.250323+00:00" } ================================================ FILE: data/benchmarks/common-voice-15.json ================================================ { "benchmark_id": "common-voice-15", "name": "Common Voice 15", "parent_benchmark_id": null, "categories": ["audio", "speech-to-text", "language"], "modality": "audio", "multilingual": true, "max_score": 100.0, "language": "en", "description": "Common Voice is a massively-multilingual collection of transcribed speech intended for speech technology research and development. Version 15.0 contains 28,750 recorded hours across 114 languages, consisting of crowdsourced voice recordings with corresponding transcriptions.", "paper_link": "https://arxiv.org/abs/1912.06670", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.830793+00:00", "updated_at": "2025-07-19T19:56:14.830793+00:00" } ================================================ FILE: data/benchmarks/commonsenseqa.json ================================================ { "benchmark_id": "commonsenseqa", "name": "CommonSenseQA", "parent_benchmark_id": null, "categories": ["reasoning", "language"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "CommonSenseQA is a multiple-choice question answering dataset that requires different types of commonsense knowledge to predict correct answers. It contains 12,102 questions with one correct answer and four distractors, designed to test semantic reasoning and conceptual relationships. Questions are created based on ConceptNet concepts and require prior world knowledge for accurate reasoning.", "paper_link": "https://arxiv.org/abs/1811.00937", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:15.129679+00:00", "updated_at": "2025-07-19T19:56:15.129679+00:00" } ================================================ FILE: data/benchmarks/complexfuncbench.json ================================================ { "benchmark_id": "complexfuncbench", "name": "ComplexFuncBench", "parent_benchmark_id": null, "categories": ["long_context", "reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "ComplexFuncBench is a benchmark designed to evaluate large language models' capabilities in handling complex function calling scenarios. It encompasses multi-step and constrained function calling tasks that require long-parameter filling, parameter value reasoning, and managing contexts up to 128k tokens. The benchmark includes 1,000 samples across five real-world scenarios.", "paper_link": "https://arxiv.org/abs/2501.10132", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:15.336577+00:00", "updated_at": "2025-07-19T19:56:15.336577+00:00" } ================================================ FILE: data/benchmarks/covost2-en-zh.json ================================================ { "benchmark_id": "covost2-en-zh", "name": "CoVoST2 en-zh", "parent_benchmark_id": null, "categories": ["audio", "speech-to-text", "language"], "modality": "audio", "multilingual": true, "max_score": 100.0, "language": "en", "description": "CoVoST 2 English-to-Chinese subset is part of the large-scale multilingual speech translation corpus derived from Common Voice. This subset focuses specifically on English to Chinese speech translation tasks within the broader CoVoST 2 dataset.", "paper_link": "https://arxiv.org/abs/2007.10310", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.825578+00:00", "updated_at": "2025-07-19T19:56:14.825578+00:00" } ================================================ FILE: data/benchmarks/covost2.json ================================================ { "benchmark_id": "covost2", "name": "CoVoST2", "parent_benchmark_id": null, "categories": ["audio", "speech-to-text", "language"], "modality": "audio", "multilingual": true, "max_score": 1.0, "language": "en", "description": "CoVoST 2 is a large-scale multilingual speech translation corpus derived from Common Voice, covering translations from 21 languages into English and from English into 15 languages. The dataset contains 2,880 hours of speech with 78K speakers for speech translation research.", "paper_link": "https://arxiv.org/abs/2007.10310", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:13.958237+00:00", "updated_at": "2025-07-19T19:56:13.958237+00:00" } ================================================ FILE: data/benchmarks/crag.json ================================================ { "benchmark_id": "crag", "name": "CRAG", "parent_benchmark_id": null, "categories": ["reasoning", "search"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "CRAG (Comprehensive RAG Benchmark) is a factual question answering benchmark consisting of 4,409 question-answer pairs across 5 domains (finance, sports, music, movie, open domain) and 8 question categories. The benchmark includes mock APIs to simulate web and Knowledge Graph search, designed to represent the diverse and dynamic nature of real-world QA tasks with temporal dynamism ranging from years to seconds. It evaluates retrieval-augmented generation systems for trustworthy question answering.", "paper_link": "https://arxiv.org/abs/2406.04744", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:12.741280+00:00", "updated_at": "2025-07-19T19:56:12.741280+00:00" } ================================================ FILE: data/benchmarks/creative-writing-v3.json ================================================ { "benchmark_id": "creative-writing-v3", "name": "Creative Writing v3", "parent_benchmark_id": null, "categories": ["creativity", "writing"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "EQ-Bench Creative Writing v3 is an LLM-judged creative writing benchmark that evaluates models across 32 writing prompts with 3 iterations per prompt. Uses a hybrid scoring system combining rubric assessment and Elo ratings through pairwise comparisons. Challenges models in areas like humor, romance, spatial awareness, and unique perspectives to assess emotional intelligence and creative writing capabilities.", "paper_link": "https://arxiv.org/abs/2312.06281", "implementation_link": null, "verified": false, "created_at": "2025-08-03T22:06:11.157942+00:00", "updated_at": "2025-08-03T22:06:11.157942+00:00" } ================================================ FILE: data/benchmarks/crperelation.json ================================================ { "benchmark_id": "crperelation", "name": "CRPErelation", "parent_benchmark_id": null, "categories": ["healthcare", "reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "Clinical reasoning problems evaluation benchmark for assessing diagnostic reasoning and medical knowledge application capabilities.", "paper_link": null, "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.834739+00:00", "updated_at": "2025-07-19T19:56:14.834739+00:00" } ================================================ FILE: data/benchmarks/crux-o.json ================================================ { "benchmark_id": "crux-o", "name": "CRUX-O", "parent_benchmark_id": null, "categories": ["reasoning"], "modality": "text", "multilingual": false, "max_score": 100.0, "language": "en", "description": "CRUXEval-O (output prediction) is part of the CRUXEval benchmark consisting of 800 Python functions (3-13 lines) designed to evaluate AI models' capabilities in code reasoning, understanding, and execution. The benchmark tests models' ability to predict correct function outputs given function code and inputs, focusing on short problems that a good human programmer should be able to solve in a minute.", "paper_link": "https://arxiv.org/abs/2401.03065", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.635245+00:00", "updated_at": "2025-07-19T19:56:14.635245+00:00" } ================================================ FILE: data/benchmarks/cruxeval-input-cot.json ================================================ { "benchmark_id": "cruxeval-input-cot", "name": "CRUXEval-Input-CoT", "parent_benchmark_id": null, "categories": ["reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "CRUXEval input prediction task with Chain of Thought (CoT) prompting. Part of the CRUXEval benchmark for code reasoning, understanding, and execution evaluation. Given a Python function and its expected output, the task is to predict the appropriate input using chain-of-thought reasoning. Consists of 800 Python functions (3-13 lines) designed to evaluate code comprehension and reasoning capabilities.", "paper_link": "https://arxiv.org/abs/2401.03065", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.551746+00:00", "updated_at": "2025-07-19T19:56:14.551746+00:00" } ================================================ FILE: data/benchmarks/cruxeval-o.json ================================================ { "benchmark_id": "cruxeval-o", "name": "CruxEval-O", "parent_benchmark_id": null, "categories": ["reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "CruxEval-O is the output prediction task of the CRUXEval benchmark, designed to evaluate code reasoning, understanding, and execution capabilities. It consists of 800 Python functions (3-13 lines) where models must predict the output given a function and input. The benchmark tests fundamental code execution reasoning abilities and goes beyond simple code generation to assess deeper understanding of program behavior.", "paper_link": "https://arxiv.org/abs/2401.03065", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:15.146592+00:00", "updated_at": "2025-07-19T19:56:15.146592+00:00" } ================================================ FILE: data/benchmarks/cruxeval-output-cot.json ================================================ { "benchmark_id": "cruxeval-output-cot", "name": "CRUXEval-Output-CoT", "parent_benchmark_id": null, "categories": ["reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "CRUXEval-O (output prediction) with Chain-of-Thought prompting. Part of the CRUXEval benchmark consisting of 800 Python functions (3-13 lines) designed to evaluate code reasoning, understanding, and execution capabilities. The output prediction task requires models to predict the output of a given Python function with specific inputs, evaluated using chain-of-thought reasoning methodology.", "paper_link": "https://arxiv.org/abs/2401.03065", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.555432+00:00", "updated_at": "2025-07-19T19:56:14.555432+00:00" } ================================================ FILE: data/benchmarks/csimpleqa.json ================================================ { "benchmark_id": "csimpleqa", "name": "CSimpleQA", "parent_benchmark_id": null, "categories": ["general", "language"], "modality": "text", "multilingual": true, "max_score": 1.0, "language": "en", "description": "Chinese SimpleQA is the first comprehensive Chinese benchmark to evaluate the factuality ability of language models to answer short questions. It contains 3,000 high-quality questions spanning 6 major topics with 99 diverse subtopics, designed to assess Chinese factual knowledge across humanities, science, engineering, culture, and society.", "paper_link": "https://arxiv.org/abs/2411.07140", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:11.931358+00:00", "updated_at": "2025-07-19T19:56:11.931358+00:00" } ================================================ FILE: data/benchmarks/cybersecurity-ctfs.json ================================================ { "benchmark_id": "cybersecurity-ctfs", "name": "Cybersecurity CTFs", "parent_benchmark_id": null, "categories": ["safety"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "Cybersecurity Capture the Flag (CTF) benchmark for evaluating LLMs in offensive security challenges. Contains diverse cybersecurity tasks including cryptography, web exploitation, binary analysis, and forensics to assess AI capabilities in cybersecurity problem-solving.", "paper_link": "https://arxiv.org/abs/2406.05590", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:15.387055+00:00", "updated_at": "2025-07-19T19:56:15.387055+00:00" } ================================================ FILE: data/benchmarks/dermmcqa.json ================================================ { "benchmark_id": "dermmcqa", "name": "DermMCQA", "parent_benchmark_id": null, "categories": ["healthcare"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "Dermatology multiple choice question assessment benchmark for evaluating medical knowledge and diagnostic reasoning in dermatological conditions and treatments.", "paper_link": "https://arxiv.org/abs/2309.06961", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.024498+00:00", "updated_at": "2025-07-19T19:56:14.024498+00:00" } ================================================ FILE: data/benchmarks/docvqa.json ================================================ { "benchmark_id": "docvqa", "name": "DocVQA", "parent_benchmark_id": null, "categories": ["vision", "multimodal"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "A dataset for Visual Question Answering on document images containing 50,000 questions defined on 12,000+ document images. The benchmark tests AI's ability to understand document structure and content, requiring models to comprehend document layout and perform information retrieval to answer questions about document images.", "paper_link": "https://arxiv.org/abs/2007.00398", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:12.825214+00:00", "updated_at": "2025-07-19T19:56:12.825214+00:00" } ================================================ FILE: data/benchmarks/docvqatest.json ================================================ { "benchmark_id": "docvqatest", "name": "DocVQAtest", "parent_benchmark_id": null, "categories": ["vision", "multimodal"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "DocVQA is a Visual Question Answering benchmark on document images containing 50,000 questions defined on 12,000+ document images. The benchmark focuses on understanding document structure and content to answer questions about various document types including letters, memos, notes, and reports from the UCSF Industry Documents Library.", "paper_link": "https://arxiv.org/abs/2007.00398", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.579372+00:00", "updated_at": "2025-07-19T19:56:14.579372+00:00" } ================================================ FILE: data/benchmarks/drop.json ================================================ { "benchmark_id": "drop", "name": "DROP", "parent_benchmark_id": null, "categories": ["reasoning", "math"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "DROP (Discrete Reasoning Over Paragraphs) is a reading comprehension benchmark requiring discrete reasoning over paragraph content. It contains crowdsourced, adversarially-created questions that require resolving references and performing discrete operations like addition, counting, or sorting, demanding comprehensive paragraph understanding beyond paraphrase-and-entity-typing shortcuts.", "paper_link": "https://arxiv.org/abs/1903.00161", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:12.981569+00:00", "updated_at": "2025-07-19T19:56:12.981569+00:00" } ================================================ FILE: data/benchmarks/ds-arena-code.json ================================================ { "benchmark_id": "ds-arena-code", "name": "DS-Arena-Code", "parent_benchmark_id": null, "categories": ["reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "Data Science Arena Code benchmark for evaluating LLMs on realistic data science code generation tasks. Tests capabilities in complex data processing, analysis, and programming across popular Python libraries used in data science workflows.", "paper_link": "https://arxiv.org/abs/2505.15621", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:15.057744+00:00", "updated_at": "2025-07-19T19:56:15.057744+00:00" } ================================================ FILE: data/benchmarks/ds-fim-eval.json ================================================ { "benchmark_id": "ds-fim-eval", "name": "DS-FIM-Eval", "parent_benchmark_id": null, "categories": ["general"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "DeepSeek's internal Fill-in-the-Middle evaluation dataset for measuring code completion performance improvements in data science contexts", "paper_link": "https://arxiv.org/abs/2406.11931", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:15.053854+00:00", "updated_at": "2025-07-19T19:56:15.053854+00:00" } ================================================ FILE: data/benchmarks/eclektic.json ================================================ { "benchmark_id": "eclektic", "name": "ECLeKTic", "parent_benchmark_id": null, "categories": ["language", "reasoning"], "modality": "text", "multilingual": true, "max_score": 1.0, "language": "en", "description": "A multilingual closed-book question answering dataset that evaluates cross-lingual knowledge transfer in large language models across 12 languages, using knowledge-seeking questions based on Wikipedia articles that exist only in one language", "paper_link": "https://arxiv.org/abs/2502.21228", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:13.561292+00:00", "updated_at": "2025-07-19T19:56:13.561292+00:00" } ================================================ FILE: data/benchmarks/egoschema.json ================================================ { "benchmark_id": "egoschema", "name": "EgoSchema", "parent_benchmark_id": null, "categories": ["vision", "reasoning", "long_context"], "modality": "video", "multilingual": false, "max_score": 1.0, "language": "en", "description": "A diagnostic benchmark for very long-form video language understanding consisting of over 5000 human curated multiple choice questions based on 3-minute video clips from Ego4D, covering a broad range of natural human activities and behaviors", "paper_link": "https://arxiv.org/abs/2308.09126", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:12.915240+00:00", "updated_at": "2025-07-19T19:56:12.915240+00:00" } ================================================ FILE: data/benchmarks/erqa.json ================================================ { "benchmark_id": "erqa", "name": "ERQA", "parent_benchmark_id": null, "categories": ["vision", "reasoning", "spatial_reasoning"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "Embodied Reasoning Question Answering benchmark consisting of 400 multiple-choice visual questions across spatial reasoning, trajectory reasoning, action reasoning, state estimation, and multi-view reasoning for evaluating AI capabilities in physical world interactions", "paper_link": "https://arxiv.org/abs/2503.20020", "implementation_link": "https://github.com/embodiedreasoning/ERQA", "verified": false, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00" } ================================================ FILE: data/benchmarks/evalplus.json ================================================ { "benchmark_id": "evalplus", "name": "EvalPlus", "parent_benchmark_id": null, "categories": ["reasoning", "code"], "modality": "text", "multilingual": false, "max_score": 100.0, "language": "en", "description": "A rigorous code synthesis evaluation framework that augments existing datasets with extensive test cases generated by LLM and mutation-based strategies to better assess functional correctness of generated code, including HumanEval+ with 80x more test cases", "paper_link": "https://arxiv.org/abs/2305.01210", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:11.793176+00:00", "updated_at": "2025-07-19T19:56:11.793176+00:00" } ================================================ FILE: data/benchmarks/facts-grounding.json ================================================ { "benchmark_id": "facts-grounding", "name": "FACTS Grounding", "parent_benchmark_id": null, "categories": ["reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "A benchmark evaluating language models' ability to generate factually accurate and well-grounded responses based on long-form input context, comprising 1,719 examples with documents up to 32k tokens requiring detailed responses that are fully grounded in provided documents", "paper_link": "https://arxiv.org/abs/2501.03200", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:13.260285+00:00", "updated_at": "2025-07-19T19:56:13.260285+00:00" } ================================================ FILE: data/benchmarks/factscore.json ================================================ { "benchmark_id": "factscore", "name": "FActScore", "parent_benchmark_id": null, "categories": ["reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "A fine-grained atomic evaluation metric for factual precision in long-form text generation that breaks generated text into atomic facts and computes the percentage supported by reliable knowledge sources, with automated assessment using retrieval and language models", "paper_link": "https://arxiv.org/abs/2305.14251", "implementation_link": null, "verified": false, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00" } ================================================ FILE: data/benchmarks/finqa.json ================================================ { "benchmark_id": "finqa", "name": "FinQA", "parent_benchmark_id": null, "categories": ["finance", "math", "reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "A large-scale dataset for numerical reasoning over financial data with question-answering pairs written by financial experts, featuring complex numerical reasoning and understanding of heterogeneous representations with annotated gold reasoning programs for full explainability", "paper_link": "https://arxiv.org/abs/2109.00122", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:12.734486+00:00", "updated_at": "2025-07-19T19:56:12.734486+00:00" } ================================================ FILE: data/benchmarks/flenqa.json ================================================ { "benchmark_id": "flenqa", "name": "FlenQA", "parent_benchmark_id": null, "categories": ["reasoning", "long_context"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "Flexible Length Question Answering dataset for evaluating the impact of input length on reasoning performance of language models, featuring True/False questions embedded in contexts of varying lengths (250-3000 tokens) across three reasoning tasks: Monotone Relations, People In Rooms, and simplified Ruletaker", "paper_link": "https://arxiv.org/abs/2402.14848", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.277205+00:00", "updated_at": "2025-07-19T19:56:14.277205+00:00" } ================================================ FILE: data/benchmarks/fleurs.json ================================================ { "benchmark_id": "fleurs", "name": "FLEURS", "parent_benchmark_id": null, "categories": ["language", "speech-to-text"], "modality": "audio", "multilingual": true, "max_score": 100.0, "language": "en", "description": "Few-shot Learning Evaluation of Universal Representations of Speech - a parallel speech dataset in 102 languages built on FLoRes-101 with approximately 12 hours of speech supervision per language for tasks including ASR, speech language identification, translation and retrieval", "paper_link": "https://arxiv.org/abs/2205.12446", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:13.943695+00:00", "updated_at": "2025-07-19T19:56:13.943695+00:00" } ================================================ FILE: data/benchmarks/frames.json ================================================ { "benchmark_id": "frames", "name": "FRAMES", "parent_benchmark_id": null, "categories": ["reasoning", "search"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "Factuality, Retrieval, And reasoning MEasurement Set - a unified evaluation dataset of 824 challenging multi-hop questions for testing retrieval-augmented generation systems across factuality, retrieval accuracy, and reasoning capabilities, requiring integration of 2-15 Wikipedia articles per question", "paper_link": "https://arxiv.org/abs/2409.12941", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.954436+00:00", "updated_at": "2025-07-19T19:56:14.954436+00:00" } ================================================ FILE: data/benchmarks/french-mmlu.json ================================================ { "benchmark_id": "french-mmlu", "name": "French MMLU", "parent_benchmark_id": null, "categories": ["general", "language", "reasoning"], "modality": "text", "multilingual": true, "max_score": 1.0, "language": "en", "description": "French version of MMLU-Pro, a multilingual benchmark for evaluating language models' cross-lingual reasoning capabilities across 14 diverse domains including mathematics, physics, chemistry, law, engineering, psychology, and health.", "paper_link": "https://arxiv.org/abs/2503.10497", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:15.134340+00:00", "updated_at": "2025-07-19T19:56:15.134340+00:00" } ================================================ FILE: data/benchmarks/frontiermath.json ================================================ { "benchmark_id": "frontiermath", "name": "FrontierMath", "parent_benchmark_id": null, "categories": ["math", "reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "A benchmark of hundreds of original, exceptionally challenging mathematics problems crafted and vetted by expert mathematicians, covering most major branches of modern mathematics from number theory and real analysis to algebraic geometry and category theory.", "paper_link": "https://arxiv.org/abs/2411.04872", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:15.179213+00:00", "updated_at": "2025-07-19T19:56:15.179213+00:00" } ================================================ FILE: data/benchmarks/functionalmath.json ================================================ { "benchmark_id": "functionalmath", "name": "FunctionalMATH", "parent_benchmark_id": null, "categories": ["math", "reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "A functional variant of the MATH benchmark that tests language models' ability to generalize reasoning patterns across different problem instances, revealing the reasoning gap between static and functional performance.", "paper_link": "https://arxiv.org/abs/2402.19450", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:13.987516+00:00", "updated_at": "2025-07-19T19:56:13.987516+00:00" } ================================================ FILE: data/benchmarks/giantsteps-tempo.json ================================================ { "benchmark_id": "giantsteps-tempo", "name": "GiantSteps Tempo", "parent_benchmark_id": null, "categories": ["audio"], "modality": "audio", "multilingual": false, "max_score": 1.0, "language": "en", "description": "A dataset for tempo estimation in electronic dance music containing 664 2-minute audio previews from Beatport, annotated from user corrections for evaluating automatic tempo estimation algorithms.", "paper_link": "https://archives.ismir.net/ismir2015/paper/000246.pdf", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.838584+00:00", "updated_at": "2025-07-19T19:56:14.838584+00:00" } ================================================ FILE: data/benchmarks/global-mmlu-lite.json ================================================ { "benchmark_id": "global-mmlu-lite", "name": "Global-MMLU-Lite", "parent_benchmark_id": null, "categories": ["general", "language", "reasoning"], "modality": "text", "multilingual": true, "max_score": 1.0, "language": "en", "description": "A lightweight version of Global MMLU benchmark that evaluates language models across multiple languages while addressing cultural and linguistic biases in multilingual evaluation.", "paper_link": "https://arxiv.org/abs/2412.03304", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:13.534515+00:00", "updated_at": "2025-07-19T19:56:13.534515+00:00" } ================================================ FILE: data/benchmarks/global-mmlu.json ================================================ { "benchmark_id": "global-mmlu", "name": "Global-MMLU", "parent_benchmark_id": null, "categories": ["general", "language", "reasoning"], "modality": "text", "multilingual": true, "max_score": 1.0, "language": "en", "description": "A comprehensive multilingual benchmark covering 42 languages that addresses cultural and linguistic biases in evaluation, with improved translation quality and culturally sensitive question subsets.", "paper_link": "https://arxiv.org/abs/2412.03304", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:13.747524+00:00", "updated_at": "2025-07-19T19:56:13.747524+00:00" } ================================================ FILE: data/benchmarks/gorilla-benchmark-api-bench.json ================================================ { "benchmark_id": "gorilla-benchmark-api-bench", "name": "Gorilla Benchmark API Bench", "parent_benchmark_id": null, "categories": ["reasoning", "code"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "APIBench, a comprehensive dataset of over 11,000 instruction-API pairs from HuggingFace, TorchHub, and TensorHub APIs for evaluating language models' ability to generate accurate API calls.", "paper_link": "https://arxiv.org/abs/2305.15334", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.383584+00:00", "updated_at": "2025-07-19T19:56:14.383584+00:00" } ================================================ FILE: data/benchmarks/govreport.json ================================================ { "benchmark_id": "govreport", "name": "GovReport", "parent_benchmark_id": null, "categories": ["summarization", "long_context"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "A long document summarization dataset consisting of reports from government research agencies including Congressional Research Service and U.S. Government Accountability Office, with significantly longer documents and summaries than other datasets.", "paper_link": "https://arxiv.org/abs/2104.02112", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.218809+00:00", "updated_at": "2025-07-19T19:56:14.218809+00:00" } ================================================ FILE: data/benchmarks/gpqa-biology.json ================================================ { "benchmark_id": "gpqa-biology", "name": "GPQA Biology", "parent_benchmark_id": null, "categories": ["reasoning", "general"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "Biology subset of GPQA, containing challenging multiple-choice questions written by domain experts in biology. These Google-proof questions require graduate-level knowledge and reasoning.", "paper_link": "https://arxiv.org/abs/2311.12022", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:15.391187+00:00", "updated_at": "2025-07-19T19:56:15.391187+00:00" } ================================================ FILE: data/benchmarks/gpqa-chemistry.json ================================================ { "benchmark_id": "gpqa-chemistry", "name": "GPQA Chemistry", "parent_benchmark_id": null, "categories": ["reasoning", "chemistry"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "Chemistry subset of GPQA, containing challenging multiple-choice questions written by domain experts in chemistry. These Google-proof questions require graduate-level knowledge and reasoning.", "paper_link": "https://arxiv.org/abs/2311.12022", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:15.395806+00:00", "updated_at": "2025-07-19T19:56:15.395806+00:00" } ================================================ FILE: data/benchmarks/gpqa-physics.json ================================================ { "benchmark_id": "gpqa-physics", "name": "GPQA Physics", "parent_benchmark_id": null, "categories": ["reasoning", "physics"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "Physics subset of GPQA, containing challenging multiple-choice questions written by domain experts in physics. These Google-proof questions require graduate-level knowledge and reasoning.", "paper_link": "https://arxiv.org/abs/2311.12022", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:15.400663+00:00", "updated_at": "2025-07-19T19:56:15.400663+00:00" } ================================================ FILE: data/benchmarks/gpqa.json ================================================ { "benchmark_id": "gpqa", "name": "GPQA", "parent_benchmark_id": null, "categories": ["reasoning", "general"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "A challenging dataset of 448 multiple-choice questions written by domain experts in biology, physics, and chemistry. Questions are Google-proof and extremely difficult, with PhD experts reaching 65% accuracy.", "paper_link": "https://arxiv.org/abs/2311.12022", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:11.588605+00:00", "updated_at": "2025-07-19T19:56:11.588605+00:00" } ================================================ FILE: data/benchmarks/graphwalks-bfs-%3C128k.json ================================================ { "benchmark_id": "graphwalks-bfs-<128k", "name": "Graphwalks BFS <128k", "parent_benchmark_id": null, "categories": ["reasoning", "spatial_reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "A graph reasoning benchmark that evaluates language models' ability to perform breadth-first search (BFS) operations on graphs with context length under 128k tokens, returning nodes reachable at specified depths.", "paper_link": null, "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:15.287324+00:00", "updated_at": "2025-07-19T19:56:15.287324+00:00" } ================================================ FILE: data/benchmarks/graphwalks-bfs-%3E128k.json ================================================ { "benchmark_id": "graphwalks-bfs->128k", "name": "Graphwalks BFS >128k", "parent_benchmark_id": null, "categories": ["reasoning", "spatial_reasoning", "long_context"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "A graph reasoning benchmark that evaluates language models' ability to perform breadth-first search (BFS) operations on graphs with context length over 128k tokens, testing long-context reasoning capabilities.", "paper_link": null, "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:15.295876+00:00", "updated_at": "2025-07-19T19:56:15.295876+00:00" } ================================================ FILE: data/benchmarks/graphwalks-parents-%3C128k.json ================================================ { "benchmark_id": "graphwalks-parents-<128k", "name": "Graphwalks parents <128k", "parent_benchmark_id": null, "categories": ["reasoning", "spatial_reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "A graph reasoning benchmark that evaluates language models' ability to find parent nodes in graphs with context length under 128k tokens, requiring understanding of graph structure and edge relationships.", "paper_link": null, "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:15.303643+00:00", "updated_at": "2025-07-19T19:56:15.303643+00:00" } ================================================ FILE: data/benchmarks/graphwalks-parents-%3E128k.json ================================================ { "benchmark_id": "graphwalks-parents->128k", "name": "Graphwalks parents >128k", "parent_benchmark_id": null, "categories": ["reasoning", "spatial_reasoning", "long_context"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "A graph reasoning benchmark that evaluates language models' ability to find parent nodes in graphs with context length over 128k tokens, testing long-context reasoning and graph structure understanding.", "paper_link": null, "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:15.316836+00:00", "updated_at": "2025-07-19T19:56:15.316836+00:00" } ================================================ FILE: data/benchmarks/groundui-1k.json ================================================ { "benchmark_id": "groundui-1k", "name": "GroundUI-1K", "parent_benchmark_id": null, "categories": ["multimodal", "vision"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "A subset of GroundUI-18K for UI grounding evaluation, where models must predict action coordinates on screenshots based on single-step instructions across web, desktop, and mobile platforms.", "paper_link": "https://arxiv.org/abs/2403.17918", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:12.758595+00:00", "updated_at": "2025-07-19T19:56:12.758595+00:00" } ================================================ FILE: data/benchmarks/gsm-8k-(cot).json ================================================ { "benchmark_id": "gsm-8k-(cot)", "name": "GSM-8K (CoT)", "parent_benchmark_id": null, "categories": ["math", "reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "Grade School Math 8K with Chain-of-Thought prompting, featuring 8.5K high-quality linguistically diverse grade school math word problems requiring multi-step reasoning and elementary arithmetic operations.", "paper_link": "https://arxiv.org/abs/2110.14168", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.360381+00:00", "updated_at": "2025-07-19T19:56:14.360381+00:00" } ================================================ FILE: data/benchmarks/gsm8k-chat.json ================================================ { "benchmark_id": "gsm8k-chat", "name": "GSM8K Chat", "parent_benchmark_id": null, "categories": ["math", "reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "Grade School Math 8K adapted for chat format evaluation, featuring 8.5K high-quality linguistically diverse grade school math word problems requiring multi-step reasoning and elementary arithmetic operations.", "paper_link": "https://arxiv.org/abs/2110.14168", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:15.101578+00:00", "updated_at": "2025-07-19T19:56:15.101578+00:00" } ================================================ FILE: data/benchmarks/gsm8k.json ================================================ { "benchmark_id": "gsm8k", "name": "GSM8k", "parent_benchmark_id": null, "categories": ["math", "reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "Grade School Math 8K, a dataset of 8.5K high-quality linguistically diverse grade school math word problems requiring multi-step reasoning and elementary arithmetic operations.", "paper_link": "https://arxiv.org/abs/2110.14168", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:11.397385+00:00", "updated_at": "2025-07-19T19:56:11.397385+00:00" } ================================================ FILE: data/benchmarks/hallusion-bench.json ================================================ { "benchmark_id": "hallusion-bench", "name": "Hallusion Bench", "parent_benchmark_id": null, "categories": ["vision", "reasoning"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "A comprehensive benchmark designed to evaluate image-context reasoning in large visual-language models (LVLMs) by challenging models with 346 images and 1,129 carefully crafted questions to assess language hallucination and visual illusion", "paper_link": "https://arxiv.org/abs/2310.14566", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.689507+00:00", "updated_at": "2025-07-19T19:56:14.689507+00:00" } ================================================ FILE: data/benchmarks/healthbench-hard.json ================================================ { "benchmark_id": "healthbench-hard", "name": "HealthBench Hard", "parent_benchmark_id": null, "categories": ["healthcare"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "A challenging variation of HealthBench that evaluates large language models' performance and safety in healthcare through 5,000 multi-turn conversations with particularly rigorous evaluation criteria validated by 262 physicians from 60 countries", "paper_link": "https://arxiv.org/abs/2505.08775", "implementation_link": null, "verified": false, "created_at": "2025-08-05T19:56:13.424873+00:00", "updated_at": "2025-08-05T19:56:13.424873+00:00" } ================================================ FILE: data/benchmarks/healthbench.json ================================================ { "benchmark_id": "healthbench", "name": "HealthBench", "parent_benchmark_id": null, "categories": ["healthcare"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "An open-source benchmark for measuring performance and safety of large language models in healthcare, consisting of 5,000 multi-turn conversations evaluated by 262 physicians using 48,562 unique rubric criteria across health contexts and behavioral dimensions", "paper_link": "https://arxiv.org/abs/2505.08775", "implementation_link": null, "verified": false, "created_at": "2025-08-05T19:56:13.424873+00:00", "updated_at": "2025-08-05T19:56:13.424873+00:00" } ================================================ FILE: data/benchmarks/hellaswag.json ================================================ { "benchmark_id": "hellaswag", "name": "HellaSwag", "parent_benchmark_id": null, "categories": ["reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "A challenging commonsense natural language inference dataset that uses Adversarial Filtering to create questions trivial for humans (>95% accuracy) but difficult for state-of-the-art models, requiring completion of sentence endings based on physical situations and everyday activities", "paper_link": "https://arxiv.org/abs/1905.07830", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:11.145630+00:00", "updated_at": "2025-07-19T19:56:11.145630+00:00" } ================================================ FILE: data/benchmarks/hiddenmath.json ================================================ { "benchmark_id": "hiddenmath", "name": "HiddenMath", "parent_benchmark_id": null, "categories": ["math", "reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "Google DeepMind's internal mathematical reasoning benchmark that introduces novel problems not encountered during model training to evaluate true mathematical reasoning capabilities rather than memorization", "paper_link": "https://storage.googleapis.com/deepmind-media/gemini/gemini_v1_5_report.pdf", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:13.424873+00:00", "updated_at": "2025-07-19T19:56:13.424873+00:00" } ================================================ FILE: data/benchmarks/hle.json ================================================ { "benchmark_id": "hle", "name": "HLE", "parent_benchmark_id": null, "categories": ["reasoning", "math"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "Humanity's Last Exam (HLE) is a multi-modal academic benchmark with 2,500 questions across mathematics, humanities, and natural sciences, designed to test LLM capabilities at the frontier of human knowledge with unambiguous, verifiable solutions", "paper_link": "https://arxiv.org/abs/2501.14249", "implementation_link": null, "verified": false, "created_at": "2025-07-28T00:00:00.000000+00:00", "updated_at": "2025-07-28T00:00:00.000000+00:00" } ================================================ FILE: data/benchmarks/hmmt-2025.json ================================================ { "benchmark_id": "hmmt-2025", "name": "HMMT 2025", "parent_benchmark_id": null, "categories": ["math"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "Harvard-MIT Mathematics Tournament 2025 - A prestigious student-organized mathematics competition for high school students featuring two tournaments (November 2025 at MIT and February 2026 at Harvard) with individual tests, team rounds, and guts rounds", "paper_link": "http://web.mit.edu/HMMT/www/", "implementation_link": null, "verified": false, "created_at": "2025-09-05T00:00:00.000000+00:00", "updated_at": "2025-09-05T00:00:00.000000+00:00" } ================================================ FILE: data/benchmarks/hmmt25.json ================================================ { "benchmark_id": "hmmt25", "name": "HMMT25", "parent_benchmark_id": null, "categories": ["math"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "Harvard-MIT Mathematics Tournament 2025 - A prestigious student-organized mathematics competition for high school students featuring two tournaments (November 2025 at MIT and February 2026 at Harvard) with individual tests, team rounds, and guts rounds", "paper_link": "http://web.mit.edu/HMMT/www/", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:15.061281+00:00", "updated_at": "2025-07-19T19:56:15.061281+00:00" } ================================================ FILE: data/benchmarks/humaneval+.json ================================================ { "benchmark_id": "humaneval+", "name": "HumanEval+", "parent_benchmark_id": null, "categories": ["reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "Enhanced version of HumanEval that extends the original test cases by 80x using EvalPlus framework for rigorous evaluation of LLM-synthesized code functional correctness, detecting previously undetected wrong code", "paper_link": "https://arxiv.org/abs/2305.01210", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.062352+00:00", "updated_at": "2025-07-19T19:56:14.062352+00:00" } ================================================ FILE: data/benchmarks/humaneval-average.json ================================================ { "benchmark_id": "humaneval-average", "name": "HumanEval-Average", "parent_benchmark_id": null, "categories": ["reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "A variant of the HumanEval benchmark that measures functional correctness for synthesizing programs from docstrings, consisting of 164 original programming problems assessing language comprehension, algorithms, and simple mathematics", "paper_link": "https://arxiv.org/abs/2107.03374", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:15.171175+00:00", "updated_at": "2025-07-19T19:56:15.171175+00:00" } ================================================ FILE: data/benchmarks/humaneval-er.json ================================================ { "benchmark_id": "humaneval-er", "name": "HumanEval-ER", "parent_benchmark_id": null, "categories": ["reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "A variant of the HumanEval benchmark that measures functional correctness for synthesizing programs from docstrings, consisting of 164 original programming problems assessing language comprehension, algorithms, and simple mathematics", "paper_link": "https://arxiv.org/abs/2107.03374", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:12.704744+00:00", "updated_at": "2025-07-19T19:56:12.704744+00:00" } ================================================ FILE: data/benchmarks/humaneval-mul.json ================================================ { "benchmark_id": "humaneval-mul", "name": "HumanEval-Mul", "parent_benchmark_id": null, "categories": ["reasoning"], "modality": "text", "multilingual": true, "max_score": 1.0, "language": "en", "description": "A multilingual variant of the HumanEval benchmark that measures functional correctness for synthesizing programs from docstrings, consisting of 164 original programming problems assessing language comprehension, algorithms, and simple mathematics", "paper_link": "https://arxiv.org/abs/2107.03374", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:15.032472+00:00", "updated_at": "2025-07-19T19:56:15.032472+00:00" } ================================================ FILE: data/benchmarks/humaneval-plus.json ================================================ { "benchmark_id": "humaneval-plus", "name": "HumanEval Plus", "parent_benchmark_id": null, "categories": ["reasoning", "code"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "Enhanced version of HumanEval that extends the original test cases by 80x using EvalPlus framework for rigorous evaluation of LLM-synthesized code functional correctness, detecting previously undetected wrong code", "paper_link": "https://arxiv.org/abs/2305.01210", "implementation_link": null, "verified": false, "created_at": "2025-08-03T22:06:10.921756+00:00", "updated_at": "2025-08-03T22:06:10.921756+00:00" } ================================================ FILE: data/benchmarks/humaneval.json ================================================ { "benchmark_id": "humaneval", "name": "HumanEval", "parent_benchmark_id": null, "categories": ["reasoning", "code"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "A benchmark that measures functional correctness for synthesizing programs from docstrings, consisting of 164 original programming problems assessing language comprehension, algorithms, and simple mathematics", "paper_link": "https://arxiv.org/abs/2107.03374", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:12.595263+00:00", "updated_at": "2025-07-19T19:56:12.595263+00:00" } ================================================ FILE: data/benchmarks/humanevalfim-average.json ================================================ { "benchmark_id": "humanevalfim-average", "name": "HumanEvalFIM-Average", "parent_benchmark_id": null, "categories": ["general"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "Average evaluation of HumanEval Fill-in-the-Middle benchmark variants (single-line, multi-line, random-span) for assessing code infilling capabilities of language models", "paper_link": "https://arxiv.org/abs/2207.14255", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:15.160562+00:00", "updated_at": "2025-07-19T19:56:15.160562+00:00" } ================================================ FILE: data/benchmarks/humanity's-last-exam.json ================================================ { "benchmark_id": "humanity's-last-exam", "name": "Humanity's Last Exam", "parent_benchmark_id": null, "categories": ["general"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "A multi-modal benchmark at the frontier of human knowledge with 2,500 questions across dozens of subjects including mathematics, humanities, and natural sciences, created by nearly 1000 subject expert contributors from over 500 institutions", "paper_link": "https://arxiv.org/abs/2501.14249", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:12.507693+00:00", "updated_at": "2025-07-19T19:56:12.507693+00:00" } ================================================ FILE: data/benchmarks/if.json ================================================ { "benchmark_id": "if", "name": "IF", "parent_benchmark_id": null, "categories": ["general"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "Instruction-Following Evaluation (IFEval) benchmark for large language models, focusing on verifiable instructions with 25 types of instructions and around 500 prompts containing one or more verifiable constraints", "paper_link": "https://arxiv.org/abs/2311.07911", "implementation_link": null, "verified": false, "created_at": "2025-08-03T22:06:11.089394+00:00", "updated_at": "2025-08-03T22:06:11.089394+00:00" } ================================================ FILE: data/benchmarks/ifeval.json ================================================ { "benchmark_id": "ifeval", "name": "IFEval", "parent_benchmark_id": null, "categories": ["general"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "Instruction-Following Evaluation (IFEval) benchmark for large language models, focusing on verifiable instructions with 25 types of instructions and around 500 prompts containing one or more verifiable constraints", "paper_link": "https://arxiv.org/abs/2311.07911", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:12.241350+00:00", "updated_at": "2025-07-19T19:56:12.241350+00:00" } ================================================ FILE: data/benchmarks/include.json ================================================ { "benchmark_id": "include", "name": "Include", "parent_benchmark_id": null, "categories": ["general"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "Include benchmark - specific documentation not found in official sources", "paper_link": null, "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:13.724387+00:00", "updated_at": "2025-07-19T19:56:13.724387+00:00" } ================================================ FILE: data/benchmarks/infinitebench-en.mc.json ================================================ { "benchmark_id": "infinitebench-en.mc", "name": "InfiniteBench/En.MC", "parent_benchmark_id": null, "categories": ["long_context"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "InfiniteBench English Multiple Choice variant - first LLM benchmark featuring average data length surpassing 100K tokens for evaluating long-context capabilities with 12 tasks spanning diverse domains", "paper_link": "https://arxiv.org/abs/2402.13718", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.461508+00:00", "updated_at": "2025-07-19T19:56:14.461508+00:00" } ================================================ FILE: data/benchmarks/infinitebench-en.qa.json ================================================ { "benchmark_id": "infinitebench-en.qa", "name": "InfiniteBench/En.QA", "parent_benchmark_id": null, "categories": ["long_context"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "InfiniteBench English Question Answering variant - first LLM benchmark featuring average data length surpassing 100K tokens for evaluating long-context capabilities with 12 tasks spanning diverse domains", "paper_link": "https://arxiv.org/abs/2402.13718", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.457927+00:00", "updated_at": "2025-07-19T19:56:14.457927+00:00" } ================================================ FILE: data/benchmarks/infographicsqa.json ================================================ { "benchmark_id": "infographicsqa", "name": "InfographicsQA", "parent_benchmark_id": null, "categories": ["vision", "multimodal"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "InfographicVQA dataset with 5,485 infographic images and over 30,000 questions requiring joint reasoning over document layout, textual content, graphical elements, and data visualizations with elementary reasoning and arithmetic skills", "paper_link": "https://arxiv.org/abs/2104.12756", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.417669+00:00", "updated_at": "2025-07-19T19:56:14.417669+00:00" } ================================================ FILE: data/benchmarks/infovqa.json ================================================ { "benchmark_id": "infovqa", "name": "InfoVQA", "parent_benchmark_id": null, "categories": ["vision", "multimodal"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "InfoVQA dataset with 30,000 questions and 5,000 infographic images requiring joint reasoning over document layout, textual content, graphical elements, and data visualizations with elementary reasoning and arithmetic skills", "paper_link": "https://arxiv.org/abs/2104.12756", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:13.601294+00:00", "updated_at": "2025-07-19T19:56:13.601294+00:00" } ================================================ FILE: data/benchmarks/infovqatest.json ================================================ { "benchmark_id": "infovqatest", "name": "InfoVQAtest", "parent_benchmark_id": null, "categories": ["vision", "multimodal"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "InfoVQA test set with infographic images requiring joint reasoning over document layout, textual content, graphical elements, and data visualizations with elementary reasoning and arithmetic skills", "paper_link": "https://arxiv.org/abs/2104.12756", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.583939+00:00", "updated_at": "2025-07-19T19:56:14.583939+00:00" } ================================================ FILE: data/benchmarks/instruct-humaneval.json ================================================ { "benchmark_id": "instruct-humaneval", "name": "Instruct HumanEval", "parent_benchmark_id": null, "categories": ["general"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "Instruction-based variant of HumanEval benchmark for evaluating large language models' code generation capabilities with functional correctness using pass@k metric on programming problems", "paper_link": "https://arxiv.org/abs/2107.03374", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:15.105488+00:00", "updated_at": "2025-07-19T19:56:15.105488+00:00" } ================================================ FILE: data/benchmarks/intergps.json ================================================ { "benchmark_id": "intergps", "name": "InterGPS", "parent_benchmark_id": null, "categories": ["math", "spatial_reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "Interpretable Geometry Problem Solver (Inter-GPS) with Geometry3K dataset of 3,002 geometry problems with dense annotation in formal language using theorem knowledge and symbolic reasoning", "paper_link": "https://arxiv.org/abs/2105.04165", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.259321+00:00", "updated_at": "2025-07-19T19:56:14.259321+00:00" } ================================================ FILE: data/benchmarks/internal-api-instruction-following-(hard).json ================================================ { "benchmark_id": "internal-api-instruction-following-(hard)", "name": "Internal API instruction following (hard)", "parent_benchmark_id": null, "categories": ["general"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "Internal API instruction following (hard) benchmark - specific documentation not found in official sources", "paper_link": null, "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:15.222560+00:00", "updated_at": "2025-07-19T19:56:15.222560+00:00" } ================================================ FILE: data/benchmarks/lbpp-(v2).json ================================================ { "benchmark_id": "lbpp-(v2)", "name": "LBPP (v2)", "parent_benchmark_id": null, "categories": ["reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "LBPP (v2) benchmark - specific documentation not found in official sources, possibly related to language-based planning problems", "paper_link": "https://arxiv.org/abs/2206.10498", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.053535+00:00", "updated_at": "2025-07-19T19:56:14.053535+00:00" } ================================================ FILE: data/benchmarks/livebench-20241125.json ================================================ { "benchmark_id": "livebench-20241125", "name": "LiveBench 20241125", "parent_benchmark_id": null, "categories": ["math", "reasoning", "general"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "LiveBench is a challenging, contamination-limited LLM benchmark that addresses test set contamination by releasing new questions monthly based on recently-released datasets, arXiv papers, news articles, and IMDb movie synopses. It comprises tasks across math, coding, reasoning, language, instruction following, and data analysis with verifiable, objective ground-truth answers.", "paper_link": "https://arxiv.org/abs/2406.19314", "implementation_link": null, "verified": false, "created_at": "2025-08-03T22:06:11.046321+00:00", "updated_at": "2025-08-03T22:06:11.046321+00:00" } ================================================ FILE: data/benchmarks/livebench.json ================================================ { "benchmark_id": "livebench", "name": "LiveBench", "parent_benchmark_id": null, "categories": ["math", "reasoning", "general"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "LiveBench is a challenging, contamination-limited LLM benchmark that addresses test set contamination by releasing new questions monthly based on recently-released datasets, arXiv papers, news articles, and IMDb movie synopses. It comprises tasks across math, coding, reasoning, language, instruction following, and data analysis with verifiable, objective ground-truth answers.", "paper_link": "https://arxiv.org/abs/2406.19314", "implementation_link": null, "verified": false, "created_at": "2025-09-05T00:00:00.000000+00:00", "updated_at": "2025-09-05T00:00:00.000000+00:00" } ================================================ FILE: data/benchmarks/livecodebench(01-09).json ================================================ { "benchmark_id": "livecodebench(01-09)", "name": "LiveCodeBench(01-09)", "parent_benchmark_id": null, "categories": ["reasoning", "general"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "LiveCodeBench is a holistic and contamination-free evaluation benchmark for large language models for code. It continuously collects new problems from programming contests (LeetCode, AtCoder, CodeForces) and evaluates four different scenarios: code generation, self-repair, code execution, and test output prediction. Problems are annotated with release dates to enable evaluation on unseen problems released after a model's training cutoff.", "paper_link": "https://arxiv.org/abs/2403.07974", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:15.049594+00:00", "updated_at": "2025-07-19T19:56:15.049594+00:00" } ================================================ FILE: data/benchmarks/livecodebench-v5-24.12-25.2.json ================================================ { "benchmark_id": "livecodebench-v5-24.12-25.2", "name": "LiveCodeBench v5 24.12-25.2", "parent_benchmark_id": null, "categories": ["reasoning", "general"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "LiveCodeBench is a holistic and contamination-free evaluation benchmark for large language models for code. It continuously collects new problems from programming contests (LeetCode, AtCoder, CodeForces) and evaluates four different scenarios: code generation, self-repair, code execution, and test output prediction. Problems are annotated with release dates to enable evaluation on unseen problems released after a model's training cutoff.", "paper_link": "https://arxiv.org/abs/2403.07974", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:12.066180+00:00", "updated_at": "2025-07-19T19:56:12.066180+00:00" } ================================================ FILE: data/benchmarks/livecodebench-v5.json ================================================ { "benchmark_id": "livecodebench-v5", "name": "LiveCodeBench v5", "parent_benchmark_id": null, "categories": ["reasoning", "general"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "LiveCodeBench is a holistic and contamination-free evaluation benchmark for large language models for code. It continuously collects new problems from programming contests (LeetCode, AtCoder, CodeForces) and evaluates four different scenarios: code generation, self-repair, code execution, and test output prediction. Problems are annotated with release dates to enable evaluation on unseen problems released after a model's training cutoff.", "paper_link": "https://arxiv.org/abs/2403.07974", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:13.759330+00:00", "updated_at": "2025-07-19T19:56:13.759330+00:00" } ================================================ FILE: data/benchmarks/livecodebench-v6.json ================================================ { "benchmark_id": "livecodebench-v6", "name": "LiveCodeBench v6", "parent_benchmark_id": null, "categories": ["reasoning", "general"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "LiveCodeBench is a holistic and contamination-free evaluation benchmark for large language models for code. It continuously collects new problems from programming contests (LeetCode, AtCoder, CodeForces) and evaluates four different scenarios: code generation, self-repair, code execution, and test output prediction. Problems are annotated with release dates to enable evaluation on unseen problems released after a model's training cutoff.", "paper_link": "https://arxiv.org/abs/2403.07974", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:11.785682+00:00", "updated_at": "2025-07-19T19:56:11.785682+00:00" } ================================================ FILE: data/benchmarks/livecodebench.json ================================================ { "benchmark_id": "livecodebench", "name": "LiveCodeBench", "parent_benchmark_id": null, "categories": ["reasoning", "general", "code"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "LiveCodeBench is a holistic and contamination-free evaluation benchmark for large language models for code. It continuously collects new problems from programming contests (LeetCode, AtCoder, CodeForces) and evaluates four different scenarios: code generation, self-repair, code execution, and test output prediction. Problems are annotated with release dates to enable evaluation on unseen problems released after a model's training cutoff.", "paper_link": "https://arxiv.org/abs/2403.07974", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:13.292229+00:00", "updated_at": "2025-07-19T19:56:13.292229+00:00" } ================================================ FILE: data/benchmarks/longbench-v2.json ================================================ { "benchmark_id": "longbench-v2", "name": "LongBench v2", "parent_benchmark_id": null, "categories": ["long_context", "reasoning", "general"], "modality": "text", "multilingual": true, "max_score": 1.0, "language": "en", "description": "LongBench v2 is a benchmark designed to assess the ability of LLMs to handle long-context problems requiring deep understanding and reasoning across real-world multitasks. It consists of 503 challenging multiple-choice questions with contexts ranging from 8k to 2M words across six major task categories: single-document QA, multi-document QA, long in-context learning, long-dialogue history understanding, code repository understanding, and long structured data understanding.", "paper_link": "https://arxiv.org/abs/2412.15204", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:15.029281+00:00", "updated_at": "2025-07-19T19:56:15.029281+00:00" } ================================================ FILE: data/benchmarks/longfact-concepts.json ================================================ { "benchmark_id": "longfact-concepts", "name": "LongFact Concepts", "parent_benchmark_id": null, "categories": ["general", "reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "LongFact is a benchmark for evaluating long-form factuality in large language models. It comprises 2,280 fact-seeking prompts spanning 38 topics, designed to test a model's ability to generate accurate, long-form responses. The benchmark uses SAFE (Search-Augmented Factuality Evaluator) to evaluate factual accuracy.", "paper_link": "https://arxiv.org/abs/2403.18802", "implementation_link": null, "verified": false, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00" } ================================================ FILE: data/benchmarks/longfact-objects.json ================================================ { "benchmark_id": "longfact-objects", "name": "LongFact Objects", "parent_benchmark_id": null, "categories": ["general", "reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "LongFact is a benchmark for evaluating long-form factuality in large language models. It comprises 2,280 fact-seeking prompts spanning 38 topics, designed to test a model's ability to generate accurate, long-form responses. The benchmark uses SAFE (Search-Augmented Factuality Evaluator) to evaluate factual accuracy.", "paper_link": "https://arxiv.org/abs/2403.18802", "implementation_link": null, "verified": false, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00" } ================================================ FILE: data/benchmarks/longvideobench.json ================================================ { "benchmark_id": "longvideobench", "name": "LongVideoBench", "parent_benchmark_id": null, "categories": ["vision", "long_context", "multimodal"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "LongVideoBench is a question-answering benchmark featuring video-language interleaved inputs up to an hour long. It includes 3,763 varying-length web-collected videos with subtitles across diverse themes and 6,678 human-annotated multiple-choice questions in 17 fine-grained categories for comprehensive evaluation of long-term multimodal understanding.", "paper_link": "https://arxiv.org/abs/2407.15754", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.730349+00:00", "updated_at": "2025-07-19T19:56:14.730349+00:00" } ================================================ FILE: data/benchmarks/lsat.json ================================================ { "benchmark_id": "lsat", "name": "LSAT", "parent_benchmark_id": null, "categories": ["reasoning", "legal", "general"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "LSAT (Law School Admission Test) benchmark evaluating complex reasoning capabilities across three challenging tasks: analytical reasoning, logical reasoning, and reading comprehension. The LSAT measures skills considered essential for success in law school including critical thinking, reading comprehension of complex texts, and analysis of arguments.", "paper_link": "https://arxiv.org/abs/2108.00648", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:15.409871+00:00", "updated_at": "2025-07-19T19:56:15.409871+00:00" } ================================================ FILE: data/benchmarks/lvbench.json ================================================ { "benchmark_id": "lvbench", "name": "LVBench", "parent_benchmark_id": null, "categories": ["vision", "multimodal", "long_context"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "LVBench is an extreme long video understanding benchmark designed to evaluate multimodal models on videos up to two hours in duration. It contains 6 major categories and 21 subcategories, with videos averaging five times longer than existing datasets. The benchmark addresses applications requiring comprehension of extremely long videos.", "paper_link": "https://arxiv.org/abs/2406.08035", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:12.724041+00:00", "updated_at": "2025-07-19T19:56:12.724041+00:00" } ================================================ FILE: data/benchmarks/math-(cot).json ================================================ { "benchmark_id": "math-(cot)", "name": "MATH (CoT)", "parent_benchmark_id": null, "categories": ["math", "reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "MATH dataset contains 12,500 challenging competition mathematics problems from AMC 10, AMC 12, AIME, and other mathematics competitions. Each problem includes full step-by-step solutions and spans multiple difficulty levels (1-5) across seven mathematical subjects. This variant uses Chain-of-Thought prompting to encourage step-by-step reasoning.", "paper_link": "https://arxiv.org/abs/2103.03874", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.366159+00:00", "updated_at": "2025-07-19T19:56:14.366159+00:00" } ================================================ FILE: data/benchmarks/math-500.json ================================================ { "benchmark_id": "math-500", "name": "MATH-500", "parent_benchmark_id": null, "categories": ["math", "reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "MATH-500 is a subset of the MATH dataset containing 500 challenging competition mathematics problems from AMC 10, AMC 12, AIME, and other mathematics competitions. Each problem includes full step-by-step solutions and spans multiple difficulty levels across seven mathematical subjects including Prealgebra, Algebra, Number Theory, Counting and Probability, Geometry, Intermediate Algebra, and Precalculus.", "paper_link": "https://arxiv.org/abs/2103.03874", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:12.027850+00:00", "updated_at": "2025-07-19T19:56:12.027850+00:00" } ================================================ FILE: data/benchmarks/math.json ================================================ { "benchmark_id": "math", "name": "MATH", "parent_benchmark_id": null, "categories": ["math", "reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "MATH dataset contains 12,500 challenging competition mathematics problems from AMC 10, AMC 12, AIME, and other mathematics competitions. Each problem includes full step-by-step solutions and spans multiple difficulty levels (1-5) across seven mathematical subjects including Prealgebra, Algebra, Number Theory, Counting and Probability, Geometry, Intermediate Algebra, and Precalculus.", "paper_link": "https://arxiv.org/abs/2103.03874", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:11.804258+00:00", "updated_at": "2025-07-19T19:56:11.804258+00:00" } ================================================ FILE: data/benchmarks/mathvision.json ================================================ { "benchmark_id": "mathvision", "name": "MathVision", "parent_benchmark_id": null, "categories": ["math", "vision", "multimodal"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "MATH-Vision is a dataset designed to measure multimodal mathematical reasoning capabilities. It focuses on evaluating how well models can solve mathematical problems that require both visual understanding and mathematical reasoning, bridging the gap between visual and mathematical domains.", "paper_link": "https://arxiv.org/abs/2402.14804", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.695583+00:00", "updated_at": "2025-07-19T19:56:14.695583+00:00" } ================================================ FILE: data/benchmarks/mathvista-mini.json ================================================ { "benchmark_id": "mathvista-mini", "name": "MathVista-Mini", "parent_benchmark_id": null, "categories": ["math", "vision", "multimodal"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "MathVista-Mini is a smaller version of the MathVista benchmark that evaluates mathematical reasoning in visual contexts. It consists of examples derived from multimodal datasets involving mathematics, combining challenges from diverse mathematical and visual tasks to assess foundation models' ability to solve problems requiring both visual understanding and mathematical reasoning.", "paper_link": "https://arxiv.org/abs/2310.02255", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:13.654470+00:00", "updated_at": "2025-07-19T19:56:13.654470+00:00" } ================================================ FILE: data/benchmarks/mathvista.json ================================================ { "benchmark_id": "mathvista", "name": "MathVista", "parent_benchmark_id": null, "categories": ["math", "vision", "multimodal"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "MathVista evaluates mathematical reasoning of foundation models in visual contexts. It consists of 6,141 examples derived from 28 existing multimodal datasets and 3 newly created datasets (IQTest, FunctionQA, and PaperQA), combining challenges from diverse mathematical and visual tasks to assess models' ability to understand complex figures and perform rigorous reasoning.", "paper_link": "https://arxiv.org/abs/2310.02255", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:12.069611+00:00", "updated_at": "2025-07-19T19:56:12.069611+00:00" } ================================================ FILE: data/benchmarks/mbpp+.json ================================================ { "benchmark_id": "mbpp+", "name": "MBPP+", "parent_benchmark_id": null, "categories": ["reasoning", "general"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "MBPP+ is an enhanced version of MBPP (Mostly Basic Python Problems) with significantly more test cases (35x) for more rigorous evaluation. MBPP is a benchmark of 974 crowd-sourced Python programming problems designed to be solvable by entry-level programmers, covering programming fundamentals and standard library functionality.", "paper_link": "https://arxiv.org/abs/2108.07732", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.501855+00:00", "updated_at": "2025-07-19T19:56:14.501855+00:00" } ================================================ FILE: data/benchmarks/mbpp-++-base-version.json ================================================ { "benchmark_id": "mbpp-++-base-version", "name": "MBPP ++ base version", "parent_benchmark_id": null, "categories": ["reasoning", "general"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "MBPP (Mostly Basic Python Problems) is a benchmark of 974 crowd-sourced Python programming problems designed to be solvable by entry-level programmers. Each problem consists of a task description, code solution, and 3 automated test cases covering programming fundamentals and standard library functionality. This is an enhanced version with additional test cases.", "paper_link": "https://arxiv.org/abs/2108.07732", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.341560+00:00", "updated_at": "2025-07-19T19:56:14.341560+00:00" } ================================================ FILE: data/benchmarks/mbpp-evalplus-(base).json ================================================ { "benchmark_id": "mbpp-evalplus-(base)", "name": "MBPP EvalPlus (base)", "parent_benchmark_id": null, "categories": ["reasoning", "general"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "MBPP (Mostly Basic Python Problems) is a benchmark of 974 crowd-sourced Python programming problems designed to be solvable by entry-level programmers. EvalPlus extends MBPP with significantly more test cases (35x) for more rigorous evaluation of LLM-synthesized code, providing high-quality and precise evaluation.", "paper_link": "https://arxiv.org/abs/2108.07732", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.421722+00:00", "updated_at": "2025-07-19T19:56:14.421722+00:00" } ================================================ FILE: data/benchmarks/mbpp-evalplus.json ================================================ { "benchmark_id": "mbpp-evalplus", "name": "MBPP EvalPlus", "parent_benchmark_id": null, "categories": ["reasoning", "general"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "MBPP (Mostly Basic Python Problems) is a benchmark of 974 crowd-sourced Python programming problems designed to be solvable by entry-level programmers. EvalPlus extends MBPP with significantly more test cases (35x) for more rigorous evaluation of LLM-synthesized code, providing high-quality and precise evaluation.", "paper_link": "https://arxiv.org/abs/2108.07732", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.425667+00:00", "updated_at": "2025-07-19T19:56:14.425667+00:00" } ================================================ FILE: data/benchmarks/mbpp-pass@1.json ================================================ { "benchmark_id": "mbpp-pass@1", "name": "MBPP pass@1", "parent_benchmark_id": null, "categories": ["reasoning", "general"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "MBPP (Mostly Basic Python Problems) is a benchmark of 974 crowd-sourced Python programming problems designed to be solvable by entry-level programmers. Each problem consists of a task description, code solution, and 3 automated test cases. This variant uses pass@1 evaluation metric measuring the percentage of problems solved correctly on the first attempt.", "paper_link": "https://arxiv.org/abs/2108.07732", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:15.138778+00:00", "updated_at": "2025-07-19T19:56:15.138778+00:00" } ================================================ FILE: data/benchmarks/mbpp-plus.json ================================================ { "benchmark_id": "mbpp-plus", "name": "MBPP Plus", "parent_benchmark_id": null, "categories": ["reasoning", "code"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "MBPP (Mostly Basic Python Problems) is a benchmark of 974 crowd-sourced Python programming problems designed to be solvable by entry-level programmers. Each problem consists of a task description, code solution, and 3 automated test cases covering programming fundamentals and standard library functionality. This is an enhanced version with additional test cases for more rigorous evaluation.", "paper_link": "https://arxiv.org/abs/2108.07732", "implementation_link": null, "verified": false, "created_at": "2025-08-03T22:06:11.143382+00:00", "updated_at": "2025-08-03T22:06:11.143382+00:00" } ================================================ FILE: data/benchmarks/mbpp.json ================================================ { "benchmark_id": "mbpp", "name": "MBPP", "parent_benchmark_id": null, "categories": ["reasoning", "general"], "modality": "text", "multilingual": false, "max_score": 100.0, "language": "en", "description": "MBPP (Mostly Basic Python Problems) is a benchmark of 974 crowd-sourced Python programming problems designed to be solvable by entry-level programmers. Each problem consists of a task description, code solution, and 3 automated test cases covering programming fundamentals and standard library functionality.", "paper_link": "https://arxiv.org/abs/2108.07732", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:13.453370+00:00", "updated_at": "2025-07-19T19:56:13.453370+00:00" } ================================================ FILE: data/benchmarks/medxpertqa.json ================================================ { "benchmark_id": "medxpertqa", "name": "MedXpertQA", "parent_benchmark_id": null, "categories": ["healthcare", "reasoning", "multimodal"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "A comprehensive benchmark to evaluate expert-level medical knowledge and advanced reasoning, featuring 4,460 questions spanning 17 specialties and 11 body systems. Includes both text-only and multimodal subsets with expert-level exam questions incorporating diverse medical images and rich clinical information.", "paper_link": "https://arxiv.org/abs/2501.18362", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.040381+00:00", "updated_at": "2025-07-19T19:56:14.040381+00:00" } ================================================ FILE: data/benchmarks/mega-mlqa.json ================================================ { "benchmark_id": "mega-mlqa", "name": "MEGA MLQA", "parent_benchmark_id": null, "categories": ["language", "reasoning"], "modality": "text", "multilingual": true, "max_score": 1.0, "language": "en", "description": "MLQA as part of the MEGA (Multilingual Evaluation of Generative AI) benchmark suite. A multi-way aligned extractive QA evaluation benchmark for cross-lingual question answering across 7 languages (English, Arabic, German, Spanish, Hindi, Vietnamese, and Simplified Chinese) with over 12K QA instances in English and 5K in each other language.", "paper_link": "https://arxiv.org/abs/2303.12528", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.187404+00:00", "updated_at": "2025-07-19T19:56:14.187404+00:00" } ================================================ FILE: data/benchmarks/mega-tydi-qa.json ================================================ { "benchmark_id": "mega-tydi-qa", "name": "MEGA TyDi QA", "parent_benchmark_id": null, "categories": ["language", "reasoning"], "modality": "text", "multilingual": true, "max_score": 1.0, "language": "en", "description": "TyDi QA as part of the MEGA benchmark suite. A question answering dataset covering 11 typologically diverse languages (Arabic, Bengali, English, Finnish, Indonesian, Japanese, Korean, Russian, Swahili, Telugu, and Thai) with 204K question-answer pairs. Features realistic information-seeking questions written by people who want to know the answer but don't know it yet.", "paper_link": "https://arxiv.org/abs/2003.05002", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.192871+00:00", "updated_at": "2025-07-19T19:56:14.192871+00:00" } ================================================ FILE: data/benchmarks/mega-udpos.json ================================================ { "benchmark_id": "mega-udpos", "name": "MEGA UDPOS", "parent_benchmark_id": null, "categories": ["language"], "modality": "text", "multilingual": true, "max_score": 1.0, "language": "en", "description": "Universal Dependencies POS tagging as part of the MEGA benchmark suite. A multilingual part-of-speech tagging dataset based on Universal Dependencies treebanks, utilizing the universal POS tag set of 17 tags across 38 diverse languages from different language families. Used for evaluating multilingual POS tagging systems.", "paper_link": "https://arxiv.org/abs/2004.10643", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.198318+00:00", "updated_at": "2025-07-19T19:56:14.198318+00:00" } ================================================ FILE: data/benchmarks/mega-xcopa.json ================================================ { "benchmark_id": "mega-xcopa", "name": "MEGA XCOPA", "parent_benchmark_id": null, "categories": ["reasoning", "language"], "modality": "text", "multilingual": true, "max_score": 1.0, "language": "en", "description": "XCOPA (Cross-lingual Choice of Plausible Alternatives) as part of the MEGA benchmark suite. A typologically diverse multilingual dataset for causal commonsense reasoning in 11 languages, including resource-poor languages like Eastern Apurímac Quechua and Haitian Creole. Requires models to select which choice is the effect or cause of a given premise.", "paper_link": "https://arxiv.org/abs/2005.00333", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.205296+00:00", "updated_at": "2025-07-19T19:56:14.205296+00:00" } ================================================ FILE: data/benchmarks/mega-xstorycloze.json ================================================ { "benchmark_id": "mega-xstorycloze", "name": "MEGA XStoryCloze", "parent_benchmark_id": null, "categories": ["reasoning", "language"], "modality": "text", "multilingual": true, "max_score": 1.0, "language": "en", "description": "XStoryCloze as part of the MEGA benchmark suite. A cross-lingual story completion task that consists of professionally translated versions of the English StoryCloze dataset to 10 non-English languages. Requires models to predict the correct ending for a given four-sentence story, evaluating commonsense reasoning and narrative understanding.", "paper_link": "https://arxiv.org/abs/2303.12528", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.212479+00:00", "updated_at": "2025-07-19T19:56:14.212479+00:00" } ================================================ FILE: data/benchmarks/meld.json ================================================ { "benchmark_id": "meld", "name": "Meld", "parent_benchmark_id": null, "categories": ["multimodal", "psychology"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "MELD (Multimodal EmotionLines Dataset) is a multimodal multi-party dataset for emotion recognition in conversations. Contains approximately 13,000 utterances from 1,433 dialogues extracted from the TV series Friends. Each utterance is annotated with emotion (Anger, Disgust, Sadness, Joy, Neutral, Surprise, Fear) and sentiment labels across audio, visual, and textual modalities.", "paper_link": "https://arxiv.org/abs/1810.02508", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.842977+00:00", "updated_at": "2025-07-19T19:56:14.842977+00:00" } ================================================ FILE: data/benchmarks/mgsm.json ================================================ { "benchmark_id": "mgsm", "name": "MGSM", "parent_benchmark_id": null, "categories": ["math", "reasoning"], "modality": "text", "multilingual": true, "max_score": 1.0, "language": "en", "description": "MGSM (Multilingual Grade School Math) is a benchmark of grade-school math problems. Contains 250 grade-school math problems manually translated from the GSM8K dataset into ten typologically diverse languages: Spanish, French, German, Russian, Chinese, Japanese, Thai, Swahili, Bengali, and Telugu. Evaluates multilingual mathematical reasoning capabilities.", "paper_link": "https://arxiv.org/abs/2210.03057", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:13.669061+00:00", "updated_at": "2025-07-19T19:56:13.669061+00:00" } ================================================ FILE: data/benchmarks/mimic-cxr.json ================================================ { "benchmark_id": "mimic-cxr", "name": "MIMIC CXR", "parent_benchmark_id": null, "categories": ["healthcare", "vision", "multimodal"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "MIMIC-CXR is a large publicly available dataset of chest radiographs with free-text radiology reports. Contains 377,110 images corresponding to 227,835 radiographic studies from 65,379 patients at Beth Israel Deaconess Medical Center. The dataset is de-identified and widely used for medical imaging research, automated report generation, and medical AI development.", "paper_link": "https://arxiv.org/abs/1901.07042", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.017221+00:00", "updated_at": "2025-07-19T19:56:14.017221+00:00" } ================================================ FILE: data/benchmarks/mlvu-m.json ================================================ { "benchmark_id": "mlvu-m", "name": "MLVU-M", "parent_benchmark_id": null, "categories": ["general"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "MLVU-M benchmark", "paper_link": null, "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.931298+00:00", "updated_at": "2025-07-19T19:56:14.931298+00:00" } ================================================ FILE: data/benchmarks/mlvu.json ================================================ { "benchmark_id": "mlvu", "name": "MLVU", "parent_benchmark_id": null, "categories": ["video", "multimodal", "long_context"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "A comprehensive benchmark for multi-task long video understanding that evaluates multimodal large language models on videos ranging from 3 minutes to 2 hours across 9 distinct tasks including reasoning, captioning, recognition, and summarization.", "paper_link": "https://arxiv.org/abs/2406.04264", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.755571+00:00", "updated_at": "2025-07-19T19:56:14.755571+00:00" } ================================================ FILE: data/benchmarks/mm-if-eval.json ================================================ { "benchmark_id": "mm-if-eval", "name": "MM IF-Eval", "parent_benchmark_id": null, "categories": ["multimodal", "reasoning"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "A challenging multimodal instruction-following benchmark that includes both compose-level constraints for output responses and perception-level constraints tied to input images, with comprehensive evaluation pipeline.", "paper_link": "https://arxiv.org/abs/2504.07957", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:15.142939+00:00", "updated_at": "2025-07-19T19:56:15.142939+00:00" } ================================================ FILE: data/benchmarks/mm-mind2web.json ================================================ { "benchmark_id": "mm-mind2web", "name": "MM-Mind2Web", "parent_benchmark_id": null, "categories": ["multimodal", "frontend_development", "reasoning"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "A multimodal web navigation benchmark comprising 2,000 open-ended tasks spanning 137 websites across 31 domains. Each task includes HTML documents paired with webpage screenshots, action sequences, and complex web interactions.", "paper_link": "https://arxiv.org/abs/2306.06070", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:12.753488+00:00", "updated_at": "2025-07-19T19:56:12.753488+00:00" } ================================================ FILE: data/benchmarks/mm-mt-bench.json ================================================ { "benchmark_id": "mm-mt-bench", "name": "MM-MT-Bench", "parent_benchmark_id": null, "categories": ["multimodal", "communication"], "modality": "multimodal", "multilingual": false, "max_score": 100.0, "language": "en", "description": "A multi-turn LLM-as-a-judge evaluation benchmark for testing multimodal instruction-tuned models' ability to follow user instructions in multi-turn dialogues and answer open-ended questions in a zero-shot manner.", "paper_link": null, "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.880812+00:00", "updated_at": "2025-07-19T19:56:14.880812+00:00" } ================================================ FILE: data/benchmarks/mmau-music.json ================================================ { "benchmark_id": "mmau-music", "name": "MMAU Music", "parent_benchmark_id": null, "categories": ["audio", "multimodal", "reasoning"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "A subset of the MMAU benchmark focused specifically on music understanding and reasoning tasks. Part of a comprehensive multimodal audio understanding benchmark that evaluates models on expert-level knowledge and complex reasoning across music audio clips.", "paper_link": "https://arxiv.org/abs/2410.19168", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.851711+00:00", "updated_at": "2025-07-19T19:56:14.851711+00:00" } ================================================ FILE: data/benchmarks/mmau-sound.json ================================================ { "benchmark_id": "mmau-sound", "name": "MMAU Sound", "parent_benchmark_id": null, "categories": ["audio", "multimodal", "reasoning"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "A subset of the MMAU benchmark focused specifically on environmental sound understanding and reasoning tasks. Part of a comprehensive multimodal audio understanding benchmark that evaluates models on expert-level knowledge and complex reasoning across environmental sound clips.", "paper_link": "https://arxiv.org/abs/2410.19168", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.859503+00:00", "updated_at": "2025-07-19T19:56:14.859503+00:00" } ================================================ FILE: data/benchmarks/mmau-speech.json ================================================ { "benchmark_id": "mmau-speech", "name": "MMAU Speech", "parent_benchmark_id": null, "categories": ["audio", "multimodal", "reasoning", "speech-to-text"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "A subset of the MMAU benchmark focused specifically on speech understanding and reasoning tasks. Part of a comprehensive multimodal audio understanding benchmark that evaluates models on expert-level knowledge and complex reasoning across speech audio clips.", "paper_link": "https://arxiv.org/abs/2410.19168", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.863540+00:00", "updated_at": "2025-07-19T19:56:14.863540+00:00" } ================================================ FILE: data/benchmarks/mmau.json ================================================ { "benchmark_id": "mmau", "name": "MMAU", "parent_benchmark_id": null, "categories": ["audio", "multimodal", "reasoning"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "A massive multi-task audio understanding and reasoning benchmark comprising 10,000 carefully curated audio clips paired with human-annotated natural language questions spanning speech, environmental sounds, and music. Requires expert-level knowledge and complex reasoning across 27 distinct skills.", "paper_link": "https://arxiv.org/abs/2410.19168", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.846435+00:00", "updated_at": "2025-07-19T19:56:14.846435+00:00" } ================================================ FILE: data/benchmarks/mmbench-test.json ================================================ { "benchmark_id": "mmbench-test", "name": "MMBench_test", "parent_benchmark_id": null, "categories": ["vision", "multimodal", "reasoning"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "Test set of MMBench, a bilingual benchmark for assessing multi-modal capabilities of vision-language models through multiple-choice questions in both English and Chinese, providing systematic evaluation across diverse vision-language tasks.", "paper_link": "https://arxiv.org/abs/2307.06281", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.607904+00:00", "updated_at": "2025-07-19T19:56:14.607904+00:00" } ================================================ FILE: data/benchmarks/mmbench-v1.1.json ================================================ { "benchmark_id": "mmbench-v1.1", "name": "MMBench-V1.1", "parent_benchmark_id": null, "categories": ["vision", "multimodal", "reasoning"], "modality": "multimodal", "multilingual": true, "max_score": 1.0, "language": "en", "description": "Version 1.1 of MMBench, an improved bilingual benchmark for assessing multi-modal capabilities of vision-language models through multiple-choice questions in both English and Chinese, providing systematic evaluation across diverse vision-language tasks.", "paper_link": "https://arxiv.org/abs/2307.06281", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.868950+00:00", "updated_at": "2025-07-19T19:56:14.868950+00:00" } ================================================ FILE: data/benchmarks/mmbench-video.json ================================================ { "benchmark_id": "mmbench-video", "name": "MMBench-Video", "parent_benchmark_id": null, "categories": ["video", "multimodal", "reasoning"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "A long-form multi-shot benchmark for holistic video understanding that incorporates approximately 600 web videos from YouTube spanning 16 major categories, with each video ranging from 30 seconds to 6 minutes. Includes roughly 2,000 original question-answer pairs covering 26 fine-grained capabilities.", "paper_link": "https://arxiv.org/abs/2406.14515", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.738914+00:00", "updated_at": "2025-07-19T19:56:14.738914+00:00" } ================================================ FILE: data/benchmarks/mmbench.json ================================================ { "benchmark_id": "mmbench", "name": "MMBench", "parent_benchmark_id": null, "categories": ["vision", "multimodal", "reasoning"], "modality": "multimodal", "multilingual": true, "max_score": 1.0, "language": "en", "description": "A bilingual benchmark for assessing multi-modal capabilities of vision-language models through multiple-choice questions in both English and Chinese, providing systematic evaluation across diverse vision-language tasks with robust metrics.", "paper_link": "https://arxiv.org/abs/2307.06281", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.235585+00:00", "updated_at": "2025-07-19T19:56:14.235585+00:00" } ================================================ FILE: data/benchmarks/mme-realworld.json ================================================ { "benchmark_id": "mme-realworld", "name": "MME-RealWorld", "parent_benchmark_id": null, "categories": ["vision", "multimodal", "general"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "A comprehensive evaluation benchmark for Multimodal Large Language Models featuring over 13,366 high-resolution images and 29,429 question-answer pairs across 43 subtasks and 5 real-world scenarios. The largest manually annotated multimodal benchmark to date, designed to test MLLMs on challenging high-resolution real-world scenarios.", "paper_link": "https://arxiv.org/abs/2408.13257", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.877676+00:00", "updated_at": "2025-07-19T19:56:14.877676+00:00" } ================================================ FILE: data/benchmarks/mme.json ================================================ { "benchmark_id": "mme", "name": "MME", "parent_benchmark_id": null, "categories": ["vision", "multimodal", "reasoning"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "A comprehensive evaluation benchmark for Multimodal Large Language Models measuring both perception and cognition abilities across 14 subtasks. Features manually designed instruction-answer pairs to avoid data leakage and provides systematic quantitative assessment of MLLM capabilities.", "paper_link": "https://arxiv.org/abs/2306.13394", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:15.022505+00:00", "updated_at": "2025-07-19T19:56:15.022505+00:00" } ================================================ FILE: data/benchmarks/mmlu-(cot).json ================================================ { "benchmark_id": "mmlu-(cot)", "name": "MMLU (CoT)", "parent_benchmark_id": null, "categories": ["language", "reasoning", "math", "general"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "Chain-of-Thought variant of the Massive Multitask Language Understanding benchmark, evaluating language models across 57 tasks including elementary mathematics, US history, computer science, law, and other professional and academic subjects. This version uses chain-of-thought prompting to elicit step-by-step reasoning.", "paper_link": "https://arxiv.org/abs/2009.03300", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.330830+00:00", "updated_at": "2025-07-19T19:56:14.330830+00:00" } ================================================ FILE: data/benchmarks/mmlu-base.json ================================================ { "benchmark_id": "mmlu-base", "name": "MMLU-Base", "parent_benchmark_id": null, "categories": ["language", "reasoning", "math", "general"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "Base version of the Massive Multitask Language Understanding benchmark, evaluating language models across 57 tasks including elementary mathematics, US history, computer science, law, and other professional and academic subjects. Designed to comprehensively measure the breadth and depth of a model's academic and professional understanding.", "paper_link": "https://arxiv.org/abs/2009.03300", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.562710+00:00", "updated_at": "2025-07-19T19:56:14.562710+00:00" } ================================================ FILE: data/benchmarks/mmlu-chat.json ================================================ { "benchmark_id": "mmlu-chat", "name": "MMLU Chat", "parent_benchmark_id": null, "categories": ["language", "reasoning", "math", "general"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "Chat-format variant of the Massive Multitask Language Understanding benchmark, evaluating language models across 57 tasks including elementary mathematics, US history, computer science, law, and other professional and academic subjects. This version uses conversational prompting format for model evaluation.", "paper_link": "https://arxiv.org/abs/2009.03300", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:15.095600+00:00", "updated_at": "2025-07-19T19:56:15.095600+00:00" } ================================================ FILE: data/benchmarks/mmlu-french.json ================================================ { "benchmark_id": "mmlu-french", "name": "MMLU French", "parent_benchmark_id": null, "categories": ["language", "reasoning", "math", "general"], "modality": "text", "multilingual": true, "max_score": 1.0, "language": "fr", "description": "French language variant of the Massive Multitask Language Understanding benchmark, evaluating language models across 57 tasks including elementary mathematics, US history, computer science, law, and other professional and academic subjects. This multilingual version tests model performance in French.", "paper_link": "https://arxiv.org/abs/2009.03300", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:15.175211+00:00", "updated_at": "2025-07-19T19:56:15.175211+00:00" } ================================================ FILE: data/benchmarks/mmlu-pro.json ================================================ { "benchmark_id": "mmlu-pro", "name": "MMLU-Pro", "parent_benchmark_id": null, "categories": ["language", "reasoning", "math", "general"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "A more robust and challenging multi-task language understanding benchmark that extends MMLU by expanding multiple-choice options from 4 to 10, eliminating trivial questions, and focusing on reasoning-intensive tasks. Features over 12,000 curated questions across 14 domains and causes a 16-33% accuracy drop compared to original MMLU.", "paper_link": "https://arxiv.org/abs/2406.01574", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:11.408351+00:00", "updated_at": "2025-07-19T19:56:11.408351+00:00" } ================================================ FILE: data/benchmarks/mmlu-prox.json ================================================ { "benchmark_id": "mmlu-prox", "name": "MMLU-ProX", "parent_benchmark_id": null, "categories": ["language", "reasoning", "math", "general"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "Extended version of MMLU-Pro providing additional challenging multiple-choice questions for evaluating language models across diverse academic and professional domains. Built on the foundation of the Massive Multitask Language Understanding benchmark framework.", "paper_link": "https://arxiv.org/abs/2406.01574", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:13.738623+00:00", "updated_at": "2025-07-19T19:56:13.738623+00:00" } ================================================ FILE: data/benchmarks/mmlu-redux-2.0.json ================================================ { "benchmark_id": "mmlu-redux-2.0", "name": "MMLU-redux-2.0", "parent_benchmark_id": null, "categories": ["language", "reasoning", "math", "general"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "A curated version of the MMLU benchmark featuring manually re-annotated 5,700 questions across 57 subjects to identify and correct errors in the original dataset. Addresses the 6.49% error rate found in MMLU and provides more reliable evaluation metrics for language models.", "paper_link": "https://arxiv.org/abs/2406.04127", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:11.518552+00:00", "updated_at": "2025-07-19T19:56:11.518552+00:00" } ================================================ FILE: data/benchmarks/mmlu-redux.json ================================================ { "benchmark_id": "mmlu-redux", "name": "MMLU-Redux", "parent_benchmark_id": null, "categories": ["language", "reasoning", "math", "general"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "An improved version of the MMLU benchmark featuring manually re-annotated questions to identify and correct errors in the original dataset. Provides more reliable evaluation metrics for language models by addressing dataset quality issues found in the original MMLU.", "paper_link": "https://arxiv.org/abs/2406.04127", "implementation_link": null, "verified": false, "created_at": "2025-09-05T00:00:00.000000+00:00", "updated_at": "2025-09-05T00:00:00.000000+00:00" } ================================================ FILE: data/benchmarks/mmlu-stem.json ================================================ { "benchmark_id": "mmlu-stem", "name": "MMLU-STEM", "parent_benchmark_id": null, "categories": ["math", "reasoning", "physics", "chemistry"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "STEM-focused subset of the Massive Multitask Language Understanding benchmark, evaluating language models on science, technology, engineering, and mathematics topics including physics, chemistry, mathematics, and other technical subjects.", "paper_link": "https://arxiv.org/abs/2009.03300", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.495405+00:00", "updated_at": "2025-07-19T19:56:14.495405+00:00" } ================================================ FILE: data/benchmarks/mmlu.json ================================================ { "benchmark_id": "mmlu", "name": "MMLU", "parent_benchmark_id": null, "categories": ["general", "reasoning", "language", "math"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "Massive Multitask Language Understanding benchmark testing knowledge across 57 diverse subjects including STEM, humanities, social sciences, and professional domains", "paper_link": "https://arxiv.org/abs/2009.03300", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:11.200416+00:00", "updated_at": "2025-07-19T19:56:11.200416+00:00" } ================================================ FILE: data/benchmarks/mmmlu.json ================================================ { "benchmark_id": "mmmlu", "name": "MMMLU", "parent_benchmark_id": null, "categories": ["language", "reasoning", "math", "general"], "modality": "text", "multilingual": true, "max_score": 1.0, "language": "en", "description": "Multilingual Massive Multitask Language Understanding dataset released by OpenAI, featuring professionally translated MMLU test questions across 14 languages including Arabic, Bengali, German, Spanish, French, Hindi, Indonesian, Italian, Japanese, Korean, Portuguese, Swahili, Yoruba, and Chinese. Contains approximately 15,908 multiple-choice questions per language covering 57 subjects.", "paper_link": "https://arxiv.org/abs/2009.03300", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.144789+00:00", "updated_at": "2025-07-19T19:56:14.144789+00:00" } ================================================ FILE: data/benchmarks/mmmu-(val).json ================================================ { "benchmark_id": "mmmu-(val)", "name": "MMMU (val)", "parent_benchmark_id": null, "categories": ["vision", "multimodal", "reasoning", "general"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "Validation set of the Massive Multi-discipline Multimodal Understanding and Reasoning benchmark. Features college-level multimodal questions across 6 core disciplines (Art & Design, Business, Science, Health & Medicine, Humanities & Social Science, Tech & Engineering) spanning 30 subjects and 183 subfields with diverse image types including charts, diagrams, maps, and tables.", "paper_link": "https://arxiv.org/abs/2311.16502", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:13.593262+00:00", "updated_at": "2025-07-19T19:56:13.593262+00:00" } ================================================ FILE: data/benchmarks/mmmu-(validation).json ================================================ { "benchmark_id": "mmmu-(validation)", "name": "MMMU (validation)", "parent_benchmark_id": null, "categories": ["vision", "multimodal", "reasoning", "general"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "Validation set of the Massive Multi-discipline Multimodal Understanding and Reasoning benchmark. Features college-level multimodal questions across 6 core disciplines (Art & Design, Business, Science, Health & Medicine, Humanities & Social Science, Tech & Engineering) spanning 30 subjects and 183 subfields with diverse image types including charts, diagrams, maps, and tables.", "paper_link": "https://arxiv.org/abs/2311.16502", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:15.118197+00:00", "updated_at": "2025-07-19T19:56:15.118197+00:00" } ================================================ FILE: data/benchmarks/mmmu-pro.json ================================================ { "benchmark_id": "mmmu-pro", "name": "MMMU-Pro", "parent_benchmark_id": null, "categories": ["vision", "multimodal", "reasoning", "general"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "A more robust multi-discipline multimodal understanding benchmark that enhances MMMU through a three-step process: filtering text-only answerable questions, augmenting candidate options, and introducing vision-only input settings. Achieves significantly lower model performance (16.8-26.9%) compared to original MMMU, providing more rigorous evaluation that closely mimics real-world scenarios.", "paper_link": "https://arxiv.org/abs/2409.02813", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.282252+00:00", "updated_at": "2025-07-19T19:56:14.282252+00:00" } ================================================ FILE: data/benchmarks/mmmu.json ================================================ { "benchmark_id": "mmmu", "name": "MMMU", "parent_benchmark_id": null, "categories": ["multimodal", "reasoning", "general"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "MMMU (Massive Multi-discipline Multimodal Understanding) is a benchmark designed to evaluate multimodal models on college-level subject knowledge and deliberate reasoning. Contains 11.5K meticulously collected multimodal questions from college exams, quizzes, and textbooks, covering six core disciplines: Art & Design, Business, Science, Health & Medicine, Humanities & Social Science, and Tech & Engineering across 30 subjects and 183 subfields.", "paper_link": "https://arxiv.org/abs/2311.16502", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:12.130105+00:00", "updated_at": "2025-07-19T19:56:12.130105+00:00" } ================================================ FILE: data/benchmarks/mmmuval.json ================================================ { "benchmark_id": "mmmuval", "name": "MMMUval", "parent_benchmark_id": null, "categories": ["vision", "general", "reasoning", "multimodal"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "Validation set for MMMU (Massive Multi-discipline Multimodal Understanding and Reasoning) benchmark, designed to evaluate multimodal models on massive multi-discipline tasks demanding college-level subject knowledge and deliberate reasoning across Art & Design, Business, Science, Health & Medicine, Humanities & Social Science, and Tech & Engineering.", "paper_link": "https://arxiv.org/abs/2311.16502", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.575948+00:00", "updated_at": "2025-07-19T19:56:14.575948+00:00" } ================================================ FILE: data/benchmarks/mmstar.json ================================================ { "benchmark_id": "mmstar", "name": "MMStar", "parent_benchmark_id": null, "categories": ["vision", "multimodal", "reasoning", "general"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "MMStar is an elite vision-indispensable multimodal benchmark comprising 1,500 challenge samples meticulously selected by humans to evaluate 6 core capabilities and 18 detailed axes. The benchmark addresses issues of visual content unnecessity and unintentional data leakage in existing multimodal evaluations.", "paper_link": "https://arxiv.org/abs/2403.20330", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.660584+00:00", "updated_at": "2025-07-19T19:56:14.660584+00:00" } ================================================ FILE: data/benchmarks/mmt-bench.json ================================================ { "benchmark_id": "mmt-bench", "name": "MMT-Bench", "parent_benchmark_id": null, "categories": ["vision", "multimodal", "reasoning", "general"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "MMT-Bench is a comprehensive multimodal benchmark for evaluating Large Vision-Language Models towards multitask AGI. It comprises 31,325 meticulously curated multi-choice visual questions from various multimodal scenarios such as vehicle driving and embodied navigation, covering 32 core meta-tasks and 162 subtasks in multimodal understanding.", "paper_link": "https://arxiv.org/abs/2404.16006", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.674184+00:00", "updated_at": "2025-07-19T19:56:14.674184+00:00" } ================================================ FILE: data/benchmarks/mmvet.json ================================================ { "benchmark_id": "mmvet", "name": "MMVet", "parent_benchmark_id": null, "categories": ["vision", "multimodal", "reasoning", "general", "spatial_reasoning", "math"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "MM-Vet is an evaluation benchmark that examines large multimodal models on complicated multimodal tasks requiring integrated capabilities. It assesses six core vision-language capabilities: recognition, knowledge, spatial awareness, language generation, OCR, and math through questions that require one or more of these capabilities.", "paper_link": "https://arxiv.org/abs/2308.02490", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.684742+00:00", "updated_at": "2025-07-19T19:56:14.684742+00:00" } ================================================ FILE: data/benchmarks/mmvetgpt4turbo.json ================================================ { "benchmark_id": "mmvetgpt4turbo", "name": "MMVetGPT4Turbo", "parent_benchmark_id": null, "categories": ["vision", "multimodal", "reasoning", "general", "spatial_reasoning", "math"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "MM-Vet evaluation using GPT-4 Turbo for scoring. This variant of MM-Vet examines large multimodal models on complicated multimodal tasks requiring integrated capabilities across six core vision-language abilities: recognition, knowledge, spatial awareness, language generation, OCR, and math.", "paper_link": "https://arxiv.org/abs/2308.02490", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.611567+00:00", "updated_at": "2025-07-19T19:56:14.611567+00:00" } ================================================ FILE: data/benchmarks/mobileminiwob++-sr.json ================================================ { "benchmark_id": "mobileminiwob++-sr", "name": "MobileMiniWob++_SR", "parent_benchmark_id": null, "categories": ["multimodal", "frontend_development"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "MobileMiniWob++ SR (Success Rate) is an adaptation of the MiniWob++ web interaction benchmark for mobile Android environments within AndroidWorld. It comprises 92 web interaction tasks adapted for touch-based mobile interfaces, evaluating agents' ability to navigate and interact with web applications on mobile devices.", "paper_link": "https://arxiv.org/abs/2405.14573", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.816755+00:00", "updated_at": "2025-07-19T19:56:14.816755+00:00" } ================================================ FILE: data/benchmarks/mrcr-1m-(pointwise).json ================================================ { "benchmark_id": "mrcr-1m-(pointwise)", "name": "MRCR 1M (pointwise)", "parent_benchmark_id": null, "categories": ["long_context", "reasoning", "general"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "MRCR 1M (pointwise) is a variant of the Multi-Round Coreference Resolution benchmark that uses pointwise evaluation for ultra-long contexts (~1M tokens). This version evaluates each response independently rather than comparatively, testing models' absolute performance on long-context reasoning tasks.", "paper_link": "https://arxiv.org/abs/2409.12640", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:13.912789+00:00", "updated_at": "2025-07-19T19:56:13.912789+00:00" } ================================================ FILE: data/benchmarks/mrcr-1m.json ================================================ { "benchmark_id": "mrcr-1m", "name": "MRCR 1M", "parent_benchmark_id": null, "categories": ["long_context", "reasoning", "general"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "MRCR 1M is a variant of the Multi-Round Coreference Resolution benchmark designed for testing extremely long context capabilities with approximately 1 million tokens. It evaluates models' ability to maintain reasoning and attention across ultra-long conversations.", "paper_link": "https://arxiv.org/abs/2409.12640", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:13.954336+00:00", "updated_at": "2025-07-19T19:56:13.954336+00:00" } ================================================ FILE: data/benchmarks/mrcr-v2-(8-needle).json ================================================ { "benchmark_id": "mrcr-v2-(8-needle)", "name": "MRCR v2 (8-needle)", "parent_benchmark_id": null, "categories": ["long_context", "reasoning", "general"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "MRCR v2 (8-needle) is a variant of the Multi-Round Coreference Resolution benchmark that includes 8 needle items to retrieve from long contexts. This tests models' ability to simultaneously track and reason about multiple pieces of information across extended conversations.", "paper_link": "https://arxiv.org/abs/2409.12640", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.010914+00:00", "updated_at": "2025-07-19T19:56:14.010914+00:00" } ================================================ FILE: data/benchmarks/mrcr-v2.json ================================================ { "benchmark_id": "mrcr-v2", "name": "MRCR v2", "parent_benchmark_id": null, "categories": ["long_context", "reasoning", "general"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "MRCR v2 (Multi-Round Coreference Resolution version 2) is an enhanced version of the synthetic long-context reasoning task. It extends the original MRCR framework with improved evaluation criteria and additional complexity for testing models' ability to maintain attention and reasoning across extended contexts.", "paper_link": "https://arxiv.org/abs/2409.12640", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:13.963241+00:00", "updated_at": "2025-07-19T19:56:13.963241+00:00" } ================================================ FILE: data/benchmarks/mrcr.json ================================================ { "benchmark_id": "mrcr", "name": "MRCR", "parent_benchmark_id": null, "categories": ["long_context", "reasoning", "general"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "MRCR (Multi-Round Coreference Resolution) is a synthetic long-context reasoning task where models must navigate long conversations to reproduce specific model outputs. It tests the ability to distinguish between similar requests and reason about ordering while maintaining attention across extended contexts.", "paper_link": "https://arxiv.org/abs/2409.12640", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:13.887445+00:00", "updated_at": "2025-07-19T19:56:13.887445+00:00" } ================================================ FILE: data/benchmarks/mt-bench.json ================================================ { "benchmark_id": "mt-bench", "name": "MT-Bench", "parent_benchmark_id": null, "categories": ["communication", "reasoning", "general", "roleplay"], "modality": "text", "multilingual": false, "max_score": 100.0, "language": "en", "description": "MT-Bench is a challenging multi-turn benchmark that measures the ability of large language models to engage in coherent, informative, and engaging conversations. It uses strong LLMs as judges for scalable and explainable evaluation of multi-turn dialogue capabilities.", "paper_link": "https://arxiv.org/abs/2306.05685", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.516415+00:00", "updated_at": "2025-07-19T19:56:14.516415+00:00" } ================================================ FILE: data/benchmarks/mtvqa.json ================================================ { "benchmark_id": "mtvqa", "name": "MTVQA", "parent_benchmark_id": null, "categories": ["vision", "multimodal", "text-to-image"], "modality": "multimodal", "multilingual": true, "max_score": 1.0, "language": "en", "description": "MTVQA (Multilingual Text-Centric Visual Question Answering) is the first benchmark featuring high-quality human expert annotations across 9 diverse languages, consisting of 6,778 question-answer pairs across 2,116 images. It addresses visual-textual misalignment problems in multilingual text-centric VQA.", "paper_link": "https://arxiv.org/abs/2405.11985", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.587333+00:00", "updated_at": "2025-07-19T19:56:14.587333+00:00" } ================================================ FILE: data/benchmarks/muirbench.json ================================================ { "benchmark_id": "muirbench", "name": "MuirBench", "parent_benchmark_id": null, "categories": ["vision", "multimodal", "reasoning"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "A comprehensive benchmark for robust multi-image understanding capabilities of multimodal LLMs. Consists of 12 diverse multi-image tasks involving 10 categories of multi-image relations (e.g., multiview, temporal relations, narrative, complementary). Comprises 11,264 images and 2,600 multiple-choice questions created in a pairwise manner, where each standard instance is paired with an unanswerable variant for reliable assessment.", "paper_link": "https://arxiv.org/abs/2406.09411", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.888428+00:00", "updated_at": "2025-07-19T19:56:14.888428+00:00" } ================================================ FILE: data/benchmarks/multi-if.json ================================================ { "benchmark_id": "multi-if", "name": "Multi-IF", "parent_benchmark_id": null, "categories": ["reasoning", "communication", "language"], "modality": "text", "multilingual": true, "max_score": 1.0, "language": "en", "description": "Multi-IF benchmarks LLMs on multi-turn and multilingual instruction following. It expands upon IFEval by incorporating multi-turn sequences and translating English prompts into 7 other languages, resulting in 4,501 multilingual conversations with three turns each. The benchmark reveals that current leading LLMs struggle with maintaining accuracy in multi-turn instructions and shows higher error rates for non-Latin script languages.", "paper_link": "https://arxiv.org/abs/2410.15553", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.638787+00:00", "updated_at": "2025-07-19T19:56:14.638787+00:00" } ================================================ FILE: data/benchmarks/multi-swe-bench.json ================================================ { "benchmark_id": "multi-swe-bench", "name": "Multi-SWE-Bench", "parent_benchmark_id": null, "categories": ["reasoning", "code"], "modality": "text", "multilingual": true, "max_score": 1.0, "language": "en", "description": "A multilingual benchmark for issue resolving that evaluates Large Language Models' ability to resolve software issues across diverse programming ecosystems. Covers 7 programming languages (Java, TypeScript, JavaScript, Go, Rust, C, and C++) with 1,632 high-quality instances carefully annotated by 68 expert annotators. Addresses limitations of existing benchmarks that focus almost exclusively on Python.", "paper_link": "https://arxiv.org/abs/2504.02605", "implementation_link": null, "verified": false, "created_at": "2025-09-15T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00" } ================================================ FILE: data/benchmarks/multichallenge-(o3-mini-grader).json ================================================ { "benchmark_id": "multichallenge-(o3-mini-grader)", "name": "MultiChallenge (o3-mini grader)", "parent_benchmark_id": null, "categories": ["reasoning", "language"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "A realistic multi-turn conversation evaluation benchmark that challenges frontier LLMs across four key areas: instruction retention, inference memory, reliable versioned editing, and self-coherence. Despite near-perfect scores on existing benchmarks, frontier models achieve less than 50% accuracy on MultiChallenge.", "paper_link": "https://arxiv.org/abs/2501.17399", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:15.235758+00:00", "updated_at": "2025-07-19T19:56:15.235758+00:00" } ================================================ FILE: data/benchmarks/multichallenge.json ================================================ { "benchmark_id": "multichallenge", "name": "Multi-Challenge", "parent_benchmark_id": null, "categories": ["communication", "reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "MultiChallenge is a realistic multi-turn conversation evaluation benchmark that challenges frontier LLMs across four key categories: instruction retention (maintaining instructions throughout conversations), inference memory (recalling and connecting details from previous turns), reliable versioned editing (adapting to evolving instructions during collaborative editing), and self-coherence (avoiding contradictions in responses). The benchmark evaluates models on sustained, contextually complex dialogues across diverse topics including travel planning, technical documentation, and professional communication.", "paper_link": "https://arxiv.org/abs/2501.17399", "implementation_link": null, "verified": false, "created_at": "2025-09-05T00:00:00.000000+00:00", "updated_at": "2025-09-05T00:00:00.000000+00:00" } ================================================ FILE: data/benchmarks/multilf.json ================================================ { "benchmark_id": "multilf", "name": "MultiLF", "parent_benchmark_id": null, "categories": ["general"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "MultiLF benchmark", "paper_link": null, "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.628191+00:00", "updated_at": "2025-07-19T19:56:14.628191+00:00" } ================================================ FILE: data/benchmarks/multilingual-mgsm-(cot).json ================================================ { "benchmark_id": "multilingual-mgsm-(cot)", "name": "Multilingual MGSM (CoT)", "parent_benchmark_id": null, "categories": ["math", "reasoning"], "modality": "text", "multilingual": true, "max_score": 1.0, "language": "en", "description": "Multilingual Grade School Math (MGSM) benchmark evaluates language models' chain-of-thought reasoning abilities across ten typologically diverse languages. Contains 250 grade-school math problems manually translated from GSM8K dataset into languages including Bengali and Swahili.", "paper_link": "https://arxiv.org/abs/2210.03057", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.402248+00:00", "updated_at": "2025-07-19T19:56:14.402248+00:00" } ================================================ FILE: data/benchmarks/multilingual-mmlu.json ================================================ { "benchmark_id": "multilingual-mmlu", "name": "Multilingual MMLU", "parent_benchmark_id": null, "categories": ["general", "reasoning", "language"], "modality": "text", "multilingual": true, "max_score": 1.0, "language": "en", "description": "MMLU-ProX is a comprehensive multilingual benchmark covering 29 typologically diverse languages, building upon MMLU-Pro. Each language version consists of 11,829 identical questions enabling direct cross-linguistic comparisons. The benchmark evaluates large language models' reasoning capabilities across linguistic and cultural boundaries through challenging, reasoning-focused questions with 10 answer choices.", "paper_link": "https://arxiv.org/abs/2503.10497", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.139086+00:00", "updated_at": "2025-07-19T19:56:14.139086+00:00" } ================================================ FILE: data/benchmarks/multipl-e-humaneval.json ================================================ { "benchmark_id": "multipl-e-humaneval", "name": "Multipl-E HumanEval", "parent_benchmark_id": null, "categories": ["language", "general"], "modality": "text", "multilingual": true, "max_score": 1.0, "language": "en", "description": "MultiPL-E is a scalable and extensible approach to benchmarking neural code generation that translates unit test-driven code generation benchmarks across multiple programming languages. It extends the HumanEval benchmark to 18 additional programming languages, enabling evaluation of code generation models across diverse programming paradigms and providing insights into how models generalize programming knowledge across language boundaries.", "paper_link": "https://arxiv.org/abs/2208.08227", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.345081+00:00", "updated_at": "2025-07-19T19:56:14.345081+00:00" } ================================================ FILE: data/benchmarks/multipl-e-mbpp.json ================================================ { "benchmark_id": "multipl-e-mbpp", "name": "Multipl-E MBPP", "parent_benchmark_id": null, "categories": ["general", "reasoning"], "modality": "text", "multilingual": true, "max_score": 1.0, "language": "en", "description": "MultiPL-E extends the Mostly Basic Python Problems (MBPP) benchmark to 18+ programming languages for evaluating multilingual code generation capabilities. MBPP contains 974 crowd-sourced programming problems designed to be solvable by entry-level programmers, covering programming fundamentals and standard library functionality. Each problem includes a task description, code solution, and automated test cases.", "paper_link": "https://arxiv.org/abs/2208.08227", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.353635+00:00", "updated_at": "2025-07-19T19:56:14.353635+00:00" } ================================================ FILE: data/benchmarks/multipl-e.json ================================================ { "benchmark_id": "multipl-e", "name": "MultiPL-E", "parent_benchmark_id": null, "categories": ["general", "language"], "modality": "text", "multilingual": true, "max_score": 1.0, "language": "en", "description": "MultiPL-E is a scalable and extensible system for translating unit test-driven code generation benchmarks to multiple programming languages. It extends HumanEval and MBPP Python benchmarks to 18 additional programming languages, enabling evaluation of neural code generation models across diverse programming paradigms and language features.", "paper_link": "https://arxiv.org/abs/2208.08227", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:12.311919+00:00", "updated_at": "2025-07-19T19:56:12.311919+00:00" } ================================================ FILE: data/benchmarks/musiccaps.json ================================================ { "benchmark_id": "musiccaps", "name": "MusicCaps", "parent_benchmark_id": null, "categories": ["audio", "multimodal"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "MusicCaps is a dataset composed of 5,521 music examples, each labeled with an English aspect list and a free text caption written by musicians. The dataset contains 10-second music clips from AudioSet paired with rich textual descriptions that capture sonic qualities and musical elements like genre, mood, tempo, instrumentation, and rhythm. Created to support research in music-text understanding and generation tasks.", "paper_link": "https://arxiv.org/abs/2301.11325", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.892085+00:00", "updated_at": "2025-07-19T19:56:14.892085+00:00" } ================================================ FILE: data/benchmarks/musr.json ================================================ { "benchmark_id": "musr", "name": "MuSR", "parent_benchmark_id": null, "categories": ["reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "MuSR (Multistep Soft Reasoning) is a benchmark for evaluating language models on multistep soft reasoning tasks specified in natural language narratives. Created through a neurosymbolic synthetic-to-natural generation algorithm, it generates complex reasoning scenarios like murder mysteries roughly 1000 words in length that challenge current LLMs including GPT-4. The benchmark tests chain-of-thought reasoning capabilities across domains involving commonsense reasoning about physical and social situations.", "paper_link": "https://arxiv.org/abs/2310.16049", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:12.708705+00:00", "updated_at": "2025-07-19T19:56:12.708705+00:00" } ================================================ FILE: data/benchmarks/mvbench.json ================================================ { "benchmark_id": "mvbench", "name": "MVBench", "parent_benchmark_id": null, "categories": ["vision", "video", "multimodal", "spatial_reasoning", "reasoning"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "A comprehensive multi-modal video understanding benchmark covering 20 challenging video tasks that require temporal understanding beyond single-frame analysis. Tasks span from perception to cognition, including action recognition, temporal reasoning, spatial reasoning, object interaction, scene transition, and counterfactual inference. Uses a novel static-to-dynamic method to systematically generate video tasks from existing annotations.", "paper_link": "https://arxiv.org/abs/2311.17005", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.615534+00:00", "updated_at": "2025-07-19T19:56:14.615534+00:00" } ================================================ FILE: data/benchmarks/natural-questions.json ================================================ { "benchmark_id": "natural-questions", "name": "Natural Questions", "parent_benchmark_id": null, "categories": ["reasoning", "general", "search"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "Natural Questions is a question answering dataset featuring real anonymized queries issued to Google search engine. It contains 307,373 training examples where annotators provide long answers (passages) and short answers (entities) from Wikipedia pages, or mark them as unanswerable.", "paper_link": "https://arxiv.org/abs/1901.08634", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:13.178778+00:00", "updated_at": "2025-07-19T19:56:13.178778+00:00" } ================================================ FILE: data/benchmarks/natural2code.json ================================================ { "benchmark_id": "natural2code", "name": "Natural2Code", "parent_benchmark_id": null, "categories": ["reasoning", "general"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "NaturalCodeBench (NCB) is a challenging code benchmark designed to mirror the complexity and variety of real-world coding tasks. It comprises 402 high-quality problems in Python and Java, selected from natural user queries from online coding services, covering 6 different domains.", "paper_link": "https://arxiv.org/abs/2405.04520", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:13.518784+00:00", "updated_at": "2025-07-19T19:56:13.518784+00:00" } ================================================ FILE: data/benchmarks/nexus.json ================================================ { "benchmark_id": "nexus", "name": "Nexus", "parent_benchmark_id": null, "categories": ["general"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "NexusRaven benchmark for evaluating function calling capabilities of large language models in zero-shot scenarios across cybersecurity tools and API interactions", "paper_link": "https://openreview.net/pdf?id=5lcPe6DqfI", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.391550+00:00", "updated_at": "2025-07-19T19:56:14.391550+00:00" } ================================================ FILE: data/benchmarks/nih-multi-needle.json ================================================ { "benchmark_id": "nih-multi-needle", "name": "NIH/Multi-needle", "parent_benchmark_id": null, "categories": ["long_context"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "Multi-needle in a haystack benchmark for evaluating long-context comprehension capabilities of language models by testing retrieval of multiple target pieces of information from extended documents", "paper_link": "https://arxiv.org/abs/2406.11230", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.465778+00:00", "updated_at": "2025-07-19T19:56:14.465778+00:00" } ================================================ FILE: data/benchmarks/nmos.json ================================================ { "benchmark_id": "nmos", "name": "NMOS", "parent_benchmark_id": null, "categories": ["general"], "modality": "text", "multilingual": false, "max_score": 100.0, "language": "en", "description": "NMOS evaluation benchmark for assessing model performance on specialized tasks", "paper_link": null, "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.895373+00:00", "updated_at": "2025-07-19T19:56:14.895373+00:00" } ================================================ FILE: data/benchmarks/nq.json ================================================ { "benchmark_id": "nq", "name": "NQ", "parent_benchmark_id": null, "categories": ["reasoning", "general"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "Natural Questions (NQ) benchmark containing real user questions issued to Google search with answers found from Wikipedia, designed for training and evaluation of automatic question answering systems", "paper_link": "https://aclanthology.org/Q19-1026/", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:15.088246+00:00", "updated_at": "2025-07-19T19:56:15.088246+00:00" } ================================================ FILE: data/benchmarks/ocrbench-v2-(en).json ================================================ { "benchmark_id": "ocrbench-v2-(en)", "name": "OCRBench-V2 (en)", "parent_benchmark_id": null, "categories": ["vision", "image-to-text"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "OCRBench v2 English subset: Enhanced benchmark for evaluating Large Multimodal Models on visual text localization and reasoning with English text content", "paper_link": "https://arxiv.org/abs/2501.00321", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.926330+00:00", "updated_at": "2025-07-19T19:56:14.926330+00:00" } ================================================ FILE: data/benchmarks/ocrbench-v2-(zh).json ================================================ { "benchmark_id": "ocrbench-v2-(zh)", "name": "OCRBench-V2 (zh)", "parent_benchmark_id": null, "categories": ["vision", "image-to-text"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "zh", "description": "OCRBench v2 Chinese subset: Enhanced benchmark for evaluating Large Multimodal Models on visual text localization and reasoning with Chinese text content", "paper_link": "https://arxiv.org/abs/2501.00321", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.944963+00:00", "updated_at": "2025-07-19T19:56:14.944963+00:00" } ================================================ FILE: data/benchmarks/ocrbench-v2.json ================================================ { "benchmark_id": "ocrbench-v2", "name": "OCRBench_V2", "parent_benchmark_id": null, "categories": ["vision", "image-to-text"], "modality": "multimodal", "multilingual": true, "max_score": 1.0, "language": "en", "description": "OCRBench v2: Enhanced large-scale bilingual benchmark for evaluating Large Multimodal Models on visual text localization and reasoning with 10,000 human-verified question-answering pairs across 8 core OCR capabilities", "paper_link": "https://arxiv.org/abs/2501.00321", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.898625+00:00", "updated_at": "2025-07-19T19:56:14.898625+00:00" } ================================================ FILE: data/benchmarks/ocrbench.json ================================================ { "benchmark_id": "ocrbench", "name": "OCRBench", "parent_benchmark_id": null, "categories": ["vision", "image-to-text"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "OCRBench: Comprehensive evaluation benchmark for assessing Optical Character Recognition (OCR) capabilities in Large Multimodal Models across text recognition, scene text VQA, and document understanding tasks", "paper_link": "https://arxiv.org/abs/2305.07895", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.304601+00:00", "updated_at": "2025-07-19T19:56:14.304601+00:00" } ================================================ FILE: data/benchmarks/odinw.json ================================================ { "benchmark_id": "odinw", "name": "ODinW", "parent_benchmark_id": null, "categories": ["vision"], "modality": "image", "multilingual": false, "max_score": 1.0, "language": "en", "description": "Object Detection in the Wild (ODinW) benchmark for evaluating object detection models' task-level transfer ability across diverse real-world datasets in terms of prediction accuracy and adaptation efficiency", "paper_link": "https://arxiv.org/abs/2112.03857", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.902703+00:00", "updated_at": "2025-07-19T19:56:14.902703+00:00" } ================================================ FILE: data/benchmarks/ojbench.json ================================================ { "benchmark_id": "ojbench", "name": "OJBench", "parent_benchmark_id": null, "categories": ["reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "OJBench is a competition-level code benchmark designed to assess the competitive-level code reasoning abilities of large language models. It comprises 232 programming competition problems from NOI and ICPC, categorized into Easy, Medium, and Hard difficulty levels. The benchmark evaluates models' ability to solve complex competitive programming challenges using Python and C++.", "paper_link": "https://arxiv.org/abs/2506.16395", "implementation_link": null, "verified": false, "created_at": "2025-09-05T00:00:00.000000+00:00", "updated_at": "2025-09-05T00:00:00.000000+00:00" } ================================================ FILE: data/benchmarks/olympiadbench.json ================================================ { "benchmark_id": "olympiadbench", "name": "OlympiadBench", "parent_benchmark_id": null, "categories": ["math", "reasoning", "physics", "multimodal"], "modality": "multimodal", "multilingual": true, "max_score": 1.0, "language": "en", "description": "A challenging benchmark for promoting AGI with Olympiad-level bilingual multimodal scientific problems. Comprises 8,476 math and physics problems from international and Chinese Olympiads and the Chinese college entrance exam, featuring expert-level annotations for step-by-step reasoning. Includes both text-only and multimodal problems in English and Chinese.", "paper_link": "https://arxiv.org/abs/2402.14008", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.821916+00:00", "updated_at": "2025-07-19T19:56:14.821916+00:00" } ================================================ FILE: data/benchmarks/omnibench-music.json ================================================ { "benchmark_id": "omnibench-music", "name": "OmniBench Music", "parent_benchmark_id": null, "categories": ["multimodal", "audio"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "Music component of OmniBench, a comprehensive benchmark for evaluating omni-language models' ability to recognize, interpret, and reason across visual, acoustic, and textual inputs simultaneously. The music category includes various compositions and performances that require integrated understanding across text, image, and audio modalities.", "paper_link": "https://arxiv.org/abs/2409.15272", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.911093+00:00", "updated_at": "2025-07-19T19:56:14.911093+00:00" } ================================================ FILE: data/benchmarks/omnibench.json ================================================ { "benchmark_id": "omnibench", "name": "OmniBench", "parent_benchmark_id": null, "categories": ["multimodal", "reasoning"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "A novel multimodal benchmark designed to evaluate large language models' ability to recognize, interpret, and reason across visual, acoustic, and textual inputs simultaneously. Comprises 1,142 question-answer pairs covering 8 task categories from basic perception to complex inference, with a unique constraint that accurate responses require integrated understanding of all three modalities.", "paper_link": "https://arxiv.org/abs/2409.15272", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.906402+00:00", "updated_at": "2025-07-19T19:56:14.906402+00:00" } ================================================ FILE: data/benchmarks/omnimath.json ================================================ { "benchmark_id": "omnimath", "name": "OmniMath", "parent_benchmark_id": null, "categories": ["math", "reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "A Universal Olympiad Level Mathematic Benchmark for Large Language Models containing 4,428 competition-level problems with rigorous human annotation, categorized into over 33 sub-domains and spanning more than 10 distinct difficulty levels", "paper_link": "https://arxiv.org/abs/2410.07985", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.271468+00:00", "updated_at": "2025-07-19T19:56:14.271468+00:00" } ================================================ FILE: data/benchmarks/open-rewrite.json ================================================ { "benchmark_id": "open-rewrite", "name": "Open-rewrite", "parent_benchmark_id": null, "categories": ["language", "writing"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "OpenRewriteEval is a benchmark for evaluating open-ended rewriting of long-form texts, covering a wide variety of rewriting types expressed through natural language instructions including formality, expansion, conciseness, paraphrasing, and tone and style transfer.", "paper_link": "https://arxiv.org/abs/2305.15685", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.435616+00:00", "updated_at": "2025-07-19T19:56:14.435616+00:00" } ================================================ FILE: data/benchmarks/openai-mmlu.json ================================================ { "benchmark_id": "openai-mmlu", "name": "OpenAI MMLU", "parent_benchmark_id": null, "categories": ["general", "reasoning", "math", "legal", "healthcare", "finance", "physics", "chemistry", "economics", "psychology"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "MMLU (Massive Multitask Language Understanding) is a comprehensive benchmark that measures a text model's multitask accuracy across 57 diverse academic and professional subjects. The test covers elementary mathematics, US history, computer science, law, morality, business ethics, clinical knowledge, and many other domains spanning STEM, humanities, social sciences, and professional fields. To attain high accuracy, models must possess extensive world knowledge and problem-solving ability.", "paper_link": "https://arxiv.org/abs/2009.03300", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.043675+00:00", "updated_at": "2025-07-19T19:56:14.043675+00:00" } ================================================ FILE: data/benchmarks/openai-mrcr%3A-2-needle-128k.json ================================================ { "benchmark_id": "openai-mrcr:-2-needle-128k", "name": "OpenAI-MRCR: 2 needle 128k", "parent_benchmark_id": null, "categories": ["long_context", "reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "Multi-round Co-reference Resolution (MRCR) benchmark for evaluating an LLM's ability to distinguish between multiple needles hidden in long context. Models are given a long, multi-turn synthetic conversation and must retrieve a specific instance of a repeated request, requiring reasoning and disambiguation skills beyond simple retrieval.", "paper_link": "https://arxiv.org/abs/2403.05530", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:15.266878+00:00", "updated_at": "2025-07-19T19:56:15.266878+00:00" } ================================================ FILE: data/benchmarks/openai-mrcr%3A-2-needle-1m.json ================================================ { "benchmark_id": "openai-mrcr:-2-needle-1m", "name": "OpenAI-MRCR: 2 needle 1M", "parent_benchmark_id": null, "categories": ["long_context", "reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "Multi-Round Co-reference Resolution benchmark that tests an LLM's ability to distinguish between multiple similar needles hidden in long conversations. Models must reproduce specific instances of content (e.g., 'Return the 2nd poem about tapirs') from multi-turn synthetic conversations, requiring reasoning about context, ordering, and subtle differences between similar outputs.", "paper_link": "https://arxiv.org/abs/2409.12640", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:15.280285+00:00", "updated_at": "2025-07-19T19:56:15.280285+00:00" } ================================================ FILE: data/benchmarks/openai-mrcr%3A-2-needle-256k.json ================================================ { "benchmark_id": "openai-mrcr:-2-needle-256k", "name": "OpenAI-MRCR: 2 needle 256k", "parent_benchmark_id": null, "categories": ["long_context", "reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "Multi-Round Co-reference Resolution (MRCR) benchmark that tests long-context reasoning by evaluating a model's ability to distinguish between similar outputs, reason about ordering, and reproduce specific content from multi-turn conversations containing multiple writing requests on overlapping topics at 256k tokens.", "paper_link": "https://arxiv.org/abs/2409.12640", "implementation_link": null, "verified": false, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00" } ================================================ FILE: data/benchmarks/openbookqa.json ================================================ { "benchmark_id": "openbookqa", "name": "OpenBookQA", "parent_benchmark_id": null, "categories": ["reasoning", "general"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "OpenBookQA is a question-answering dataset modeled after open book exams for assessing human understanding. It contains 5,957 multiple-choice elementary-level science questions that probe understanding of 1,326 core science facts and their application to novel situations, requiring combination of open book facts with broad common knowledge through multi-hop reasoning.", "paper_link": "https://arxiv.org/abs/1809.02789", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.129348+00:00", "updated_at": "2025-07-19T19:56:14.129348+00:00" } ================================================ FILE: data/benchmarks/osworld-extended.json ================================================ { "benchmark_id": "osworld-extended", "name": "OSWorld Extended", "parent_benchmark_id": null, "categories": ["general", "reasoning", "multimodal"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "OSWorld is a scalable, real computer environment benchmark for evaluating multimodal agents on open-ended tasks across Ubuntu, Windows, and macOS. It comprises 369 computer tasks involving real web and desktop applications, OS file I/O, and multi-application workflows. The benchmark evaluates agents' ability to interact with computer interfaces using screenshots and actions in realistic computing environments.", "paper_link": "https://arxiv.org/abs/2404.07972", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:15.113488+00:00", "updated_at": "2025-07-19T19:56:15.113488+00:00" } ================================================ FILE: data/benchmarks/osworld-screenshot-only.json ================================================ { "benchmark_id": "osworld-screenshot-only", "name": "OSWorld Screenshot-only", "parent_benchmark_id": null, "categories": ["multimodal", "vision", "general"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "OSWorld Screenshot-only: A variant of the OSWorld benchmark that evaluates multimodal AI agents using only screenshot observations to complete open-ended computer tasks across real operating systems (Ubuntu, Windows, macOS). Tests agents' ability to perform complex workflows involving web apps, desktop applications, file I/O, and multi-application tasks through visual interface understanding and GUI grounding.", "paper_link": "https://arxiv.org/abs/2404.07972", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:15.109647+00:00", "updated_at": "2025-07-19T19:56:15.109647+00:00" } ================================================ FILE: data/benchmarks/osworld.json ================================================ { "benchmark_id": "osworld", "name": "OSWorld", "parent_benchmark_id": null, "categories": ["multimodal", "general", "vision"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "OSWorld: The first-of-its-kind scalable, real computer environment for multimodal agents, supporting task setup, execution-based evaluation, and interactive learning across Ubuntu, Windows, and macOS with 369 computer tasks involving real web and desktop applications, OS file I/O, and multi-application workflows", "paper_link": "https://arxiv.org/abs/2404.07972", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.935426+00:00", "updated_at": "2025-07-19T19:56:14.935426+00:00" } ================================================ FILE: data/benchmarks/pathmcqa.json ================================================ { "benchmark_id": "pathmcqa", "name": "PathMCQA", "parent_benchmark_id": null, "categories": ["healthcare", "vision", "multimodal", "reasoning"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "PathMMU is a massive multimodal expert-level benchmark for understanding and reasoning in pathology, containing 33,428 multimodal multi-choice questions and 24,067 images validated by seven pathologists. It evaluates Large Multimodal Models (LMMs) performance on pathology tasks, with the top-performing model GPT-4V achieving only 49.8% zero-shot performance compared to 71.8% for human pathologists.", "paper_link": "https://arxiv.org/abs/2401.16355", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.036453+00:00", "updated_at": "2025-07-19T19:56:14.036453+00:00" } ================================================ FILE: data/benchmarks/perceptiontest.json ================================================ { "benchmark_id": "perceptiontest", "name": "PerceptionTest", "parent_benchmark_id": null, "categories": ["video", "multimodal", "reasoning", "physics", "spatial_reasoning"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "A novel multimodal video benchmark designed to evaluate perception and reasoning skills of pre-trained models across video, audio, and text modalities. Contains 11.6k real-world videos (average 23 seconds) filmed by participants worldwide, densely annotated with six types of labels. Focuses on skills (Memory, Abstraction, Physics, Semantics) and reasoning types (descriptive, explanatory, predictive, counterfactual). Shows significant performance gap between human baseline (91.4%) and state-of-the-art video QA models (46.2%).", "paper_link": "https://arxiv.org/abs/2305.13786", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.708910+00:00", "updated_at": "2025-07-19T19:56:14.708910+00:00" } ================================================ FILE: data/benchmarks/phibench.json ================================================ { "benchmark_id": "phibench", "name": "PhiBench", "parent_benchmark_id": null, "categories": ["reasoning", "math", "general"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "PhiBench is an internal benchmark designed to evaluate diverse skills and reasoning abilities of language models, covering a wide range of tasks including coding (debugging, extending incomplete code, explaining code snippets) and mathematics (identifying proof errors, generating related problems). Created by Microsoft's research team to address limitations of standard academic benchmarks and guide the development of the Phi-4 model.", "paper_link": "https://arxiv.org/abs/2412.08905", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.121593+00:00", "updated_at": "2025-07-19T19:56:14.121593+00:00" } ================================================ FILE: data/benchmarks/physicsfinals.json ================================================ { "benchmark_id": "physicsfinals", "name": "PhysicsFinals", "parent_benchmark_id": null, "categories": ["physics", "math", "reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "PHYSICS is a comprehensive benchmark for university-level physics problem solving, containing 1,297 expert-annotated problems covering six core areas: classical mechanics, quantum mechanics, thermodynamics and statistical mechanics, electromagnetism, atomic physics, and optics. Each problem requires advanced physics knowledge and mathematical reasoning. Even advanced models like o3-mini achieve only 59.9% accuracy.", "paper_link": "https://arxiv.org/abs/2503.21821", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:13.981919+00:00", "updated_at": "2025-07-19T19:56:13.981919+00:00" } ================================================ FILE: data/benchmarks/piqa.json ================================================ { "benchmark_id": "piqa", "name": "PIQA", "parent_benchmark_id": null, "categories": ["reasoning", "physics", "general"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "PIQA (Physical Interaction: Question Answering) is a benchmark dataset for physical commonsense reasoning in natural language. It tests AI systems' ability to answer questions requiring physical world knowledge through multiple choice questions with everyday situations, focusing on atypical solutions inspired by instructables.com. The dataset contains 21,000 multiple choice questions where models must choose the most appropriate solution for physical interactions.", "paper_link": "https://arxiv.org/abs/1911.11641", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:13.133817+00:00", "updated_at": "2025-07-19T19:56:13.133817+00:00" } ================================================ FILE: data/benchmarks/pointgrounding.json ================================================ { "benchmark_id": "pointgrounding", "name": "PointGrounding", "parent_benchmark_id": null, "categories": ["vision", "spatial_reasoning", "multimodal"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "PointArena is a comprehensive platform for evaluating multimodal pointing across diverse reasoning scenarios. It includes Point-Bench, a curated dataset of ~1,000 pointing tasks across five categories: Spatial (positional references), Affordance (functional part identification), Counting (attribute-based grouping), Steerable (relative pointing), and Reasoning (open-ended visual inference). The benchmark evaluates language-guided pointing capabilities in vision-language models.", "paper_link": "https://arxiv.org/abs/2505.09990", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.914897+00:00", "updated_at": "2025-07-19T19:56:14.914897+00:00" } ================================================ FILE: data/benchmarks/polymath-en.json ================================================ { "benchmark_id": "polymath-en", "name": "PolyMath-en", "parent_benchmark_id": null, "categories": ["math", "reasoning"], "modality": "text", "multilingual": true, "max_score": 1.0, "language": "en", "description": "PolyMath is a multilingual mathematical reasoning benchmark covering 18 languages and 4 difficulty levels from easy to hard, ensuring difficulty comprehensiveness, language diversity, and high-quality translation. The benchmark evaluates mathematical reasoning capabilities of large language models across diverse linguistic contexts, making it a highly discriminative multilingual mathematical benchmark.", "paper_link": "https://arxiv.org/abs/2504.18428", "implementation_link": null, "verified": false, "created_at": "2025-09-05T00:00:00.000000+00:00", "updated_at": "2025-09-05T00:00:00.000000+00:00" } ================================================ FILE: data/benchmarks/polymath.json ================================================ { "benchmark_id": "polymath", "name": "PolyMATH", "parent_benchmark_id": null, "categories": ["math", "reasoning", "spatial_reasoning", "multimodal", "vision"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "Polymath is a challenging multi-modal mathematical reasoning benchmark designed to evaluate the general cognitive reasoning abilities of Multi-modal Large Language Models (MLLMs). The benchmark comprises 5,000 manually collected high-quality images of cognitive textual and visual challenges across 10 distinct categories, including pattern recognition, spatial reasoning, and relative reasoning.", "paper_link": "https://arxiv.org/abs/2410.14702", "implementation_link": null, "verified": false, "created_at": "2025-08-03T22:06:11.108063+00:00", "updated_at": "2025-08-03T22:06:11.108063+00:00" } ================================================ FILE: data/benchmarks/pope.json ================================================ { "benchmark_id": "pope", "name": "POPE", "parent_benchmark_id": null, "categories": ["vision", "safety", "multimodal"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "Polling-based Object Probing Evaluation (POPE) is a benchmark for evaluating object hallucination in Large Vision-Language Models (LVLMs). POPE addresses the problem where LVLMs generate objects inconsistent with target images by using a polling-based query method that asks yes/no questions about object presence in images, providing more stable and flexible evaluation of object hallucination.", "paper_link": "https://arxiv.org/abs/2305.10355", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.264312+00:00", "updated_at": "2025-07-19T19:56:14.264312+00:00" } ================================================ FILE: data/benchmarks/popqa.json ================================================ { "benchmark_id": "popqa", "name": "PopQA", "parent_benchmark_id": null, "categories": ["general", "reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "PopQA is an entity-centric open-domain question-answering dataset consisting of 14,000 QA pairs designed to evaluate language models' ability to memorize and recall factual knowledge across entities with varying popularity levels. The dataset probes both parametric memory (stored in model parameters) and non-parametric memory effectiveness, with questions covering 16 diverse relationship types from Wikidata converted to natural language using templates. Created by sampling knowledge triples from Wikidata and converting them to natural language questions, focusing on long-tail entities to understand LMs' strengths and limitations in memorizing factual knowledge.", "paper_link": "https://arxiv.org/abs/2212.10511", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:15.072897+00:00", "updated_at": "2025-07-19T19:56:15.072897+00:00" } ================================================ FILE: data/benchmarks/qasper.json ================================================ { "benchmark_id": "qasper", "name": "Qasper", "parent_benchmark_id": null, "categories": ["reasoning", "long_context"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "QASPER is a dataset of 5,049 information-seeking questions and answers anchored in 1,585 NLP research papers. Questions are written by NLP practitioners who read only titles and abstracts, while answers require understanding the full paper text and provide supporting evidence. The dataset challenges models with complex reasoning across document sections for academic document question answering. Each question seeks information present in the full text and is answered by a separate set of NLP practitioners who also provide supporting evidence to answers.", "paper_link": "https://arxiv.org/abs/2105.03011", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.166932+00:00", "updated_at": "2025-07-19T19:56:14.166932+00:00" } ================================================ FILE: data/benchmarks/qmsum.json ================================================ { "benchmark_id": "qmsum", "name": "QMSum", "parent_benchmark_id": null, "categories": ["summarization", "long_context"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "QMSum is a benchmark for query-based multi-domain meeting summarization consisting of 1,808 query-summary pairs over 232 meetings across academic, product, and committee domains. The dataset enables models to select and summarize relevant spans of meetings in response to specific queries. Published at NAACL 2021, QMSum presents significant challenges in long meeting summarization where models must identify and summarize relevant content based on user queries.", "paper_link": "https://arxiv.org/abs/2104.05938", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.223595+00:00", "updated_at": "2025-07-19T19:56:14.223595+00:00" } ================================================ FILE: data/benchmarks/realworldqa.json ================================================ { "benchmark_id": "realworldqa", "name": "RealWorldQA", "parent_benchmark_id": null, "categories": ["vision", "spatial_reasoning"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "RealWorldQA is a benchmark designed to evaluate basic real-world spatial understanding capabilities of multimodal models. The initial release consists of over 700 anonymized images taken from vehicles and other real-world scenarios, each accompanied by a question and easily verifiable answer. Released by xAI as part of their Grok-1.5 Vision preview to test models' ability to understand natural scenes and spatial relationships in everyday visual contexts.", "paper_link": null, "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.595271+00:00", "updated_at": "2025-07-19T19:56:14.595271+00:00" } ================================================ FILE: data/benchmarks/repobench.json ================================================ { "benchmark_id": "repobench", "name": "RepoBench", "parent_benchmark_id": null, "categories": ["reasoning", "code"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "RepoBench is a benchmark for evaluating repository-level code auto-completion systems through three interconnected tasks: RepoBench-R (retrieval of relevant code snippets across files), RepoBench-C (code completion with cross-file and in-file context), and RepoBench-P (pipeline combining retrieval and prediction). Supports Python and Java programming languages and addresses the gap in evaluating real-world, multi-file programming scenarios by providing a more complete comparison of performance in auto-completion systems.", "paper_link": "https://arxiv.org/abs/2306.03091", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:15.152588+00:00", "updated_at": "2025-07-19T19:56:15.152588+00:00" } ================================================ FILE: data/benchmarks/repoqa.json ================================================ { "benchmark_id": "repoqa", "name": "RepoQA", "parent_benchmark_id": null, "categories": ["long_context", "reasoning", "code"], "modality": "text", "multilingual": true, "max_score": 1.0, "language": "en", "description": "RepoQA is a benchmark for evaluating long-context code understanding capabilities of Large Language Models through the Searching Needle Function (SNF) task, where LLMs must locate specific functions in code repositories using natural language descriptions. The benchmark contains 500 code search tasks spanning 50 repositories across 5 modern programming languages (Python, Java, TypeScript, C++, and Rust), tested on 26 general and code-specific LLMs to assess their ability to comprehend and navigate code repositories.", "paper_link": "https://arxiv.org/abs/2406.06025", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.180278+00:00", "updated_at": "2025-07-19T19:56:14.180278+00:00" } ================================================ FILE: data/benchmarks/ruler.json ================================================ { "benchmark_id": "ruler", "name": "RULER", "parent_benchmark_id": null, "categories": ["long_context", "reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "RULER (What's the Real Context Size of Your Long-Context Language Models?) is a synthetic benchmark designed to comprehensively evaluate the long-context capabilities of language models. It expands on needle-in-a-haystack (NIAH) testing by introducing new task categories including multi-hop tracing and aggregation tasks. The benchmark provides flexible configurations for customized sequence length and task complexity, evaluating 17 long-context language models across 13 representative tasks to reveal that despite models claiming 32K+ token context sizes, only half maintain satisfactory performance at 32K length.", "paper_link": "https://arxiv.org/abs/2404.06654", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.175181+00:00", "updated_at": "2025-07-19T19:56:14.175181+00:00" } ================================================ FILE: data/benchmarks/sat-math.json ================================================ { "benchmark_id": "sat-math", "name": "SAT Math", "parent_benchmark_id": null, "categories": ["math", "reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "SAT Math benchmark from AGIEval containing standardized mathematics questions from the College Board SAT examination, designed to evaluate mathematical reasoning capabilities of foundation models using human-centric assessment methods.", "paper_link": "https://arxiv.org/abs/2304.06364", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:15.414463+00:00", "updated_at": "2025-07-19T19:56:15.414463+00:00" } ================================================ FILE: data/benchmarks/scale-multichallenge.json ================================================ { "benchmark_id": "scale-multichallenge", "name": "Scale MultiChallenge", "parent_benchmark_id": null, "categories": ["reasoning", "communication", "general"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "MultiChallenge is a realistic multi-turn conversation evaluation benchmark developed by Scale AI that evaluates large language models on four challenging conversation categories: instruction retention, inference memory of user information, reliable versioned editing, and self-coherence. Each challenge requires accurate instruction-following, context allocation, and in-context reasoning. Despite achieving near-perfect scores on existing multi-turn evaluation benchmarks, all frontier models have less than 50% accuracy on MultiChallenge.", "paper_link": "https://arxiv.org/abs/2501.17399", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:15.205789+00:00", "updated_at": "2025-07-19T19:56:15.205789+00:00" } ================================================ FILE: data/benchmarks/scicode.json ================================================ { "benchmark_id": "scicode", "name": "SciCode", "parent_benchmark_id": null, "categories": ["reasoning", "math", "physics", "chemistry", "code"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "SciCode is a research coding benchmark curated by scientists that challenges language models to code solutions for scientific problems. It contains 338 subproblems decomposed from 80 challenging main problems across 16 natural science sub-fields including mathematics, physics, chemistry, biology, and materials science. Problems require knowledge recall, reasoning, and code synthesis skills.", "paper_link": "https://arxiv.org/abs/2407.13168", "implementation_link": null, "verified": false, "created_at": "2025-07-28T00:00:00.000000+00:00", "updated_at": "2025-07-28T00:00:00.000000+00:00" } ================================================ FILE: data/benchmarks/scienceqa-visual.json ================================================ { "benchmark_id": "scienceqa-visual", "name": "ScienceQA Visual", "parent_benchmark_id": null, "categories": ["vision", "reasoning", "multimodal"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "ScienceQA Visual is a multimodal science question answering benchmark consisting of 21,208 multiple-choice questions from elementary and high school science curricula. The dataset covers 3 subjects (natural science, language science, social science), 26 topics, 127 categories, and 379 skills. 48.7% of questions include image context requiring multimodal reasoning. Questions are annotated with lectures (83.9%) and explanations (90.5%) to support chain-of-thought reasoning for science question answering.", "paper_link": "https://arxiv.org/abs/2209.09513", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.300722+00:00", "updated_at": "2025-07-19T19:56:14.300722+00:00" } ================================================ FILE: data/benchmarks/scienceqa.json ================================================ { "benchmark_id": "scienceqa", "name": "ScienceQA", "parent_benchmark_id": null, "categories": ["reasoning", "math", "multimodal"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "ScienceQA is the first large-scale multimodal science question answering benchmark with 21,208 multiple-choice questions covering 3 subjects (natural science, language science, social science), 26 topics, 127 categories, and 379 skills. The benchmark includes both text and image modalities, featuring detailed explanations and Chain-of-Thought reasoning to diagnose multi-hop reasoning ability.", "paper_link": "https://arxiv.org/abs/2209.09513", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.255251+00:00", "updated_at": "2025-07-19T19:56:14.255251+00:00" } ================================================ FILE: data/benchmarks/screenspot-pro.json ================================================ { "benchmark_id": "screenspot-pro", "name": "ScreenSpot Pro", "parent_benchmark_id": null, "categories": ["vision", "multimodal", "spatial_reasoning"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "ScreenSpot-Pro is a novel GUI grounding benchmark designed to rigorously evaluate the grounding capabilities of multimodal large language models (MLLMs) in professional high-resolution computing environments. The benchmark comprises 1,581 instructions across 23 applications spanning 5 industries and 3 operating systems, featuring authentic high-resolution images from professional domains with expert annotations. Unlike previous benchmarks that focus on cropped screenshots in consumer applications, ScreenSpot-Pro addresses the complexity and diversity of real-world professional software scenarios, revealing significant performance gaps in current MLLM GUI perception capabilities.", "paper_link": "https://arxiv.org/abs/2504.07981", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.776671+00:00", "updated_at": "2025-07-19T19:56:14.776671+00:00" } ================================================ FILE: data/benchmarks/screenspot.json ================================================ { "benchmark_id": "screenspot", "name": "ScreenSpot", "parent_benchmark_id": null, "categories": ["vision", "multimodal", "spatial_reasoning"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "ScreenSpot is the first realistic GUI grounding benchmark that encompasses mobile, desktop, and web environments. The dataset comprises over 1,200 instructions from iOS, Android, macOS, Windows and Web environments, along with annotated element types (text and icon/widget), designed to evaluate visual GUI agents' ability to accurately locate screen elements based on natural language instructions.", "paper_link": "https://arxiv.org/abs/2401.10935", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.766976+00:00", "updated_at": "2025-07-19T19:56:14.766976+00:00" } ================================================ FILE: data/benchmarks/simpleqa.json ================================================ { "benchmark_id": "simpleqa", "name": "SimpleQA", "parent_benchmark_id": null, "categories": ["general", "reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "SimpleQA is a factuality benchmark developed by OpenAI that measures the short-form factual accuracy of large language models. The benchmark contains 4,326 short, fact-seeking questions that are adversarially collected and designed to have single, indisputable answers. Questions cover diverse topics from science and technology to entertainment, and the benchmark also measures model calibration by evaluating whether models know what they know.", "paper_link": "https://arxiv.org/abs/2411.04368", "implementation_link": null, "verified": false, "created_at": "2025-09-05T00:00:00.000000+00:00", "updated_at": "2025-09-05T00:00:00.000000+00:00" } ================================================ FILE: data/benchmarks/slakevqa.json ================================================ { "benchmark_id": "slakevqa", "name": "SlakeVQA", "parent_benchmark_id": null, "categories": ["vision", "healthcare", "multimodal", "reasoning"], "modality": "multimodal", "multilingual": true, "max_score": 1.0, "language": "en", "description": "A semantically-labeled knowledge-enhanced dataset for medical visual question answering. Contains 642 radiology images (CT scans, MRI scans, X-rays) covering five body parts and 14,028 bilingual English-Chinese question-answer pairs annotated by experienced physicians. Features comprehensive semantic labels and a structural medical knowledge base with both vision-only and knowledge-based questions requiring external medical knowledge reasoning.", "paper_link": "https://arxiv.org/abs/2102.09542", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.027646+00:00", "updated_at": "2025-07-19T19:56:14.027646+00:00" } ================================================ FILE: data/benchmarks/social-iqa.json ================================================ { "benchmark_id": "social-iqa", "name": "Social IQa", "parent_benchmark_id": null, "categories": ["reasoning", "psychology"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "The first large-scale benchmark for commonsense reasoning about social situations. Contains 38,000 multiple choice questions probing emotional and social intelligence in everyday situations, testing commonsense understanding of social interactions and theory of mind reasoning about the implied emotions and behavior of others.", "paper_link": "https://arxiv.org/abs/1904.09728", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:13.155825+00:00", "updated_at": "2025-07-19T19:56:13.155825+00:00" } ================================================ FILE: data/benchmarks/spider.json ================================================ { "benchmark_id": "spider", "name": "Spider", "parent_benchmark_id": null, "categories": ["language", "reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "A large-scale, complex and cross-domain semantic parsing and text-to-SQL dataset annotated by 11 college students. Contains 10,181 questions and 5,693 unique complex SQL queries on 200 databases with multiple tables, covering 138 different domains. Requires models to generalize to both new SQL queries and new database schemas, making it distinct from previous semantic parsing tasks that use single databases.", "paper_link": "https://arxiv.org/abs/1809.08887", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:15.156791+00:00", "updated_at": "2025-07-19T19:56:15.156791+00:00" } ================================================ FILE: data/benchmarks/squality.json ================================================ { "benchmark_id": "squality", "name": "SQuALITY", "parent_benchmark_id": null, "categories": ["summarization", "long_context", "language"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "SQuALITY (Summarization-format QUestion Answering with Long Input Texts, Yes!) is a long-document summarization dataset built by hiring highly-qualified contractors to read public-domain short stories (3000-6000 words) and write original summaries from scratch. Each document has five summaries: one overview and four question-focused summaries. Designed to address limitations in existing summarization datasets by providing high-quality, faithful summaries.", "paper_link": "https://arxiv.org/abs/2205.11465", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:12.712415+00:00", "updated_at": "2025-07-19T19:56:12.712415+00:00" } ================================================ FILE: data/benchmarks/stem.json ================================================ { "benchmark_id": "stem", "name": "STEM", "parent_benchmark_id": null, "categories": ["math", "reasoning", "multimodal"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "A comprehensive multimodal benchmark dataset with 448 skills and 1,073,146 questions spanning all STEM subjects (Science, Technology, Engineering, Mathematics), designed to test neural models' vision-language STEM skills based on K-12 curriculum. Unlike existing datasets that focus on expert-level ability, this dataset includes fundamental skills designed around educational standards.", "paper_link": "https://arxiv.org/abs/2402.17205", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.559354+00:00", "updated_at": "2025-07-19T19:56:14.559354+00:00" } ================================================ FILE: data/benchmarks/summscreenfd.json ================================================ { "benchmark_id": "summscreenfd", "name": "SummScreenFD", "parent_benchmark_id": null, "categories": ["summarization", "long_context"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "SummScreenFD is the ForeverDreaming subset of the SummScreen dataset for abstractive screenplay summarization, comprising pairs of TV series transcripts and human-written recaps from 88 different shows. The dataset provides a challenging testbed for abstractive summarization where plot details are often expressed indirectly in character dialogues and scattered across the entirety of the transcript, requiring models to find and integrate these details to form succinct plot descriptions.", "paper_link": "https://arxiv.org/abs/2104.07091", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.229354+00:00", "updated_at": "2025-07-19T19:56:14.229354+00:00" } ================================================ FILE: data/benchmarks/superglue.json ================================================ { "benchmark_id": "superglue", "name": "SuperGLUE", "parent_benchmark_id": null, "categories": ["general", "language", "reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "SuperGLUE is a new benchmark styled after GLUE with a new set of more difficult language understanding tasks, improved resources, and a new public leaderboard. It includes 8 primary tasks: BoolQ (Boolean Questions), CB (CommitmentBank), COPA (Choice of Plausible Alternatives), MultiRC (Multi-Sentence Reading Comprehension), ReCoRD (Reading Comprehension with Commonsense Reasoning), RTE (Recognizing Textual Entailment), WiC (Word-in-Context), and WSC (Winograd Schema Challenge). The benchmark evaluates diverse language understanding capabilities including reading comprehension, commonsense reasoning, causal reasoning, coreference resolution, textual entailment, and word sense disambiguation across multiple domains.", "paper_link": "https://arxiv.org/abs/1905.00537", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:15.382590+00:00", "updated_at": "2025-07-19T19:56:15.382590+00:00" } ================================================ FILE: data/benchmarks/supergpqa.json ================================================ { "benchmark_id": "supergpqa", "name": "SuperGPQA", "parent_benchmark_id": null, "categories": ["reasoning", "general", "math", "legal", "healthcare", "finance", "chemistry", "economics", "physics"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "SuperGPQA is a comprehensive benchmark that evaluates large language models across 285 graduate-level academic disciplines. The benchmark contains 25,957 questions covering 13 broad disciplinary areas including Engineering, Medicine, Science, and Law, with specialized fields in light industry, agriculture, and service-oriented domains. It employs a Human-LLM collaborative filtering mechanism with over 80 expert annotators to create challenging questions that assess graduate-level knowledge and reasoning capabilities.", "paper_link": "https://arxiv.org/abs/2502.14739", "implementation_link": null, "verified": false, "created_at": "2025-09-05T00:00:00.000000+00:00", "updated_at": "2025-09-05T00:00:00.000000+00:00" } ================================================ FILE: data/benchmarks/swe-bench-multilingual.json ================================================ { "benchmark_id": "swe-bench-multilingual", "name": "SWE-bench Multilingual", "parent_benchmark_id": null, "categories": ["reasoning", "code"], "modality": "text", "multilingual": true, "max_score": 1.0, "language": "en", "description": "A multilingual benchmark for issue resolving in software engineering that covers Java, TypeScript, JavaScript, Go, Rust, C, and C++. Contains 1,632 high-quality instances carefully annotated from 2,456 candidates by 68 expert annotators, designed to evaluate Large Language Models across diverse software ecosystems beyond Python.", "paper_link": "https://arxiv.org/abs/2504.02605", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:12.340903+00:00", "updated_at": "2025-07-19T19:56:12.340903+00:00" } ================================================ FILE: data/benchmarks/swe-bench-verified-(agentic-coding).json ================================================ { "benchmark_id": "swe-bench-verified-(agentic-coding)", "name": "SWE-bench Verified (Agentic Coding)", "parent_benchmark_id": null, "categories": ["reasoning", "code"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "SWE-bench Verified is a human-filtered subset of 500 software engineering problems drawn from real GitHub issues across 12 popular Python repositories. Given a codebase and an issue description, language models are tasked with generating patches that resolve the described problems. This benchmark evaluates AI's real-world agentic coding skills by requiring models to navigate complex codebases, understand software engineering problems, and coordinate changes across multiple functions, classes, and files to fix well-defined issues with clear descriptions.", "paper_link": "https://arxiv.org/abs/2310.06770", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:12.331440+00:00", "updated_at": "2025-07-19T19:56:12.331440+00:00" } ================================================ FILE: data/benchmarks/swe-bench-verified-(agentless).json ================================================ { "benchmark_id": "swe-bench-verified-(agentless)", "name": "SWE-bench Verified (Agentless)", "parent_benchmark_id": null, "categories": ["general", "reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "A human-validated subset of SWE-bench that evaluates language models' ability to resolve real-world GitHub issues using an agentless approach. The benchmark tests models on software engineering problems requiring understanding and coordinating changes across multiple functions, classes, and files simultaneously.", "paper_link": "https://arxiv.org/abs/2407.01489", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:12.328122+00:00", "updated_at": "2025-07-19T19:56:12.328122+00:00" } ================================================ FILE: data/benchmarks/swe-bench-verified-(multiple-attempts).json ================================================ { "benchmark_id": "swe-bench-verified-(multiple-attempts)", "name": "SWE-bench Verified (Multiple Attempts)", "parent_benchmark_id": null, "categories": ["reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "SWE-bench Verified is a human-validated subset of 500 test samples from the original SWE-bench dataset that evaluates AI systems' ability to automatically resolve real GitHub issues in Python repositories. Given a codebase and issue description, models must edit the code to successfully resolve the problem, requiring understanding and coordination of changes across multiple functions, classes, and files. The Verified version provides more reliable evaluation through manual validation of test samples.", "paper_link": "https://arxiv.org/abs/2310.06770", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:12.336780+00:00", "updated_at": "2025-07-19T19:56:12.336780+00:00" } ================================================ FILE: data/benchmarks/swe-bench-verified.json ================================================ { "benchmark_id": "swe-bench-verified", "name": "SWE-Bench Verified", "parent_benchmark_id": null, "categories": ["reasoning", "frontend_development", "code"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "A verified subset of 500 software engineering problems from real GitHub issues, validated by human annotators for evaluating language models' ability to resolve real-world coding issues by generating patches for Python codebases.", "paper_link": "https://arxiv.org/abs/2310.06770", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:13.812805+00:00", "updated_at": "2025-07-19T19:56:13.812805+00:00" } ================================================ FILE: data/benchmarks/swe-dev.json ================================================ { "benchmark_id": "swe-dev", "name": "SWE-Dev", "parent_benchmark_id": null, "categories": ["frontend_development"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "SWE-bench development split consisting of 225 software engineering problems drawn from real GitHub issues across 12 popular Python repositories. Language models are given a codebase along with a description of an issue to be resolved and must edit the codebase to address the issue, often requiring understanding and coordinating changes across multiple functions, classes, and files.", "paper_link": "https://arxiv.org/abs/2310.06770", "implementation_link": null, "verified": false, "created_at": "2025-09-15T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00" } ================================================ FILE: data/benchmarks/swe-lancer-(ic-diamond-subset).json ================================================ { "benchmark_id": "swe-lancer-(ic-diamond-subset)", "name": "SWE-Lancer (IC-Diamond subset)", "parent_benchmark_id": null, "categories": ["reasoning", "code"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "SWE-Lancer (IC-Diamond subset) is a benchmark of real-world freelance software engineering tasks from Upwork, ranging from $50 bug fixes to $32,000 feature implementations. It evaluates AI models on independent engineering tasks using end-to-end tests triple-verified by experienced software engineers, and includes managerial tasks where models choose between technical implementation proposals.", "paper_link": "https://arxiv.org/abs/2502.12115", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:15.359574+00:00", "updated_at": "2025-07-19T19:56:15.359574+00:00" } ================================================ FILE: data/benchmarks/swe-lancer.json ================================================ { "benchmark_id": "swe-lancer", "name": "SWE-Lancer", "parent_benchmark_id": null, "categories": ["reasoning", "code"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "A benchmark for evaluating large language models on real-world freelance software engineering tasks from Upwork. Contains over 1,400 tasks valued at $1 million USD total, ranging from $50 bug fixes to $32,000 feature implementations. Includes both independent engineering tasks graded via end-to-end tests and managerial tasks assessed against original engineering managers' choices.", "paper_link": "https://arxiv.org/abs/2502.12115", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:15.352660+00:00", "updated_at": "2025-07-19T19:56:15.352660+00:00" } ================================================ FILE: data/benchmarks/tau-bench-airline.json ================================================ { "benchmark_id": "tau-bench-airline", "name": "TAU-bench Airline", "parent_benchmark_id": null, "categories": ["reasoning", "communication"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "Part of τ-bench (TAU-bench), a benchmark for Tool-Agent-User interaction in real-world domains. The airline domain evaluates language agents' ability to interact with users through dynamic conversations while following domain-specific rules and using API tools. Agents must handle airline-related tasks and policies reliably.", "paper_link": "https://arxiv.org/abs/2406.12045", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.993213+00:00", "updated_at": "2025-07-19T19:56:14.993213+00:00" } ================================================ FILE: data/benchmarks/tau-bench-retail.json ================================================ { "benchmark_id": "tau-bench-retail", "name": "TAU-bench Retail", "parent_benchmark_id": null, "categories": ["reasoning", "communication"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "A benchmark for evaluating tool-agent-user interaction in retail environments. Tests language agents' ability to handle dynamic conversations with users while using domain-specific API tools and following policy guidelines. Evaluates agents on tasks like order cancellations, address changes, and order status checks through multi-turn conversations.", "paper_link": "https://arxiv.org/abs/2406.12045", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.965635+00:00", "updated_at": "2025-07-19T19:56:14.965635+00:00" } ================================================ FILE: data/benchmarks/tau-bench.json ================================================ { "benchmark_id": "tau-bench", "name": "Tau-bench", "parent_benchmark_id": null, "categories": ["general", "reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "τ-bench: A benchmark for tool-agent-user interaction in real-world domains. Tests language agents' ability to interact with users and follow domain-specific rules through dynamic conversations using API tools and policy guidelines across retail and airline domains. Evaluates consistency and reliability of agent behavior over multiple trials.", "paper_link": "https://arxiv.org/abs/2406.12045", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:15.219001+00:00", "updated_at": "2025-07-19T19:56:15.219001+00:00" } ================================================ FILE: data/benchmarks/tau2-airline.json ================================================ { "benchmark_id": "tau2-airline", "name": "Tau2 Airline", "parent_benchmark_id": null, "categories": ["reasoning", "communication"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "TAU2 airline domain benchmark for evaluating conversational agents in dual-control environments where both AI agents and users interact with tools in airline customer service scenarios. Tests agent coordination, communication, and ability to guide user actions in tasks like flight booking, modifications, cancellations, and refunds.", "paper_link": "https://arxiv.org/abs/2506.07982", "implementation_link": null, "verified": false, "created_at": "2025-09-05T00:00:00.000000+00:00", "updated_at": "2025-09-05T00:00:00.000000+00:00" } ================================================ FILE: data/benchmarks/tau2-retail.json ================================================ { "benchmark_id": "tau2-retail", "name": "Tau2 Retail", "parent_benchmark_id": null, "categories": ["communication", "reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "τ²-bench retail domain evaluates conversational AI agents in customer service scenarios within a dual-control environment where both agent and user can interact with tools. Tests tool-agent-user interaction, rule adherence, and task consistency in retail customer support contexts.", "paper_link": "https://arxiv.org/abs/2506.07982", "implementation_link": null, "verified": false, "created_at": "2025-09-05T00:00:00.000000+00:00", "updated_at": "2025-09-05T00:00:00.000000+00:00" } ================================================ FILE: data/benchmarks/tau2-telecom.json ================================================ { "benchmark_id": "tau2-telecom", "name": "Tau2 Telecom", "parent_benchmark_id": null, "categories": ["communication", "reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "τ²-Bench telecom domain evaluates conversational agents in a dual-control environment modeled as a Dec-POMDP, where both agent and user use tools in shared telecommunications troubleshooting scenarios that test coordination and communication capabilities.", "paper_link": "https://arxiv.org/abs/2506.07982", "implementation_link": null, "verified": false, "created_at": "2025-09-05T00:00:00.000000+00:00", "updated_at": "2025-09-05T00:00:00.000000+00:00" } ================================================ FILE: data/benchmarks/tempcompass.json ================================================ { "benchmark_id": "tempcompass", "name": "TempCompass", "parent_benchmark_id": null, "categories": ["vision", "multimodal", "reasoning"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "TempCompass is a comprehensive benchmark for evaluating temporal perception capabilities of Video Large Language Models (Video LLMs). It constructs conflicting videos that share identical static content but differ in specific temporal aspects to prevent models from exploiting single-frame bias. The benchmark evaluates multiple temporal aspects including action, motion, speed, temporal order, and attribute changes across diverse task formats including multi-choice QA, yes/no QA, caption matching, and caption generation.", "paper_link": "https://arxiv.org/abs/2403.00476", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.748364+00:00", "updated_at": "2025-07-19T19:56:14.748364+00:00" } ================================================ FILE: data/benchmarks/terminal-bench.json ================================================ { "benchmark_id": "terminal-bench", "name": "Terminal-Bench", "parent_benchmark_id": null, "categories": ["reasoning", "code"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "Terminal-Bench is a benchmark for testing AI agents in real terminal environments. It evaluates how well agents can handle real-world, end-to-end tasks autonomously, including compiling code, training models, setting up servers, system administration, security tasks, data science workflows, and cybersecurity vulnerabilities. The benchmark consists of a dataset of ~100 hand-crafted, human-verified tasks and an execution harness that connects language models to a terminal sandbox.", "paper_link": null, "implementation_link": null, "verified": false, "created_at": "2025-07-28T00:00:00.000000+00:00", "updated_at": "2025-07-28T00:00:00.000000+00:00" } ================================================ FILE: data/benchmarks/terminus.json ================================================ { "benchmark_id": "terminus", "name": "Terminus", "parent_benchmark_id": null, "categories": ["reasoning", "code"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "Terminal-Bench is a benchmark for testing AI agents in real terminal environments, evaluating how well agents can handle real-world, end-to-end tasks autonomously. The benchmark includes tasks spanning coding, system administration, security, data science, model training, file operations, version control, and web development. Terminus is the neutral test-bed agent designed to work with Terminal-Bench, operating purely through tmux sessions without dedicated tools.", "paper_link": "https://github.com/laude-institute/terminal-bench", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:12.355994+00:00", "updated_at": "2025-07-19T19:56:12.355994+00:00" } ================================================ FILE: data/benchmarks/textvqa.json ================================================ { "benchmark_id": "textvqa", "name": "TextVQA", "parent_benchmark_id": null, "categories": ["vision", "multimodal", "image-to-text"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "TextVQA contains 45,336 questions on 28,408 images that require reasoning about text to answer. Introduced to benchmark VQA models' ability to read and reason about text within images, particularly for assistive technologies for visually impaired users. The dataset addresses the gap where existing VQA datasets had few text-based questions or were too small.", "paper_link": "https://arxiv.org/abs/1904.08920", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:12.875287+00:00", "updated_at": "2025-07-19T19:56:12.875287+00:00" } ================================================ FILE: data/benchmarks/theoremqa.json ================================================ { "benchmark_id": "theoremqa", "name": "TheoremQA", "parent_benchmark_id": null, "categories": ["math", "reasoning", "physics", "finance"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "A theorem-driven question answering dataset containing 800 high-quality questions covering 350+ theorems from Math, Physics, EE&CS, and Finance. Designed to evaluate AI models' capabilities to apply theorems to solve challenging university-level science problems.", "paper_link": "https://arxiv.org/abs/2305.12524", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.479157+00:00", "updated_at": "2025-07-19T19:56:14.479157+00:00" } ================================================ FILE: data/benchmarks/tldr9+-(test).json ================================================ { "benchmark_id": "tldr9+-(test)", "name": "TLDR9+ (test)", "parent_benchmark_id": null, "categories": ["summarization", "language"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "A large-scale summarization dataset containing over 9 million training instances extracted from Reddit, designed for extreme summarization (generating one-sentence summaries with high compression and abstraction). More than twice larger than previously proposed datasets.", "paper_link": "https://arxiv.org/abs/2110.01159", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.439927+00:00", "updated_at": "2025-07-19T19:56:14.439927+00:00" } ================================================ FILE: data/benchmarks/translation-en-to-set1-comet22.json ================================================ { "benchmark_id": "translation-en\u2192set1-comet22", "name": "Translation en\u2192Set1 COMET22", "parent_benchmark_id": null, "categories": ["language"], "modality": "text", "multilingual": true, "max_score": 1.0, "language": "en", "description": "COMET-22 is an ensemble machine translation evaluation metric combining a COMET estimator model trained with Direct Assessments and a multitask model that predicts sentence-level scores and word-level OK/BAD tags. It demonstrates improved correlations compared to state-of-the-art metrics and increased robustness to critical errors.", "paper_link": "https://aclanthology.org/2022.wmt-1.52/", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:12.959436+00:00", "updated_at": "2025-07-19T19:56:12.959436+00:00" } ================================================ FILE: data/benchmarks/translation-en-to-set1-spbleu.json ================================================ { "benchmark_id": "translation-en\u2192set1-spbleu", "name": "Translation en\u2192Set1 spBleu", "parent_benchmark_id": null, "categories": ["language"], "modality": "text", "multilingual": true, "max_score": 1.0, "language": "en", "description": "Translation evaluation using spBLEU (SentencePiece BLEU), a BLEU metric computed over text tokenized with a language-agnostic SentencePiece subword model. Introduced in the FLORES-101 evaluation benchmark for low-resource and multilingual machine translation.", "paper_link": "https://arxiv.org/abs/2106.03193", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:12.936891+00:00", "updated_at": "2025-07-19T19:56:12.936891+00:00" } ================================================ FILE: data/benchmarks/translation-set1-to-en-comet22.json ================================================ { "benchmark_id": "translation-set1\u2192en-comet22", "name": "Translation Set1\u2192en COMET22", "parent_benchmark_id": null, "categories": ["language"], "modality": "text", "multilingual": true, "max_score": 1.0, "language": "en", "description": "COMET-22 is a neural machine translation evaluation metric that uses an ensemble of two models: a COMET estimator trained with Direct Assessments and a multitask model that predicts sentence-level scores and word-level OK/BAD tags. It provides improved correlations with human judgments and increased robustness to critical errors compared to previous metrics.", "paper_link": "https://aclanthology.org/2022.wmt-1.52/", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:12.974744+00:00", "updated_at": "2025-07-19T19:56:12.974744+00:00" } ================================================ FILE: data/benchmarks/translation-set1-to-en-spbleu.json ================================================ { "benchmark_id": "translation-set1\u2192en-spbleu", "name": "Translation Set1\u2192en spBleu", "parent_benchmark_id": null, "categories": ["language"], "modality": "text", "multilingual": true, "max_score": 1.0, "language": "en", "description": "spBLEU (SentencePiece BLEU) evaluation metric for machine translation quality assessment, using language-agnostic SentencePiece tokenization with BLEU scoring. Part of the FLORES-101 evaluation benchmark for low-resource and multilingual machine translation.", "paper_link": "https://arxiv.org/abs/2106.03193", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:12.967240+00:00", "updated_at": "2025-07-19T19:56:12.967240+00:00" } ================================================ FILE: data/benchmarks/triviaqa.json ================================================ { "benchmark_id": "triviaqa", "name": "TriviaQA", "parent_benchmark_id": null, "categories": ["general", "reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "A large-scale reading comprehension dataset containing over 650K question-answer-evidence triples. TriviaQA includes 95K question-answer pairs authored by trivia enthusiasts and independently gathered evidence documents (six per question on average) that provide high quality distant supervision for answering the questions. The dataset features relatively complex, compositional questions with considerable syntactic and lexical variability, requiring cross-sentence reasoning to find answers.", "paper_link": "https://arxiv.org/abs/1705.03551", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:11.563587+00:00", "updated_at": "2025-07-19T19:56:11.563587+00:00" } ================================================ FILE: data/benchmarks/truthfulqa.json ================================================ { "benchmark_id": "truthfulqa", "name": "TruthfulQA", "parent_benchmark_id": null, "categories": ["general", "reasoning", "legal", "healthcare", "finance"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "TruthfulQA is a benchmark to measure whether language models are truthful in generating answers to questions. It comprises 817 questions that span 38 categories, including health, law, finance and politics. The questions are crafted such that some humans would answer falsely due to a false belief or misconception, testing models' ability to avoid generating false answers learned from human texts.", "paper_link": "https://arxiv.org/abs/2109.07958", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:11.339268+00:00", "updated_at": "2025-07-19T19:56:11.339268+00:00" } ================================================ FILE: data/benchmarks/tydiqa.json ================================================ { "benchmark_id": "tydiqa", "name": "TydiQA", "parent_benchmark_id": null, "categories": ["language", "reasoning"], "modality": "text", "multilingual": true, "max_score": 1.0, "language": "en", "description": "A multilingual question answering benchmark covering 11 typologically diverse languages with 204K question-answer pairs. Questions are written by people seeking genuine information and data is collected directly in each language without translation to test model generalization across diverse linguistic structures.", "paper_link": "https://arxiv.org/abs/2003.05002", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.470500+00:00", "updated_at": "2025-07-19T19:56:14.470500+00:00" } ================================================ FILE: data/benchmarks/uniform-bar-exam.json ================================================ { "benchmark_id": "uniform-bar-exam", "name": "Uniform Bar Exam", "parent_benchmark_id": null, "categories": ["legal", "reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "The Uniform Bar Examination (UBE) benchmark evaluates language models on the complete bar exam including multiple-choice Multistate Bar Examination (MBE), open-ended Multistate Essay Exam (MEE), and Multistate Performance Test (MPT) components. Used to assess legal reasoning capabilities across seven subject areas including Evidence, Torts, Constitutional Law, Contracts, Criminal Law and Procedure, Real Property, and Civil Procedure.", "paper_link": "https://royalsocietypublishing.org/doi/10.1098/rsta.2023.0254", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:15.404860+00:00", "updated_at": "2025-07-19T19:56:15.404860+00:00" } ================================================ FILE: data/benchmarks/usamo25.json ================================================ { "benchmark_id": "usamo25", "name": "USAMO25", "parent_benchmark_id": null, "categories": ["math", "reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "The 2025 United States of America Mathematical Olympiad (USAMO) benchmark consists of six challenging mathematical problems requiring rigorous proof-based reasoning. USAMO is the most prestigious high school mathematics competition in the United States, serving as the final round of the American Mathematics Competitions series. This benchmark evaluates models on mathematical problem-solving capabilities beyond simple numerical computation, focusing on formal mathematical reasoning and proof generation.", "paper_link": "https://arxiv.org/abs/2503.21934", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:15.067604+00:00", "updated_at": "2025-07-19T19:56:15.067604+00:00" } ================================================ FILE: data/benchmarks/vatex.json ================================================ { "benchmark_id": "vatex", "name": "VATEX", "parent_benchmark_id": null, "categories": ["multimodal", "video", "language"], "modality": "multimodal", "multilingual": true, "max_score": 1.0, "language": "en", "description": "VaTeX: A Large-Scale, High-Quality Multilingual Dataset for Video-and-Language Research. Contains over 41,250 videos and 825,000 captions in both English and Chinese, with over 206,000 English-Chinese parallel translation pairs. Supports multilingual video captioning and video-guided machine translation tasks.", "paper_link": "https://arxiv.org/abs/1904.03493", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:12.909879+00:00", "updated_at": "2025-07-19T19:56:12.909879+00:00" } ================================================ FILE: data/benchmarks/vcr-en-easy.json ================================================ { "benchmark_id": "vcr-en-easy", "name": "VCR_en_easy", "parent_benchmark_id": null, "categories": ["vision", "reasoning"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "Visual Commonsense Reasoning (VCR) benchmark that tests higher-order cognition and commonsense reasoning beyond simple object recognition. Models must answer challenging questions about images and provide rationales justifying their answers. The benchmark measures the ability to infer people's actions, goals, and mental states from visual context.", "paper_link": "https://arxiv.org/abs/1811.10830", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.592175+00:00", "updated_at": "2025-07-19T19:56:14.592175+00:00" } ================================================ FILE: data/benchmarks/vibe-eval.json ================================================ { "benchmark_id": "vibe-eval", "name": "Vibe-Eval", "parent_benchmark_id": null, "categories": ["multimodal", "vision", "general"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "VIBE-Eval is a hard evaluation suite for measuring progress of multimodal language models, consisting of 269 visual understanding prompts with gold-standard responses authored by experts. The benchmark has dual objectives: vibe checking multimodal chat models for day-to-day tasks and rigorously testing frontier models, with the hard set containing >50% questions that all frontier models answer incorrectly.", "paper_link": "https://arxiv.org/abs/2405.02287", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:13.871369+00:00", "updated_at": "2025-07-19T19:56:13.871369+00:00" } ================================================ FILE: data/benchmarks/video-mme-(long,-no-subtitles).json ================================================ { "benchmark_id": "video-mme-(long,-no-subtitles)", "name": "Video-MME (long, no subtitles)", "parent_benchmark_id": null, "categories": ["vision", "multimodal", "video"], "modality": "multimodal", "multilingual": true, "max_score": 1.0, "language": "en", "description": "Video-MME is the first-ever comprehensive evaluation benchmark for Multi-modal Large Language Models (MLLMs) in video analysis. This variant focuses on long-term videos (30min-60min) without subtitle inputs, testing robust contextual dynamics across 6 primary visual domains with 30 subfields including knowledge, film & television, sports competition, life record, and multilingual content.", "paper_link": "https://arxiv.org/abs/2405.21075", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:15.374053+00:00", "updated_at": "2025-07-19T19:56:15.374053+00:00" } ================================================ FILE: data/benchmarks/video-mme.json ================================================ { "benchmark_id": "video-mme", "name": "Video-MME", "parent_benchmark_id": null, "categories": ["multimodal", "vision", "reasoning"], "modality": "multimodal", "multilingual": true, "max_score": 1.0, "language": "en", "description": "Video-MME is the first-ever comprehensive evaluation benchmark of Multi-modal Large Language Models (MLLMs) in video analysis. It features 900 videos totaling 254 hours with 2,700 human-annotated question-answer pairs across 6 primary visual domains (Knowledge, Film & Television, Sports Competition, Life Record, Multilingual, and others) and 30 subfields. The benchmark evaluates models across diverse temporal dimensions (11 seconds to 1 hour), integrates multi-modal inputs including video frames, subtitles, and audio, and uses rigorous manual labeling by expert annotators for precise assessment.", "paper_link": "https://arxiv.org/abs/2405.21075", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:13.901883+00:00", "updated_at": "2025-07-19T19:56:13.901883+00:00" } ================================================ FILE: data/benchmarks/video-mmew-sub.json ================================================ { "benchmark_id": "video-mmew-sub", "name": "Video-MMEw sub", "parent_benchmark_id": null, "categories": ["multimodal", "reasoning", "vision"], "modality": "multimodal", "multilingual": true, "max_score": 1.0, "language": "en", "description": "Video-MME is the first comprehensive evaluation benchmark for multi-modal large language models in video analysis. It consists of 900 videos (254 hours total) across 6 domains and 30 sub-categories, with 2,700 high-quality multiple-choice questions. The benchmark evaluates MLLMs on diverse video types of varying durations (11 seconds to 1 hour) with multi-modal inputs including video frames, subtitles, and audio to assess perception, reasoning, and temporal understanding capabilities.", "paper_link": "https://arxiv.org/abs/2405.21075", "implementation_link": null, "verified": false, "created_at": "2025-08-03T22:06:11.276310+00:00", "updated_at": "2025-08-03T22:06:11.276310+00:00" } ================================================ FILE: data/benchmarks/videomme-w-o-sub..json ================================================ { "benchmark_id": "videomme-w-o-sub.", "name": "VideoMME w/o sub.", "parent_benchmark_id": null, "categories": ["multimodal", "video", "vision"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "Video-MME is a comprehensive evaluation benchmark for multi-modal large language models in video analysis. It features 900 videos across 6 primary visual domains with 30 subfields, ranging from 11 seconds to 1 hour in duration, with 2,700 question-answer pairs. The benchmark evaluates MLLMs' capabilities in processing sequential visual data and multi-modal content including video frames, subtitles, and audio.", "paper_link": "https://arxiv.org/abs/2405.21075", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.715184+00:00", "updated_at": "2025-07-19T19:56:14.715184+00:00" } ================================================ FILE: data/benchmarks/videomme-w-sub..json ================================================ { "benchmark_id": "videomme-w-sub.", "name": "VideoMME w sub.", "parent_benchmark_id": null, "categories": ["vision", "multimodal", "video"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "The first-ever comprehensive evaluation benchmark of Multi-modal LLMs in Video analysis. Features 900 videos (254 hours) with 2,700 question-answer pairs covering 6 primary visual domains and 30 subfields. Evaluates temporal understanding across short (11 seconds) to long (1 hour) videos with multi-modal inputs including video frames, subtitles, and audio.", "paper_link": "https://arxiv.org/abs/2405.21075", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.723259+00:00", "updated_at": "2025-07-19T19:56:14.723259+00:00" } ================================================ FILE: data/benchmarks/videommmu.json ================================================ { "benchmark_id": "videommmu", "name": "VideoMMMU", "parent_benchmark_id": null, "categories": ["multimodal", "vision", "reasoning"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "Video-MMMU evaluates Large Multimodal Models' ability to acquire knowledge from expert-level professional videos across six disciplines through three cognitive stages: perception, comprehension, and adaptation. Contains 300 videos and 900 human-annotated questions spanning Art, Business, Science, Medicine, Humanities, and Engineering.", "paper_link": "https://arxiv.org/abs/2501.13826", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.007381+00:00", "updated_at": "2025-07-19T19:56:14.007381+00:00" } ================================================ FILE: data/benchmarks/visualwebbench.json ================================================ { "benchmark_id": "visualwebbench", "name": "VisualWebBench", "parent_benchmark_id": null, "categories": ["vision", "multimodal", "frontend_development"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "A multimodal benchmark designed to assess the capabilities of multimodal large language models (MLLMs) across web page understanding and grounding tasks. Comprises 7 tasks (captioning, webpage QA, heading OCR, element OCR, element grounding, action prediction, and action grounding) with 1.5K human-curated instances from 139 real websites across 87 sub-domains.", "paper_link": "https://arxiv.org/abs/2404.05955", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:12.747583+00:00", "updated_at": "2025-07-19T19:56:12.747583+00:00" } ================================================ FILE: data/benchmarks/vocalsound.json ================================================ { "benchmark_id": "vocalsound", "name": "VocalSound", "parent_benchmark_id": null, "categories": ["audio"], "modality": "audio", "multilingual": false, "max_score": 1.0, "language": "en", "description": "A dataset for improving human vocal sounds recognition, containing over 21,000 crowdsourced recordings of laughter, sighs, coughs, throat clearing, sneezes, and sniffs from 3,365 unique subjects. Used for audio event classification and recognition of human non-speech vocalizations.", "paper_link": "https://arxiv.org/abs/2205.03433", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.919198+00:00", "updated_at": "2025-07-19T19:56:14.919198+00:00" } ================================================ FILE: data/benchmarks/voicebench-avg.json ================================================ { "benchmark_id": "voicebench-avg", "name": "VoiceBench Avg", "parent_benchmark_id": null, "categories": ["general", "reasoning", "safety", "communication"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "VoiceBench is the first benchmark designed to provide a multi-faceted evaluation of LLM-based voice assistants, evaluating capabilities including general knowledge, instruction-following, reasoning, and safety using both synthetic and real spoken instruction data with diverse speaker characteristics and environmental conditions.", "paper_link": "https://arxiv.org/abs/2410.17196", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.922519+00:00", "updated_at": "2025-07-19T19:56:14.922519+00:00" } ================================================ FILE: data/benchmarks/vqa-rad.json ================================================ { "benchmark_id": "vqa-rad", "name": "VQA-Rad", "parent_benchmark_id": null, "categories": ["vision", "healthcare", "multimodal"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "VQA-RAD (Visual Question Answering in Radiology) is the first manually constructed dataset of medical visual question answering containing 3,515 clinically generated visual questions and answers about radiology images. The dataset includes questions created by clinical trainees on 315 radiology images from MedPix covering head, chest, and abdominal scans, designed to support AI development for medical image analysis and improve patient care.", "paper_link": "https://doi.org/10.1038/sdata.2018.251", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.031802+00:00", "updated_at": "2025-07-19T19:56:14.031802+00:00" } ================================================ FILE: data/benchmarks/vqav2-(test).json ================================================ { "benchmark_id": "vqav2-(test)", "name": "VQAv2 (test)", "parent_benchmark_id": null, "categories": ["vision", "multimodal", "reasoning"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "VQA v2.0 (Visual Question Answering v2.0) is a balanced dataset designed to counter language priors in visual question answering. It consists of complementary image pairs where the same question yields different answers, forcing models to rely on visual understanding rather than language bias. The dataset contains 1,105,904 questions across 204,721 COCO images, requiring understanding of vision, language, and commonsense knowledge.", "paper_link": "https://arxiv.org/abs/1612.00837", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.430940+00:00", "updated_at": "2025-07-19T19:56:14.430940+00:00" } ================================================ FILE: data/benchmarks/vqav2-(val).json ================================================ { "benchmark_id": "vqav2-(val)", "name": "VQAv2 (val)", "parent_benchmark_id": null, "categories": ["vision", "multimodal", "language", "reasoning"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "VQAv2 is a balanced Visual Question Answering dataset containing open-ended questions about images that require understanding of vision, language, and commonsense knowledge to answer. VQAv2 addresses bias issues from the original VQA dataset by collecting complementary images such that every question is associated with similar images that result in different answers, forcing models to actually understand visual content rather than relying on language priors.", "paper_link": "https://arxiv.org/abs/1612.00837", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:13.647852+00:00", "updated_at": "2025-07-19T19:56:13.647852+00:00" } ================================================ FILE: data/benchmarks/vqav2.json ================================================ { "benchmark_id": "vqav2", "name": "VQAv2", "parent_benchmark_id": null, "categories": ["vision", "multimodal", "reasoning"], "modality": "multimodal", "multilingual": false, "max_score": 1.0, "language": "en", "description": "VQAv2 is a balanced Visual Question Answering dataset that addresses language bias by providing complementary images for each question, forcing models to rely on visual understanding rather than language priors. It contains approximately twice the number of image-question pairs compared to the original VQA dataset.", "paper_link": "https://arxiv.org/abs/1612.00837", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:14.410411+00:00", "updated_at": "2025-07-19T19:56:14.410411+00:00" } ================================================ FILE: data/benchmarks/wild-bench.json ================================================ { "benchmark_id": "wild-bench", "name": "Wild Bench", "parent_benchmark_id": null, "categories": ["general", "reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "WildBench is an automated evaluation framework that benchmarks large language models using 1,024 challenging, real-world tasks selected from over one million human-chatbot conversation logs. It introduces two evaluation metrics (WB-Reward and WB-Score) that achieve high correlation with human preferences and uses task-specific checklists for systematic evaluation.", "paper_link": "https://arxiv.org/abs/2406.04770", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:15.122112+00:00", "updated_at": "2025-07-19T19:56:15.122112+00:00" } ================================================ FILE: data/benchmarks/winogrande.json ================================================ { "benchmark_id": "winogrande", "name": "Winogrande", "parent_benchmark_id": null, "categories": ["reasoning", "language"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "WinoGrande: An Adversarial Winograd Schema Challenge at Scale. A large-scale dataset of 44,000 pronoun resolution problems designed to test machine commonsense reasoning. Uses adversarial filtering to reduce spurious biases and provides a more robust evaluation of whether AI systems truly understand commonsense or exploit statistical shortcuts. Current best AI methods achieve 59.4-79.1% accuracy, significantly below human performance of 94.0%.", "paper_link": "https://arxiv.org/abs/1907.10641", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:11.370408+00:00", "updated_at": "2025-07-19T19:56:11.370408+00:00" } ================================================ FILE: data/benchmarks/wmt23.json ================================================ { "benchmark_id": "wmt23", "name": "WMT23", "parent_benchmark_id": null, "categories": ["language"], "modality": "text", "multilingual": true, "max_score": 1.0, "language": "en", "description": "The Eighth Conference on Machine Translation (WMT23) benchmark evaluating machine translation systems across 8 language pairs (14 translation directions) including general, biomedical, literary, and low-resource language translation tasks. Features specialized shared tasks for quality estimation, metrics evaluation, sign language translation, and discourse-level literary translation with professional human assessment.", "paper_link": "https://aclanthology.org/2023.wmt-1.1/", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:13.934606+00:00", "updated_at": "2025-07-19T19:56:13.934606+00:00" } ================================================ FILE: data/benchmarks/wmt24++.json ================================================ { "benchmark_id": "wmt24++", "name": "WMT24++", "parent_benchmark_id": null, "categories": ["language"], "modality": "text", "multilingual": true, "max_score": 1.0, "language": "en", "description": "WMT24++ is a comprehensive multilingual machine translation benchmark that expands the WMT24 dataset to cover 55 languages and dialects. It includes human-written references and post-edits across four domains (literary, news, social, and speech) to evaluate machine translation systems and large language models across diverse linguistic contexts.", "paper_link": "https://arxiv.org/abs/2502.12404", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:13.576712+00:00", "updated_at": "2025-07-19T19:56:13.576712+00:00" } ================================================ FILE: data/benchmarks/writingbench.json ================================================ { "benchmark_id": "writingbench", "name": "WritingBench", "parent_benchmark_id": null, "categories": ["writing", "creativity", "communication"], "modality": "text", "multilingual": true, "max_score": 1.0, "language": "en", "description": "A comprehensive benchmark for evaluating large language models' generative writing capabilities across 6 core writing domains (Academic & Engineering, Finance & Business, Politics & Law, Literature & Art, Education, Advertising & Marketing) and 100 subdomains. Contains 1,239 queries with a query-dependent evaluation framework that dynamically generates 5 instance-specific assessment criteria for each writing task, using a fine-tuned critic model to score responses on style, format, and length dimensions.", "paper_link": "https://arxiv.org/abs/2503.05244", "implementation_link": null, "verified": false, "created_at": "2025-08-03T22:06:11.074130+00:00", "updated_at": "2025-08-03T22:06:11.074130+00:00" } ================================================ FILE: data/benchmarks/xlsum-english.json ================================================ { "benchmark_id": "xlsum-english", "name": "XLSum English", "parent_benchmark_id": null, "categories": ["summarization", "language"], "modality": "text", "multilingual": true, "max_score": 1.0, "language": "en", "description": "Large-scale multilingual abstractive summarization dataset comprising 1 million professionally annotated article-summary pairs from BBC, covering 44 languages. XL-Sum is highly abstractive, concise, and of high quality, designed to encourage research on multilingual abstractive summarization tasks.", "paper_link": "https://arxiv.org/abs/2106.13822", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:15.092213+00:00", "updated_at": "2025-07-19T19:56:15.092213+00:00" } ================================================ FILE: data/benchmarks/xstest.json ================================================ { "benchmark_id": "xstest", "name": "XSTest", "parent_benchmark_id": null, "categories": ["safety"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "XSTest is a test suite designed to identify exaggerated safety behaviours in large language models. It comprises 450 prompts: 250 safe prompts across ten prompt types that well-calibrated models should not refuse to comply with, and 200 unsafe prompts as contrasts that models should refuse. The benchmark systematically evaluates whether models refuse to respond to clearly safe prompts due to overly cautious safety mechanisms.", "paper_link": "https://arxiv.org/abs/2308.01263", "implementation_link": null, "verified": false, "created_at": "2025-07-19T19:56:13.998594+00:00", "updated_at": "2025-07-19T19:56:13.998594+00:00" } ================================================ FILE: data/benchmarks/zebralogic.json ================================================ { "benchmark_id": "zebralogic", "name": "ZebraLogic", "parent_benchmark_id": null, "categories": ["reasoning"], "modality": "text", "multilingual": false, "max_score": 1.0, "language": "en", "description": "ZebraLogic is an evaluation framework for assessing large language models' logical reasoning capabilities through logic grid puzzles derived from constraint satisfaction problems (CSPs). The benchmark consists of 1,000 programmatically generated puzzles with controllable and quantifiable complexity, revealing a 'curse of complexity' where model accuracy declines significantly as problem complexity grows.", "paper_link": "https://arxiv.org/abs/2502.01100", "implementation_link": null, "verified": false, "created_at": "2025-09-05T00:00:00.000000+00:00", "updated_at": "2025-09-05T00:00:00.000000+00:00" } ================================================ FILE: data/licenses/apache_2_0.json ================================================ { "license_id": "apache_2_0", "name": "Apache 2.0", "allow_commercial": true, "description": "Apache License 2.0 - allows commercial use", "created_at": "2025-07-19T19:49:05.605369+00:00", "updated_at": "2025-07-19T19:49:05.605369+00:00" } ================================================ FILE: data/licenses/cc_by_nc.json ================================================ { "license_id": "cc_by_nc", "name": "CC BY-NC", "allow_commercial": false, "description": "Creative Commons Non-Commercial", "created_at": "2025-07-19T19:49:05.408956+00:00", "updated_at": "2025-07-19T19:49:05.408956+00:00" } ================================================ FILE: data/licenses/creative_commons_attribution_4_0_license.json ================================================ { "license_id": "creative_commons_attribution_4_0_license", "name": "Creative Commons Attribution 4.0 License", "allow_commercial": false, "description": "Creative Commons Attribution 4.0 License license", "created_at": "2025-07-19T19:49:05.471773+00:00", "updated_at": "2025-07-19T19:49:05.471773+00:00" } ================================================ FILE: data/licenses/deepseek.json ================================================ { "license_id": "deepseek", "name": "deepseek", "allow_commercial": false, "description": "deepseek license", "created_at": "2025-07-19T19:49:05.656652+00:00", "updated_at": "2025-07-19T19:49:05.656652+00:00" } ================================================ FILE: data/licenses/gemma.json ================================================ { "license_id": "gemma", "name": "Gemma", "allow_commercial": true, "description": "Google Gemma Terms of Use", "created_at": "2025-07-19T19:49:05.442645+00:00", "updated_at": "2025-07-19T19:49:05.442645+00:00" } ================================================ FILE: data/licenses/health_ai_developer_foundations_terms_of_use.json ================================================ { "license_id": "health_ai_developer_foundations_terms_of_use", "name": "Health AI Developer Foundations terms of use", "allow_commercial": false, "description": "Health AI Developer Foundations terms of use license", "created_at": "2025-07-19T19:49:05.510423+00:00", "updated_at": "2025-07-19T19:49:05.510423+00:00" } ================================================ FILE: data/licenses/jamba_open_model_license.json ================================================ { "license_id": "jamba_open_model_license", "name": "Jamba Open Model License", "allow_commercial": false, "description": "Jamba Open Model License license", "created_at": "2025-07-19T19:49:05.763778+00:00", "updated_at": "2025-07-19T19:49:05.763778+00:00" } ================================================ FILE: data/licenses/llama3_2.json ================================================ { "license_id": "llama3_2", "name": "Llama 3.2", "allow_commercial": true, "description": "Meta Llama 3.2 Community License", "created_at": "2025-07-19T19:49:05.578287+00:00", "updated_at": "2025-07-19T19:49:05.578287+00:00" } ================================================ FILE: data/licenses/llama_3_1_community_license.json ================================================ { "license_id": "llama_3_1_community_license", "name": "Llama 3.1 Community License", "allow_commercial": false, "description": "Llama 3.1 Community License license", "created_at": "2025-07-19T19:49:05.574080+00:00", "updated_at": "2025-07-19T19:49:05.574080+00:00" } ================================================ FILE: data/licenses/llama_3_2_community_license.json ================================================ { "license_id": "llama_3_2_community_license", "name": "Llama 3.2 Community License", "allow_commercial": false, "description": "Llama 3.2 Community License license", "created_at": "2025-07-19T19:49:05.587308+00:00", "updated_at": "2025-07-19T19:49:05.587308+00:00" } ================================================ FILE: data/licenses/llama_3_3_community_license_agreement.json ================================================ { "license_id": "llama_3_3_community_license_agreement", "name": "Llama 3.3 Community License Agreement", "allow_commercial": false, "description": "Llama 3.3 Community License Agreement license", "created_at": "2025-07-19T19:49:05.602167+00:00", "updated_at": "2025-07-19T19:49:05.602167+00:00" } ================================================ FILE: data/licenses/llama_4_community_license_agreement.json ================================================ { "license_id": "llama_4_community_license_agreement", "name": "Llama 4 Community License Agreement", "allow_commercial": false, "description": "Llama 4 Community License Agreement license", "created_at": "2025-07-19T19:49:05.593881+00:00", "updated_at": "2025-07-19T19:49:05.593881+00:00" } ================================================ FILE: data/licenses/mistral_research_license.json ================================================ { "license_id": "mistral_research_license", "name": "Mistral Research License", "allow_commercial": false, "description": "Mistral Research License license", "created_at": "2025-07-19T19:49:05.785093+00:00", "updated_at": "2025-07-19T19:49:05.785093+00:00" } ================================================ FILE: data/licenses/mistral_research_license_(mrl)_for_research;_mistral_commercial_license_for_commercial_use.json ================================================ { "license_id": "mistral_research_license_(mrl)_for_research;_mistral_commercial_license_for_commercial_use", "name": "Mistral Research License (MRL) for research; Mistral Commercial License for commercial use", "allow_commercial": false, "description": "Mistral Research License (MRL) for research; Mistral Commercial License for commercial use license", "created_at": "2025-07-19T19:49:05.911442+00:00", "updated_at": "2025-07-19T19:49:05.911442+00:00" } ================================================ FILE: data/licenses/mit.json ================================================ { "license_id": "mit", "name": "MIT", "allow_commercial": true, "description": "MIT License - allows commercial use", "created_at": "2025-07-19T19:49:05.544627+00:00", "updated_at": "2025-07-19T19:49:05.544627+00:00" } ================================================ FILE: data/licenses/mit_+_model_license_(commercial_use_allowed).json ================================================ { "license_id": "mit_+_model_license_(commercial_use_allowed)", "name": "MIT + Model License (Commercial use allowed)", "allow_commercial": false, "description": "MIT + Model License (Commercial use allowed) license", "created_at": "2025-07-19T19:49:05.676049+00:00", "updated_at": "2025-07-19T19:49:05.676049+00:00" } ================================================ FILE: data/licenses/mit_license.json ================================================ { "license_id": "mit_license", "name": "MIT License", "allow_commercial": false, "description": "MIT License license", "created_at": "2025-07-19T19:49:05.897679+00:00", "updated_at": "2025-07-19T19:49:05.897679+00:00" } ================================================ FILE: data/licenses/mnpl_0_1.json ================================================ { "license_id": "mnpl_0_1", "name": "MNPL-0.1", "allow_commercial": false, "description": "MNPL-0.1 license", "created_at": "2025-07-19T19:49:05.804469+00:00", "updated_at": "2025-07-19T19:49:05.804469+00:00" } ================================================ FILE: data/licenses/modified_mit_license.json ================================================ { "license_id": "modified_mit_license", "name": "Modified MIT License", "allow_commercial": false, "description": "Modified MIT License license", "created_at": "2025-07-19T19:49:05.420757+00:00", "updated_at": "2025-07-19T19:49:05.420757+00:00" } ================================================ FILE: data/licenses/nvidia_open_model_license_agreement.json ================================================ { "license_id": "nvidia_open_model_license_agreement", "name": "NVIDIA Open Model License Agreement ", "allow_commercial": true, "description": "NVIDIA Open Model License Agreement ", "created_at": "2025-10-02T21:51:16.835+00:00", "updated_at": "2025-10-02T21:51:16.835+00:00" } ================================================ FILE: data/licenses/proprietary.json ================================================ { "license_id": "proprietary", "name": "Proprietary", "allow_commercial": false, "description": "Proprietary license - usage restrictions apply", "created_at": "2025-07-19T19:49:05.425183+00:00", "updated_at": "2025-07-19T19:49:05.425183+00:00" } ================================================ FILE: data/licenses/qwen.json ================================================ { "license_id": "qwen", "name": "Qwen", "allow_commercial": true, "description": "Alibaba Qwen License", "created_at": "2025-07-19T19:49:05.626726+00:00", "updated_at": "2025-07-19T19:49:05.626726+00:00" } ================================================ FILE: data/licenses/tongyi_qianwen.json ================================================ { "license_id": "tongyi_qianwen", "name": "tongyi-qianwen", "allow_commercial": false, "description": "tongyi-qianwen license", "created_at": "2025-07-19T19:49:05.618579+00:00", "updated_at": "2025-07-19T19:49:05.618579+00:00" } ================================================ FILE: data/licenses/unknown.json ================================================ { "license_id": "unknown", "name": "Unknown", "allow_commercial": false, "description": "Unknown license", "created_at": "2025-08-03T22:06:10.793734+00:00", "updated_at": "2025-08-03T22:06:10.793734+00:00" } ================================================ FILE: data/organizations/ai21/models/jamba-1.5-large/benchmarks.json ================================================ [ { "model_benchmark_id": 28, "benchmark_id": "arc-c", "model_id": "jamba-1.5-large", "score": 0.93, "normalized_score": 0.93, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/ai21labs/AI21-Jamba-1.5-Large", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.139664+00:00", "updated_at": "2025-07-19T19:56:11.139664+00:00", "benchmark_name": "ARC-C" }, { "model_benchmark_id": 1462, "benchmark_id": "arena-hard", "model_id": "jamba-1.5-large", "score": 0.654, "normalized_score": 0.654, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/ai21labs/AI21-Jamba-1.5-Large", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.114965+00:00", "updated_at": "2025-07-19T19:56:14.114965+00:00", "benchmark_name": "Arena Hard" }, { "model_benchmark_id": 338, "benchmark_id": "gpqa", "model_id": "jamba-1.5-large", "score": 0.369, "normalized_score": 0.369, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/ai21labs/AI21-Jamba-1.5-Large", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.736664+00:00", "updated_at": "2025-07-19T19:56:11.736664+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 1011, "benchmark_id": "gsm8k", "model_id": "jamba-1.5-large", "score": 0.87, "normalized_score": 0.87, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/ai21labs/AI21-Jamba-1.5-Large", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.109009+00:00", "updated_at": "2025-07-19T19:56:13.109009+00:00", "benchmark_name": "GSM8k" }, { "model_benchmark_id": 108, "benchmark_id": "mmlu", "model_id": "jamba-1.5-large", "score": 0.812, "normalized_score": 0.812, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/ai21labs/AI21-Jamba-1.5-Large", "verified_by_llmstats": false, "analysis_method": "Chain-of-Thought accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.302578+00:00", "updated_at": "2025-07-19T19:56:11.302578+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 213, "benchmark_id": "mmlu-pro", "model_id": "jamba-1.5-large", "score": 0.535, "normalized_score": 0.535, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/ai21labs/AI21-Jamba-1.5-Large", "verified_by_llmstats": false, "analysis_method": "Chain-of-Thought accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.505024+00:00", "updated_at": "2025-07-19T19:56:11.505024+00:00", "benchmark_name": "MMLU-Pro" }, { "model_benchmark_id": 144, "benchmark_id": "truthfulqa", "model_id": "jamba-1.5-large", "score": 0.583, "normalized_score": 0.583, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/ai21labs/AI21-Jamba-1.5-Large", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.365684+00:00", "updated_at": "2025-07-19T19:56:11.365684+00:00", "benchmark_name": "TruthfulQA" }, { "model_benchmark_id": 1816, "benchmark_id": "wild-bench", "model_id": "jamba-1.5-large", "score": 0.485, "normalized_score": 0.485, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/ai21labs/AI21-Jamba-1.5-Large", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.125090+00:00", "updated_at": "2025-07-19T19:56:15.125090+00:00", "benchmark_name": "Wild Bench" } ] ================================================ FILE: data/organizations/ai21/models/jamba-1.5-large/model.json ================================================ { "model_id": "jamba-1.5-large", "name": "Jamba 1.5 Large", "organization_id": "ai21", "fine_tuned_from_model_id": null, "description": "State-of-the-art hybrid SSM-Transformer instruction following foundation model, offering superior long context handling, speed, and quality.", "release_date": "2024-08-22", "announcement_date": "2024-08-22", "license_id": "jamba_open_model_license", "multimodal": false, "knowledge_cutoff": "2024-03-05", "param_count": 398000000000, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://docs.ai21.com/reference/jamba-15-api-ref", "source_playground": null, "source_paper": null, "source_scorecard_blog_link": "https://www.ai21.com/blog/announcing-jamba-model-family", "source_repo_link": null, "source_weights_link": "https://huggingface.co/ai21labs/AI21-Jamba-1.5-Large", "created_at": "2025-07-19T19:49:05.764734+00:00", "updated_at": "2025-07-19T19:49:05.764734+00:00", "model_family_id": null } ================================================ FILE: data/organizations/ai21/models/jamba-1.5-mini/benchmarks.json ================================================ [ { "model_benchmark_id": 29, "benchmark_id": "arc-c", "model_id": "jamba-1.5-mini", "score": 0.857, "normalized_score": 0.857, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/ai21labs/AI21-Jamba-1.5-Mini", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.141043+00:00", "updated_at": "2025-07-19T19:56:11.141043+00:00", "benchmark_name": "ARC-C" }, { "model_benchmark_id": 1463, "benchmark_id": "arena-hard", "model_id": "jamba-1.5-mini", "score": 0.461, "normalized_score": 0.461, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/ai21labs/AI21-Jamba-1.5-Mini", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.117178+00:00", "updated_at": "2025-07-19T19:56:14.117178+00:00", "benchmark_name": "Arena Hard" }, { "model_benchmark_id": 339, "benchmark_id": "gpqa", "model_id": "jamba-1.5-mini", "score": 0.323, "normalized_score": 0.323, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/ai21labs/AI21-Jamba-1.5-Mini", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.739037+00:00", "updated_at": "2025-07-19T19:56:11.739037+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 1012, "benchmark_id": "gsm8k", "model_id": "jamba-1.5-mini", "score": 0.758, "normalized_score": 0.758, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/ai21labs/AI21-Jamba-1.5-Mini", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.110443+00:00", "updated_at": "2025-07-19T19:56:13.110443+00:00", "benchmark_name": "GSM8k" }, { "model_benchmark_id": 109, "benchmark_id": "mmlu", "model_id": "jamba-1.5-mini", "score": 0.697, "normalized_score": 0.697, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/ai21labs/AI21-Jamba-1.5-Mini", "verified_by_llmstats": false, "analysis_method": "Chain-of-Thought accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.304017+00:00", "updated_at": "2025-07-19T19:56:11.304017+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 214, "benchmark_id": "mmlu-pro", "model_id": "jamba-1.5-mini", "score": 0.425, "normalized_score": 0.425, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/ai21labs/AI21-Jamba-1.5-Mini", "verified_by_llmstats": false, "analysis_method": "Chain-of-Thought accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.506893+00:00", "updated_at": "2025-07-19T19:56:11.506893+00:00", "benchmark_name": "MMLU-Pro" }, { "model_benchmark_id": 145, "benchmark_id": "truthfulqa", "model_id": "jamba-1.5-mini", "score": 0.541, "normalized_score": 0.541, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/ai21labs/AI21-Jamba-1.5-Mini", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.367476+00:00", "updated_at": "2025-07-19T19:56:11.367476+00:00", "benchmark_name": "TruthfulQA" }, { "model_benchmark_id": 1817, "benchmark_id": "wild-bench", "model_id": "jamba-1.5-mini", "score": 0.424, "normalized_score": 0.424, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/ai21labs/AI21-Jamba-1.5-Mini", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.127075+00:00", "updated_at": "2025-07-19T19:56:15.127075+00:00", "benchmark_name": "Wild Bench" } ] ================================================ FILE: data/organizations/ai21/models/jamba-1.5-mini/model.json ================================================ { "model_id": "jamba-1.5-mini", "name": "Jamba 1.5 Mini", "organization_id": "ai21", "fine_tuned_from_model_id": null, "description": "Part of the Jamba 1.5 family, a state-of-the-art hybrid SSM-Transformer instruction following foundation model offering superior long context handling, speed, and quality.", "release_date": "2024-08-22", "announcement_date": "2024-08-22", "license_id": "jamba_open_model_license", "multimodal": false, "knowledge_cutoff": "2024-03-05", "param_count": 52000000000, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://docs.ai21.com/reference/jamba-15-api-ref", "source_playground": null, "source_paper": "https://arxiv.org/abs/2408.12570", "source_scorecard_blog_link": "https://www.ai21.com/blog/announcing-jamba-model-family", "source_repo_link": null, "source_weights_link": "https://huggingface.co/ai21labs/AI21-Jamba-1.5-Mini", "created_at": "2025-07-19T19:49:05.767535+00:00", "updated_at": "2025-07-19T19:49:05.767535+00:00", "model_family_id": null } ================================================ FILE: data/organizations/ai21/organization.json ================================================ { "organization_id": "ai21", "name": "AI21 Labs", "website": "https://ai21.com", "description": "NLP AI company", "country": null, "created_at": "2025-07-19T19:49:05.762555+00:00", "updated_at": "2025-07-19T19:49:05.762555+00:00" } ================================================ FILE: data/organizations/amazon/models/nova-lite/benchmarks.json ================================================ [ { "model_benchmark_id": 2, "benchmark_id": "arc-c", "model_id": "nova-lite", "score": 0.924, "normalized_score": 0.924, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "0-shot chain-of-thought", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.080108+00:00", "updated_at": "2025-07-19T19:56:11.080108+00:00", "benchmark_name": "ARC-C" }, { "model_benchmark_id": 967, "benchmark_id": "bbh", "model_id": "nova-lite", "score": 0.824, "normalized_score": 0.824, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "3-shot CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.034481+00:00", "updated_at": "2025-07-19T19:56:13.034481+00:00", "benchmark_name": "BBH" }, { "model_benchmark_id": 843, "benchmark_id": "bfcl", "model_id": "nova-lite", "score": 0.666, "normalized_score": 0.666, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.766776+00:00", "updated_at": "2025-07-19T19:56:12.766776+00:00", "benchmark_name": "BFCL" }, { "model_benchmark_id": 853, "benchmark_id": "chartqa", "model_id": "nova-lite", "score": 0.868, "normalized_score": 0.868, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "relaxed accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.786772+00:00", "updated_at": "2025-07-19T19:56:12.786772+00:00", "benchmark_name": "ChartQA" }, { "model_benchmark_id": 834, "benchmark_id": "crag", "model_id": "nova-lite", "score": 0.438, "normalized_score": 0.438, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.743484+00:00", "updated_at": "2025-07-19T19:56:12.743484+00:00", "benchmark_name": "CRAG" }, { "model_benchmark_id": 876, "benchmark_id": "docvqa", "model_id": "nova-lite", "score": 0.924, "normalized_score": 0.924, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "ANLS", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.827478+00:00", "updated_at": "2025-07-19T19:56:12.827478+00:00", "benchmark_name": "DocVQA" }, { "model_benchmark_id": 939, "benchmark_id": "drop", "model_id": "nova-lite", "score": 0.802, "normalized_score": 0.802, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.984716+00:00", "updated_at": "2025-07-19T19:56:12.984716+00:00", "benchmark_name": "DROP" }, { "model_benchmark_id": 918, "benchmark_id": "egoschema", "model_id": "nova-lite", "score": 0.714, "normalized_score": 0.714, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.918221+00:00", "updated_at": "2025-07-19T19:56:12.918221+00:00", "benchmark_name": "EgoSchema" }, { "model_benchmark_id": 831, "benchmark_id": "finqa", "model_id": "nova-lite", "score": 0.736, "normalized_score": 0.736, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "0-shot accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.736609+00:00", "updated_at": "2025-07-19T19:56:12.736609+00:00", "benchmark_name": "FinQA" }, { "model_benchmark_id": 258, "benchmark_id": "gpqa", "model_id": "nova-lite", "score": 0.42, "normalized_score": 0.42, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "6-shot CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.594691+00:00", "updated_at": "2025-07-19T19:56:11.594691+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 841, "benchmark_id": "groundui-1k", "model_id": "nova-lite", "score": 0.802, "normalized_score": 0.802, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.761300+00:00", "updated_at": "2025-07-19T19:56:12.761300+00:00", "benchmark_name": "GroundUI-1K" }, { "model_benchmark_id": 160, "benchmark_id": "gsm8k", "model_id": "nova-lite", "score": 0.945, "normalized_score": 0.945, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "0-shot CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.407299+00:00", "updated_at": "2025-07-19T19:56:11.407299+00:00", "benchmark_name": "GSM8k" }, { "model_benchmark_id": 759, "benchmark_id": "humaneval", "model_id": "nova-lite", "score": 0.854, "normalized_score": 0.854, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.601822+00:00", "updated_at": "2025-07-19T19:56:12.601822+00:00", "benchmark_name": "HumanEval" }, { "model_benchmark_id": 604, "benchmark_id": "ifeval", "model_id": "nova-lite", "score": 0.897, "normalized_score": 0.897, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "0-shot CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.248959+00:00", "updated_at": "2025-07-19T19:56:12.248959+00:00", "benchmark_name": "IFEval" }, { "model_benchmark_id": 826, "benchmark_id": "lvbench", "model_id": "nova-lite", "score": 0.404, "normalized_score": 0.404, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.726573+00:00", "updated_at": "2025-07-19T19:56:12.726573+00:00", "benchmark_name": "LVBench" }, { "model_benchmark_id": 374, "benchmark_id": "math", "model_id": "nova-lite", "score": 0.733, "normalized_score": 0.733, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "0-shot CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.810622+00:00", "updated_at": "2025-07-19T19:56:11.810622+00:00", "benchmark_name": "MATH" }, { "model_benchmark_id": 60, "benchmark_id": "mmlu", "model_id": "nova-lite", "score": 0.805, "normalized_score": 0.805, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "0-shot chain-of-thought", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.212315+00:00", "updated_at": "2025-07-19T19:56:11.212315+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 839, "benchmark_id": "mm-mind2web", "model_id": "nova-lite", "score": 0.607, "normalized_score": 0.607, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.755878+00:00", "updated_at": "2025-07-19T19:56:12.755878+00:00", "benchmark_name": "MM-Mind2Web" }, { "model_benchmark_id": 550, "benchmark_id": "mmmu", "model_id": "nova-lite", "score": 0.562, "normalized_score": 0.562, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "CoT accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.134288+00:00", "updated_at": "2025-07-19T19:56:12.134288+00:00", "benchmark_name": "MMMU" }, { "model_benchmark_id": 821, "benchmark_id": "squality", "model_id": "nova-lite", "score": 0.192, "normalized_score": 0.192, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "rouge-l", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.715662+00:00", "updated_at": "2025-07-19T19:56:12.715662+00:00", "benchmark_name": "SQuALITY" }, { "model_benchmark_id": 901, "benchmark_id": "textvqa", "model_id": "nova-lite", "score": 0.802, "normalized_score": 0.802, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "weighted accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.878076+00:00", "updated_at": "2025-07-19T19:56:12.878076+00:00", "benchmark_name": "TextVQA" }, { "model_benchmark_id": 930, "benchmark_id": "translation-en\u2192set1-comet22", "model_id": "nova-lite", "score": 0.888, "normalized_score": 0.888, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "COMET22", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.962491+00:00", "updated_at": "2025-07-19T19:56:12.962491+00:00", "benchmark_name": "Translation en\u2192Set1 COMET22" }, { "model_benchmark_id": 927, "benchmark_id": "translation-en\u2192set1-spbleu", "model_id": "nova-lite", "score": 0.415, "normalized_score": 0.415, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "spBleu", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.942744+00:00", "updated_at": "2025-07-19T19:56:12.942744+00:00", "benchmark_name": "Translation en\u2192Set1 spBleu" }, { "model_benchmark_id": 936, "benchmark_id": "translation-set1\u2192en-comet22", "model_id": "nova-lite", "score": 0.888, "normalized_score": 0.888, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "COMET22", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.977060+00:00", "updated_at": "2025-07-19T19:56:12.977060+00:00", "benchmark_name": "Translation Set1\u2192en COMET22" }, { "model_benchmark_id": 933, "benchmark_id": "translation-set1\u2192en-spbleu", "model_id": "nova-lite", "score": 0.431, "normalized_score": 0.431, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "spBleu", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.969524+00:00", "updated_at": "2025-07-19T19:56:12.969524+00:00", "benchmark_name": "Translation Set1\u2192en spBleu" }, { "model_benchmark_id": 916, "benchmark_id": "vatex", "model_id": "nova-lite", "score": 0.778, "normalized_score": 0.778, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "CIDEr", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.912261+00:00", "updated_at": "2025-07-19T19:56:12.912261+00:00", "benchmark_name": "VATEX" }, { "model_benchmark_id": 837, "benchmark_id": "visualwebbench", "model_id": "nova-lite", "score": 0.777, "normalized_score": 0.777, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "composite step accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.750738+00:00", "updated_at": "2025-07-19T19:56:12.750738+00:00", "benchmark_name": "VisualWebBench" } ] ================================================ FILE: data/organizations/amazon/models/nova-lite/model.json ================================================ { "model_id": "nova-lite", "name": "Nova Lite", "organization_id": "amazon", "fine_tuned_from_model_id": null, "description": "A low-cost multimodal model that is lightning fast for processing images, video, documents, and text.", "release_date": "2024-11-20", "announcement_date": "2024-11-20", "license_id": "proprietary", "multimodal": true, "knowledge_cutoff": null, "param_count": null, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://aws.amazon.com/bedrock/amazon-nova-lite", "source_playground": null, "source_paper": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "source_scorecard_blog_link": null, "source_repo_link": null, "source_weights_link": null, "created_at": "2025-07-19T19:49:05.429271+00:00", "updated_at": "2025-07-19T19:49:05.429271+00:00", "model_family_id": null } ================================================ FILE: data/organizations/amazon/models/nova-micro/benchmarks.json ================================================ [ { "model_benchmark_id": 4, "benchmark_id": "arc-c", "model_id": "nova-micro", "score": 0.902, "normalized_score": 0.902, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.088301+00:00", "updated_at": "2025-07-19T19:56:11.088301+00:00", "benchmark_name": "ARC-C" }, { "model_benchmark_id": 969, "benchmark_id": "bbh", "model_id": "nova-micro", "score": 0.795, "normalized_score": 0.795, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "3-shot Chain-of-Thought", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.038288+00:00", "updated_at": "2025-07-19T19:56:13.038288+00:00", "benchmark_name": "BBH" }, { "model_benchmark_id": 845, "benchmark_id": "bfcl", "model_id": "nova-micro", "score": 0.562, "normalized_score": 0.562, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.770319+00:00", "updated_at": "2025-07-19T19:56:12.770319+00:00", "benchmark_name": "BFCL" }, { "model_benchmark_id": 836, "benchmark_id": "crag", "model_id": "nova-micro", "score": 0.431, "normalized_score": 0.431, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.746657+00:00", "updated_at": "2025-07-19T19:56:12.746657+00:00", "benchmark_name": "CRAG" }, { "model_benchmark_id": 941, "benchmark_id": "drop", "model_id": "nova-micro", "score": 0.793, "normalized_score": 0.793, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "6-shot Chain-of-Thought", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.987950+00:00", "updated_at": "2025-07-19T19:56:12.987950+00:00", "benchmark_name": "DROP" }, { "model_benchmark_id": 833, "benchmark_id": "finqa", "model_id": "nova-micro", "score": 0.652, "normalized_score": 0.652, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "0-shot accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.740201+00:00", "updated_at": "2025-07-19T19:56:12.740201+00:00", "benchmark_name": "FinQA" }, { "model_benchmark_id": 260, "benchmark_id": "gpqa", "model_id": "nova-micro", "score": 0.4, "normalized_score": 0.4, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "0-shot Chain-of-Thought", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.598530+00:00", "updated_at": "2025-07-19T19:56:11.598530+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 976, "benchmark_id": "gsm8k", "model_id": "nova-micro", "score": 0.923, "normalized_score": 0.923, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "0-shot Chain-of-Thought", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.051041+00:00", "updated_at": "2025-07-19T19:56:13.051041+00:00", "benchmark_name": "GSM8k" }, { "model_benchmark_id": 761, "benchmark_id": "humaneval", "model_id": "nova-micro", "score": 0.811, "normalized_score": 0.811, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.605066+00:00", "updated_at": "2025-07-19T19:56:12.605066+00:00", "benchmark_name": "HumanEval" }, { "model_benchmark_id": 606, "benchmark_id": "ifeval", "model_id": "nova-micro", "score": 0.872, "normalized_score": 0.872, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.252589+00:00", "updated_at": "2025-07-19T19:56:12.252589+00:00", "benchmark_name": "IFEval" }, { "model_benchmark_id": 376, "benchmark_id": "math", "model_id": "nova-micro", "score": 0.693, "normalized_score": 0.693, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "0-shot Chain-of-Thought", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.814150+00:00", "updated_at": "2025-07-19T19:56:11.814150+00:00", "benchmark_name": "MATH" }, { "model_benchmark_id": 62, "benchmark_id": "mmlu", "model_id": "nova-micro", "score": 0.776, "normalized_score": 0.776, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "0-shot Chain-of-Thought", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.217284+00:00", "updated_at": "2025-07-19T19:56:11.217284+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 823, "benchmark_id": "squality", "model_id": "nova-micro", "score": 0.188, "normalized_score": 0.188, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "rouge-l", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.719314+00:00", "updated_at": "2025-07-19T19:56:12.719314+00:00", "benchmark_name": "SQuALITY" }, { "model_benchmark_id": 932, "benchmark_id": "translation-en\u2192set1-comet22", "model_id": "nova-micro", "score": 0.885, "normalized_score": 0.885, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "COMET22", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.966157+00:00", "updated_at": "2025-07-19T19:56:12.966157+00:00", "benchmark_name": "Translation en\u2192Set1 COMET22" }, { "model_benchmark_id": 929, "benchmark_id": "translation-en\u2192set1-spbleu", "model_id": "nova-micro", "score": 0.402, "normalized_score": 0.402, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "spBleu", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.958167+00:00", "updated_at": "2025-07-19T19:56:12.958167+00:00", "benchmark_name": "Translation en\u2192Set1 spBleu" }, { "model_benchmark_id": 938, "benchmark_id": "translation-set1\u2192en-comet22", "model_id": "nova-micro", "score": 0.887, "normalized_score": 0.887, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "COMET22", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.980365+00:00", "updated_at": "2025-07-19T19:56:12.980365+00:00", "benchmark_name": "Translation Set1\u2192en COMET22" }, { "model_benchmark_id": 935, "benchmark_id": "translation-set1\u2192en-spbleu", "model_id": "nova-micro", "score": 0.426, "normalized_score": 0.426, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "spBleu", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.973209+00:00", "updated_at": "2025-07-19T19:56:12.973209+00:00", "benchmark_name": "Translation Set1\u2192en spBleu" } ] ================================================ FILE: data/organizations/amazon/models/nova-micro/model.json ================================================ { "model_id": "nova-micro", "name": "Nova Micro", "organization_id": "amazon", "fine_tuned_from_model_id": null, "description": "A text-only model that delivers lowest-latency responses at very low cost while maintaining strong performance on core language tasks. Optimized for speed and efficiency while preserving high accuracy on key benchmarks.", "release_date": "2024-11-20", "announcement_date": "2024-11-20", "license_id": "proprietary", "multimodal": false, "knowledge_cutoff": null, "param_count": null, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://docs.aws.amazon.com/bedrock/latest/userguide/model-parameters-nova.html", "source_playground": null, "source_paper": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "source_scorecard_blog_link": null, "source_repo_link": "https://huggingface.co/amazon-agi", "source_weights_link": null, "created_at": "2025-07-19T19:49:05.435386+00:00", "updated_at": "2025-07-19T19:49:05.435386+00:00", "model_family_id": null } ================================================ FILE: data/organizations/amazon/models/nova-pro/benchmarks.json ================================================ [ { "model_benchmark_id": 3, "benchmark_id": "arc-c", "model_id": "nova-pro", "score": 0.948, "normalized_score": 0.948, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "0-shot Chain-of-Thought", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.085849+00:00", "updated_at": "2025-07-19T19:56:11.085849+00:00", "benchmark_name": "ARC-C" }, { "model_benchmark_id": 968, "benchmark_id": "bbh", "model_id": "nova-pro", "score": 0.869, "normalized_score": 0.869, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "3-shot Chain-of-Thought", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.036192+00:00", "updated_at": "2025-07-19T19:56:13.036192+00:00", "benchmark_name": "BBH" }, { "model_benchmark_id": 844, "benchmark_id": "bfcl", "model_id": "nova-pro", "score": 0.684, "normalized_score": 0.684, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.768714+00:00", "updated_at": "2025-07-19T19:56:12.768714+00:00", "benchmark_name": "BFCL" }, { "model_benchmark_id": 854, "benchmark_id": "chartqa", "model_id": "nova-pro", "score": 0.892, "normalized_score": 0.892, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "relaxed accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.788270+00:00", "updated_at": "2025-07-19T19:56:12.788270+00:00", "benchmark_name": "ChartQA" }, { "model_benchmark_id": 835, "benchmark_id": "crag", "model_id": "nova-pro", "score": 0.503, "normalized_score": 0.503, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.744994+00:00", "updated_at": "2025-07-19T19:56:12.744994+00:00", "benchmark_name": "CRAG" }, { "model_benchmark_id": 877, "benchmark_id": "docvqa", "model_id": "nova-pro", "score": 0.935, "normalized_score": 0.935, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "ANLS", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.829064+00:00", "updated_at": "2025-07-19T19:56:12.829064+00:00", "benchmark_name": "DocVQA" }, { "model_benchmark_id": 940, "benchmark_id": "drop", "model_id": "nova-pro", "score": 0.854, "normalized_score": 0.854, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.986311+00:00", "updated_at": "2025-07-19T19:56:12.986311+00:00", "benchmark_name": "DROP" }, { "model_benchmark_id": 919, "benchmark_id": "egoschema", "model_id": "nova-pro", "score": 0.721, "normalized_score": 0.721, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.920400+00:00", "updated_at": "2025-07-19T19:56:12.920400+00:00", "benchmark_name": "EgoSchema" }, { "model_benchmark_id": 832, "benchmark_id": "finqa", "model_id": "nova-pro", "score": 0.772, "normalized_score": 0.772, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "0-shot accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.738456+00:00", "updated_at": "2025-07-19T19:56:12.738456+00:00", "benchmark_name": "FinQA" }, { "model_benchmark_id": 259, "benchmark_id": "gpqa", "model_id": "nova-pro", "score": 0.469, "normalized_score": 0.469, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "6-shot Chain-of-Thought", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.596541+00:00", "updated_at": "2025-07-19T19:56:11.596541+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 842, "benchmark_id": "groundui-1k", "model_id": "nova-pro", "score": 0.814, "normalized_score": 0.814, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.762846+00:00", "updated_at": "2025-07-19T19:56:12.762846+00:00", "benchmark_name": "GroundUI-1K" }, { "model_benchmark_id": 975, "benchmark_id": "gsm8k", "model_id": "nova-pro", "score": 0.948, "normalized_score": 0.948, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "0-shot Chain-of-Thought", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.049455+00:00", "updated_at": "2025-07-19T19:56:13.049455+00:00", "benchmark_name": "GSM8k" }, { "model_benchmark_id": 760, "benchmark_id": "humaneval", "model_id": "nova-pro", "score": 0.89, "normalized_score": 0.89, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "0-shot pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.603428+00:00", "updated_at": "2025-07-19T19:56:12.603428+00:00", "benchmark_name": "HumanEval" }, { "model_benchmark_id": 605, "benchmark_id": "ifeval", "model_id": "nova-pro", "score": 0.921, "normalized_score": 0.921, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.250818+00:00", "updated_at": "2025-07-19T19:56:12.250818+00:00", "benchmark_name": "IFEval" }, { "model_benchmark_id": 827, "benchmark_id": "lvbench", "model_id": "nova-pro", "score": 0.416, "normalized_score": 0.416, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.728104+00:00", "updated_at": "2025-07-19T19:56:12.728104+00:00", "benchmark_name": "LVBench" }, { "model_benchmark_id": 375, "benchmark_id": "math", "model_id": "nova-pro", "score": 0.766, "normalized_score": 0.766, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "0-shot Chain-of-Thought", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.812663+00:00", "updated_at": "2025-07-19T19:56:11.812663+00:00", "benchmark_name": "MATH" }, { "model_benchmark_id": 61, "benchmark_id": "mmlu", "model_id": "nova-pro", "score": 0.859, "normalized_score": 0.859, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "0-shot Chain-of-Thought", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.214544+00:00", "updated_at": "2025-07-19T19:56:11.214544+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 840, "benchmark_id": "mm-mind2web", "model_id": "nova-pro", "score": 0.637, "normalized_score": 0.637, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "step accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.757670+00:00", "updated_at": "2025-07-19T19:56:12.757670+00:00", "benchmark_name": "MM-Mind2Web" }, { "model_benchmark_id": 551, "benchmark_id": "mmmu", "model_id": "nova-pro", "score": 0.617, "normalized_score": 0.617, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "Chain-of-Thought", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.135953+00:00", "updated_at": "2025-07-19T19:56:12.135953+00:00", "benchmark_name": "MMMU" }, { "model_benchmark_id": 822, "benchmark_id": "squality", "model_id": "nova-pro", "score": 0.198, "normalized_score": 0.198, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "ROUGE-L", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.717624+00:00", "updated_at": "2025-07-19T19:56:12.717624+00:00", "benchmark_name": "SQuALITY" }, { "model_benchmark_id": 902, "benchmark_id": "textvqa", "model_id": "nova-pro", "score": 0.815, "normalized_score": 0.815, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "weighted accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.880228+00:00", "updated_at": "2025-07-19T19:56:12.880228+00:00", "benchmark_name": "TextVQA" }, { "model_benchmark_id": 931, "benchmark_id": "translation-en\u2192set1-comet22", "model_id": "nova-pro", "score": 0.891, "normalized_score": 0.891, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "COMET22", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.964047+00:00", "updated_at": "2025-07-19T19:56:12.964047+00:00", "benchmark_name": "Translation en\u2192Set1 COMET22" }, { "model_benchmark_id": 928, "benchmark_id": "translation-en\u2192set1-spbleu", "model_id": "nova-pro", "score": 0.434, "normalized_score": 0.434, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "spBleu", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.950458+00:00", "updated_at": "2025-07-19T19:56:12.950458+00:00", "benchmark_name": "Translation en\u2192Set1 spBleu" }, { "model_benchmark_id": 937, "benchmark_id": "translation-set1\u2192en-comet22", "model_id": "nova-pro", "score": 0.89, "normalized_score": 0.89, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "COMET22", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.978787+00:00", "updated_at": "2025-07-19T19:56:12.978787+00:00", "benchmark_name": "Translation Set1\u2192en COMET22" }, { "model_benchmark_id": 934, "benchmark_id": "translation-set1\u2192en-spbleu", "model_id": "nova-pro", "score": 0.444, "normalized_score": 0.444, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "spBleu", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.971295+00:00", "updated_at": "2025-07-19T19:56:12.971295+00:00", "benchmark_name": "Translation Set1\u2192en spBleu" }, { "model_benchmark_id": 917, "benchmark_id": "vatex", "model_id": "nova-pro", "score": 0.778, "normalized_score": 0.778, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "CIDEr", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.913837+00:00", "updated_at": "2025-07-19T19:56:12.913837+00:00", "benchmark_name": "VATEX" }, { "model_benchmark_id": 838, "benchmark_id": "visualwebbench", "model_id": "nova-pro", "score": 0.797, "normalized_score": 0.797, "is_self_reported": true, "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "verified_by_llmstats": false, "analysis_method": "composite", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.752533+00:00", "updated_at": "2025-07-19T19:56:12.752533+00:00", "benchmark_name": "VisualWebBench" } ] ================================================ FILE: data/organizations/amazon/models/nova-pro/model.json ================================================ { "model_id": "nova-pro", "name": "Nova Pro", "organization_id": "amazon", "fine_tuned_from_model_id": null, "description": "Amazon Nova Pro is a highly-capable multimodal model with state-of-the-art performance across text, image, and video understanding. It excels at core capabilities like language understanding, mathematical reasoning, and multimodal tasks while offering industry-leading speed and cost efficiency.", "release_date": "2024-11-20", "announcement_date": "2024-11-20", "license_id": "proprietary", "multimodal": true, "knowledge_cutoff": null, "param_count": null, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://docs.aws.amazon.com/bedrock/latest/userguide/model-parameters-nova.html", "source_playground": null, "source_paper": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card", "source_scorecard_blog_link": null, "source_repo_link": "https://huggingface.co/amazon-agi", "source_weights_link": null, "created_at": "2025-07-19T19:49:05.431675+00:00", "updated_at": "2025-07-19T19:49:05.431675+00:00", "model_family_id": null } ================================================ FILE: data/organizations/amazon/organization.json ================================================ { "organization_id": "amazon", "name": "Amazon", "website": "https://aws.amazon.com", "description": "Cloud and AI services", "country": null, "created_at": "2025-07-19T19:49:05.427427+00:00", "updated_at": "2025-07-19T19:49:05.427427+00:00" } ================================================ FILE: data/organizations/anthropic/models/claude-3-5-haiku-20241022/benchmarks.json ================================================ [ { "model_benchmark_id": 958, "benchmark_id": "drop", "model_id": "claude-3-5-haiku-20241022", "score": 0.831, "normalized_score": 0.831, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-3-5-haiku", "verified_by_llmstats": false, "analysis_method": "3-shot F1 Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.017079+00:00", "updated_at": "2025-07-19T19:56:13.017079+00:00", "benchmark_name": "DROP" }, { "model_benchmark_id": 331, "benchmark_id": "gpqa", "model_id": "claude-3-5-haiku-20241022", "score": 0.416, "normalized_score": 0.416, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-3-5-haiku", "verified_by_llmstats": false, "analysis_method": "0-shot CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.725835+00:00", "updated_at": "2025-07-19T19:56:11.725835+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 801, "benchmark_id": "humaneval", "model_id": "claude-3-5-haiku-20241022", "score": 0.881, "normalized_score": 0.881, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-3-5-haiku", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.671817+00:00", "updated_at": "2025-07-19T19:56:12.671817+00:00", "benchmark_name": "HumanEval" }, { "model_benchmark_id": 417, "benchmark_id": "math", "model_id": "claude-3-5-haiku-20241022", "score": 0.694, "normalized_score": 0.694, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-3-5-haiku", "verified_by_llmstats": false, "analysis_method": "0-shot CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.885732+00:00", "updated_at": "2025-07-19T19:56:11.885732+00:00", "benchmark_name": "MATH" }, { "model_benchmark_id": 1292, "benchmark_id": "mgsm", "model_id": "claude-3-5-haiku-20241022", "score": 0.856, "normalized_score": 0.856, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-3-5-haiku", "verified_by_llmstats": false, "analysis_method": "0-shot CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.705114+00:00", "updated_at": "2025-07-19T19:56:13.705114+00:00", "benchmark_name": "MGSM" }, { "model_benchmark_id": 210, "benchmark_id": "mmlu-pro", "model_id": "claude-3-5-haiku-20241022", "score": 0.65, "normalized_score": 0.65, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-3-5-haiku", "verified_by_llmstats": false, "analysis_method": "0-shot CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.499754+00:00", "updated_at": "2025-07-19T19:56:11.499754+00:00", "benchmark_name": "MMLU-Pro" }, { "model_benchmark_id": 1347, "benchmark_id": "swe-bench-verified", "model_id": "claude-3-5-haiku-20241022", "score": 0.406, "normalized_score": 0.406, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-3-5-haiku", "verified_by_llmstats": false, "analysis_method": "standard", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.836974+00:00", "updated_at": "2025-07-19T19:56:13.836974+00:00", "benchmark_name": "SWE-Bench Verified" }, { "model_benchmark_id": 1771, "benchmark_id": "tau-bench-airline", "model_id": "claude-3-5-haiku-20241022", "score": 0.228, "normalized_score": 0.228, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-3-5-haiku", "verified_by_llmstats": false, "analysis_method": "standard", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.997081+00:00", "updated_at": "2025-07-19T19:56:14.997081+00:00", "benchmark_name": "TAU-bench Airline" }, { "model_benchmark_id": 1757, "benchmark_id": "tau-bench-retail", "model_id": "claude-3-5-haiku-20241022", "score": 0.51, "normalized_score": 0.51, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-3-5-haiku", "verified_by_llmstats": false, "analysis_method": "standard", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.970473+00:00", "updated_at": "2025-07-19T19:56:14.970473+00:00", "benchmark_name": "TAU-bench Retail" } ] ================================================ FILE: data/organizations/anthropic/models/claude-3-5-haiku-20241022/model.json ================================================ { "model_id": "claude-3-5-haiku-20241022", "name": "Claude 3.5 Haiku", "organization_id": "anthropic", "fine_tuned_from_model_id": null, "description": "Claude 3.5 Haiku is Anthropic's fastest model, delivering advanced coding, tool use, and reasoning capabilities at an accessible price. It excels at user-facing products, specialized sub-agent tasks, and generating personalized experiences from large data volumes. The model is particularly well-suited for code completions, interactive chatbots, data extraction, and real-time content moderation.", "release_date": "2024-10-22", "announcement_date": "2024-10-22", "license_id": "proprietary", "multimodal": false, "knowledge_cutoff": null, "param_count": null, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://docs.anthropic.com/en/docs/intro-to-claude#claude-3-5-family", "source_playground": "https://claude.ai", "source_paper": null, "source_scorecard_blog_link": "https://www.anthropic.com/news/claude-3-5-haiku", "source_repo_link": null, "source_weights_link": null, "created_at": "2025-07-19T19:49:05.744002+00:00", "updated_at": "2025-07-19T19:49:05.744002+00:00", "model_family_id": null } ================================================ FILE: data/organizations/anthropic/models/claude-3-5-sonnet-20240620/benchmarks.json ================================================ [ { "model_benchmark_id": 1086, "benchmark_id": "big-bench-hard", "model_id": "claude-3-5-sonnet-20240620", "score": 0.931, "normalized_score": 0.931, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-3-5-sonnet", "verified_by_llmstats": false, "analysis_method": "3-shot CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.259482+00:00", "updated_at": "2025-07-19T19:56:13.259482+00:00", "benchmark_name": "BIG-Bench Hard" }, { "model_benchmark_id": 961, "benchmark_id": "drop", "model_id": "claude-3-5-sonnet-20240620", "score": 0.871, "normalized_score": 0.871, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-3-5-sonnet", "verified_by_llmstats": false, "analysis_method": "3-shot F1 Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.021997+00:00", "updated_at": "2025-07-19T19:56:13.021997+00:00", "benchmark_name": "DROP" }, { "model_benchmark_id": 336, "benchmark_id": "gpqa", "model_id": "claude-3-5-sonnet-20240620", "score": 0.594, "normalized_score": 0.594, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-3-5-sonnet", "verified_by_llmstats": false, "analysis_method": "0-shot CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.733246+00:00", "updated_at": "2025-07-19T19:56:11.733246+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 1010, "benchmark_id": "gsm8k", "model_id": "claude-3-5-sonnet-20240620", "score": 0.964, "normalized_score": 0.964, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-3-5-sonnet", "verified_by_llmstats": false, "analysis_method": "0-shot CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.107479+00:00", "updated_at": "2025-07-19T19:56:13.107479+00:00", "benchmark_name": "GSM8k" }, { "model_benchmark_id": 804, "benchmark_id": "humaneval", "model_id": "claude-3-5-sonnet-20240620", "score": 0.92, "normalized_score": 0.92, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-3-5-sonnet", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.676235+00:00", "updated_at": "2025-07-19T19:56:12.676235+00:00", "benchmark_name": "HumanEval" }, { "model_benchmark_id": 420, "benchmark_id": "math", "model_id": "claude-3-5-sonnet-20240620", "score": 0.711, "normalized_score": 0.711, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-3-5-sonnet", "verified_by_llmstats": false, "analysis_method": "0-shot CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.891344+00:00", "updated_at": "2025-07-19T19:56:11.891344+00:00", "benchmark_name": "MATH" }, { "model_benchmark_id": 1295, "benchmark_id": "mgsm", "model_id": "claude-3-5-sonnet-20240620", "score": 0.916, "normalized_score": 0.916, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-3-5-sonnet", "verified_by_llmstats": false, "analysis_method": "0-shot CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.710814+00:00", "updated_at": "2025-07-19T19:56:13.710814+00:00", "benchmark_name": "MGSM" }, { "model_benchmark_id": 107, "benchmark_id": "mmlu", "model_id": "claude-3-5-sonnet-20240620", "score": 0.904, "normalized_score": 0.904, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-3-5-sonnet", "verified_by_llmstats": false, "analysis_method": "5-shot CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.300996+00:00", "updated_at": "2025-07-19T19:56:11.300996+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 212, "benchmark_id": "mmlu-pro", "model_id": "claude-3-5-sonnet-20240620", "score": 0.761, "normalized_score": 0.761, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/spaces/TIGER-Lab/MMLU-Pro", "verified_by_llmstats": false, "analysis_method": "5-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.503274+00:00", "updated_at": "2025-07-19T19:56:11.503274+00:00", "benchmark_name": "MMLU-Pro" } ] ================================================ FILE: data/organizations/anthropic/models/claude-3-5-sonnet-20240620/model.json ================================================ { "model_id": "claude-3-5-sonnet-20240620", "name": "Claude 3.5 Sonnet", "organization_id": "anthropic", "fine_tuned_from_model_id": null, "description": "Claude 3.5 Sonnet is a powerful AI model. It excels in graduate-level reasoning, undergraduate-level knowledge, and coding proficiency, with improved understanding of nuance, humor, and complex instructions.", "release_date": "2024-06-21", "announcement_date": "2024-06-21", "license_id": "proprietary", "multimodal": true, "knowledge_cutoff": null, "param_count": null, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://docs.anthropic.com/en/docs/intro-to-claude#claude-3-5-family", "source_playground": "https://claude.ai", "source_paper": null, "source_scorecard_blog_link": "https://www.anthropic.com/news/claude-3-5-sonnet", "source_repo_link": null, "source_weights_link": null, "created_at": "2025-07-19T19:49:05.757926+00:00", "updated_at": "2025-07-19T19:49:05.757926+00:00", "model_family_id": null } ================================================ FILE: data/organizations/anthropic/models/claude-3-5-sonnet-20241022/benchmarks.json ================================================ [ { "model_benchmark_id": 1260, "benchmark_id": "ai2d", "model_id": "claude-3-5-sonnet-20241022", "score": 0.947, "normalized_score": 0.947, "is_self_reported": true, "self_reported_source_link": "https://www-cdn.anthropic.com/fed9cc193a14b84131812372d8d5857f8f304c52/Model_Card_Claude_3_Addendum.pdf", "verified_by_llmstats": false, "analysis_method": "test", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.643744+00:00", "updated_at": "2025-07-19T19:56:13.643744+00:00", "benchmark_name": "AI2D" }, { "model_benchmark_id": 1084, "benchmark_id": "big-bench-hard", "model_id": "claude-3-5-sonnet-20241022", "score": 0.931, "normalized_score": 0.931, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/3-5-models-and-computer-use", "verified_by_llmstats": false, "analysis_method": "3-shot CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.256021+00:00", "updated_at": "2025-07-19T19:56:13.256021+00:00", "benchmark_name": "BIG-Bench Hard" }, { "model_benchmark_id": 872, "benchmark_id": "chartqa", "model_id": "claude-3-5-sonnet-20241022", "score": 0.908, "normalized_score": 0.908, "is_self_reported": true, "self_reported_source_link": "https://www-cdn.anthropic.com/fed9cc193a14b84131812372d8d5857f8f304c52/Model_Card_Claude_3_Addendum.pdf", "verified_by_llmstats": false, "analysis_method": "test, relaxed accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.819413+00:00", "updated_at": "2025-07-19T19:56:12.819413+00:00", "benchmark_name": "ChartQA" }, { "model_benchmark_id": 897, "benchmark_id": "docvqa", "model_id": "claude-3-5-sonnet-20241022", "score": 0.952, "normalized_score": 0.952, "is_self_reported": true, "self_reported_source_link": "https://www-cdn.anthropic.com/fed9cc193a14b84131812372d8d5857f8f304c52/Model_Card_Claude_3_Addendum.pdf", "verified_by_llmstats": false, "analysis_method": "test, ANLS score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.867423+00:00", "updated_at": "2025-07-19T19:56:12.867423+00:00", "benchmark_name": "DocVQA" }, { "model_benchmark_id": 959, "benchmark_id": "drop", "model_id": "claude-3-5-sonnet-20241022", "score": 0.871, "normalized_score": 0.871, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/3-5-models-and-computer-use", "verified_by_llmstats": false, "analysis_method": "3-shot F1 Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.018623+00:00", "updated_at": "2025-07-19T19:56:13.018623+00:00", "benchmark_name": "DROP" }, { "model_benchmark_id": 334, "benchmark_id": "gpqa", "model_id": "claude-3-5-sonnet-20241022", "score": 0.672, "normalized_score": 0.672, "is_self_reported": true, "self_reported_source_link": "https://www-cdn.anthropic.com/fed9cc193a14b84131812372d8d5857f8f304c52/Model_Card_Claude_3_Addendum.pdf", "verified_by_llmstats": false, "analysis_method": "Maj@32 5-shot CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.730271+00:00", "updated_at": "2025-07-19T19:56:11.730271+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 1008, "benchmark_id": "gsm8k", "model_id": "claude-3-5-sonnet-20241022", "score": 0.964, "normalized_score": 0.964, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/3-5-models-and-computer-use", "verified_by_llmstats": false, "analysis_method": "0-shot CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.104248+00:00", "updated_at": "2025-07-19T19:56:13.104248+00:00", "benchmark_name": "GSM8k" }, { "model_benchmark_id": 802, "benchmark_id": "humaneval", "model_id": "claude-3-5-sonnet-20241022", "score": 0.937, "normalized_score": 0.937, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/3-5-models-and-computer-use", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.673295+00:00", "updated_at": "2025-07-19T19:56:12.673295+00:00", "benchmark_name": "HumanEval" }, { "model_benchmark_id": 418, "benchmark_id": "math", "model_id": "claude-3-5-sonnet-20241022", "score": 0.783, "normalized_score": 0.783, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/3-5-models-and-computer-use", "verified_by_llmstats": false, "analysis_method": "standard", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.887521+00:00", "updated_at": "2025-07-19T19:56:11.887521+00:00", "benchmark_name": "MATH" }, { "model_benchmark_id": 535, "benchmark_id": "mathvista", "model_id": "claude-3-5-sonnet-20241022", "score": 0.677, "normalized_score": 0.677, "is_self_reported": true, "self_reported_source_link": "https://www-cdn.anthropic.com/fed9cc193a14b84131812372d8d5857f8f304c52/Model_Card_Claude_3_Addendum.pdf", "verified_by_llmstats": false, "analysis_method": "testmini", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.108158+00:00", "updated_at": "2025-07-19T19:56:12.108158+00:00", "benchmark_name": "MathVista" }, { "model_benchmark_id": 1293, "benchmark_id": "mgsm", "model_id": "claude-3-5-sonnet-20241022", "score": 0.916, "normalized_score": 0.916, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/3-5-models-and-computer-use", "verified_by_llmstats": false, "analysis_method": "0-shot CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.707042+00:00", "updated_at": "2025-07-19T19:56:13.707042+00:00", "benchmark_name": "MGSM" }, { "model_benchmark_id": 105, "benchmark_id": "mmlu", "model_id": "claude-3-5-sonnet-20241022", "score": 0.904, "normalized_score": 0.904, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/3-5-models-and-computer-use", "verified_by_llmstats": false, "analysis_method": "5-shot CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.298011+00:00", "updated_at": "2025-07-19T19:56:11.298011+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 211, "benchmark_id": "mmlu-pro", "model_id": "claude-3-5-sonnet-20241022", "score": 0.776, "normalized_score": 0.776, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/spaces/TIGER-Lab/MMLU-Pro", "verified_by_llmstats": false, "analysis_method": "5-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.501331+00:00", "updated_at": "2025-07-19T19:56:11.501331+00:00", "benchmark_name": "MMLU-Pro" }, { "model_benchmark_id": 584, "benchmark_id": "mmmu", "model_id": "claude-3-5-sonnet-20241022", "score": 0.683, "normalized_score": 0.683, "is_self_reported": true, "self_reported_source_link": "https://www-cdn.anthropic.com/fed9cc193a14b84131812372d8d5857f8f304c52/Model_Card_Claude_3_Addendum.pdf", "verified_by_llmstats": false, "analysis_method": "validation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.201491+00:00", "updated_at": "2025-07-19T19:56:12.201491+00:00", "benchmark_name": "MMMU" }, { "model_benchmark_id": 1814, "benchmark_id": "osworld-extended", "model_id": "claude-3-5-sonnet-20241022", "score": 0.22, "normalized_score": 0.22, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/3-5-models-and-computer-use", "verified_by_llmstats": false, "analysis_method": "standard", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.117020+00:00", "updated_at": "2025-07-19T19:56:15.117020+00:00", "benchmark_name": "OSWorld Extended" }, { "model_benchmark_id": 1813, "benchmark_id": "osworld-screenshot-only", "model_id": "claude-3-5-sonnet-20241022", "score": 0.149, "normalized_score": 0.149, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/3-5-models-and-computer-use", "verified_by_llmstats": false, "analysis_method": "standard", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.112291+00:00", "updated_at": "2025-07-19T19:56:15.112291+00:00", "benchmark_name": "OSWorld Screenshot-only" }, { "model_benchmark_id": 1350, "benchmark_id": "swe-bench-verified", "model_id": "claude-3-5-sonnet-20241022", "score": 0.49, "normalized_score": 0.49, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/3-5-models-and-computer-use", "verified_by_llmstats": false, "analysis_method": "standard", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.842061+00:00", "updated_at": "2025-07-19T19:56:13.842061+00:00", "benchmark_name": "SWE-Bench Verified" }, { "model_benchmark_id": 1774, "benchmark_id": "tau-bench-airline", "model_id": "claude-3-5-sonnet-20241022", "score": 0.46, "normalized_score": 0.46, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/3-5-models-and-computer-use", "verified_by_llmstats": false, "analysis_method": "standard", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.003886+00:00", "updated_at": "2025-07-19T19:56:15.003886+00:00", "benchmark_name": "TAU-bench Airline" }, { "model_benchmark_id": 1760, "benchmark_id": "tau-bench-retail", "model_id": "claude-3-5-sonnet-20241022", "score": 0.692, "normalized_score": 0.692, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/3-5-models-and-computer-use", "verified_by_llmstats": false, "analysis_method": "standard", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.975456+00:00", "updated_at": "2025-07-19T19:56:14.975456+00:00", "benchmark_name": "TAU-bench Retail" } ] ================================================ FILE: data/organizations/anthropic/models/claude-3-5-sonnet-20241022/model.json ================================================ { "model_id": "claude-3-5-sonnet-20241022", "name": "Claude 3.5 Sonnet", "organization_id": "anthropic", "fine_tuned_from_model_id": null, "description": "Claude 3.5 Sonnet is a powerful AI model with industry-leading software engineering skills. It excels in coding, planning, and problem-solving, with significant improvements in agentic coding and tool use tasks. The model includes computer use capabilities in public beta, allowing it to interact with computer interfaces like a human user.", "release_date": "2024-10-22", "announcement_date": "2024-10-22", "license_id": "proprietary", "multimodal": true, "knowledge_cutoff": null, "param_count": null, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://docs.anthropic.com/en/docs/intro-to-claude#claude-3-5-family", "source_playground": "https://claude.ai", "source_paper": "https://www-cdn.anthropic.com/fed9cc193a14b84131812372d8d5857f8f304c52/Model_Card_Claude_3_Addendum.pdf", "source_scorecard_blog_link": "https://www.anthropic.com/news/claude-3-5-sonnet", "source_repo_link": null, "source_weights_link": null, "created_at": "2025-07-19T19:49:05.752534+00:00", "updated_at": "2025-07-19T19:49:05.752534+00:00", "model_family_id": null } ================================================ FILE: data/organizations/anthropic/models/claude-3-7-sonnet-20250219/benchmarks.json ================================================ [ { "model_benchmark_id": 478, "benchmark_id": "aime-2024", "model_id": "claude-3-7-sonnet-20250219", "score": 0.8, "normalized_score": 0.8, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-3-7-sonnet", "verified_by_llmstats": false, "analysis_method": "", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.007831+00:00", "updated_at": "2025-07-19T19:56:12.007831+00:00", "benchmark_name": "AIME 2024" }, { "model_benchmark_id": 700, "benchmark_id": "aime-2025", "model_id": "claude-3-7-sonnet-20250219", "score": 0.548, "normalized_score": 0.548, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-4", "verified_by_llmstats": false, "analysis_method": "Parallel test-time compute (footnotes 4, 5)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.464908+00:00", "updated_at": "2025-07-19T19:56:12.464908+00:00", "benchmark_name": "AIME 2025" }, { "model_benchmark_id": 332, "benchmark_id": "gpqa", "model_id": "claude-3-7-sonnet-20250219", "score": 0.848, "normalized_score": 0.848, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-3-7-sonnet", "verified_by_llmstats": false, "analysis_method": "Diamond", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.727330+00:00", "updated_at": "2025-07-19T19:56:11.727330+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 629, "benchmark_id": "ifeval", "model_id": "claude-3-7-sonnet-20250219", "score": 0.932, "normalized_score": 0.932, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-3-7-sonnet", "verified_by_llmstats": false, "analysis_method": "", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.294010+00:00", "updated_at": "2025-07-19T19:56:12.294010+00:00", "benchmark_name": "IFEval" }, { "model_benchmark_id": 512, "benchmark_id": "math-500", "model_id": "claude-3-7-sonnet-20250219", "score": 0.962, "normalized_score": 0.962, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-3-7-sonnet", "verified_by_llmstats": false, "analysis_method": "", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.063685+00:00", "updated_at": "2025-07-19T19:56:12.063685+00:00", "benchmark_name": "MATH-500" }, { "model_benchmark_id": 1478, "benchmark_id": "mmmlu", "model_id": "claude-3-7-sonnet-20250219", "score": 0.861, "normalized_score": 0.861, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-3-7-sonnet", "verified_by_llmstats": false, "analysis_method": "Average over 14 non-English languages (footnote 3)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.152773+00:00", "updated_at": "2025-07-19T19:56:14.152773+00:00", "benchmark_name": "MMMLU" }, { "model_benchmark_id": 582, "benchmark_id": "mmmu", "model_id": "claude-3-7-sonnet-20250219", "score": 0.75, "normalized_score": 0.75, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-3-7-sonnet", "verified_by_llmstats": false, "analysis_method": "validation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.197283+00:00", "updated_at": "2025-07-19T19:56:12.197283+00:00", "benchmark_name": "MMMU" }, { "model_benchmark_id": 1348, "benchmark_id": "swe-bench-verified", "model_id": "claude-3-7-sonnet-20250219", "score": 0.703, "normalized_score": 0.703, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-3-7-sonnet", "verified_by_llmstats": false, "analysis_method": "With multiple parallel attempts and advanced scaffolding", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.838599+00:00", "updated_at": "2025-07-19T19:56:13.838599+00:00", "benchmark_name": "SWE-Bench Verified" }, { "model_benchmark_id": 1772, "benchmark_id": "tau-bench-airline", "model_id": "claude-3-7-sonnet-20250219", "score": 0.584, "normalized_score": 0.584, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-3-7-sonnet", "verified_by_llmstats": false, "analysis_method": "With prompt addendum to better utilize planning", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.999875+00:00", "updated_at": "2025-07-19T19:56:14.999875+00:00", "benchmark_name": "TAU-bench Airline" }, { "model_benchmark_id": 1758, "benchmark_id": "tau-bench-retail", "model_id": "claude-3-7-sonnet-20250219", "score": 0.812, "normalized_score": 0.812, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-3-7-sonnet", "verified_by_llmstats": false, "analysis_method": "With prompt addendum to better utilize planning", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.971988+00:00", "updated_at": "2025-07-19T19:56:14.971988+00:00", "benchmark_name": "TAU-bench Retail" }, { "model_benchmark_id": 653, "benchmark_id": "terminal-bench", "model_id": "claude-3-7-sonnet-20250219", "score": 0.352, "normalized_score": 0.352, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-4", "verified_by_llmstats": false, "analysis_method": "Parallel test-time compute, Claude Code agent framework (footnotes 2, 5)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.350298+00:00", "updated_at": "2025-07-19T19:56:12.350298+00:00", "benchmark_name": "Terminal-bench" } ] ================================================ FILE: data/organizations/anthropic/models/claude-3-7-sonnet-20250219/model.json ================================================ { "model_id": "claude-3-7-sonnet-20250219", "name": "Claude 3.7 Sonnet", "organization_id": "anthropic", "fine_tuned_from_model_id": null, "description": "The most intelligent Claude model and the first hybrid reasoning model on the market. Claude 3.7 Sonnet can produce near-instant responses or extended, step-by-step thinking that is made visible to the user. Shows particularly strong improvements in coding and front-end web development.", "release_date": "2025-02-24", "announcement_date": "2025-02-24", "license_id": "proprietary", "multimodal": true, "knowledge_cutoff": null, "param_count": null, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://docs.anthropic.com/en/docs/about-claude/models/all-models", "source_playground": "https://claude.ai", "source_paper": null, "source_scorecard_blog_link": "https://www.anthropic.com/news/claude-3-7-sonnet", "source_repo_link": null, "source_weights_link": null, "created_at": "2025-07-19T19:49:05.747775+00:00", "updated_at": "2025-07-19T19:49:05.747775+00:00", "model_family_id": null } ================================================ FILE: data/organizations/anthropic/models/claude-3-haiku-20240307/benchmarks.json ================================================ [ { "model_benchmark_id": 27, "benchmark_id": "arc-c", "model_id": "claude-3-haiku-20240307", "score": 0.892, "normalized_score": 0.892, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-3-haiku", "verified_by_llmstats": false, "analysis_method": "25-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.137830+00:00", "updated_at": "2025-07-19T19:56:11.137830+00:00", "benchmark_name": "ARC-C" }, { "model_benchmark_id": 1085, "benchmark_id": "big-bench-hard", "model_id": "claude-3-haiku-20240307", "score": 0.737, "normalized_score": 0.737, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-3-haiku", "verified_by_llmstats": false, "analysis_method": "3-shot CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.257814+00:00", "updated_at": "2025-07-19T19:56:13.257814+00:00", "benchmark_name": "BIG-Bench Hard" }, { "model_benchmark_id": 960, "benchmark_id": "drop", "model_id": "claude-3-haiku-20240307", "score": 0.784, "normalized_score": 0.784, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-3-haiku", "verified_by_llmstats": false, "analysis_method": "3-shot, F1 score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.020609+00:00", "updated_at": "2025-07-19T19:56:13.020609+00:00", "benchmark_name": "DROP" }, { "model_benchmark_id": 335, "benchmark_id": "gpqa", "model_id": "claude-3-haiku-20240307", "score": 0.333, "normalized_score": 0.333, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-3-haiku", "verified_by_llmstats": false, "analysis_method": "0-shot CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.731729+00:00", "updated_at": "2025-07-19T19:56:11.731729+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 1009, "benchmark_id": "gsm8k", "model_id": "claude-3-haiku-20240307", "score": 0.889, "normalized_score": 0.889, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-3-haiku", "verified_by_llmstats": false, "analysis_method": "0-shot CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.105970+00:00", "updated_at": "2025-07-19T19:56:13.105970+00:00", "benchmark_name": "GSM8k" }, { "model_benchmark_id": 53, "benchmark_id": "hellaswag", "model_id": "claude-3-haiku-20240307", "score": 0.859, "normalized_score": 0.859, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-3-haiku", "verified_by_llmstats": false, "analysis_method": "10-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.195028+00:00", "updated_at": "2025-07-19T19:56:11.195028+00:00", "benchmark_name": "HellaSwag" }, { "model_benchmark_id": 803, "benchmark_id": "humaneval", "model_id": "claude-3-haiku-20240307", "score": 0.759, "normalized_score": 0.759, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-3-haiku", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.674804+00:00", "updated_at": "2025-07-19T19:56:12.674804+00:00", "benchmark_name": "HumanEval" }, { "model_benchmark_id": 419, "benchmark_id": "math", "model_id": "claude-3-haiku-20240307", "score": 0.389, "normalized_score": 0.389, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-3-haiku", "verified_by_llmstats": false, "analysis_method": "0-shot CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.889123+00:00", "updated_at": "2025-07-19T19:56:11.889123+00:00", "benchmark_name": "MATH" }, { "model_benchmark_id": 1294, "benchmark_id": "mgsm", "model_id": "claude-3-haiku-20240307", "score": 0.751, "normalized_score": 0.751, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-3-haiku", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.709200+00:00", "updated_at": "2025-07-19T19:56:13.709200+00:00", "benchmark_name": "MGSM" }, { "model_benchmark_id": 106, "benchmark_id": "mmlu", "model_id": "claude-3-haiku-20240307", "score": 0.752, "normalized_score": 0.752, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-3-haiku", "verified_by_llmstats": false, "analysis_method": "5-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.299416+00:00", "updated_at": "2025-07-19T19:56:11.299416+00:00", "benchmark_name": "MMLU" } ] ================================================ FILE: data/organizations/anthropic/models/claude-3-haiku-20240307/model.json ================================================ { "model_id": "claude-3-haiku-20240307", "name": "Claude 3 Haiku", "organization_id": "anthropic", "fine_tuned_from_model_id": null, "description": "Claude 3 Haiku is the fastest and most compact model in the Claude 3 family, designed for near-instant responsiveness. It excels at answering simple queries and requests with unmatched speed, making it ideal for seamless AI experiences that mimic human interactions.", "release_date": "2024-03-13", "announcement_date": "2024-03-13", "license_id": "proprietary", "multimodal": true, "knowledge_cutoff": null, "param_count": null, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://www.anthropic.com/claude", "source_playground": "https://claude.ai", "source_paper": "https://www-cdn.anthropic.com/de8ba9b01c9ab7cbabf5c33b80b7bbc618857627/Model_Card_Claude_3.pdf", "source_scorecard_blog_link": "https://www.anthropic.com/news/claude-3-haiku", "source_repo_link": null, "source_weights_link": null, "created_at": "2025-07-19T19:49:05.755159+00:00", "updated_at": "2025-07-19T19:49:05.755159+00:00", "model_family_id": null } ================================================ FILE: data/organizations/anthropic/models/claude-3-opus-20240229/benchmarks.json ================================================ [ { "model_benchmark_id": 25, "benchmark_id": "arc-c", "model_id": "claude-3-opus-20240229", "score": 0.964, "normalized_score": 0.964, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-3-family", "verified_by_llmstats": false, "analysis_method": "25-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.134917+00:00", "updated_at": "2025-07-19T19:56:11.134917+00:00", "benchmark_name": "ARC-C" }, { "model_benchmark_id": 1082, "benchmark_id": "big-bench-hard", "model_id": "claude-3-opus-20240229", "score": 0.868, "normalized_score": 0.868, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-3-family", "verified_by_llmstats": false, "analysis_method": "3-shot CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.252820+00:00", "updated_at": "2025-07-19T19:56:13.252820+00:00", "benchmark_name": "BIG-Bench Hard" }, { "model_benchmark_id": 956, "benchmark_id": "drop", "model_id": "claude-3-opus-20240229", "score": 0.831, "normalized_score": 0.831, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-3-family", "verified_by_llmstats": false, "analysis_method": "3-shot, F1 Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.013702+00:00", "updated_at": "2025-07-19T19:56:13.013702+00:00", "benchmark_name": "DROP" }, { "model_benchmark_id": 329, "benchmark_id": "gpqa", "model_id": "claude-3-opus-20240229", "score": 0.504, "normalized_score": 0.504, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-3-family", "verified_by_llmstats": false, "analysis_method": "0-shot CoT - Diamond", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.722913+00:00", "updated_at": "2025-07-19T19:56:11.722913+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 1006, "benchmark_id": "gsm8k", "model_id": "claude-3-opus-20240229", "score": 0.95, "normalized_score": 0.95, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-3-family", "verified_by_llmstats": false, "analysis_method": "0-shot CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.101310+00:00", "updated_at": "2025-07-19T19:56:13.101310+00:00", "benchmark_name": "GSM8k" }, { "model_benchmark_id": 51, "benchmark_id": "hellaswag", "model_id": "claude-3-opus-20240229", "score": 0.954, "normalized_score": 0.954, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-3-family", "verified_by_llmstats": false, "analysis_method": "10-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.190975+00:00", "updated_at": "2025-07-19T19:56:11.190975+00:00", "benchmark_name": "HellaSwag" }, { "model_benchmark_id": 799, "benchmark_id": "humaneval", "model_id": "claude-3-opus-20240229", "score": 0.849, "normalized_score": 0.849, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-3-family", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.668395+00:00", "updated_at": "2025-07-19T19:56:12.668395+00:00", "benchmark_name": "HumanEval" }, { "model_benchmark_id": 415, "benchmark_id": "math", "model_id": "claude-3-opus-20240229", "score": 0.601, "normalized_score": 0.601, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-3-family", "verified_by_llmstats": false, "analysis_method": "0-shot CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.882261+00:00", "updated_at": "2025-07-19T19:56:11.882261+00:00", "benchmark_name": "MATH" }, { "model_benchmark_id": 1290, "benchmark_id": "mgsm", "model_id": "claude-3-opus-20240229", "score": 0.907, "normalized_score": 0.907, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-3-family", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.701952+00:00", "updated_at": "2025-07-19T19:56:13.701952+00:00", "benchmark_name": "MGSM" }, { "model_benchmark_id": 103, "benchmark_id": "mmlu", "model_id": "claude-3-opus-20240229", "score": 0.868, "normalized_score": 0.868, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-3-family", "verified_by_llmstats": false, "analysis_method": "5-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.294591+00:00", "updated_at": "2025-07-19T19:56:11.294591+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 208, "benchmark_id": "mmlu-pro", "model_id": "claude-3-opus-20240229", "score": 0.685, "normalized_score": 0.685, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2406.01574", "verified_by_llmstats": false, "analysis_method": "0-shot CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.496438+00:00", "updated_at": "2025-07-19T19:56:11.496438+00:00", "benchmark_name": "MMLU-Pro" } ] ================================================ FILE: data/organizations/anthropic/models/claude-3-opus-20240229/model.json ================================================ { "model_id": "claude-3-opus-20240229", "name": "Claude 3 Opus", "organization_id": "anthropic", "fine_tuned_from_model_id": null, "description": "Claude 3 Opus is Anthropic's most intelligent model, with best-in-market performance on highly complex tasks. It can navigate open-ended prompts and sight-unseen scenarios with remarkable fluency and human-like understanding, showing the outer limits of what's possible with generative AI.", "release_date": "2024-02-29", "announcement_date": "2024-02-29", "license_id": "proprietary", "multimodal": true, "knowledge_cutoff": null, "param_count": null, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://www.anthropic.com/claude", "source_playground": "https://claude.ai", "source_paper": "https://www-cdn.anthropic.com/de8ba9b01c9ab7cbabf5c33b80b7bbc618857627/Model_Card_Claude_3.pdf", "source_scorecard_blog_link": "https://www.anthropic.com/news/claude-3-family", "source_repo_link": null, "source_weights_link": null, "created_at": "2025-07-19T19:49:05.738279+00:00", "updated_at": "2025-07-19T19:49:05.738279+00:00", "model_family_id": null } ================================================ FILE: data/organizations/anthropic/models/claude-3-sonnet-20240229/benchmarks.json ================================================ [ { "model_benchmark_id": 26, "benchmark_id": "arc-c", "model_id": "claude-3-sonnet-20240229", "score": 0.932, "normalized_score": 0.932, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-3-family", "verified_by_llmstats": false, "analysis_method": "25-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.136363+00:00", "updated_at": "2025-07-19T19:56:11.136363+00:00", "benchmark_name": "ARC-C" }, { "model_benchmark_id": 1083, "benchmark_id": "big-bench-hard", "model_id": "claude-3-sonnet-20240229", "score": 0.829, "normalized_score": 0.829, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-3-family", "verified_by_llmstats": false, "analysis_method": "3-shot CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.254531+00:00", "updated_at": "2025-07-19T19:56:13.254531+00:00", "benchmark_name": "BIG-Bench Hard" }, { "model_benchmark_id": 957, "benchmark_id": "drop", "model_id": "claude-3-sonnet-20240229", "score": 0.789, "normalized_score": 0.789, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-3-family", "verified_by_llmstats": false, "analysis_method": "3-shot, F1 score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.015601+00:00", "updated_at": "2025-07-19T19:56:13.015601+00:00", "benchmark_name": "DROP" }, { "model_benchmark_id": 330, "benchmark_id": "gpqa", "model_id": "claude-3-sonnet-20240229", "score": 0.404, "normalized_score": 0.404, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-3-family", "verified_by_llmstats": false, "analysis_method": "0-shot CoT - Diamond", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.724379+00:00", "updated_at": "2025-07-19T19:56:11.724379+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 1007, "benchmark_id": "gsm8k", "model_id": "claude-3-sonnet-20240229", "score": 0.923, "normalized_score": 0.923, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-3-family", "verified_by_llmstats": false, "analysis_method": "0-shot CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.102758+00:00", "updated_at": "2025-07-19T19:56:13.102758+00:00", "benchmark_name": "GSM8k" }, { "model_benchmark_id": 52, "benchmark_id": "hellaswag", "model_id": "claude-3-sonnet-20240229", "score": 0.89, "normalized_score": 0.89, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-3-family", "verified_by_llmstats": false, "analysis_method": "10-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.193193+00:00", "updated_at": "2025-07-19T19:56:11.193193+00:00", "benchmark_name": "HellaSwag" }, { "model_benchmark_id": 800, "benchmark_id": "humaneval", "model_id": "claude-3-sonnet-20240229", "score": 0.73, "normalized_score": 0.73, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-3-family", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.670119+00:00", "updated_at": "2025-07-19T19:56:12.670119+00:00", "benchmark_name": "HumanEval" }, { "model_benchmark_id": 416, "benchmark_id": "math", "model_id": "claude-3-sonnet-20240229", "score": 0.431, "normalized_score": 0.431, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-3-family", "verified_by_llmstats": false, "analysis_method": "0-shot CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.884160+00:00", "updated_at": "2025-07-19T19:56:11.884160+00:00", "benchmark_name": "MATH" }, { "model_benchmark_id": 1291, "benchmark_id": "mgsm", "model_id": "claude-3-sonnet-20240229", "score": 0.835, "normalized_score": 0.835, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-3-family", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.703593+00:00", "updated_at": "2025-07-19T19:56:13.703593+00:00", "benchmark_name": "MGSM" }, { "model_benchmark_id": 104, "benchmark_id": "mmlu", "model_id": "claude-3-sonnet-20240229", "score": 0.79, "normalized_score": 0.79, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-3-family", "verified_by_llmstats": false, "analysis_method": "5-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.296409+00:00", "updated_at": "2025-07-19T19:56:11.296409+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 209, "benchmark_id": "mmlu-pro", "model_id": "claude-3-sonnet-20240229", "score": 0.568, "normalized_score": 0.568, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2406.01574", "verified_by_llmstats": false, "analysis_method": "0-shot CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.498008+00:00", "updated_at": "2025-07-19T19:56:11.498008+00:00", "benchmark_name": "MMLU-Pro" } ] ================================================ FILE: data/organizations/anthropic/models/claude-3-sonnet-20240229/model.json ================================================ { "model_id": "claude-3-sonnet-20240229", "name": "Claude 3 Sonnet", "organization_id": "anthropic", "fine_tuned_from_model_id": null, "description": "Claude 3 Sonnet strikes the ideal balance between intelligence and speed\u2014particularly for enterprise workloads. It delivers strong performance at a lower cost compared to its peers, and is engineered for high endurance in large-scale AI deployments.", "release_date": "2024-02-29", "announcement_date": "2024-02-29", "license_id": "proprietary", "multimodal": true, "knowledge_cutoff": null, "param_count": null, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://www.anthropic.com/claude", "source_playground": "https://claude.ai", "source_paper": "https://www-cdn.anthropic.com/de8ba9b01c9ab7cbabf5c33b80b7bbc618857627/Model_Card_Claude_3.pdf", "source_scorecard_blog_link": "https://www.anthropic.com/news/claude-3-family", "source_repo_link": null, "source_weights_link": null, "created_at": "2025-07-19T19:49:05.740647+00:00", "updated_at": "2025-07-19T19:49:05.740647+00:00", "model_family_id": null } ================================================ FILE: data/organizations/anthropic/models/claude-haiku-4-5-20251015/benchmarks.json ================================================ [ { "model_benchmark_id": 22228, "benchmark_id": "swe-bench-verified", "model_id": "claude-haiku-4-5-20251015", "score": 0.733, "normalized_score": 0.733, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-haiku-4-5", "verified_by_llmstats": false, "analysis_method": "standard", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-10-15T00:00:00.000000+00:00", "updated_at": "2025-10-15T00:00:00.000000+00:00", "benchmark_name": "SWE-Bench Verified" }, { "model_benchmark_id": 22229, "benchmark_id": "terminal-bench", "model_id": "claude-haiku-4-5-20251015", "score": 0.41, "normalized_score": 0.41, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-haiku-4-5", "verified_by_llmstats": false, "analysis_method": "standard", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-10-15T00:00:00.000000+00:00", "updated_at": "2025-10-15T00:00:00.000000+00:00", "benchmark_name": "Terminal-Bench" }, { "model_benchmark_id": 22230, "benchmark_id": "tau2-retail", "model_id": "claude-haiku-4-5-20251015", "score": 0.832, "normalized_score": 0.832, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-haiku-4-5", "verified_by_llmstats": false, "analysis_method": "standard", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-10-15T00:00:00.000000+00:00", "updated_at": "2025-10-15T00:00:00.000000+00:00", "benchmark_name": "Tau2 Retail" }, { "model_benchmark_id": 22231, "benchmark_id": "tau2-airline", "model_id": "claude-haiku-4-5-20251015", "score": 0.636, "normalized_score": 0.636, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-haiku-4-5", "verified_by_llmstats": false, "analysis_method": "standard", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-10-15T00:00:00.000000+00:00", "updated_at": "2025-10-15T00:00:00.000000+00:00", "benchmark_name": "Tau2 Airline" }, { "model_benchmark_id": 22232, "benchmark_id": "tau2-telecom", "model_id": "claude-haiku-4-5-20251015", "score": 0.83, "normalized_score": 0.83, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-haiku-4-5", "verified_by_llmstats": false, "analysis_method": "standard", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-10-15T00:00:00.000000+00:00", "updated_at": "2025-10-15T00:00:00.000000+00:00", "benchmark_name": "Tau2 Telecom" }, { "model_benchmark_id": 22233, "benchmark_id": "osworld", "model_id": "claude-haiku-4-5-20251015", "score": 0.507, "normalized_score": 0.507, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-haiku-4-5", "verified_by_llmstats": false, "analysis_method": "standard", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-10-15T00:00:00.000000+00:00", "updated_at": "2025-10-15T00:00:00.000000+00:00", "benchmark_name": "OSWorld" }, { "model_benchmark_id": 22234, "benchmark_id": "aime-2025", "model_id": "claude-haiku-4-5-20251015", "score": 0.963, "normalized_score": 0.963, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-haiku-4-5", "verified_by_llmstats": false, "analysis_method": "python", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-10-15T00:00:00.000000+00:00", "updated_at": "2025-10-15T00:00:00.000000+00:00", "benchmark_name": "AIME 2025" }, { "model_benchmark_id": 22235, "benchmark_id": "aime-2025", "model_id": "claude-haiku-4-5-20251015", "score": 0.807, "normalized_score": 0.807, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-haiku-4-5", "verified_by_llmstats": false, "analysis_method": "no tools", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-10-15T00:00:00.000000+00:00", "updated_at": "2025-10-15T00:00:00.000000+00:00", "benchmark_name": "AIME 2025" }, { "model_benchmark_id": 22236, "benchmark_id": "gpqa", "model_id": "claude-haiku-4-5-20251015", "score": 0.73, "normalized_score": 0.73, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-haiku-4-5", "verified_by_llmstats": false, "analysis_method": "Diamond subset", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-10-15T00:00:00.000000+00:00", "updated_at": "2025-10-15T00:00:00.000000+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 22237, "benchmark_id": "mmmlu", "model_id": "claude-haiku-4-5-20251015", "score": 0.83, "normalized_score": 0.83, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-haiku-4-5", "verified_by_llmstats": false, "analysis_method": "standard", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-10-15T00:00:00.000000+00:00", "updated_at": "2025-10-15T00:00:00.000000+00:00", "benchmark_name": "MMMLU" }, { "model_benchmark_id": 22238, "benchmark_id": "mmmu-(validation)", "model_id": "claude-haiku-4-5-20251015", "score": 0.732, "normalized_score": 0.732, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-haiku-4-5", "verified_by_llmstats": false, "analysis_method": "standard", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-10-15T00:00:00.000000+00:00", "updated_at": "2025-10-15T00:00:00.000000+00:00", "benchmark_name": "MMMU (validation)" }, { "model_benchmark_id": 22239, "benchmark_id": "cybersecurity-ctfs", "model_id": "claude-haiku-4-5-20251015", "score": 0.46875, "normalized_score": 0.46875, "is_self_reported": true, "self_reported_source_link": "https://assets.anthropic.com/m/99128ddd009bdcb/original/Claude-Haiku-4-5-System-Card.pdf", "verified_by_llmstats": false, "analysis_method": "32-challenge subset", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": "15/32 challenges solved (pass@30)", "created_at": "2025-10-15T00:00:00.000000+00:00", "updated_at": "2025-10-15T00:00:00.000000+00:00", "benchmark_name": "Cybersecurity CTFs" } ] ================================================ FILE: data/organizations/anthropic/models/claude-haiku-4-5-20251015/model.json ================================================ { "model_id": "claude-haiku-4-5-20251015", "name": "Claude Haiku 4.5", "organization_id": "anthropic", "fine_tuned_from_model_id": null, "description": "Claude Haiku 4.5 is Anthropic's fastest, most cost-efficient model, matching Sonnet 4's performance on coding, computer use, and agent tasks. It offers similar performance to Sonnet 4 at one-third the cost and more than twice the speed, making it ideal for high-volume, latency-sensitive applications and multi-agent orchestration.", "release_date": "2025-10-15", "announcement_date": "2025-10-15", "license_id": "proprietary", "multimodal": true, "knowledge_cutoff": "2025-02-01", "param_count": null, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://docs.anthropic.com/en/docs/about-claude/models", "source_playground": "https://claude.ai", "source_paper": "https://assets.anthropic.com/m/99128ddd009bdcb/original/Claude-Haiku-4-5-System-Card.pdf", "source_scorecard_blog_link": "https://www.anthropic.com/news/claude-haiku-4-5", "source_repo_link": null, "source_weights_link": null, "created_at": "2025-10-15T00:00:00.000000+00:00", "updated_at": "2025-10-15T00:00:00.000000+00:00", "model_family_id": null } ================================================ FILE: data/organizations/anthropic/models/claude-opus-4-1-20250805/benchmarks.json ================================================ [ { "model_benchmark_id": 2001, "benchmark_id": "swe-bench-verified", "model_id": "claude-opus-4-1-20250805", "score": 0.745, "normalized_score": 0.745, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-opus-4-1", "verified_by_llmstats": false, "analysis_method": "No extended thinking. Simple scaffold with bash tool and file editing tool via string replacements. Scores reported out of full 500 problems.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-05T00:00:00.000000+00:00", "updated_at": "2025-08-05T00:00:00.000000+00:00", "benchmark_name": "SWE-Bench Verified" }, { "model_benchmark_id": 2002, "benchmark_id": "terminal-bench", "model_id": "claude-opus-4-1-20250805", "score": 0.433, "normalized_score": 0.433, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-opus-4-1", "verified_by_llmstats": false, "analysis_method": "No extended thinking. Terminus 1 averaged over 5 trials.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-05T00:00:00.000000+00:00", "updated_at": "2025-08-05T00:00:00.000000+00:00", "benchmark_name": "Terminal-bench" }, { "model_benchmark_id": 2003, "benchmark_id": "gpqa", "model_id": "claude-opus-4-1-20250805", "score": 0.809, "normalized_score": 0.809, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-opus-4-1", "verified_by_llmstats": false, "analysis_method": "Diamond: Extended thinking (up to 64K tokens)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-05T00:00:00.000000+00:00", "updated_at": "2025-08-05T00:00:00.000000+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 2004, "benchmark_id": "tau-bench-retail", "model_id": "claude-opus-4-1-20250805", "score": 0.824, "normalized_score": 0.824, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-opus-4-1", "verified_by_llmstats": false, "analysis_method": "Extended thinking with tool use (up to 64K tokens, prompt addendum, increased max steps from 30 to 100).", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-05T00:00:00.000000+00:00", "updated_at": "2025-08-05T00:00:00.000000+00:00", "benchmark_name": "TAU-bench Retail" }, { "model_benchmark_id": 2005, "benchmark_id": "tau-bench-airline", "model_id": "claude-opus-4-1-20250805", "score": 0.56, "normalized_score": 0.56, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-opus-4-1", "verified_by_llmstats": false, "analysis_method": "Extended thinking with tool use (up to 64K tokens, prompt addendum, increased max steps from 30 to 100).", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-05T00:00:00.000000+00:00", "updated_at": "2025-08-05T00:00:00.000000+00:00", "benchmark_name": "TAU-bench Airline" }, { "model_benchmark_id": 2006, "benchmark_id": "mmmlu", "model_id": "claude-opus-4-1-20250805", "score": 0.895, "normalized_score": 0.895, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-opus-4-1", "verified_by_llmstats": false, "analysis_method": "Extended thinking (up to 64K tokens). Average over 14 non-English languages.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-05T00:00:00.000000+00:00", "updated_at": "2025-08-05T00:00:00.000000+00:00", "benchmark_name": "MMMLU" }, { "model_benchmark_id": 2007, "benchmark_id": "mmmu-(validation)", "model_id": "claude-opus-4-1-20250805", "score": 0.771, "normalized_score": 0.771, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-opus-4-1", "verified_by_llmstats": false, "analysis_method": "Extended thinking (up to 64K tokens)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-05T00:00:00.000000+00:00", "updated_at": "2025-08-05T00:00:00.000000+00:00", "benchmark_name": "MMMU (validation)" }, { "model_benchmark_id": 2008, "benchmark_id": "aime-2025", "model_id": "claude-opus-4-1-20250805", "score": 0.78, "normalized_score": 0.78, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-opus-4-1", "verified_by_llmstats": false, "analysis_method": "Extended thinking (up to 64K tokens). AIME 2025 using nucleus sampling with a top_p of 0.95.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-05T00:00:00.000000+00:00", "updated_at": "2025-08-05T00:00:00.000000+00:00", "benchmark_name": "AIME 2025" } ] ================================================ FILE: data/organizations/anthropic/models/claude-opus-4-1-20250805/model.json ================================================ { "model_id": "claude-opus-4-1-20250805", "name": "Claude Opus 4.1", "organization_id": "anthropic", "fine_tuned_from_model_id": null, "description": "Claude Opus 4.1 is a hybrid reasoning model that pushes the frontier for coding and AI agents, featuring a 200K context window. It delivers superior performance and precision for real-world coding and agentic tasks, handling complex multi-step problems with rigor and attention to detail. With extended thinking capabilities, it offers instant responses or extended step-by-step thinking visible through user-friendly summaries. It advances state-of-the-art coding performance to 74.5% on SWE-bench Verified, excels at agentic search and research, and produces human-quality content with exceptional writing abilities. It supports 32K output tokens and adapts to specific coding styles while delivering exceptional quality for extensive generation and refactoring projects.", "release_date": "2025-08-05", "announcement_date": "2025-08-05", "license_id": "proprietary", "multimodal": true, "knowledge_cutoff": null, "param_count": null, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://docs.anthropic.com/en/docs/about-claude/models/all-models", "source_playground": "https://claude.ai", "source_paper": null, "source_scorecard_blog_link": "https://www.anthropic.com/news/claude-opus-4-1", "source_repo_link": null, "source_weights_link": null, "created_at": "2025-08-05T00:00:00.000000+00:00", "updated_at": "2025-08-05T00:00:00.000000+00:00", "model_family_id": null } ================================================ FILE: data/organizations/anthropic/models/claude-opus-4-20250514/benchmarks.json ================================================ [ { "model_benchmark_id": 702, "benchmark_id": "aime-2025", "model_id": "claude-opus-4-20250514", "score": 0.755, "normalized_score": 0.755, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-4", "verified_by_llmstats": false, "analysis_method": "Extended thinking (up to 64K tokens) with parallel test-time compute (multiple attempts, internal scoring model selection). Nucleus sampling (top_p 0.95). Based on footnotes 4, 5 and blog appendix.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.468994+00:00", "updated_at": "2025-07-19T19:56:12.468994+00:00", "benchmark_name": "AIME 2025" }, { "model_benchmark_id": 1388, "benchmark_id": "arc-agi-v2", "model_id": "claude-opus-4-20250514", "score": 0.086, "normalized_score": 0.086, "is_self_reported": false, "self_reported_source_link": "https://x.com/xai/status/1943158495588815072", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.923803+00:00", "updated_at": "2025-07-19T19:56:13.923803+00:00", "benchmark_name": "ARC-AGI v2" }, { "model_benchmark_id": 337, "benchmark_id": "gpqa", "model_id": "claude-opus-4-20250514", "score": 0.796, "normalized_score": 0.796, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-4", "verified_by_llmstats": false, "analysis_method": "Diamond: Extended thinking (up to 64K tokens) with parallel test-time compute (multiple attempts, internal scoring model selection). Based on footnote 5 and blog appendix.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.734764+00:00", "updated_at": "2025-07-19T19:56:11.734764+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 1480, "benchmark_id": "mmmlu", "model_id": "claude-opus-4-20250514", "score": 0.888, "normalized_score": 0.888, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-4", "verified_by_llmstats": false, "analysis_method": "Extended thinking (up to 64K tokens). Average over 14 non-English languages. Based on blog appendix and footnote 3.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.155829+00:00", "updated_at": "2025-07-19T19:56:14.155829+00:00", "benchmark_name": "MMMLU" }, { "model_benchmark_id": 1815, "benchmark_id": "mmmu-(validation)", "model_id": "claude-opus-4-20250514", "score": 0.765, "normalized_score": 0.765, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-4", "verified_by_llmstats": false, "analysis_method": "Extended thinking (up to 64K tokens). Based on blog appendix.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.120938+00:00", "updated_at": "2025-07-19T19:56:15.120938+00:00", "benchmark_name": "MMMU (validation)" }, { "model_benchmark_id": 1351, "benchmark_id": "swe-bench-verified", "model_id": "claude-opus-4-20250514", "score": 0.725, "normalized_score": 0.725, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-4", "verified_by_llmstats": false, "analysis_method": "Parallel test-time compute (multiple attempts, internal scoring model selection). No extended thinking. Based on footnote 5 and SWE-bench methodology for high compute.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.843719+00:00", "updated_at": "2025-07-19T19:56:13.843719+00:00", "benchmark_name": "SWE-Bench Verified" }, { "model_benchmark_id": 1775, "benchmark_id": "tau-bench-airline", "model_id": "claude-opus-4-20250514", "score": 0.596, "normalized_score": 0.596, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-4", "verified_by_llmstats": false, "analysis_method": "Extended thinking with tool use (up to 64K tokens, prompt addendum, increased max steps). Based on blog appendix and TAU-bench methodology.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.005622+00:00", "updated_at": "2025-07-19T19:56:15.005622+00:00", "benchmark_name": "TAU-bench Airline" }, { "model_benchmark_id": 1761, "benchmark_id": "tau-bench-retail", "model_id": "claude-opus-4-20250514", "score": 0.814, "normalized_score": 0.814, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-4", "verified_by_llmstats": false, "analysis_method": "Extended thinking with tool use (up to 64K tokens, prompt addendum, increased max steps). Based on blog appendix and TAU-bench methodology.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.977090+00:00", "updated_at": "2025-07-19T19:56:14.977090+00:00", "benchmark_name": "TAU-bench Retail" }, { "model_benchmark_id": 655, "benchmark_id": "terminal-bench", "model_id": "claude-opus-4-20250514", "score": 0.392, "normalized_score": 0.392, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-4", "verified_by_llmstats": false, "analysis_method": "Parallel test-time compute (multiple attempts, internal scoring model selection). No extended thinking. Claude Code as agent framework. Based on footnotes 2 and 5.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.354970+00:00", "updated_at": "2025-07-19T19:56:12.354970+00:00", "benchmark_name": "Terminal-bench" } ] ================================================ FILE: data/organizations/anthropic/models/claude-opus-4-20250514/model.json ================================================ { "model_id": "claude-opus-4-20250514", "name": "Claude Opus 4", "organization_id": "anthropic", "fine_tuned_from_model_id": null, "description": "Claude Opus 4 is Anthropic's most powerful model and the world's best coding model, part of the Claude 4 family. It delivers sustained performance on complex, long-running tasks and agent workflows. Opus 4 excels at coding, advanced reasoning, and can use tools (like web search) during extended thinking. It supports parallel tool execution and has improved memory capabilities.", "release_date": "2025-05-22", "announcement_date": "2025-05-22", "license_id": "proprietary", "multimodal": true, "knowledge_cutoff": null, "param_count": null, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://docs.anthropic.com/en/docs/about-claude/models/all-models", "source_playground": "https://claude.ai", "source_paper": null, "source_scorecard_blog_link": "https://www.anthropic.com/news/claude-4", "source_repo_link": null, "source_weights_link": null, "created_at": "2025-07-19T19:49:05.760983+00:00", "updated_at": "2025-07-19T19:49:05.760983+00:00", "model_family_id": null } ================================================ FILE: data/organizations/anthropic/models/claude-sonnet-4-20250514/benchmarks.json ================================================ [ { "model_benchmark_id": 701, "benchmark_id": "aime-2025", "model_id": "claude-sonnet-4-20250514", "score": 0.705, "normalized_score": 0.705, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-4", "verified_by_llmstats": false, "analysis_method": "Extended thinking (up to 64K tokens) with parallel test-time compute (multiple attempts, internal scoring model selection). Nucleus sampling (top_p 0.95). Based on footnotes 4, 5 and blog appendix.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.466833+00:00", "updated_at": "2025-07-19T19:56:12.466833+00:00", "benchmark_name": "AIME 2025" }, { "model_benchmark_id": 333, "benchmark_id": "gpqa", "model_id": "claude-sonnet-4-20250514", "score": 0.754, "normalized_score": 0.754, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-4", "verified_by_llmstats": false, "analysis_method": "Diamond: Extended thinking (up to 64K tokens) with parallel test-time compute (multiple attempts, internal scoring model selection). Based on footnote 5 and blog appendix.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.728759+00:00", "updated_at": "2025-07-19T19:56:11.728759+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 1479, "benchmark_id": "mmmlu", "model_id": "claude-sonnet-4-20250514", "score": 0.865, "normalized_score": 0.865, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-4", "verified_by_llmstats": false, "analysis_method": "Extended thinking (up to 64K tokens). Average over 14 non-English languages. Based on blog appendix and footnote 3.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.154357+00:00", "updated_at": "2025-07-19T19:56:14.154357+00:00", "benchmark_name": "MMMLU" }, { "model_benchmark_id": 583, "benchmark_id": "mmmu", "model_id": "claude-sonnet-4-20250514", "score": 0.744, "normalized_score": 0.744, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-4", "verified_by_llmstats": false, "analysis_method": "Extended thinking (up to 64K tokens). Based on blog appendix.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.199608+00:00", "updated_at": "2025-07-19T19:56:12.199608+00:00", "benchmark_name": "MMMU" }, { "model_benchmark_id": 1349, "benchmark_id": "swe-bench-verified", "model_id": "claude-sonnet-4-20250514", "score": 0.727, "normalized_score": 0.727, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-4", "verified_by_llmstats": false, "analysis_method": "Parallel test-time compute (multiple attempts, internal scoring model selection). No extended thinking. Based on footnote 5 and SWE-bench methodology for high compute.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.840540+00:00", "updated_at": "2025-07-19T19:56:13.840540+00:00", "benchmark_name": "SWE-Bench Verified" }, { "model_benchmark_id": 1773, "benchmark_id": "tau-bench-airline", "model_id": "claude-sonnet-4-20250514", "score": 0.6, "normalized_score": 0.6, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-4", "verified_by_llmstats": false, "analysis_method": "Extended thinking with tool use (up to 64K tokens, prompt addendum, increased max steps). Based on blog appendix and TAU-bench methodology.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.002282+00:00", "updated_at": "2025-07-19T19:56:15.002282+00:00", "benchmark_name": "TAU-bench Airline" }, { "model_benchmark_id": 1759, "benchmark_id": "tau-bench-retail", "model_id": "claude-sonnet-4-20250514", "score": 0.805, "normalized_score": 0.805, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-4", "verified_by_llmstats": false, "analysis_method": "Extended thinking with tool use (up to 64K tokens, prompt addendum, increased max steps). Based on blog appendix and TAU-bench methodology.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.973668+00:00", "updated_at": "2025-07-19T19:56:14.973668+00:00", "benchmark_name": "TAU-bench Retail" }, { "model_benchmark_id": 654, "benchmark_id": "terminal-bench", "model_id": "claude-sonnet-4-20250514", "score": 0.355, "normalized_score": 0.355, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-4", "verified_by_llmstats": false, "analysis_method": "Parallel test-time compute (multiple attempts, internal scoring model selection). No extended thinking. Claude Code as agent framework. Based on footnotes 2 and 5.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.353338+00:00", "updated_at": "2025-07-19T19:56:12.353338+00:00", "benchmark_name": "Terminal-bench" } ] ================================================ FILE: data/organizations/anthropic/models/claude-sonnet-4-20250514/model.json ================================================ { "model_id": "claude-sonnet-4-20250514", "name": "Claude Sonnet 4", "organization_id": "anthropic", "fine_tuned_from_model_id": null, "description": "Claude Sonnet 4, part of the Claude 4 family, is a significant upgrade to Claude Sonnet 3.7. It excels in coding (72.7% on SWE-bench) and reasoning, responding more precisely to instructions. Sonnet 4 offers an optimal mix of capability and practicality, with enhanced steerability, and supports extended thinking with tool use.", "release_date": "2025-05-22", "announcement_date": "2025-05-22", "license_id": "proprietary", "multimodal": true, "knowledge_cutoff": null, "param_count": null, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://docs.anthropic.com/en/docs/about-claude/models/all-models", "source_playground": "https://claude.ai", "source_paper": null, "source_scorecard_blog_link": "https://www.anthropic.com/news/claude-4", "source_repo_link": null, "source_weights_link": null, "created_at": "2025-07-19T19:49:05.750182+00:00", "updated_at": "2025-07-19T19:49:05.750182+00:00", "model_family_id": null } ================================================ FILE: data/organizations/anthropic/models/claude-sonnet-4-5-20250929/benchmarks.json ================================================ [ { "model_benchmark_id": 701, "benchmark_id": "swe-bench-verified-(agentic-coding)", "model_id": "claude-sonnet-4-5-20250929", "score": 0.772, "normalized_score": 0.772, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-sonnet-4-5", "verified_by_llmstats": false, "analysis_method": "Agentic coding", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-09-29T19:56:12.466833+00:00", "updated_at": "2025-09-29T19:56:12.466833+00:00", "benchmark_name": "SWE-bench Verified (Agentic Coding)" }, { "model_benchmark_id": 702, "benchmark_id": "terminal-bench", "model_id": "claude-sonnet-4-5-20250929", "score": 0.5, "normalized_score": 0.5, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-sonnet-4-5", "verified_by_llmstats": false, "analysis_method": "Agentic terminal coding", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-09-29T19:56:12.466833+00:00", "updated_at": "2025-09-29T19:56:12.466833+00:00", "benchmark_name": "Terminal-Bench" }, { "model_benchmark_id": 703, "benchmark_id": "osworld", "model_id": "claude-sonnet-4-5-20250929", "score": 0.614, "normalized_score": 0.614, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-sonnet-4-5", "verified_by_llmstats": false, "analysis_method": "Computer use", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-09-29T19:56:12.466833+00:00", "updated_at": "2025-09-29T19:56:12.466833+00:00", "benchmark_name": "OSWorld" }, { "model_benchmark_id": 704, "benchmark_id": "aime-2025", "model_id": "claude-sonnet-4-5-20250929", "score": 0.87, "normalized_score": 0.87, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-sonnet-4-5", "verified_by_llmstats": false, "analysis_method": "High school math competition", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-09-29T19:56:12.466833+00:00", "updated_at": "2025-09-29T19:56:12.466833+00:00", "benchmark_name": "AIME 2025" }, { "model_benchmark_id": 705, "benchmark_id": "gpqa", "model_id": "claude-sonnet-4-5-20250929", "score": 0.834, "normalized_score": 0.834, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-sonnet-4-5", "verified_by_llmstats": false, "analysis_method": "Graduate-level reasoning", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-09-29T19:56:12.466833+00:00", "updated_at": "2025-09-29T19:56:12.466833+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 706, "benchmark_id": "mmmlu", "model_id": "claude-sonnet-4-5-20250929", "score": 0.891, "normalized_score": 0.891, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-sonnet-4-5", "verified_by_llmstats": false, "analysis_method": "Multilingual Q&A", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-09-29T19:56:12.466833+00:00", "updated_at": "2025-09-29T19:56:12.466833+00:00", "benchmark_name": "MMMLU" }, { "model_benchmark_id": 707, "benchmark_id": "tau-bench-retail", "model_id": "claude-sonnet-4-5-20250929", "score": 0.862, "normalized_score": 0.862, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-sonnet-4-5", "verified_by_llmstats": false, "analysis_method": "Agentic tool use", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-09-29T19:56:12.466833+00:00", "updated_at": "2025-09-29T19:56:12.466833+00:00", "benchmark_name": "TAU-bench Retail" }, { "model_benchmark_id": 708, "benchmark_id": "tau-bench-airline", "model_id": "claude-sonnet-4-5-20250929", "score": 0.7, "normalized_score": 0.7, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-sonnet-4-5", "verified_by_llmstats": false, "analysis_method": "Agentic tool use", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-09-29T19:56:12.466833+00:00", "updated_at": "2025-09-29T19:56:12.466833+00:00", "benchmark_name": "TAU-bench Airline" }, { "model_benchmark_id": 710, "benchmark_id": "mmmuval", "model_id": "claude-sonnet-4-5-20250929", "score": 0.778, "normalized_score": 0.778, "is_self_reported": true, "self_reported_source_link": "https://www.anthropic.com/news/claude-sonnet-4-5", "verified_by_llmstats": false, "analysis_method": "Visual reasoning", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-09-29T19:56:12.466833+00:00", "updated_at": "2025-09-29T19:56:12.466833+00:00", "benchmark_name": "MMMUval" } ] ================================================ FILE: data/organizations/anthropic/models/claude-sonnet-4-5-20250929/model.json ================================================ { "model_id": "claude-sonnet-4-5-20250929", "name": "Claude Sonnet 4.5", "organization_id": "anthropic", "fine_tuned_from_model_id": null, "description": "Claude Sonnet 4.5 is the best coding model in the world. It's the strongest model for building complex agents. It’s the best model at using computers. And it shows substantial gains in reasoning and math. Highest intelligence across most tasks with exceptional agent and coding capabilities.", "release_date": "2025-09-29", "announcement_date": "2025-09-29", "license_id": "proprietary", "multimodal": true, "knowledge_cutoff": "2025-01-31", "param_count": null, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://docs.anthropic.com/en/docs/about-claude/models/all-models", "source_playground": "https://claude.ai", "source_paper": null, "source_scorecard_blog_link": "https://www.anthropic.com/news/claude-sonnet-4-5", "source_repo_link": null, "source_weights_link": null, "created_at": "2025-07-19T19:49:05.750182+00:00", "updated_at": "2025-07-19T19:49:05.750182+00:00", "model_family_id": null } ================================================ FILE: data/organizations/anthropic/organization.json ================================================ { "organization_id": "anthropic", "name": "Anthropic", "website": "https://anthropic.com", "description": "AI safety company", "country": "US", "created_at": "2025-07-19T19:49:05.736520+00:00", "updated_at": "2025-07-19T19:49:05.736520+00:00" } ================================================ FILE: data/organizations/cohere/models/command-r-plus-04-2024/benchmarks.json ================================================ [ { "model_benchmark_id": 1, "benchmark_id": "arc-c", "model_id": "command-r-plus-04-2024", "score": 0.7099, "normalized_score": 0.7099, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/CohereForAI/c4ai-command-r-plus", "verified_by_llmstats": false, "analysis_method": "Standardized Evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.062949+00:00", "updated_at": "2025-07-19T19:56:11.062949+00:00", "benchmark_name": "ARC-C" }, { "model_benchmark_id": 157, "benchmark_id": "gsm8k", "model_id": "command-r-plus-04-2024", "score": 0.707, "normalized_score": 0.707, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/CohereForAI/c4ai-command-r-plus", "verified_by_llmstats": false, "analysis_method": "Standardized Evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.401017+00:00", "updated_at": "2025-07-19T19:56:11.401017+00:00", "benchmark_name": "GSM8k" }, { "model_benchmark_id": 32, "benchmark_id": "hellaswag", "model_id": "command-r-plus-04-2024", "score": 0.886, "normalized_score": 0.886, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/CohereForAI/c4ai-command-r-plus", "verified_by_llmstats": false, "analysis_method": "Standardized Evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.149067+00:00", "updated_at": "2025-07-19T19:56:11.149067+00:00", "benchmark_name": "HellaSwag" }, { "model_benchmark_id": 56, "benchmark_id": "mmlu", "model_id": "command-r-plus-04-2024", "score": 0.757, "normalized_score": 0.757, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/CohereForAI/c4ai-command-r-plus", "verified_by_llmstats": false, "analysis_method": "Standardized Evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.202939+00:00", "updated_at": "2025-07-19T19:56:11.202939+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 131, "benchmark_id": "truthfulqa", "model_id": "command-r-plus-04-2024", "score": 0.563, "normalized_score": 0.563, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/CohereForAI/c4ai-command-r-plus", "verified_by_llmstats": false, "analysis_method": "Standardized Evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.341733+00:00", "updated_at": "2025-07-19T19:56:11.341733+00:00", "benchmark_name": "TruthfulQA" }, { "model_benchmark_id": 147, "benchmark_id": "winogrande", "model_id": "command-r-plus-04-2024", "score": 0.854, "normalized_score": 0.854, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/CohereForAI/c4ai-command-r-plus", "verified_by_llmstats": false, "analysis_method": "Standardized Evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.378573+00:00", "updated_at": "2025-07-19T19:56:11.378573+00:00", "benchmark_name": "Winogrande" } ] ================================================ FILE: data/organizations/cohere/models/command-r-plus-04-2024/model.json ================================================ { "model_id": "command-r-plus-04-2024", "name": "Command R+", "organization_id": "cohere", "fine_tuned_from_model_id": null, "description": "C4AI Command R+ is a 104 billion parameter model with advanced capabilities, including Retrieval Augmented Generation (RAG) and multi-step tool use, optimized for multilingual tasks.", "release_date": "2024-08-30", "announcement_date": "2024-08-30", "license_id": "cc_by_nc", "multimodal": false, "knowledge_cutoff": null, "param_count": 104000000000, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://docs.cohere.com/v2/docs/command-r-plus", "source_playground": "https://huggingface.co/CohereForAI/c4ai-command-r-plus", "source_paper": null, "source_scorecard_blog_link": null, "source_repo_link": "https://huggingface.co/CohereForAI/c4ai-command-r-plus", "source_weights_link": "", "created_at": "2025-07-19T19:49:05.415748+00:00", "updated_at": "2025-07-19T19:49:05.415748+00:00", "model_family_id": null } ================================================ FILE: data/organizations/cohere/organization.json ================================================ { "organization_id": "cohere", "name": "Cohere", "website": "https://cohere.ai", "description": "Enterprise AI company", "country": "CA", "created_at": "2025-07-19T19:49:05.404836+00:00", "updated_at": "2025-07-19T19:49:05.404836+00:00" } ================================================ FILE: data/organizations/deepseek/models/deepseek-r1/benchmarks.json ================================================ [] ================================================ FILE: data/organizations/deepseek/models/deepseek-r1/model.json ================================================ { "model_id": "deepseek-r1", "name": "DeepSeek-R1", "organization_id": "deepseek", "model_family_id": null, "fine_tuned_from_model_id": null, "description": "DeepSeek-R1 is a reasoning-focused language model from DeepSeek that features advanced thinking capabilities. It serves as the foundation for DeepSeek's reasoning model family and pioneered their thinking mode approach for complex problem-solving tasks.", "release_date": "2025-01-20", "announcement_date": "2025-01-20", "license_id": "mit", "multimodal": false, "knowledge_cutoff": null, "param_count": 671000000000, "training_tokens": null, "available_in_zeroeval": false, "source_api_ref": "https://api.deepseek.com/docs", "source_playground": "https://chat.deepseek.com/", "source_paper": null, "source_scorecard_blog_link": "https://www.deepseek.com/", "source_repo_link": null, "source_weights_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1", "created_at": "2025-01-20T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00" } ================================================ FILE: data/organizations/deepseek/models/deepseek-r1-0528/benchmarks.json ================================================ [ { "model_benchmark_id": 9601, "benchmark_id": "mmlu-redux", "model_id": "deepseek-r1-0528", "score": 0.934, "normalized_score": 0.934, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1", "verified_by_llmstats": false, "analysis_method": "Thinking mode", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-05-28T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "MMLU-Redux" }, { "model_benchmark_id": 9602, "benchmark_id": "mmlu-pro", "model_id": "deepseek-r1-0528", "score": 0.85, "normalized_score": 0.85, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1", "verified_by_llmstats": false, "analysis_method": "Thinking mode", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-05-28T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "MMLU-Pro" }, { "model_benchmark_id": 9603, "benchmark_id": "gpqa", "model_id": "deepseek-r1-0528", "score": 0.81, "normalized_score": 0.81, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1", "verified_by_llmstats": false, "analysis_method": "Pass@1, Thinking mode", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-05-28T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 9604, "benchmark_id": "humanity's-last-exam", "model_id": "deepseek-r1-0528", "score": 0.177, "normalized_score": 0.177, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1", "verified_by_llmstats": false, "analysis_method": "Pass@1, Thinking mode, text-only subset", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": "Text-only subset evaluation", "created_at": "2025-05-28T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "Humanity's Last Exam" }, { "model_benchmark_id": 9605, "benchmark_id": "browsecomp", "model_id": "deepseek-r1-0528", "score": 0.089, "normalized_score": 0.089, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1", "verified_by_llmstats": false, "analysis_method": "Search agent with pre-defined workflow", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": "Evaluated with pre-defined workflow", "created_at": "2025-05-28T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "BrowseComp" }, { "model_benchmark_id": 9606, "benchmark_id": "browsecomp-zh", "model_id": "deepseek-r1-0528", "score": 0.357, "normalized_score": 0.357, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1", "verified_by_llmstats": false, "analysis_method": "Search agent with pre-defined workflow", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": "Evaluated with pre-defined workflow", "created_at": "2025-05-28T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "BrowseComp-zh" }, { "model_benchmark_id": 9607, "benchmark_id": "simpleqa", "model_id": "deepseek-r1-0528", "score": 0.923, "normalized_score": 0.923, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1", "verified_by_llmstats": false, "analysis_method": "Search agent evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-05-28T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "SimpleQA" }, { "model_benchmark_id": 9608, "benchmark_id": "livecodebench", "model_id": "deepseek-r1-0528", "score": 0.733, "normalized_score": 0.733, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1", "verified_by_llmstats": false, "analysis_method": "Pass@1, 2408-2505, Thinking mode", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-05-28T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "LiveCodeBench" }, { "model_benchmark_id": 9609, "benchmark_id": "codeforces", "model_id": "deepseek-r1-0528", "score": 0.6433, "normalized_score": 0.6433, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1", "verified_by_llmstats": false, "analysis_method": "Div1 Rating, Thinking mode", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-05-28T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "Codeforces" }, { "model_benchmark_id": 9610, "benchmark_id": "aider-polyglot", "model_id": "deepseek-r1-0528", "score": 0.716, "normalized_score": 0.716, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1", "verified_by_llmstats": false, "analysis_method": "Thinking mode", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-05-28T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "Aider-Polyglot" }, { "model_benchmark_id": 9611, "benchmark_id": "swe-bench-verified", "model_id": "deepseek-r1-0528", "score": 0.446, "normalized_score": 0.446, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1", "verified_by_llmstats": false, "analysis_method": "Agent mode", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": "Evaluated with internal code agent framework", "created_at": "2025-05-28T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "SWE-Bench Verified" }, { "model_benchmark_id": 9612, "benchmark_id": "swe-bench-multilingual", "model_id": "deepseek-r1-0528", "score": 0.305, "normalized_score": 0.305, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1", "verified_by_llmstats": false, "analysis_method": "Agent mode", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": "Evaluated with internal code agent framework", "created_at": "2025-05-28T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "SWE-Bench Multilingual" }, { "model_benchmark_id": 9613, "benchmark_id": "terminal-bench", "model_id": "deepseek-r1-0528", "score": 0.057, "normalized_score": 0.057, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1", "verified_by_llmstats": false, "analysis_method": "Terminus 1 framework", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-05-28T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "Terminal-Bench" }, { "model_benchmark_id": 9614, "benchmark_id": "aime-2024", "model_id": "deepseek-r1-0528", "score": 0.914, "normalized_score": 0.914, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1", "verified_by_llmstats": false, "analysis_method": "Pass@1, Thinking mode", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-05-28T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "AIME 2024" }, { "model_benchmark_id": 9615, "benchmark_id": "aime-2025", "model_id": "deepseek-r1-0528", "score": 0.875, "normalized_score": 0.875, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1", "verified_by_llmstats": false, "analysis_method": "Pass@1, Thinking mode", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-05-28T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "AIME 2025" }, { "model_benchmark_id": 9616, "benchmark_id": "hmmt-2025", "model_id": "deepseek-r1-0528", "score": 0.794, "normalized_score": 0.794, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1", "verified_by_llmstats": false, "analysis_method": "Pass@1, Thinking mode", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-05-28T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "HMMT 2025" } ] ================================================ FILE: data/organizations/deepseek/models/deepseek-r1-0528/model.json ================================================ { "model_id": "deepseek-r1-0528", "name": "DeepSeek-R1-0528", "organization_id": "deepseek", "model_family_id": null, "fine_tuned_from_model_id": "deepseek-r1", "description": "DeepSeek-R1-0528 is the May 28, 2025 version of DeepSeek's reasoning model. It features advanced thinking capabilities and serves as a benchmark comparison for newer models like DeepSeek-V3.1. This model excels in complex reasoning tasks, mathematical problem-solving, and code generation through its thinking mode approach.", "release_date": "2025-05-28", "announcement_date": "2025-05-28", "license_id": "mit", "multimodal": false, "knowledge_cutoff": null, "param_count": 671000000000, "training_tokens": null, "available_in_zeroeval": false, "source_api_ref": "https://api.deepseek.com/docs", "source_playground": "https://chat.deepseek.com/", "source_paper": null, "source_scorecard_blog_link": "https://www.deepseek.com/", "source_repo_link": null, "source_weights_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1", "created_at": "2025-05-28T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00" } ================================================ FILE: data/organizations/deepseek/models/deepseek-r1-distill-llama-70b/benchmarks.json ================================================ [ { "model_benchmark_id": 467, "benchmark_id": "aime-2024", "model_id": "deepseek-r1-distill-llama-70b", "score": 0.867, "normalized_score": 0.867, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B", "verified_by_llmstats": false, "analysis_method": "Cons@64", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.987242+00:00", "updated_at": "2025-07-19T19:56:11.989505+00:00", "benchmark_name": "AIME 2024" }, { "model_benchmark_id": 315, "benchmark_id": "gpqa", "model_id": "deepseek-r1-distill-llama-70b", "score": 0.652, "normalized_score": 0.652, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B", "verified_by_llmstats": false, "analysis_method": "Diamond, Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.700874+00:00", "updated_at": "2025-07-19T19:56:11.700874+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 1135, "benchmark_id": "livecodebench", "model_id": "deepseek-r1-distill-llama-70b", "score": 0.575, "normalized_score": 0.575, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.386337+00:00", "updated_at": "2025-07-19T19:56:13.386337+00:00", "benchmark_name": "LiveCodeBench" }, { "model_benchmark_id": 503, "benchmark_id": "math-500", "model_id": "deepseek-r1-distill-llama-70b", "score": 0.945, "normalized_score": 0.945, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.048302+00:00", "updated_at": "2025-07-19T19:56:12.048302+00:00", "benchmark_name": "MATH-500" } ] ================================================ FILE: data/organizations/deepseek/models/deepseek-r1-distill-llama-70b/model.json ================================================ { "model_id": "deepseek-r1-distill-llama-70b", "name": "DeepSeek R1 Distill Llama 70B", "organization_id": "deepseek", "fine_tuned_from_model_id": null, "description": "DeepSeek-R1 is the first-generation reasoning model built atop DeepSeek-V3 (671B total parameters, 37B activated per token). It incorporates large-scale reinforcement learning (RL) to enhance its chain-of-thought and reasoning capabilities, delivering strong performance in math, code, and multi-step reasoning tasks.", "release_date": "2025-01-20", "announcement_date": "2025-01-20", "license_id": "mit", "multimodal": false, "knowledge_cutoff": null, "param_count": 70600000000, "training_tokens": 14800000000000, "available_in_zeroeval": true, "source_api_ref": "https://api-docs.deepseek.com/news/news250120", "source_playground": "https://chat.deepseek.com", "source_paper": "https://arxiv.org/pdf/2501.12948", "source_scorecard_blog_link": null, "source_repo_link": "https://github.com/deepseek-ai/DeepSeek-R1", "source_weights_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B", "created_at": "2025-07-19T19:49:05.685839+00:00", "updated_at": "2025-07-19T19:49:05.685839+00:00", "model_family_id": null } ================================================ FILE: data/organizations/deepseek/models/deepseek-r1-distill-llama-8b/benchmarks.json ================================================ [ { "model_benchmark_id": 465, "benchmark_id": "aime-2024", "model_id": "deepseek-r1-distill-llama-8b", "score": 0.8, "normalized_score": 0.8, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B", "verified_by_llmstats": false, "analysis_method": "Cons@64", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.984093+00:00", "updated_at": "2025-07-19T19:56:11.985582+00:00", "benchmark_name": "AIME 2024" }, { "model_benchmark_id": 314, "benchmark_id": "gpqa", "model_id": "deepseek-r1-distill-llama-8b", "score": 0.49, "normalized_score": 0.49, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B", "verified_by_llmstats": false, "analysis_method": "Diamond, Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.699365+00:00", "updated_at": "2025-07-19T19:56:11.699365+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 1134, "benchmark_id": "livecodebench", "model_id": "deepseek-r1-distill-llama-8b", "score": 0.396, "normalized_score": 0.396, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.384499+00:00", "updated_at": "2025-07-19T19:56:13.384499+00:00", "benchmark_name": "LiveCodeBench" }, { "model_benchmark_id": 502, "benchmark_id": "math-500", "model_id": "deepseek-r1-distill-llama-8b", "score": 0.891, "normalized_score": 0.891, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.046427+00:00", "updated_at": "2025-07-19T19:56:12.046427+00:00", "benchmark_name": "MATH-500" } ] ================================================ FILE: data/organizations/deepseek/models/deepseek-r1-distill-llama-8b/model.json ================================================ { "model_id": "deepseek-r1-distill-llama-8b", "name": "DeepSeek R1 Distill Llama 8B", "organization_id": "deepseek", "fine_tuned_from_model_id": null, "description": "DeepSeek-R1 is the first-generation reasoning model built atop DeepSeek-V3 (671B total parameters, 37B activated per token). It incorporates large-scale reinforcement learning (RL) to enhance its chain-of-thought and reasoning capabilities, delivering strong performance in math, code, and multi-step reasoning tasks.", "release_date": "2025-01-20", "announcement_date": "2025-01-20", "license_id": "mit", "multimodal": false, "knowledge_cutoff": null, "param_count": 8030000000, "training_tokens": 14800000000000, "available_in_zeroeval": true, "source_api_ref": "https://api-docs.deepseek.com/news/news250120", "source_playground": "https://chat.deepseek.com", "source_paper": "https://arxiv.org/pdf/2501.12948", "source_scorecard_blog_link": null, "source_repo_link": "https://github.com/deepseek-ai/DeepSeek-R1", "source_weights_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B", "created_at": "2025-07-19T19:49:05.683265+00:00", "updated_at": "2025-07-19T19:49:05.683265+00:00", "model_family_id": null } ================================================ FILE: data/organizations/deepseek/models/deepseek-r1-distill-qwen-1.5b/benchmarks.json ================================================ [ { "model_benchmark_id": 461, "benchmark_id": "aime-2024", "model_id": "deepseek-r1-distill-qwen-1.5b", "score": 0.527, "normalized_score": 0.527, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", "verified_by_llmstats": false, "analysis_method": "Cons@64", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.976978+00:00", "updated_at": "2025-07-19T19:56:11.978475+00:00", "benchmark_name": "AIME 2024" }, { "model_benchmark_id": 311, "benchmark_id": "gpqa", "model_id": "deepseek-r1-distill-qwen-1.5b", "score": 0.338, "normalized_score": 0.338, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", "verified_by_llmstats": false, "analysis_method": "Diamond, Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.694071+00:00", "updated_at": "2025-07-19T19:56:11.694071+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 1130, "benchmark_id": "livecodebench", "model_id": "deepseek-r1-distill-qwen-1.5b", "score": 0.169, "normalized_score": 0.169, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.362673+00:00", "updated_at": "2025-07-19T19:56:13.362673+00:00", "benchmark_name": "LiveCodeBench" }, { "model_benchmark_id": 499, "benchmark_id": "math-500", "model_id": "deepseek-r1-distill-qwen-1.5b", "score": 0.839, "normalized_score": 0.839, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.041592+00:00", "updated_at": "2025-07-19T19:56:12.041592+00:00", "benchmark_name": "MATH-500" } ] ================================================ FILE: data/organizations/deepseek/models/deepseek-r1-distill-qwen-1.5b/model.json ================================================ { "model_id": "deepseek-r1-distill-qwen-1.5b", "name": "DeepSeek R1 Distill Qwen 1.5B", "organization_id": "deepseek", "fine_tuned_from_model_id": null, "description": "DeepSeek-R1 is the first-generation reasoning model built atop DeepSeek-V3 (671B total parameters, 37B activated per token). It incorporates large-scale reinforcement learning (RL) to enhance its chain-of-thought and reasoning capabilities, delivering strong performance in math, code, and multi-step reasoning tasks.", "release_date": "2025-01-20", "announcement_date": "2025-01-20", "license_id": "mit", "multimodal": false, "knowledge_cutoff": null, "param_count": 1780000000, "training_tokens": 14800000000000, "available_in_zeroeval": true, "source_api_ref": "https://api-docs.deepseek.com/news/news250120", "source_playground": "https://chat.deepseek.com", "source_paper": "https://arxiv.org/pdf/2501.12948", "source_scorecard_blog_link": null, "source_repo_link": "https://github.com/deepseek-ai/DeepSeek-R1", "source_weights_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", "created_at": "2025-07-19T19:49:05.672853+00:00", "updated_at": "2025-07-19T19:49:05.672853+00:00", "model_family_id": null } ================================================ FILE: data/organizations/deepseek/models/deepseek-r1-distill-qwen-14b/benchmarks.json ================================================ [ { "model_benchmark_id": 469, "benchmark_id": "aime-2024", "model_id": "deepseek-r1-distill-qwen-14b", "score": 0.8, "normalized_score": 0.8, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B", "verified_by_llmstats": false, "analysis_method": "Cons@64", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.991646+00:00", "updated_at": "2025-07-19T19:56:11.993518+00:00", "benchmark_name": "AIME 2024" }, { "model_benchmark_id": 316, "benchmark_id": "gpqa", "model_id": "deepseek-r1-distill-qwen-14b", "score": 0.591, "normalized_score": 0.591, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B", "verified_by_llmstats": false, "analysis_method": "Diamond, Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.702334+00:00", "updated_at": "2025-07-19T19:56:11.702334+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 1136, "benchmark_id": "livecodebench", "model_id": "deepseek-r1-distill-qwen-14b", "score": 0.531, "normalized_score": 0.531, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.387993+00:00", "updated_at": "2025-07-19T19:56:13.387993+00:00", "benchmark_name": "LiveCodeBench" }, { "model_benchmark_id": 504, "benchmark_id": "math-500", "model_id": "deepseek-r1-distill-qwen-14b", "score": 0.939, "normalized_score": 0.939, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.050287+00:00", "updated_at": "2025-07-19T19:56:12.050287+00:00", "benchmark_name": "MATH-500" } ] ================================================ FILE: data/organizations/deepseek/models/deepseek-r1-distill-qwen-14b/model.json ================================================ { "model_id": "deepseek-r1-distill-qwen-14b", "name": "DeepSeek R1 Distill Qwen 14B", "organization_id": "deepseek", "fine_tuned_from_model_id": null, "description": "DeepSeek-R1 is the first-generation reasoning model built atop DeepSeek-V3 (671B total parameters, 37B activated per token). It incorporates large-scale reinforcement learning (RL) to enhance its chain-of-thought and reasoning capabilities, delivering strong performance in math, code, and multi-step reasoning tasks.", "release_date": "2025-01-20", "announcement_date": "2025-01-20", "license_id": "mit", "multimodal": false, "knowledge_cutoff": null, "param_count": 14800000000, "training_tokens": 14800000000000, "available_in_zeroeval": true, "source_api_ref": "https://api-docs.deepseek.com/news/news250120", "source_playground": "https://chat.deepseek.com", "source_paper": "https://arxiv.org/pdf/2501.12948", "source_scorecard_blog_link": null, "source_repo_link": "https://github.com/deepseek-ai/DeepSeek-R1", "source_weights_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B", "created_at": "2025-07-19T19:49:05.688267+00:00", "updated_at": "2025-07-19T19:49:05.688267+00:00", "model_family_id": null } ================================================ FILE: data/organizations/deepseek/models/deepseek-r1-distill-qwen-32b/benchmarks.json ================================================ [ { "model_benchmark_id": 471, "benchmark_id": "aime-2024", "model_id": "deepseek-r1-distill-qwen-32b", "score": 0.833, "normalized_score": 0.833, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", "verified_by_llmstats": false, "analysis_method": "Cons@64", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.995645+00:00", "updated_at": "2025-07-19T19:56:11.997517+00:00", "benchmark_name": "AIME 2024" }, { "model_benchmark_id": 317, "benchmark_id": "gpqa", "model_id": "deepseek-r1-distill-qwen-32b", "score": 0.621, "normalized_score": 0.621, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", "verified_by_llmstats": false, "analysis_method": "Diamond, Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.703902+00:00", "updated_at": "2025-07-19T19:56:11.703902+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 1137, "benchmark_id": "livecodebench", "model_id": "deepseek-r1-distill-qwen-32b", "score": 0.572, "normalized_score": 0.572, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.389729+00:00", "updated_at": "2025-07-19T19:56:13.389729+00:00", "benchmark_name": "LiveCodeBench" }, { "model_benchmark_id": 505, "benchmark_id": "math-500", "model_id": "deepseek-r1-distill-qwen-32b", "score": 0.943, "normalized_score": 0.943, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.051744+00:00", "updated_at": "2025-07-19T19:56:12.051744+00:00", "benchmark_name": "MATH-500" } ] ================================================ FILE: data/organizations/deepseek/models/deepseek-r1-distill-qwen-32b/model.json ================================================ { "model_id": "deepseek-r1-distill-qwen-32b", "name": "DeepSeek R1 Distill Qwen 32B", "organization_id": "deepseek", "fine_tuned_from_model_id": null, "description": "DeepSeek-R1 is the first-generation reasoning model built atop DeepSeek-V3 (671B total parameters, 37B activated per token). It incorporates large-scale reinforcement learning (RL) to enhance its chain-of-thought and reasoning capabilities, delivering strong performance in math, code, and multi-step reasoning tasks.", "release_date": "2025-01-20", "announcement_date": "2025-01-20", "license_id": "mit", "multimodal": false, "knowledge_cutoff": null, "param_count": 32800000000, "training_tokens": 14800000000000, "available_in_zeroeval": true, "source_api_ref": "https://api-docs.deepseek.com/news/news250120", "source_playground": "https://chat.deepseek.com", "source_paper": "https://arxiv.org/pdf/2501.12948", "source_scorecard_blog_link": null, "source_repo_link": "https://github.com/deepseek-ai/DeepSeek-R1", "source_weights_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", "created_at": "2025-07-19T19:49:05.690560+00:00", "updated_at": "2025-07-19T19:49:05.690560+00:00", "model_family_id": null } ================================================ FILE: data/organizations/deepseek/models/deepseek-r1-distill-qwen-7b/benchmarks.json ================================================ [ { "model_benchmark_id": 459, "benchmark_id": "aime-2024", "model_id": "deepseek-r1-distill-qwen-7b", "score": 0.833, "normalized_score": 0.833, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", "verified_by_llmstats": false, "analysis_method": "Cons@64", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.973870+00:00", "updated_at": "2025-07-19T19:56:11.975371+00:00", "benchmark_name": "AIME 2024" }, { "model_benchmark_id": 310, "benchmark_id": "gpqa", "model_id": "deepseek-r1-distill-qwen-7b", "score": 0.491, "normalized_score": 0.491, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", "verified_by_llmstats": false, "analysis_method": "Diamond, Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.692702+00:00", "updated_at": "2025-07-19T19:56:11.692702+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 1129, "benchmark_id": "livecodebench", "model_id": "deepseek-r1-distill-qwen-7b", "score": 0.376, "normalized_score": 0.376, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.360567+00:00", "updated_at": "2025-07-19T19:56:13.360567+00:00", "benchmark_name": "LiveCodeBench" }, { "model_benchmark_id": 498, "benchmark_id": "math-500", "model_id": "deepseek-r1-distill-qwen-7b", "score": 0.928, "normalized_score": 0.928, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.039853+00:00", "updated_at": "2025-07-19T19:56:12.039853+00:00", "benchmark_name": "MATH-500" } ] ================================================ FILE: data/organizations/deepseek/models/deepseek-r1-distill-qwen-7b/model.json ================================================ { "model_id": "deepseek-r1-distill-qwen-7b", "name": "DeepSeek R1 Distill Qwen 7B", "organization_id": "deepseek", "fine_tuned_from_model_id": null, "description": "DeepSeek-R1 is the first-generation reasoning model built atop DeepSeek-V3 (671B total parameters, 37B activated per token). It incorporates large-scale reinforcement learning (RL) to enhance its chain-of-thought and reasoning capabilities, delivering strong performance in math, code, and multi-step reasoning tasks.", "release_date": "2025-01-20", "announcement_date": "2025-01-20", "license_id": "mit", "multimodal": false, "knowledge_cutoff": null, "param_count": 7620000000, "training_tokens": 14800000000000, "available_in_zeroeval": true, "source_api_ref": "https://api-docs.deepseek.com/news/news250120", "source_playground": "https://chat.deepseek.com", "source_paper": "https://arxiv.org/pdf/2501.12948", "source_scorecard_blog_link": null, "source_repo_link": "https://github.com/deepseek-ai/DeepSeek-R1", "source_weights_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", "created_at": "2025-07-19T19:49:05.669926+00:00", "updated_at": "2025-07-19T19:49:05.669926+00:00", "model_family_id": null } ================================================ FILE: data/organizations/deepseek/models/deepseek-r1-zero/benchmarks.json ================================================ [ { "model_benchmark_id": 457, "benchmark_id": "aime-2024", "model_id": "deepseek-r1-zero", "score": 0.867, "normalized_score": 0.867, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/abs/2501.12948", "verified_by_llmstats": false, "analysis_method": "Cons@64", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.970600+00:00", "updated_at": "2025-07-19T19:56:11.972162+00:00", "benchmark_name": "AIME 2024" }, { "model_benchmark_id": 309, "benchmark_id": "gpqa", "model_id": "deepseek-r1-zero", "score": 0.733, "normalized_score": 0.733, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/abs/2501.12948", "verified_by_llmstats": false, "analysis_method": "Pass@1 Diamond", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.691175+00:00", "updated_at": "2025-07-19T19:56:11.691175+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 1128, "benchmark_id": "livecodebench", "model_id": "deepseek-r1-zero", "score": 0.5, "normalized_score": 0.5, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/abs/2501.12948", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.357962+00:00", "updated_at": "2025-07-19T19:56:13.357962+00:00", "benchmark_name": "LiveCodeBench" }, { "model_benchmark_id": 497, "benchmark_id": "math-500", "model_id": "deepseek-r1-zero", "score": 0.959, "normalized_score": 0.959, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/abs/2501.12948", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.038172+00:00", "updated_at": "2025-07-19T19:56:12.038172+00:00", "benchmark_name": "MATH-500" } ] ================================================ FILE: data/organizations/deepseek/models/deepseek-r1-zero/model.json ================================================ { "model_id": "deepseek-r1-zero", "name": "DeepSeek R1 Zero", "organization_id": "deepseek", "fine_tuned_from_model_id": "deepseek-v3", "description": "DeepSeek-R1-Zero, a model trained via large-scale reinforcement learning (RL) without supervised fine-tuning (SFT) as a preliminary step, demonstrated remarkable performance on reasoning. With RL, DeepSeek-R1-Zero naturally emerged with numerous powerful and interesting reasoning behaviors. However, DeepSeek-R1-Zero encounters challenges such as endless repetition, poor readability, and language mixing. To address these issues and further enhance reasoning performance, we introduce DeepSeek-R1, which incorporates cold-start data before RL. DeepSeek-R1 achieves performance comparable to OpenAI-o1 across math, code, and reasoning tasks.", "release_date": "2025-01-20", "announcement_date": "2025-01-20", "license_id": "mit", "multimodal": false, "knowledge_cutoff": null, "param_count": 671000000000, "training_tokens": 14800000000000, "available_in_zeroeval": true, "source_api_ref": "https://api-docs.deepseek.com/news/news250120", "source_playground": "https://chat.deepseek.com", "source_paper": "https://arxiv.org/abs/2501.12948", "source_scorecard_blog_link": null, "source_repo_link": "https://github.com/deepseek-ai/DeepSeek-R1", "source_weights_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1", "created_at": "2025-07-19T19:49:05.902496+00:00", "updated_at": "2025-07-19T19:49:05.902496+00:00", "model_family_id": null } ================================================ FILE: data/organizations/deepseek/models/deepseek-v2.5/benchmarks.json ================================================ [ { "model_benchmark_id": 1627, "benchmark_id": "aider", "model_id": "deepseek-v2.5", "score": 0.722, "normalized_score": 0.722, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V2.5", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.574890+00:00", "updated_at": "2025-07-19T19:56:14.574890+00:00", "benchmark_name": "Aider" }, { "model_benchmark_id": 1619, "benchmark_id": "alignbench", "model_id": "deepseek-v2.5", "score": 0.804, "normalized_score": 0.804, "is_self_reported": true, "self_reported_source_link": "https://www.deepseek.com/", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.550691+00:00", "updated_at": "2025-07-19T19:56:14.550691+00:00", "benchmark_name": "AlignBench" }, { "model_benchmark_id": 1790, "benchmark_id": "alpacaeval-2.0", "model_id": "deepseek-v2.5", "score": 0.505, "normalized_score": 0.505, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V2.5", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.041535+00:00", "updated_at": "2025-07-19T19:56:15.041535+00:00", "benchmark_name": "AlpacaEval 2.0" }, { "model_benchmark_id": 1456, "benchmark_id": "arena-hard", "model_id": "deepseek-v2.5", "score": 0.762, "normalized_score": 0.762, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V2.5", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.104170+00:00", "updated_at": "2025-07-19T19:56:14.104170+00:00", "benchmark_name": "Arena Hard" }, { "model_benchmark_id": 974, "benchmark_id": "bbh", "model_id": "deepseek-v2.5", "score": 0.843, "normalized_score": 0.843, "is_self_reported": true, "self_reported_source_link": "https://www.deepseek.com/", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.046694+00:00", "updated_at": "2025-07-19T19:56:13.046694+00:00", "benchmark_name": "BBH" }, { "model_benchmark_id": 1797, "benchmark_id": "ds-arena-code", "model_id": "deepseek-v2.5", "score": 0.631, "normalized_score": 0.631, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V2.5", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.060324+00:00", "updated_at": "2025-07-19T19:56:15.060324+00:00", "benchmark_name": "DS-Arena-Code" }, { "model_benchmark_id": 1796, "benchmark_id": "ds-fim-eval", "model_id": "deepseek-v2.5", "score": 0.783, "normalized_score": 0.783, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V2.5", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.056487+00:00", "updated_at": "2025-07-19T19:56:15.056487+00:00", "benchmark_name": "DS-FIM-Eval" }, { "model_benchmark_id": 1000, "benchmark_id": "gsm8k", "model_id": "deepseek-v2.5", "score": 0.951, "normalized_score": 0.951, "is_self_reported": true, "self_reported_source_link": "https://www.deepseek.com/", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.091340+00:00", "updated_at": "2025-07-19T19:56:13.091340+00:00", "benchmark_name": "GSM8k" }, { "model_benchmark_id": 792, "benchmark_id": "humaneval", "model_id": "deepseek-v2.5", "score": 0.89, "normalized_score": 0.89, "is_self_reported": true, "self_reported_source_link": "https://www.deepseek.com/", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.656959+00:00", "updated_at": "2025-07-19T19:56:12.656959+00:00", "benchmark_name": "HumanEval" }, { "model_benchmark_id": 1789, "benchmark_id": "humaneval-mul", "model_id": "deepseek-v2.5", "score": 0.738, "normalized_score": 0.738, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V2.5", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.037209+00:00", "updated_at": "2025-07-19T19:56:15.037209+00:00", "benchmark_name": "HumanEval-Mul" }, { "model_benchmark_id": 1795, "benchmark_id": "livecodebench(01-09)", "model_id": "deepseek-v2.5", "score": 0.418, "normalized_score": 0.418, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V2.5", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.052983+00:00", "updated_at": "2025-07-19T19:56:15.052983+00:00", "benchmark_name": "LiveCodeBench(01-09)" }, { "model_benchmark_id": 411, "benchmark_id": "math", "model_id": "deepseek-v2.5", "score": 0.747, "normalized_score": 0.747, "is_self_reported": true, "self_reported_source_link": "https://www.deepseek.com/", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.874944+00:00", "updated_at": "2025-07-19T19:56:11.874944+00:00", "benchmark_name": "MATH" }, { "model_benchmark_id": 94, "benchmark_id": "mmlu", "model_id": "deepseek-v2.5", "score": 0.804, "normalized_score": 0.804, "is_self_reported": true, "self_reported_source_link": "https://www.deepseek.com/", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.277903+00:00", "updated_at": "2025-07-19T19:56:11.277903+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 1608, "benchmark_id": "mt-bench", "model_id": "deepseek-v2.5", "score": 0.902, "normalized_score": 0.902, "is_self_reported": true, "self_reported_source_link": "https://www.deepseek.com/", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.525856+00:00", "updated_at": "2025-07-19T19:56:14.525856+00:00", "benchmark_name": "MT-Bench" }, { "model_benchmark_id": 1345, "benchmark_id": "swe-bench-verified", "model_id": "deepseek-v2.5", "score": 0.168, "normalized_score": 0.168, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V2.5", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.830793+00:00", "updated_at": "2025-07-19T19:56:13.830793+00:00", "benchmark_name": "SWE-Bench Verified" } ] ================================================ FILE: data/organizations/deepseek/models/deepseek-v2.5/model.json ================================================ { "model_id": "deepseek-v2.5", "name": "DeepSeek-V2.5", "organization_id": "deepseek", "fine_tuned_from_model_id": null, "description": "DeepSeek-V2.5 is an upgraded version that combines DeepSeek-V2-Chat and DeepSeek-Coder-V2-Instruct, integrating general and coding abilities. It better aligns with human preferences and has been optimized in various aspects, including writing and instruction following.", "release_date": "2024-05-08", "announcement_date": "2024-05-08", "license_id": "deepseek", "multimodal": false, "knowledge_cutoff": null, "param_count": 236000000000, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://www.deepseek.com/", "source_playground": "https://huggingface.co/deepseek-ai/DeepSeek-V2.5", "source_paper": "https://arxiv.org/abs/2405.04434", "source_scorecard_blog_link": null, "source_repo_link": "https://huggingface.co/deepseek-ai/DeepSeek-V2.5", "source_weights_link": "https://huggingface.co/deepseek-ai/DeepSeek-V2.5", "created_at": "2025-07-19T19:49:05.680851+00:00", "updated_at": "2025-07-19T19:49:05.680851+00:00", "model_family_id": null } ================================================ FILE: data/organizations/deepseek/models/deepseek-v3/benchmarks.json ================================================ [ { "model_benchmark_id": 663, "benchmark_id": "aider-polyglot", "model_id": "deepseek-v3", "score": 0.496, "normalized_score": 0.496, "is_self_reported": true, "self_reported_source_link": "https://github.com/deepseek-ai/DeepSeek-V3", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.374175+00:00", "updated_at": "2025-07-19T19:56:12.374175+00:00", "benchmark_name": "Aider-Polyglot" }, { "model_benchmark_id": 1330, "benchmark_id": "aider-polyglot-edit", "model_id": "deepseek-v3", "score": 0.797, "normalized_score": 0.797, "is_self_reported": true, "self_reported_source_link": "https://github.com/deepseek-ai/DeepSeek-V3", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.796886+00:00", "updated_at": "2025-07-19T19:56:13.796886+00:00", "benchmark_name": "Aider-Polyglot Edit" }, { "model_benchmark_id": 463, "benchmark_id": "aime-2024", "model_id": "deepseek-v3", "score": 0.392, "normalized_score": 0.392, "is_self_reported": true, "self_reported_source_link": "https://github.com/deepseek-ai/DeepSeek-V3", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.980196+00:00", "updated_at": "2025-07-19T19:56:11.980196+00:00", "benchmark_name": "AIME 2024" }, { "model_benchmark_id": 438, "benchmark_id": "c-eval", "model_id": "deepseek-v3", "score": 0.865, "normalized_score": 0.865, "is_self_reported": true, "self_reported_source_link": "https://github.com/deepseek-ai/DeepSeek-V3", "verified_by_llmstats": false, "analysis_method": "Exact Match", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.928060+00:00", "updated_at": "2025-07-19T19:56:11.928060+00:00", "benchmark_name": "C-Eval" }, { "model_benchmark_id": 600, "benchmark_id": "cluewsc", "model_id": "deepseek-v3", "score": 0.909, "normalized_score": 0.909, "is_self_reported": true, "self_reported_source_link": "https://github.com/deepseek-ai/DeepSeek-V3", "verified_by_llmstats": false, "analysis_method": "Exact Match", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.237991+00:00", "updated_at": "2025-07-19T19:56:12.237991+00:00", "benchmark_name": "CLUEWSC" }, { "model_benchmark_id": 711, "benchmark_id": "cnmo-2024", "model_id": "deepseek-v3", "score": 0.432, "normalized_score": 0.432, "is_self_reported": true, "self_reported_source_link": "https://github.com/deepseek-ai/DeepSeek-V3", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.493124+00:00", "updated_at": "2025-07-19T19:56:12.493124+00:00", "benchmark_name": "CNMO 2024" }, { "model_benchmark_id": 442, "benchmark_id": "csimpleqa", "model_id": "deepseek-v3", "score": 0.648, "normalized_score": 0.648, "is_self_reported": true, "self_reported_source_link": "https://github.com/deepseek-ai/DeepSeek-V3", "verified_by_llmstats": false, "analysis_method": "Correct", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.937598+00:00", "updated_at": "2025-07-19T19:56:11.937598+00:00", "benchmark_name": "CSimpleQA" }, { "model_benchmark_id": 951, "benchmark_id": "drop", "model_id": "deepseek-v3", "score": 0.916, "normalized_score": 0.916, "is_self_reported": true, "self_reported_source_link": "https://github.com/deepseek-ai/DeepSeek-V3", "verified_by_llmstats": false, "analysis_method": "3-shot F1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.005931+00:00", "updated_at": "2025-07-19T19:56:13.005931+00:00", "benchmark_name": "DROP" }, { "model_benchmark_id": 1753, "benchmark_id": "frames", "model_id": "deepseek-v3", "score": 0.733, "normalized_score": 0.733, "is_self_reported": true, "self_reported_source_link": "https://github.com/deepseek-ai/DeepSeek-V3", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.958906+00:00", "updated_at": "2025-07-19T19:56:14.958906+00:00", "benchmark_name": "FRAMES" }, { "model_benchmark_id": 312, "benchmark_id": "gpqa", "model_id": "deepseek-v3", "score": 0.591, "normalized_score": 0.591, "is_self_reported": true, "self_reported_source_link": "https://github.com/deepseek-ai/DeepSeek-V3", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.695757+00:00", "updated_at": "2025-07-19T19:56:11.695757+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 1788, "benchmark_id": "humaneval-mul", "model_id": "deepseek-v3", "score": 0.826, "normalized_score": 0.826, "is_self_reported": true, "self_reported_source_link": "https://github.com/deepseek-ai/DeepSeek-V3", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.035409+00:00", "updated_at": "2025-07-19T19:56:15.035409+00:00", "benchmark_name": "HumanEval-Mul" }, { "model_benchmark_id": 622, "benchmark_id": "ifeval", "model_id": "deepseek-v3", "score": 0.861, "normalized_score": 0.861, "is_self_reported": true, "self_reported_source_link": "https://github.com/deepseek-ai/DeepSeek-V3", "verified_by_llmstats": false, "analysis_method": "Prompt Strict", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.280659+00:00", "updated_at": "2025-07-19T19:56:12.280659+00:00", "benchmark_name": "IFEval" }, { "model_benchmark_id": 1131, "benchmark_id": "livecodebench", "model_id": "deepseek-v3", "score": 0.376, "normalized_score": 0.376, "is_self_reported": true, "self_reported_source_link": "https://github.com/deepseek-ai/DeepSeek-V3", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.364940+00:00", "updated_at": "2025-07-19T19:56:13.372242+00:00", "benchmark_name": "LiveCodeBench" }, { "model_benchmark_id": 1787, "benchmark_id": "longbench-v2", "model_id": "deepseek-v3", "score": 0.487, "normalized_score": 0.487, "is_self_reported": true, "self_reported_source_link": "https://github.com/deepseek-ai/DeepSeek-V3", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.031520+00:00", "updated_at": "2025-07-19T19:56:15.031520+00:00", "benchmark_name": "LongBench v2" }, { "model_benchmark_id": 500, "benchmark_id": "math-500", "model_id": "deepseek-v3", "score": 0.902, "normalized_score": 0.902, "is_self_reported": true, "self_reported_source_link": "https://github.com/deepseek-ai/DeepSeek-V3", "verified_by_llmstats": false, "analysis_method": "Exact Match", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.043125+00:00", "updated_at": "2025-07-19T19:56:12.043125+00:00", "benchmark_name": "MATH-500" }, { "model_benchmark_id": 93, "benchmark_id": "mmlu", "model_id": "deepseek-v3", "score": 0.885, "normalized_score": 0.885, "is_self_reported": true, "self_reported_source_link": "https://github.com/deepseek-ai/DeepSeek-V3", "verified_by_llmstats": false, "analysis_method": "Exact Match", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.275957+00:00", "updated_at": "2025-07-19T19:56:11.275957+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 202, "benchmark_id": "mmlu-pro", "model_id": "deepseek-v3", "score": 0.759, "normalized_score": 0.759, "is_self_reported": true, "self_reported_source_link": "https://github.com/deepseek-ai/DeepSeek-V3", "verified_by_llmstats": false, "analysis_method": "Exact Match", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.485394+00:00", "updated_at": "2025-07-19T19:56:11.485394+00:00", "benchmark_name": "MMLU-Pro" }, { "model_benchmark_id": 737, "benchmark_id": "mmlu-redux", "model_id": "deepseek-v3", "score": 0.891, "normalized_score": 0.891, "is_self_reported": true, "self_reported_source_link": "https://github.com/deepseek-ai/DeepSeek-V3", "verified_by_llmstats": false, "analysis_method": "Exact Match", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.548864+00:00", "updated_at": "2025-07-19T19:56:12.548864+00:00", "benchmark_name": "MMLU-Redux" }, { "model_benchmark_id": 235, "benchmark_id": "simpleqa", "model_id": "deepseek-v3", "score": 0.249, "normalized_score": 0.249, "is_self_reported": true, "self_reported_source_link": "https://github.com/deepseek-ai/DeepSeek-V3", "verified_by_llmstats": false, "analysis_method": "Correct", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.549943+00:00", "updated_at": "2025-07-19T19:56:11.549943+00:00", "benchmark_name": "SimpleQA" }, { "model_benchmark_id": 1344, "benchmark_id": "swe-bench-verified", "model_id": "deepseek-v3", "score": 0.42, "normalized_score": 0.42, "is_self_reported": true, "self_reported_source_link": "https://github.com/deepseek-ai/DeepSeek-V3", "verified_by_llmstats": false, "analysis_method": "Resolved", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.828562+00:00", "updated_at": "2025-07-19T19:56:13.828562+00:00", "benchmark_name": "SWE-Bench Verified" } ] ================================================ FILE: data/organizations/deepseek/models/deepseek-v3/model.json ================================================ { "model_id": "deepseek-v3", "name": "DeepSeek-V3", "organization_id": "deepseek", "fine_tuned_from_model_id": null, "description": "A powerful Mixture-of-Experts (MoE) language model with 671B total parameters (37B activated per token). Features Multi-head Latent Attention (MLA), auxiliary-loss-free load balancing, and multi-token prediction training. Pre-trained on 14.8T tokens with strong performance in reasoning, math, and code tasks.", "release_date": "2024-12-25", "announcement_date": "2024-12-25", "license_id": "mit_+_model_license_(commercial_use_allowed)", "multimodal": false, "knowledge_cutoff": null, "param_count": 671000000000, "training_tokens": 14800000000000, "available_in_zeroeval": true, "source_api_ref": "https://platform.deepseek.com", "source_playground": "https://chat.deepseek.com", "source_paper": "https://github.com/deepseek-ai/DeepSeek-V3/blob/main/DeepSeek_V3.pdf", "source_scorecard_blog_link": null, "source_repo_link": "https://github.com/deepseek-ai/DeepSeek-V3", "source_weights_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3", "created_at": "2025-07-19T19:49:05.677307+00:00", "updated_at": "2025-07-19T19:49:05.677307+00:00", "model_family_id": null } ================================================ FILE: data/organizations/deepseek/models/deepseek-v3-0324/benchmarks.json ================================================ [ { "model_benchmark_id": 473, "benchmark_id": "aime-2024", "model_id": "deepseek-v3-0324", "score": 0.594, "normalized_score": 0.594, "is_self_reported": true, "self_reported_source_link": "https://api-docs.deepseek.com/news/news250325", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.999879+00:00", "updated_at": "2025-07-19T19:56:11.999879+00:00", "benchmark_name": "AIME 2024" }, { "model_benchmark_id": 318, "benchmark_id": "gpqa", "model_id": "deepseek-v3-0324", "score": 0.684, "normalized_score": 0.684, "is_self_reported": true, "self_reported_source_link": "https://api-docs.deepseek.com/news/news250325", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.705537+00:00", "updated_at": "2025-07-19T19:56:11.705537+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 1138, "benchmark_id": "livecodebench", "model_id": "deepseek-v3-0324", "score": 0.492, "normalized_score": 0.492, "is_self_reported": true, "self_reported_source_link": "https://api-docs.deepseek.com/news/news250325", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.392232+00:00", "updated_at": "2025-07-19T19:56:13.392232+00:00", "benchmark_name": "LiveCodeBench" }, { "model_benchmark_id": 506, "benchmark_id": "math-500", "model_id": "deepseek-v3-0324", "score": 0.94, "normalized_score": 0.94, "is_self_reported": true, "self_reported_source_link": "https://api-docs.deepseek.com/news/news250325", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.053333+00:00", "updated_at": "2025-07-19T19:56:12.053333+00:00", "benchmark_name": "MATH-500" }, { "model_benchmark_id": 204, "benchmark_id": "mmlu-pro", "model_id": "deepseek-v3-0324", "score": 0.812, "normalized_score": 0.812, "is_self_reported": true, "self_reported_source_link": "https://api-docs.deepseek.com/news/news250325", "verified_by_llmstats": false, "analysis_method": "Exact Match", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.488686+00:00", "updated_at": "2025-07-19T19:56:11.488686+00:00", "benchmark_name": "MMLU-Pro" } ] ================================================ FILE: data/organizations/deepseek/models/deepseek-v3-0324/model.json ================================================ { "model_id": "deepseek-v3-0324", "name": "DeepSeek-V3 0324", "organization_id": "deepseek", "fine_tuned_from_model_id": null, "description": "A powerful Mixture-of-Experts (MoE) language model with 671B total parameters (37B activated per token). Features Multi-head Latent Attention (MLA), auxiliary-loss-free load balancing, and multi-token prediction training. Pre-trained on 14.8T tokens with strong performance in reasoning, math, and code tasks.", "release_date": "2025-03-25", "announcement_date": "2025-03-25", "license_id": "mit_+_model_license_(commercial_use_allowed)", "multimodal": false, "knowledge_cutoff": null, "param_count": 671000000000, "training_tokens": 14800000000000, "available_in_zeroeval": true, "source_api_ref": "https://platform.deepseek.com", "source_playground": "https://chat.deepseek.com", "source_paper": "https://arxiv.org/abs/2412.19437", "source_scorecard_blog_link": null, "source_repo_link": "https://github.com/deepseek-ai/DeepSeek-V3", "source_weights_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3-0324", "created_at": "2025-07-19T19:49:05.693499+00:00", "updated_at": "2025-07-19T19:49:05.693499+00:00", "model_family_id": null } ================================================ FILE: data/organizations/deepseek/models/deepseek-v3.1/benchmarks.json ================================================ [ { "model_benchmark_id": 9501, "benchmark_id": "mmlu-redux", "model_id": "deepseek-v3.1", "score": 0.918, "normalized_score": 0.918, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1", "verified_by_llmstats": false, "analysis_method": "Non-Thinking mode", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": "Non-thinking: 91.8%, Thinking: 93.7%", "created_at": "2025-01-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "MMLU-Redux" }, { "model_benchmark_id": 9502, "benchmark_id": "mmlu-pro", "model_id": "deepseek-v3.1", "score": 0.837, "normalized_score": 0.837, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1", "verified_by_llmstats": false, "analysis_method": "Non-Thinking mode", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": "Non-thinking: 83.7%, Thinking: 84.8%", "created_at": "2025-01-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "MMLU-Pro" }, { "model_benchmark_id": 9503, "benchmark_id": "gpqa", "model_id": "deepseek-v3.1", "score": 0.749, "normalized_score": 0.749, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1", "verified_by_llmstats": false, "analysis_method": "Pass@1, Non-Thinking mode", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": "Non-thinking: 74.9%, Thinking: 80.1%", "created_at": "2025-01-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 9504, "benchmark_id": "humanity's-last-exam", "model_id": "deepseek-v3.1", "score": 0.159, "normalized_score": 0.159, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1", "verified_by_llmstats": false, "analysis_method": "Pass@1, Thinking mode, text-only subset", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": "Thinking mode only, text-only subset", "created_at": "2025-01-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "Humanity's Last Exam" }, { "model_benchmark_id": 9505, "benchmark_id": "browsecomp", "model_id": "deepseek-v3.1", "score": 0.3, "normalized_score": 0.3, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1", "verified_by_llmstats": false, "analysis_method": "Thinking mode with search agent", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": "Search agent with commercial API + webpage filter + 128K context", "created_at": "2025-01-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "BrowseComp" }, { "model_benchmark_id": 9506, "benchmark_id": "browsecomp-zh", "model_id": "deepseek-v3.1", "score": 0.492, "normalized_score": 0.492, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1", "verified_by_llmstats": false, "analysis_method": "Thinking mode with search agent", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": "Search agent with commercial API + webpage filter + 128K context", "created_at": "2025-01-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "BrowseComp-zh" }, { "model_benchmark_id": 9507, "benchmark_id": "simpleqa", "model_id": "deepseek-v3.1", "score": 0.934, "normalized_score": 0.934, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1", "verified_by_llmstats": false, "analysis_method": "Thinking mode with search agent", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": "Search agent evaluation", "created_at": "2025-01-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "SimpleQA" }, { "model_benchmark_id": 9508, "benchmark_id": "livecodebench", "model_id": "deepseek-v3.1", "score": 0.564, "normalized_score": 0.564, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1", "verified_by_llmstats": false, "analysis_method": "Pass@1, 2408-2505, Non-Thinking mode", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": "Non-thinking: 56.4%, Thinking: 74.8%", "created_at": "2025-01-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "LiveCodeBench" }, { "model_benchmark_id": 9509, "benchmark_id": "codeforces", "model_id": "deepseek-v3.1", "score": 0.697, "normalized_score": 0.697, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1", "verified_by_llmstats": false, "analysis_method": "Div1 Rating, Thinking mode", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": "Codeforces Div1 rating in thinking mode", "created_at": "2025-01-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "Codeforces" }, { "model_benchmark_id": 9510, "benchmark_id": "aider-polyglot", "model_id": "deepseek-v3.1", "score": 0.684, "normalized_score": 0.684, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1", "verified_by_llmstats": false, "analysis_method": "Non-Thinking mode", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": "Non-thinking: 68.4%, Thinking: 76.3%", "created_at": "2025-01-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "Aider-Polyglot" }, { "model_benchmark_id": 9511, "benchmark_id": "swe-bench-verified", "model_id": "deepseek-v3.1", "score": 0.66, "normalized_score": 0.66, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1", "verified_by_llmstats": false, "analysis_method": "Agent mode, Non-Thinking", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": "Evaluated with internal code agent framework", "created_at": "2025-01-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "SWE-Bench Verified" }, { "model_benchmark_id": 9512, "benchmark_id": "swe-bench-multilingual", "model_id": "deepseek-v3.1", "score": 0.545, "normalized_score": 0.545, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1", "verified_by_llmstats": false, "analysis_method": "Agent mode, Non-Thinking", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": "Evaluated with internal code agent framework", "created_at": "2025-01-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "SWE-Bench Multilingual" }, { "model_benchmark_id": 9513, "benchmark_id": "terminal-bench", "model_id": "deepseek-v3.1", "score": 0.313, "normalized_score": 0.313, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1", "verified_by_llmstats": false, "analysis_method": "Terminus 1 framework, Non-Thinking", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-01-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "Terminal-Bench" }, { "model_benchmark_id": 9514, "benchmark_id": "aime-2024", "model_id": "deepseek-v3.1", "score": 0.663, "normalized_score": 0.663, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1", "verified_by_llmstats": false, "analysis_method": "Pass@1, Non-Thinking mode", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": "Non-thinking: 66.3%, Thinking: 93.1%", "created_at": "2025-01-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "AIME 2024" }, { "model_benchmark_id": 9515, "benchmark_id": "aime-2025", "model_id": "deepseek-v3.1", "score": 0.498, "normalized_score": 0.498, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1", "verified_by_llmstats": false, "analysis_method": "Pass@1, Non-Thinking mode", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": "Non-thinking: 49.8%, Thinking: 88.4%", "created_at": "2025-01-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "AIME 2025" }, { "model_benchmark_id": 9516, "benchmark_id": "hmmt-2025", "model_id": "deepseek-v3.1", "score": 0.335, "normalized_score": 0.335, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1", "verified_by_llmstats": false, "analysis_method": "Pass@1, Non-Thinking mode", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": "Non-thinking: 33.5%, Thinking: 84.2%", "created_at": "2025-01-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "HMMT 2025" } ] ================================================ FILE: data/organizations/deepseek/models/deepseek-v3.1/model.json ================================================ { "model_id": "deepseek-v3.1", "name": "DeepSeek-V3.1", "organization_id": "deepseek", "model_family_id": null, "fine_tuned_from_model_id": "deepseek-v3", "description": "DeepSeek-V3.1 is a hybrid model supporting both thinking and non-thinking modes through different chat templates. Built on DeepSeek-V3.1-Base with a two-phase long context extension (32K phase: 630B tokens, 128K phase: 209B tokens), it features 671B total parameters with 37B activated. Key improvements include smarter tool calling through post-training optimization, higher thinking efficiency achieving comparable quality to DeepSeek-R1-0528 while responding more quickly, and UE8M0 FP8 scale data format for model weights and activations. The model excels in both reasoning tasks (thinking mode) and practical applications (non-thinking mode), with particularly strong performance in code agent tasks, math competitions, and search-based problem solving.", "release_date": "2025-01-10", "announcement_date": "2025-01-10", "license_id": "mit", "multimodal": false, "knowledge_cutoff": null, "param_count": 671000000000, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://api.deepseek.com/docs", "source_playground": "https://chat.deepseek.com/", "source_paper": "https://github.com/deepseek-ai/DeepSeek-V3/blob/main/DeepSeek-V3.pdf", "source_scorecard_blog_link": "https://www.deepseek.com/news/deepseek-v3-1", "source_repo_link": "https://github.com/deepseek-ai/DeepSeek-V3", "source_weights_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1", "created_at": "2025-01-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00" } ================================================ FILE: data/organizations/deepseek/models/deepseek-v3.2-exp/benchmarks.json ================================================ [ { "model_benchmark_id": 9521, "benchmark_id": "mmlu-pro", "model_id": "deepseek-v3.2-exp", "score": 0.85, "normalized_score": 0.85, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.2-Exp", "verified_by_llmstats": false, "analysis_method": "Reasoning Mode (w/o Tool Use)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-09-29T00:00:00.000000+00:00", "updated_at": "2025-09-29T00:00:00.000000+00:00", "benchmark_name": "MMLU-Pro" }, { "model_benchmark_id": 9522, "benchmark_id": "gpqa", "model_id": "deepseek-v3.2-exp", "score": 0.799, "normalized_score": 0.799, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.2-Exp", "verified_by_llmstats": false, "analysis_method": "Reasoning Mode (w/o Tool Use)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-09-29T00:00:00.000000+00:00", "updated_at": "2025-09-29T00:00:00.000000+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 9523, "benchmark_id": "humanity's-last-exam", "model_id": "deepseek-v3.2-exp", "score": 0.198, "normalized_score": 0.198, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.2-Exp", "verified_by_llmstats": false, "analysis_method": "Reasoning Mode (w/o Tool Use)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": "Text-only subset where applicable", "created_at": "2025-09-29T00:00:00.000000+00:00", "updated_at": "2025-09-29T00:00:00.000000+00:00", "benchmark_name": "Humanity's Last Exam" }, { "model_benchmark_id": 9524, "benchmark_id": "livecodebench", "model_id": "deepseek-v3.2-exp", "score": 0.741, "normalized_score": 0.741, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.2-Exp", "verified_by_llmstats": false, "analysis_method": "Pass@1 (Reasoning Mode w/o Tool Use)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-09-29T00:00:00.000000+00:00", "updated_at": "2025-09-29T00:00:00.000000+00:00", "benchmark_name": "LiveCodeBench" }, { "model_benchmark_id": 9525, "benchmark_id": "aime-2025", "model_id": "deepseek-v3.2-exp", "score": 0.893, "normalized_score": 0.893, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.2-Exp", "verified_by_llmstats": false, "analysis_method": "Pass@1 (Reasoning Mode w/o Tool Use)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-09-29T00:00:00.000000+00:00", "updated_at": "2025-09-29T00:00:00.000000+00:00", "benchmark_name": "AIME 2025" }, { "model_benchmark_id": 9526, "benchmark_id": "hmmt-2025", "model_id": "deepseek-v3.2-exp", "score": 0.836, "normalized_score": 0.836, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.2-Exp", "verified_by_llmstats": false, "analysis_method": "Pass@1 (Reasoning Mode w/o Tool Use)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-09-29T00:00:00.000000+00:00", "updated_at": "2025-09-29T00:00:00.000000+00:00", "benchmark_name": "HMMT 2025" }, { "model_benchmark_id": 9527, "benchmark_id": "codeforces", "model_id": "deepseek-v3.2-exp", "score": 0.707, "normalized_score": 0.707, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.2-Exp", "verified_by_llmstats": false, "analysis_method": "Div1 rating (Reasoning Mode)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": "Raw rating ≈ 2121; normalized by 3000 max", "created_at": "2025-09-29T00:00:00.000000+00:00", "updated_at": "2025-09-29T00:00:00.000000+00:00", "benchmark_name": "Codeforces" }, { "model_benchmark_id": 9528, "benchmark_id": "aider-polyglot", "model_id": "deepseek-v3.2-exp", "score": 0.745, "normalized_score": 0.745, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.2-Exp", "verified_by_llmstats": false, "analysis_method": "Reasoning Mode (w/o Tool Use)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-09-29T00:00:00.000000+00:00", "updated_at": "2025-09-29T00:00:00.000000+00:00", "benchmark_name": "Aider-Polyglot" }, { "model_benchmark_id": 9529, "benchmark_id": "browsecomp", "model_id": "deepseek-v3.2-exp", "score": 0.401, "normalized_score": 0.401, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.2-Exp", "verified_by_llmstats": false, "analysis_method": "Agentic Tool Use", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-09-29T00:00:00.000000+00:00", "updated_at": "2025-09-29T00:00:00.000000+00:00", "benchmark_name": "BrowseComp" }, { "model_benchmark_id": 9530, "benchmark_id": "browsecomp-zh", "model_id": "deepseek-v3.2-exp", "score": 0.479, "normalized_score": 0.479, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.2-Exp", "verified_by_llmstats": false, "analysis_method": "Agentic Tool Use", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-09-29T00:00:00.000000+00:00", "updated_at": "2025-09-29T00:00:00.000000+00:00", "benchmark_name": "BrowseComp-zh" }, { "model_benchmark_id": 9531, "benchmark_id": "simpleqa", "model_id": "deepseek-v3.2-exp", "score": 0.971, "normalized_score": 0.971, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.2-Exp", "verified_by_llmstats": false, "analysis_method": "Agentic Tool Use", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-09-29T00:00:00.000000+00:00", "updated_at": "2025-09-29T00:00:00.000000+00:00", "benchmark_name": "SimpleQA" }, { "model_benchmark_id": 9532, "benchmark_id": "swe-bench-verified", "model_id": "deepseek-v3.2-exp", "score": 0.678, "normalized_score": 0.678, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.2-Exp", "verified_by_llmstats": false, "analysis_method": "Agentic Tool Use", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-09-29T00:00:00.000000+00:00", "updated_at": "2025-09-29T00:00:00.000000+00:00", "benchmark_name": "SWE-Bench Verified" }, { "model_benchmark_id": 9533, "benchmark_id": "swe-bench-multilingual", "model_id": "deepseek-v3.2-exp", "score": 0.579, "normalized_score": 0.579, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.2-Exp", "verified_by_llmstats": false, "analysis_method": "Agentic Tool Use", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-09-29T00:00:00.000000+00:00", "updated_at": "2025-09-29T00:00:00.000000+00:00", "benchmark_name": "SWE-Bench Multilingual" }, { "model_benchmark_id": 9534, "benchmark_id": "terminal-bench", "model_id": "deepseek-v3.2-exp", "score": 0.377, "normalized_score": 0.377, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.2-Exp", "verified_by_llmstats": false, "analysis_method": "Agentic Tool Use", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-09-29T00:00:00.000000+00:00", "updated_at": "2025-09-29T00:00:00.000000+00:00", "benchmark_name": "Terminal-Bench" } ] ================================================ FILE: data/organizations/deepseek/models/deepseek-v3.2-exp/model.json ================================================ { "model_id": "deepseek-v3.2-exp", "name": "DeepSeek-V3.2-Exp", "organization_id": "deepseek", "model_family_id": null, "fine_tuned_from_model_id": null, "description": "DeepSeek-V3.2-Exp is an experimental iteration introducing DeepSeek Sparse Attention (DSA) to improve long-context training and inference efficiency while keeping output quality on par with V3.1. It explores fine-grained sparse attention for extended sequence processing.", "release_date": "2025-09-29", "announcement_date": "2025-09-29", "license_id": "mit", "multimodal": false, "knowledge_cutoff": null, "param_count": 685000000000, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://api.deepseek.com/docs", "source_playground": "https://chat.deepseek.com/", "source_paper": "https://github.com/deepseek-ai/DeepSeek-V3.2-Exp/blob/main/DeepSeek_V3_2.pdf", "source_scorecard_blog_link": null, "source_repo_link": "https://github.com/deepseek-ai/DeepSeek-V3.2-Exp", "source_weights_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.2-Exp", "created_at": "2025-09-29T00:00:00.000000+00:00", "updated_at": "2025-09-29T00:00:00.000000+00:00" } ================================================ FILE: data/organizations/deepseek/models/deepseek-vl2/benchmarks.json ================================================ [ { "model_benchmark_id": 1256, "benchmark_id": "ai2d", "model_id": "deepseek-vl2", "score": 0.814, "normalized_score": 0.814, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2412.10302", "verified_by_llmstats": false, "analysis_method": "test", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.636398+00:00", "updated_at": "2025-07-19T19:56:13.636398+00:00", "benchmark_name": "AI2D" }, { "model_benchmark_id": 868, "benchmark_id": "chartqa", "model_id": "deepseek-vl2", "score": 0.86, "normalized_score": 0.86, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2412.10302", "verified_by_llmstats": false, "analysis_method": "test", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.812840+00:00", "updated_at": "2025-07-19T19:56:12.812840+00:00", "benchmark_name": "ChartQA" }, { "model_benchmark_id": 890, "benchmark_id": "docvqa", "model_id": "deepseek-vl2", "score": 0.933, "normalized_score": 0.933, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2412.10302", "verified_by_llmstats": false, "analysis_method": "test", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.852402+00:00", "updated_at": "2025-07-19T19:56:12.852402+00:00", "benchmark_name": "DocVQA" }, { "model_benchmark_id": 1244, "benchmark_id": "infovqa", "model_id": "deepseek-vl2", "score": 0.781, "normalized_score": 0.781, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2412.10302", "verified_by_llmstats": false, "analysis_method": "test", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.614094+00:00", "updated_at": "2025-07-19T19:56:13.614094+00:00", "benchmark_name": "InfoVQA" }, { "model_benchmark_id": 528, "benchmark_id": "mathvista", "model_id": "deepseek-vl2", "score": 0.628, "normalized_score": 0.628, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2412.10302", "verified_by_llmstats": false, "analysis_method": "testmini", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.096047+00:00", "updated_at": "2025-07-19T19:56:12.096047+00:00", "benchmark_name": "MathVista" }, { "model_benchmark_id": 1513, "benchmark_id": "mmbench", "model_id": "deepseek-vl2", "score": 0.796, "normalized_score": 0.796, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2412.10302", "verified_by_llmstats": false, "analysis_method": "en test", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.245378+00:00", "updated_at": "2025-07-19T19:56:14.247008+00:00", "benchmark_name": "MMBench" }, { "model_benchmark_id": 1727, "benchmark_id": "mmbench-v1.1", "model_id": "deepseek-vl2", "score": 0.792, "normalized_score": 0.792, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2412.10302", "verified_by_llmstats": false, "analysis_method": "cn test", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.873346+00:00", "updated_at": "2025-07-19T19:56:14.873346+00:00", "benchmark_name": "MMBench-V1.1" }, { "model_benchmark_id": 1784, "benchmark_id": "mme", "model_id": "deepseek-vl2", "score": 0.2253, "normalized_score": 0.2253, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2412.10302", "verified_by_llmstats": false, "analysis_method": "Standard Evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.025040+00:00", "updated_at": "2025-07-19T19:56:15.025040+00:00", "benchmark_name": "MME" }, { "model_benchmark_id": 574, "benchmark_id": "mmmu", "model_id": "deepseek-vl2", "score": 0.511, "normalized_score": 0.511, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2412.10302", "verified_by_llmstats": false, "analysis_method": "val", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.181251+00:00", "updated_at": "2025-07-19T19:56:12.181251+00:00", "benchmark_name": "MMMU" }, { "model_benchmark_id": 1663, "benchmark_id": "mmstar", "model_id": "deepseek-vl2", "score": 0.613, "normalized_score": 0.613, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2412.10302", "verified_by_llmstats": false, "analysis_method": "Standard Evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.669907+00:00", "updated_at": "2025-07-19T19:56:14.669907+00:00", "benchmark_name": "MMStar" }, { "model_benchmark_id": 1667, "benchmark_id": "mmt-bench", "model_id": "deepseek-vl2", "score": 0.636, "normalized_score": 0.636, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2412.10302", "verified_by_llmstats": false, "analysis_method": "Standard Evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.678247+00:00", "updated_at": "2025-07-19T19:56:14.678247+00:00", "benchmark_name": "MMT-Bench" }, { "model_benchmark_id": 1542, "benchmark_id": "ocrbench", "model_id": "deepseek-vl2", "score": 0.811, "normalized_score": 0.811, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2412.10302", "verified_by_llmstats": false, "analysis_method": "Standard Evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.320020+00:00", "updated_at": "2025-07-19T19:56:14.320020+00:00", "benchmark_name": "OCRBench" }, { "model_benchmark_id": 1635, "benchmark_id": "realworldqa", "model_id": "deepseek-vl2", "score": 0.684, "normalized_score": 0.684, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2412.10302", "verified_by_llmstats": false, "analysis_method": "Standard Evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.601290+00:00", "updated_at": "2025-07-19T19:56:14.601290+00:00", "benchmark_name": "RealWorldQA" }, { "model_benchmark_id": 912, "benchmark_id": "textvqa", "model_id": "deepseek-vl2", "score": 0.842, "normalized_score": 0.842, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2412.10302", "verified_by_llmstats": false, "analysis_method": "val", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.902069+00:00", "updated_at": "2025-07-19T19:56:12.902069+00:00", "benchmark_name": "TextVQA" } ] ================================================ FILE: data/organizations/deepseek/models/deepseek-vl2/model.json ================================================ { "model_id": "deepseek-vl2", "name": "DeepSeek VL2", "organization_id": "deepseek", "fine_tuned_from_model_id": null, "description": "An advanced series of large Mixture-of-Experts (MoE) Vision-Language Models that significantly improves upon its predecessor, DeepSeek-VL. DeepSeek-VL2 demonstrates superior capabilities across various tasks, including but not limited to visual question answering, optical character recognition, document/table/chart understanding, and visual grounding.", "release_date": "2024-12-13", "announcement_date": "2024-12-13", "license_id": "deepseek", "multimodal": true, "knowledge_cutoff": null, "param_count": 27000000000, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://www.deepseek.com/", "source_playground": "https://huggingface.co/deepseek-ai/deepseek-vl2", "source_paper": "https://arxiv.org/pdf/2412.10302", "source_scorecard_blog_link": null, "source_repo_link": "https://github.com/deepseek-ai/DeepSeek-VL2?tab=readme-ov-file", "source_weights_link": "https://huggingface.co/deepseek-ai/deepseek-vl2", "created_at": "2025-07-19T19:49:05.658016+00:00", "updated_at": "2025-07-19T19:49:05.658016+00:00", "model_family_id": null } ================================================ FILE: data/organizations/deepseek/models/deepseek-vl2-small/benchmarks.json ================================================ [ { "model_benchmark_id": 1258, "benchmark_id": "ai2d", "model_id": "deepseek-vl2-small", "score": 0.8, "normalized_score": 0.8, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2412.10302", "verified_by_llmstats": false, "analysis_method": "test", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.640145+00:00", "updated_at": "2025-07-19T19:56:13.640145+00:00", "benchmark_name": "AI2D" }, { "model_benchmark_id": 870, "benchmark_id": "chartqa", "model_id": "deepseek-vl2-small", "score": 0.845, "normalized_score": 0.845, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2412.10302", "verified_by_llmstats": false, "analysis_method": "test", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.816278+00:00", "updated_at": "2025-07-19T19:56:12.816278+00:00", "benchmark_name": "ChartQA" }, { "model_benchmark_id": 892, "benchmark_id": "docvqa", "model_id": "deepseek-vl2-small", "score": 0.923, "normalized_score": 0.923, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2412.10302", "verified_by_llmstats": false, "analysis_method": "test", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.857733+00:00", "updated_at": "2025-07-19T19:56:12.857733+00:00", "benchmark_name": "DocVQA" }, { "model_benchmark_id": 1246, "benchmark_id": "infovqa", "model_id": "deepseek-vl2-small", "score": 0.758, "normalized_score": 0.758, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2412.10302", "verified_by_llmstats": false, "analysis_method": "test", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.617970+00:00", "updated_at": "2025-07-19T19:56:13.617970+00:00", "benchmark_name": "InfoVQA" }, { "model_benchmark_id": 530, "benchmark_id": "mathvista", "model_id": "deepseek-vl2-small", "score": 0.607, "normalized_score": 0.607, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2412.10302", "verified_by_llmstats": false, "analysis_method": "testmini", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.100314+00:00", "updated_at": "2025-07-19T19:56:12.100314+00:00", "benchmark_name": "MathVista" }, { "model_benchmark_id": 1517, "benchmark_id": "mmbench", "model_id": "deepseek-vl2-small", "score": 0.803, "normalized_score": 0.803, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2412.10302", "verified_by_llmstats": false, "analysis_method": "en test", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.252930+00:00", "updated_at": "2025-07-19T19:56:14.254459+00:00", "benchmark_name": "MMBench" }, { "model_benchmark_id": 1729, "benchmark_id": "mmbench-v1.1", "model_id": "deepseek-vl2-small", "score": 0.793, "normalized_score": 0.793, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2412.10302", "verified_by_llmstats": false, "analysis_method": "cn test", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.876824+00:00", "updated_at": "2025-07-19T19:56:14.876824+00:00", "benchmark_name": "MMBench-V1.1" }, { "model_benchmark_id": 1786, "benchmark_id": "mme", "model_id": "deepseek-vl2-small", "score": 0.2123, "normalized_score": 0.2123, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2412.10302", "verified_by_llmstats": false, "analysis_method": "Standard Evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.028315+00:00", "updated_at": "2025-07-19T19:56:15.028315+00:00", "benchmark_name": "MME" }, { "model_benchmark_id": 576, "benchmark_id": "mmmu", "model_id": "deepseek-vl2-small", "score": 0.48, "normalized_score": 0.48, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2412.10302", "verified_by_llmstats": false, "analysis_method": "val", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.184966+00:00", "updated_at": "2025-07-19T19:56:12.184966+00:00", "benchmark_name": "MMMU" }, { "model_benchmark_id": 1665, "benchmark_id": "mmstar", "model_id": "deepseek-vl2-small", "score": 0.57, "normalized_score": 0.57, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2412.10302", "verified_by_llmstats": false, "analysis_method": "Standard Evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.672978+00:00", "updated_at": "2025-07-19T19:56:14.672978+00:00", "benchmark_name": "MMStar" }, { "model_benchmark_id": 1669, "benchmark_id": "mmt-bench", "model_id": "deepseek-vl2-small", "score": 0.629, "normalized_score": 0.629, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2412.10302", "verified_by_llmstats": false, "analysis_method": "Standard Evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.683443+00:00", "updated_at": "2025-07-19T19:56:14.683443+00:00", "benchmark_name": "MMT-Bench" }, { "model_benchmark_id": 1544, "benchmark_id": "ocrbench", "model_id": "deepseek-vl2-small", "score": 0.834, "normalized_score": 0.834, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2412.10302", "verified_by_llmstats": false, "analysis_method": "Standard Evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.324965+00:00", "updated_at": "2025-07-19T19:56:14.324965+00:00", "benchmark_name": "OCRBench" }, { "model_benchmark_id": 1637, "benchmark_id": "realworldqa", "model_id": "deepseek-vl2-small", "score": 0.654, "normalized_score": 0.654, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2412.10302", "verified_by_llmstats": false, "analysis_method": "Standard Evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.604508+00:00", "updated_at": "2025-07-19T19:56:14.604508+00:00", "benchmark_name": "RealWorldQA" }, { "model_benchmark_id": 914, "benchmark_id": "textvqa", "model_id": "deepseek-vl2-small", "score": 0.834, "normalized_score": 0.834, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2412.10302", "verified_by_llmstats": false, "analysis_method": "val", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.906237+00:00", "updated_at": "2025-07-19T19:56:12.906237+00:00", "benchmark_name": "TextVQA" } ] ================================================ FILE: data/organizations/deepseek/models/deepseek-vl2-small/model.json ================================================ { "model_id": "deepseek-vl2-small", "name": "DeepSeek VL2 Small", "organization_id": "deepseek", "fine_tuned_from_model_id": null, "description": "An advanced series of large Mixture-of-Experts (MoE) Vision-Language Models that significantly improves upon its predecessor, DeepSeek-VL. DeepSeek-VL2 demonstrates superior capabilities across various tasks, including but not limited to visual question answering, optical character recognition, document/table/chart understanding, and visual grounding.", "release_date": "2024-12-13", "announcement_date": "2024-12-13", "license_id": "deepseek", "multimodal": true, "knowledge_cutoff": null, "param_count": 16000000000, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://www.deepseek.com/", "source_playground": "https://huggingface.co/deepseek-ai/deepseek-vl2-small", "source_paper": "https://arxiv.org/pdf/2412.10302", "source_scorecard_blog_link": null, "source_repo_link": "https://github.com/deepseek-ai/DeepSeek-VL2", "source_weights_link": "https://huggingface.co/deepseek-ai/deepseek-vl2-small", "created_at": "2025-07-19T19:49:05.666424+00:00", "updated_at": "2025-07-19T19:49:05.666424+00:00", "model_family_id": null } ================================================ FILE: data/organizations/deepseek/models/deepseek-vl2-tiny/benchmarks.json ================================================ [ { "model_benchmark_id": 1257, "benchmark_id": "ai2d", "model_id": "deepseek-vl2-tiny", "score": 0.716, "normalized_score": 0.716, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2412.10302", "verified_by_llmstats": false, "analysis_method": "test", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.638556+00:00", "updated_at": "2025-07-19T19:56:13.638556+00:00", "benchmark_name": "AI2D" }, { "model_benchmark_id": 869, "benchmark_id": "chartqa", "model_id": "deepseek-vl2-tiny", "score": 0.81, "normalized_score": 0.81, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2412.10302", "verified_by_llmstats": false, "analysis_method": "test", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.814592+00:00", "updated_at": "2025-07-19T19:56:12.814592+00:00", "benchmark_name": "ChartQA" }, { "model_benchmark_id": 891, "benchmark_id": "docvqa", "model_id": "deepseek-vl2-tiny", "score": 0.889, "normalized_score": 0.889, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2412.10302", "verified_by_llmstats": false, "analysis_method": "test", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.854588+00:00", "updated_at": "2025-07-19T19:56:12.854588+00:00", "benchmark_name": "DocVQA" }, { "model_benchmark_id": 1245, "benchmark_id": "infovqa", "model_id": "deepseek-vl2-tiny", "score": 0.661, "normalized_score": 0.661, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2412.10302", "verified_by_llmstats": false, "analysis_method": "test", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.616113+00:00", "updated_at": "2025-07-19T19:56:13.616113+00:00", "benchmark_name": "InfoVQA" }, { "model_benchmark_id": 529, "benchmark_id": "mathvista", "model_id": "deepseek-vl2-tiny", "score": 0.536, "normalized_score": 0.536, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2412.10302", "verified_by_llmstats": false, "analysis_method": "testmini", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.098477+00:00", "updated_at": "2025-07-19T19:56:12.098477+00:00", "benchmark_name": "MathVista" }, { "model_benchmark_id": 1515, "benchmark_id": "mmbench", "model_id": "deepseek-vl2-tiny", "score": 0.692, "normalized_score": 0.692, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2412.10302", "verified_by_llmstats": false, "analysis_method": "en test", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.249349+00:00", "updated_at": "2025-07-19T19:56:14.251060+00:00", "benchmark_name": "MMBench" }, { "model_benchmark_id": 1728, "benchmark_id": "mmbench-v1.1", "model_id": "deepseek-vl2-tiny", "score": 0.683, "normalized_score": 0.683, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2412.10302", "verified_by_llmstats": false, "analysis_method": "cn test", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.875207+00:00", "updated_at": "2025-07-19T19:56:14.875207+00:00", "benchmark_name": "MMBench-V1.1" }, { "model_benchmark_id": 1785, "benchmark_id": "mme", "model_id": "deepseek-vl2-tiny", "score": 0.1915, "normalized_score": 0.1915, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2412.10302", "verified_by_llmstats": false, "analysis_method": "Standard Evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.026734+00:00", "updated_at": "2025-07-19T19:56:15.026734+00:00", "benchmark_name": "MME" }, { "model_benchmark_id": 575, "benchmark_id": "mmmu", "model_id": "deepseek-vl2-tiny", "score": 0.407, "normalized_score": 0.407, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2412.10302", "verified_by_llmstats": false, "analysis_method": "val", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.183016+00:00", "updated_at": "2025-07-19T19:56:12.183016+00:00", "benchmark_name": "MMMU" }, { "model_benchmark_id": 1664, "benchmark_id": "mmstar", "model_id": "deepseek-vl2-tiny", "score": 0.459, "normalized_score": 0.459, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2412.10302", "verified_by_llmstats": false, "analysis_method": "Standard Evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.671412+00:00", "updated_at": "2025-07-19T19:56:14.671412+00:00", "benchmark_name": "MMStar" }, { "model_benchmark_id": 1668, "benchmark_id": "mmt-bench", "model_id": "deepseek-vl2-tiny", "score": 0.532, "normalized_score": 0.532, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2412.10302", "verified_by_llmstats": false, "analysis_method": "Standard Evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.681683+00:00", "updated_at": "2025-07-19T19:56:14.681683+00:00", "benchmark_name": "MMT-Bench" }, { "model_benchmark_id": 1543, "benchmark_id": "ocrbench", "model_id": "deepseek-vl2-tiny", "score": 0.809, "normalized_score": 0.809, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2412.10302", "verified_by_llmstats": false, "analysis_method": "Standard Evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.321888+00:00", "updated_at": "2025-07-19T19:56:14.321888+00:00", "benchmark_name": "OCRBench" }, { "model_benchmark_id": 1636, "benchmark_id": "realworldqa", "model_id": "deepseek-vl2-tiny", "score": 0.642, "normalized_score": 0.642, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2412.10302", "verified_by_llmstats": false, "analysis_method": "Standard Evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.602948+00:00", "updated_at": "2025-07-19T19:56:14.602948+00:00", "benchmark_name": "RealWorldQA" }, { "model_benchmark_id": 913, "benchmark_id": "textvqa", "model_id": "deepseek-vl2-tiny", "score": 0.807, "normalized_score": 0.807, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2412.10302", "verified_by_llmstats": false, "analysis_method": "val", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.904238+00:00", "updated_at": "2025-07-19T19:56:12.904238+00:00", "benchmark_name": "TextVQA" } ] ================================================ FILE: data/organizations/deepseek/models/deepseek-vl2-tiny/model.json ================================================ { "model_id": "deepseek-vl2-tiny", "name": "DeepSeek VL2 Tiny", "organization_id": "deepseek", "fine_tuned_from_model_id": null, "description": "An advanced series of large Mixture-of-Experts (MoE) Vision-Language Models that significantly improves upon its predecessor, DeepSeek-VL. DeepSeek-VL2 demonstrates superior capabilities across various tasks, including but not limited to visual question answering, optical character recognition, document/table/chart understanding, and visual grounding.", "release_date": "2024-12-13", "announcement_date": "2024-12-13", "license_id": "deepseek", "multimodal": true, "knowledge_cutoff": null, "param_count": 3000000000, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://www.deepseek.com/", "source_playground": "https://huggingface.co/deepseek-ai/deepseek-vl2-tiny", "source_paper": "https://arxiv.org/pdf/2412.10302", "source_scorecard_blog_link": null, "source_repo_link": "https://github.com/deepseek-ai/DeepSeek-VL2", "source_weights_link": "https://huggingface.co/deepseek-ai/deepseek-vl2-tiny", "created_at": "2025-07-19T19:49:05.662552+00:00", "updated_at": "2025-07-19T19:49:05.662552+00:00", "model_family_id": null } ================================================ FILE: data/organizations/deepseek/organization.json ================================================ { "organization_id": "deepseek", "name": "DeepSeek", "website": "https://deepseek.com", "description": "Chinese AI company developing state-of-the-art large language models including the DeepSeek-V3 series with mixture-of-experts architecture and hybrid thinking/non-thinking capabilities", "country": "CN", "created_at": "2025-07-19T19:49:05.655332+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00" } ================================================ FILE: data/organizations/google/models/gemini-1.0-pro/benchmarks.json ================================================ [ { "model_benchmark_id": 1390, "benchmark_id": "big-bench", "model_id": "gemini-1.0-pro", "score": 0.75, "normalized_score": 0.75, "is_self_reported": false, "self_reported_source_link": "https://example.com/benchmark-image", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.928761+00:00", "updated_at": "2025-07-19T19:56:13.928761+00:00", "benchmark_name": "BIG-Bench" }, { "model_benchmark_id": 920, "benchmark_id": "egoschema", "model_id": "gemini-1.0-pro", "score": 0.557, "normalized_score": 0.557, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/technologies/gemini/pro/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.922622+00:00", "updated_at": "2025-07-19T19:56:12.922622+00:00", "benchmark_name": "EgoSchema" }, { "model_benchmark_id": 1397, "benchmark_id": "fleurs", "model_id": "gemini-1.0-pro", "score": 0.064, "normalized_score": 0.064, "is_self_reported": false, "self_reported_source_link": "https://example.com/benchmark-image", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.946039+00:00", "updated_at": "2025-07-19T19:56:13.946039+00:00", "benchmark_name": "FLEURS" }, { "model_benchmark_id": 264, "benchmark_id": "gpqa", "model_id": "gemini-1.0-pro", "score": 0.279, "normalized_score": 0.279, "is_self_reported": false, "self_reported_source_link": "https://example.com/benchmark-image", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.607534+00:00", "updated_at": "2025-07-19T19:56:11.607534+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 378, "benchmark_id": "math", "model_id": "gemini-1.0-pro", "score": 0.326, "normalized_score": 0.326, "is_self_reported": false, "self_reported_source_link": "https://example.com/benchmark-image", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.817378+00:00", "updated_at": "2025-07-19T19:56:11.817378+00:00", "benchmark_name": "MATH" }, { "model_benchmark_id": 516, "benchmark_id": "mathvista", "model_id": "gemini-1.0-pro", "score": 0.466, "normalized_score": 0.466, "is_self_reported": false, "self_reported_source_link": "https://example.com/benchmark-image", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.073663+00:00", "updated_at": "2025-07-19T19:56:12.073663+00:00", "benchmark_name": "MathVista" }, { "model_benchmark_id": 64, "benchmark_id": "mmlu", "model_id": "gemini-1.0-pro", "score": 0.718, "normalized_score": 0.718, "is_self_reported": true, "self_reported_source_link": "https://example.com/benchmark-image", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.221259+00:00", "updated_at": "2025-07-19T19:56:11.221259+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 553, "benchmark_id": "mmmu", "model_id": "gemini-1.0-pro", "score": 0.479, "normalized_score": 0.479, "is_self_reported": false, "self_reported_source_link": "https://example.com/benchmark-image", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.139083+00:00", "updated_at": "2025-07-19T19:56:12.139083+00:00", "benchmark_name": "MMMU" }, { "model_benchmark_id": 1393, "benchmark_id": "wmt23", "model_id": "gemini-1.0-pro", "score": 0.717, "normalized_score": 0.717, "is_self_reported": false, "self_reported_source_link": "https://example.com/benchmark-image", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.937549+00:00", "updated_at": "2025-07-19T19:56:13.937549+00:00", "benchmark_name": "WMT23" } ] ================================================ FILE: data/organizations/google/models/gemini-1.0-pro/model.json ================================================ { "model_id": "gemini-1.0-pro", "name": "Gemini 1.0 Pro", "organization_id": "google", "fine_tuned_from_model_id": null, "description": "Gemini 1.0 Pro is a Natural Language Processing (NLP) model designed for tasks such as multi-turn text and code chat, and code generation. It supports text input and output, making it ideal for natural language tasks. The model is optimized for handling complex conversations and generating code snippets. It offers adjustable safety settings and supports function calling, but does not support JSON mode, JSON schema, or system instructions. The latest stable version is gemini-1.0-pro-001, and it was last updated in February 2024.", "release_date": "2024-02-15", "announcement_date": "2024-02-15", "license_id": "proprietary", "multimodal": false, "knowledge_cutoff": "2024-02-01", "param_count": null, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://ai.google.dev/gemini-api/docs/models/gemini#gemini-1.0-pro", "source_playground": "https://gemini.google/advanced/", "source_paper": "https://arxiv.org/pdf/2312.11805", "source_scorecard_blog_link": "https://blog.google/technology/ai/google-gemini-ai/#scalable-efficient", "source_repo_link": null, "source_weights_link": null, "created_at": "2025-07-19T19:49:05.461784+00:00", "updated_at": "2025-07-19T19:49:05.461784+00:00", "model_family_id": null } ================================================ FILE: data/organizations/google/models/gemini-1.5-flash/benchmarks.json ================================================ [ { "model_benchmark_id": 1417, "benchmark_id": "amc-2022-23", "model_id": "gemini-1.5-flash", "score": 0.348, "normalized_score": 0.348, "is_self_reported": true, "self_reported_source_link": "https://www.maa.org/math-competitions/amc-1012", "verified_by_llmstats": false, "analysis_method": "Accuracy (4-shot)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.997413+00:00", "updated_at": "2025-07-19T19:56:13.997413+00:00", "benchmark_name": "AMC_2022_23" }, { "model_benchmark_id": 1072, "benchmark_id": "big-bench-hard", "model_id": "gemini-1.5-flash", "score": 0.855, "normalized_score": 0.855, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/abs/2206.04615", "verified_by_llmstats": false, "analysis_method": "Accuracy (3-shot)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.235605+00:00", "updated_at": "2025-07-19T19:56:13.235605+00:00", "benchmark_name": "BIG-Bench Hard" }, { "model_benchmark_id": 1399, "benchmark_id": "fleurs", "model_id": "gemini-1.5-flash", "score": 0.096, "normalized_score": 0.096, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/technologies/gemini/flash/", "verified_by_llmstats": false, "analysis_method": "Word Error Rate", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.949679+00:00", "updated_at": "2025-07-19T19:56:13.949679+00:00", "benchmark_name": "FLEURS" }, { "model_benchmark_id": 1415, "benchmark_id": "functionalmath", "model_id": "gemini-1.5-flash", "score": 0.536, "normalized_score": 0.536, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/abs/2201.04723", "verified_by_llmstats": false, "analysis_method": "Accuracy (0-shot)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.991969+00:00", "updated_at": "2025-07-19T19:56:13.991969+00:00", "benchmark_name": "FunctionalMATH" }, { "model_benchmark_id": 272, "benchmark_id": "gpqa", "model_id": "gemini-1.5-flash", "score": 0.51, "normalized_score": 0.51, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/technologies/gemini/flash/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.622361+00:00", "updated_at": "2025-07-19T19:56:11.622361+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 981, "benchmark_id": "gsm8k", "model_id": "gemini-1.5-flash", "score": 0.862, "normalized_score": 0.862, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/abs/2110.14168", "verified_by_llmstats": false, "analysis_method": "Accuracy (11-shot)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.060014+00:00", "updated_at": "2025-07-19T19:56:13.060014+00:00", "benchmark_name": "GSM8k" }, { "model_benchmark_id": 40, "benchmark_id": "hellaswag", "model_id": "gemini-1.5-flash", "score": 0.865, "normalized_score": 0.865, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/abs/1905.07830", "verified_by_llmstats": false, "analysis_method": "Accuracy (10-shot)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.168455+00:00", "updated_at": "2025-07-19T19:56:11.168455+00:00", "benchmark_name": "HellaSwag" }, { "model_benchmark_id": 1158, "benchmark_id": "hiddenmath", "model_id": "gemini-1.5-flash", "score": 0.472, "normalized_score": 0.472, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/technologies/gemini/flash/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.436585+00:00", "updated_at": "2025-07-19T19:56:13.436585+00:00", "benchmark_name": "HiddenMath" }, { "model_benchmark_id": 768, "benchmark_id": "humaneval", "model_id": "gemini-1.5-flash", "score": 0.743, "normalized_score": 0.743, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/technologies/gemini/flash/", "verified_by_llmstats": false, "analysis_method": "Pass Rate", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.617215+00:00", "updated_at": "2025-07-19T19:56:12.617215+00:00", "benchmark_name": "HumanEval" }, { "model_benchmark_id": 383, "benchmark_id": "math", "model_id": "gemini-1.5-flash", "score": 0.779, "normalized_score": 0.779, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/technologies/gemini/flash/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.826586+00:00", "updated_at": "2025-07-19T19:56:11.826586+00:00", "benchmark_name": "MATH" }, { "model_benchmark_id": 518, "benchmark_id": "mathvista", "model_id": "gemini-1.5-flash", "score": 0.658, "normalized_score": 0.658, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/technologies/gemini/flash/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.077492+00:00", "updated_at": "2025-07-19T19:56:12.077492+00:00", "benchmark_name": "MathVista" }, { "model_benchmark_id": 1276, "benchmark_id": "mgsm", "model_id": "gemini-1.5-flash", "score": 0.826, "normalized_score": 0.826, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/abs/2305.08916", "verified_by_llmstats": false, "analysis_method": "Accuracy (8-shot)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.676395+00:00", "updated_at": "2025-07-19T19:56:13.676395+00:00", "benchmark_name": "MGSM" }, { "model_benchmark_id": 69, "benchmark_id": "mmlu", "model_id": "gemini-1.5-flash", "score": 0.789, "normalized_score": 0.789, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/abs/2403.05530", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.229674+00:00", "updated_at": "2025-07-19T19:56:11.229674+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 168, "benchmark_id": "mmlu-pro", "model_id": "gemini-1.5-flash", "score": 0.673, "normalized_score": 0.673, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/technologies/gemini/flash/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.426986+00:00", "updated_at": "2025-07-19T19:56:11.426986+00:00", "benchmark_name": "MMLU-Pro" }, { "model_benchmark_id": 560, "benchmark_id": "mmmu", "model_id": "gemini-1.5-flash", "score": 0.623, "normalized_score": 0.623, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/technologies/gemini/flash/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.153019+00:00", "updated_at": "2025-07-19T19:56:12.153019+00:00", "benchmark_name": "MMMU" }, { "model_benchmark_id": 1376, "benchmark_id": "mrcr", "model_id": "gemini-1.5-flash", "score": 0.719, "normalized_score": 0.719, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/technologies/gemini/flash/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.896456+00:00", "updated_at": "2025-07-19T19:56:13.896456+00:00", "benchmark_name": "MRCR" }, { "model_benchmark_id": 1199, "benchmark_id": "natural2code", "model_id": "gemini-1.5-flash", "score": 0.798, "normalized_score": 0.798, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/technologies/gemini/flash/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.525034+00:00", "updated_at": "2025-07-19T19:56:13.525034+00:00", "benchmark_name": "Natural2Code" }, { "model_benchmark_id": 1413, "benchmark_id": "physicsfinals", "model_id": "gemini-1.5-flash", "score": 0.574, "normalized_score": 0.574, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/abs/2303.16416", "verified_by_llmstats": false, "analysis_method": "Accuracy (0-shot)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.986673+00:00", "updated_at": "2025-07-19T19:56:13.986673+00:00", "benchmark_name": "PhysicsFinals" }, { "model_benchmark_id": 1369, "benchmark_id": "vibe-eval", "model_id": "gemini-1.5-flash", "score": 0.489, "normalized_score": 0.489, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/technologies/gemini/flash/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.882991+00:00", "updated_at": "2025-07-19T19:56:13.882991+00:00", "benchmark_name": "Vibe-Eval" }, { "model_benchmark_id": 1381, "benchmark_id": "video-mme", "model_id": "gemini-1.5-flash", "score": 0.761, "normalized_score": 0.761, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/technologies/gemini/flash/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.908485+00:00", "updated_at": "2025-07-19T19:56:13.908485+00:00", "benchmark_name": "Video-MME" }, { "model_benchmark_id": 1395, "benchmark_id": "wmt23", "model_id": "gemini-1.5-flash", "score": 0.741, "normalized_score": 0.741, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/technologies/gemini/flash/", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.940965+00:00", "updated_at": "2025-07-19T19:56:13.940965+00:00", "benchmark_name": "WMT23" }, { "model_benchmark_id": 1419, "benchmark_id": "xstest", "model_id": "gemini-1.5-flash", "score": 0.97, "normalized_score": 0.97, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/technologies/gemini/flash/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.004109+00:00", "updated_at": "2025-07-19T19:56:14.004109+00:00", "benchmark_name": "XSTest" } ] ================================================ FILE: data/organizations/google/models/gemini-1.5-flash/model.json ================================================ { "model_id": "gemini-1.5-flash", "name": "Gemini 1.5 Flash", "organization_id": "google", "fine_tuned_from_model_id": null, "description": "Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. It supports audio, images, video, and text input, and produces text output. The model is optimized for generating code, extracting data, editing text, and more, making it ideal for narrow, high-frequency tasks.", "release_date": "2024-05-01", "announcement_date": "2024-05-01", "license_id": "proprietary", "multimodal": true, "knowledge_cutoff": "2023-11-01", "param_count": null, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://ai.google.dev/gemini-api/docs/models/gemini#gemini-1.5-flash", "source_playground": "https://ai.google.dev/studio", "source_paper": "https://arxiv.org/pdf/2403.05530", "source_scorecard_blog_link": "https://deepmind.google/technologies/gemini/flash/", "source_repo_link": null, "source_weights_link": null, "created_at": "2025-07-19T19:49:05.514569+00:00", "updated_at": "2025-07-19T19:49:05.514569+00:00", "model_family_id": null } ================================================ FILE: data/organizations/google/models/gemini-1.5-flash-8b/benchmarks.json ================================================ [ { "model_benchmark_id": 1400, "benchmark_id": "fleurs", "model_id": "gemini-1.5-flash-8b", "score": 0.864, "normalized_score": 0.864, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/technologies/gemini/flash/", "verified_by_llmstats": false, "analysis_method": "Speech recognition accuracy (1 - WER)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.951665+00:00", "updated_at": "2025-07-19T19:56:13.951665+00:00", "benchmark_name": "FLEURS" }, { "model_benchmark_id": 277, "benchmark_id": "gpqa", "model_id": "gemini-1.5-flash-8b", "score": 0.384, "normalized_score": 0.384, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/technologies/gemini/flash/", "verified_by_llmstats": false, "analysis_method": "Accuracy on expert-written science questions", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.635441+00:00", "updated_at": "2025-07-19T19:56:11.635441+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 1163, "benchmark_id": "hiddenmath", "model_id": "gemini-1.5-flash-8b", "score": 0.328, "normalized_score": 0.328, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/technologies/gemini/flash/", "verified_by_llmstats": false, "analysis_method": "Accuracy on competition-level math problems", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.447290+00:00", "updated_at": "2025-07-19T19:56:13.447290+00:00", "benchmark_name": "HiddenMath" }, { "model_benchmark_id": 387, "benchmark_id": "math", "model_id": "gemini-1.5-flash-8b", "score": 0.587, "normalized_score": 0.587, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/technologies/gemini/flash/", "verified_by_llmstats": false, "analysis_method": "Accuracy on mathematical problem-solving tasks", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.834192+00:00", "updated_at": "2025-07-19T19:56:11.834192+00:00", "benchmark_name": "MATH" }, { "model_benchmark_id": 519, "benchmark_id": "mathvista", "model_id": "gemini-1.5-flash-8b", "score": 0.547, "normalized_score": 0.547, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/technologies/gemini/flash/", "verified_by_llmstats": false, "analysis_method": "Visual mathematical reasoning accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.078820+00:00", "updated_at": "2025-07-19T19:56:12.078820+00:00", "benchmark_name": "MathVista" }, { "model_benchmark_id": 173, "benchmark_id": "mmlu-pro", "model_id": "gemini-1.5-flash-8b", "score": 0.587, "normalized_score": 0.587, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/technologies/gemini/flash/", "verified_by_llmstats": false, "analysis_method": "Multiple choice accuracy across enhanced MMLU dataset with higher difficulty tasks", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.436045+00:00", "updated_at": "2025-07-19T19:56:11.436045+00:00", "benchmark_name": "MMLU-Pro" }, { "model_benchmark_id": 561, "benchmark_id": "mmmu", "model_id": "gemini-1.5-flash-8b", "score": 0.537, "normalized_score": 0.537, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/technologies/gemini/flash/", "verified_by_llmstats": false, "analysis_method": "Multimodal understanding accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.154594+00:00", "updated_at": "2025-07-19T19:56:12.154594+00:00", "benchmark_name": "MMMU" }, { "model_benchmark_id": 1377, "benchmark_id": "mrcr", "model_id": "gemini-1.5-flash-8b", "score": 0.547, "normalized_score": 0.547, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/technologies/gemini/flash/", "verified_by_llmstats": false, "analysis_method": "Long-context comprehension accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.898262+00:00", "updated_at": "2025-07-19T19:56:13.898262+00:00", "benchmark_name": "MRCR" }, { "model_benchmark_id": 1203, "benchmark_id": "natural2code", "model_id": "gemini-1.5-flash-8b", "score": 0.755, "normalized_score": 0.755, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/technologies/gemini/flash/", "verified_by_llmstats": false, "analysis_method": "Pass rate on code generation tasks across multiple programming languages", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.531432+00:00", "updated_at": "2025-07-19T19:56:13.531432+00:00", "benchmark_name": "Natural2Code" }, { "model_benchmark_id": 1370, "benchmark_id": "vibe-eval", "model_id": "gemini-1.5-flash-8b", "score": 0.409, "normalized_score": 0.409, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/technologies/gemini/flash/", "verified_by_llmstats": false, "analysis_method": "Visual understanding evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.885058+00:00", "updated_at": "2025-07-19T19:56:13.885058+00:00", "benchmark_name": "Vibe-Eval" }, { "model_benchmark_id": 1382, "benchmark_id": "video-mme", "model_id": "gemini-1.5-flash-8b", "score": 0.662, "normalized_score": 0.662, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/technologies/gemini/flash/", "verified_by_llmstats": false, "analysis_method": "Video analysis accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.910273+00:00", "updated_at": "2025-07-19T19:56:13.910273+00:00", "benchmark_name": "Video-MME" }, { "model_benchmark_id": 1396, "benchmark_id": "wmt23", "model_id": "gemini-1.5-flash-8b", "score": 0.726, "normalized_score": 0.726, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/technologies/gemini/flash/", "verified_by_llmstats": false, "analysis_method": "Translation quality score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.942779+00:00", "updated_at": "2025-07-19T19:56:13.942779+00:00", "benchmark_name": "WMT23" }, { "model_benchmark_id": 1420, "benchmark_id": "xstest", "model_id": "gemini-1.5-flash-8b", "score": 0.926, "normalized_score": 0.926, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/technologies/gemini/flash/", "verified_by_llmstats": false, "analysis_method": "Safe request fulfillment rate", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.005888+00:00", "updated_at": "2025-07-19T19:56:14.005888+00:00", "benchmark_name": "XSTest" } ] ================================================ FILE: data/organizations/google/models/gemini-1.5-flash-8b/model.json ================================================ { "model_id": "gemini-1.5-flash-8b", "name": "Gemini 1.5 Flash 8B", "organization_id": "google", "fine_tuned_from_model_id": null, "description": "A multimodal model capable of processing audio, images, video, and text with high efficiency. Features JSON mode, function calling, code execution, and system instructions support. Optimized for fast inference with 8B parameters.", "release_date": "2024-03-15", "announcement_date": "2024-03-15", "license_id": "proprietary", "multimodal": true, "knowledge_cutoff": "2024-10-01", "param_count": 8000000000, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://ai.google.dev/docs/gemini_1.5_flash", "source_playground": "https://ai.google.dev/studio", "source_paper": null, "source_scorecard_blog_link": null, "source_repo_link": "https://github.com/google/generative-ai", "source_weights_link": null, "created_at": "2025-07-19T19:49:05.530672+00:00", "updated_at": "2025-07-19T19:49:05.530672+00:00", "model_family_id": null } ================================================ FILE: data/organizations/google/models/gemini-1.5-pro/benchmarks.json ================================================ [ { "model_benchmark_id": 1416, "benchmark_id": "amc-2022-23", "model_id": "gemini-1.5-pro", "score": 0.464, "normalized_score": 0.464, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2403.05530", "verified_by_llmstats": false, "analysis_method": "4-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.995700+00:00", "updated_at": "2025-07-19T19:56:13.995700+00:00", "benchmark_name": "AMC_2022_23" }, { "model_benchmark_id": 1070, "benchmark_id": "big-bench-hard", "model_id": "gemini-1.5-pro", "score": 0.892, "normalized_score": 0.892, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2403.05530", "verified_by_llmstats": false, "analysis_method": "3-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.231702+00:00", "updated_at": "2025-07-19T19:56:13.231702+00:00", "benchmark_name": "BIG-Bench Hard" }, { "model_benchmark_id": 945, "benchmark_id": "drop", "model_id": "gemini-1.5-pro", "score": 0.749, "normalized_score": 0.749, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2403.05530", "verified_by_llmstats": false, "analysis_method": "Variable shots", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.994980+00:00", "updated_at": "2025-07-19T19:56:12.994980+00:00", "benchmark_name": "DROP" }, { "model_benchmark_id": 1398, "benchmark_id": "fleurs", "model_id": "gemini-1.5-pro", "score": 0.067, "normalized_score": 0.067, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/technologies/gemini/pro/", "verified_by_llmstats": false, "analysis_method": "Word Error Rate", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.947638+00:00", "updated_at": "2025-07-19T19:56:13.947638+00:00", "benchmark_name": "FLEURS" }, { "model_benchmark_id": 1414, "benchmark_id": "functionalmath", "model_id": "gemini-1.5-pro", "score": 0.646, "normalized_score": 0.646, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2403.05530", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.990248+00:00", "updated_at": "2025-07-19T19:56:13.990248+00:00", "benchmark_name": "FunctionalMATH" }, { "model_benchmark_id": 268, "benchmark_id": "gpqa", "model_id": "gemini-1.5-pro", "score": 0.591, "normalized_score": 0.591, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/technologies/gemini/pro/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.614440+00:00", "updated_at": "2025-07-19T19:56:11.614440+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 979, "benchmark_id": "gsm8k", "model_id": "gemini-1.5-pro", "score": 0.908, "normalized_score": 0.908, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2403.05530", "verified_by_llmstats": false, "analysis_method": "11-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.055992+00:00", "updated_at": "2025-07-19T19:56:13.055992+00:00", "benchmark_name": "GSM8k" }, { "model_benchmark_id": 37, "benchmark_id": "hellaswag", "model_id": "gemini-1.5-pro", "score": 0.933, "normalized_score": 0.933, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2403.05530", "verified_by_llmstats": false, "analysis_method": "10-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.158919+00:00", "updated_at": "2025-07-19T19:56:11.158919+00:00", "benchmark_name": "HellaSwag" }, { "model_benchmark_id": 1157, "benchmark_id": "hiddenmath", "model_id": "gemini-1.5-pro", "score": 0.52, "normalized_score": 0.52, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/technologies/gemini/pro/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.434888+00:00", "updated_at": "2025-07-19T19:56:13.434888+00:00", "benchmark_name": "HiddenMath" }, { "model_benchmark_id": 766, "benchmark_id": "humaneval", "model_id": "gemini-1.5-pro", "score": 0.841, "normalized_score": 0.841, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2403.05530", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.613548+00:00", "updated_at": "2025-07-19T19:56:12.613548+00:00", "benchmark_name": "HumanEval" }, { "model_benchmark_id": 381, "benchmark_id": "math", "model_id": "gemini-1.5-pro", "score": 0.865, "normalized_score": 0.865, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/technologies/gemini/pro/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.822515+00:00", "updated_at": "2025-07-19T19:56:11.822515+00:00", "benchmark_name": "MATH" }, { "model_benchmark_id": 517, "benchmark_id": "mathvista", "model_id": "gemini-1.5-pro", "score": 0.681, "normalized_score": 0.681, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/technologies/gemini/pro/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.075702+00:00", "updated_at": "2025-07-19T19:56:12.075702+00:00", "benchmark_name": "MathVista" }, { "model_benchmark_id": 1275, "benchmark_id": "mgsm", "model_id": "gemini-1.5-pro", "score": 0.875, "normalized_score": 0.875, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2403.05530", "verified_by_llmstats": false, "analysis_method": "8-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.674684+00:00", "updated_at": "2025-07-19T19:56:13.674684+00:00", "benchmark_name": "MGSM" }, { "model_benchmark_id": 67, "benchmark_id": "mmlu", "model_id": "gemini-1.5-pro", "score": 0.859, "normalized_score": 0.859, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2403.05530", "verified_by_llmstats": false, "analysis_method": "5-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.226593+00:00", "updated_at": "2025-07-19T19:56:11.226593+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 167, "benchmark_id": "mmlu-pro", "model_id": "gemini-1.5-pro", "score": 0.758, "normalized_score": 0.758, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/technologies/gemini/pro/", "verified_by_llmstats": false, "analysis_method": "0-shot CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.425109+00:00", "updated_at": "2025-07-19T19:56:11.425109+00:00", "benchmark_name": "MMLU-Pro" }, { "model_benchmark_id": 556, "benchmark_id": "mmmu", "model_id": "gemini-1.5-pro", "score": 0.659, "normalized_score": 0.659, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/technologies/gemini/pro/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.145100+00:00", "updated_at": "2025-07-19T19:56:12.145100+00:00", "benchmark_name": "MMMU" }, { "model_benchmark_id": 1373, "benchmark_id": "mrcr", "model_id": "gemini-1.5-pro", "score": 0.826, "normalized_score": 0.826, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/technologies/gemini/pro/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.891629+00:00", "updated_at": "2025-07-19T19:56:13.891629+00:00", "benchmark_name": "MRCR" }, { "model_benchmark_id": 1198, "benchmark_id": "natural2code", "model_id": "gemini-1.5-pro", "score": 0.854, "normalized_score": 0.854, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/technologies/gemini/pro/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.523328+00:00", "updated_at": "2025-07-19T19:56:13.523328+00:00", "benchmark_name": "Natural2Code" }, { "model_benchmark_id": 1412, "benchmark_id": "physicsfinals", "model_id": "gemini-1.5-pro", "score": 0.639, "normalized_score": 0.639, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2403.05530", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.984883+00:00", "updated_at": "2025-07-19T19:56:13.984883+00:00", "benchmark_name": "PhysicsFinals" }, { "model_benchmark_id": 1366, "benchmark_id": "vibe-eval", "model_id": "gemini-1.5-pro", "score": 0.539, "normalized_score": 0.539, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/technologies/gemini/pro/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.877591+00:00", "updated_at": "2025-07-19T19:56:13.877591+00:00", "benchmark_name": "Vibe-Eval" }, { "model_benchmark_id": 1380, "benchmark_id": "video-mme", "model_id": "gemini-1.5-pro", "score": 0.786, "normalized_score": 0.786, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/technologies/gemini/pro/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.906552+00:00", "updated_at": "2025-07-19T19:56:13.906552+00:00", "benchmark_name": "Video-MME" }, { "model_benchmark_id": 1394, "benchmark_id": "wmt23", "model_id": "gemini-1.5-pro", "score": 0.751, "normalized_score": 0.751, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/technologies/gemini/pro/", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.939104+00:00", "updated_at": "2025-07-19T19:56:13.939104+00:00", "benchmark_name": "WMT23" }, { "model_benchmark_id": 1418, "benchmark_id": "xstest", "model_id": "gemini-1.5-pro", "score": 0.988, "normalized_score": 0.988, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/technologies/gemini/pro/", "verified_by_llmstats": false, "analysis_method": "Safety Compliance", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.002222+00:00", "updated_at": "2025-07-19T19:56:14.002222+00:00", "benchmark_name": "XSTest" } ] ================================================ FILE: data/organizations/google/models/gemini-1.5-pro/model.json ================================================ { "model_id": "gemini-1.5-pro", "name": "Gemini 1.5 Pro", "organization_id": "google", "fine_tuned_from_model_id": null, "description": "Gemini 1.5 Pro is a mid-size multimodal model optimized for a wide range of reasoning tasks. It can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text.", "release_date": "2024-05-01", "announcement_date": "2024-05-01", "license_id": "proprietary", "multimodal": true, "knowledge_cutoff": "2023-11-01", "param_count": null, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://ai.google.dev/gemini-api/docs/models/gemini#gemini-1.5-pro", "source_playground": "https://ai.google.dev/studio", "source_paper": "https://arxiv.org/pdf/2403.05530", "source_scorecard_blog_link": "https://deepmind.google/technologies/gemini/pro/", "source_repo_link": null, "source_weights_link": null, "created_at": "2025-07-19T19:49:05.481673+00:00", "updated_at": "2025-07-19T19:49:05.481673+00:00", "model_family_id": null } ================================================ FILE: data/organizations/google/models/gemini-2.0-flash/benchmarks.json ================================================ [ { "model_benchmark_id": 1152, "benchmark_id": "bird-sql-(dev)", "model_id": "gemini-2.0-flash", "score": 0.569, "normalized_score": 0.569, "is_self_reported": true, "self_reported_source_link": "https://blog.google/technology/google-deepmind/google-gemini-ai-update-december-2024/", "verified_by_llmstats": false, "analysis_method": "Natural language to SQL conversion evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.423568+00:00", "updated_at": "2025-07-19T19:56:13.423568+00:00", "benchmark_name": "Bird-SQL (dev)" }, { "model_benchmark_id": 1404, "benchmark_id": "covost2", "model_id": "gemini-2.0-flash", "score": 0.392, "normalized_score": 0.392, "is_self_reported": true, "self_reported_source_link": "https://blog.google/technology/google-deepmind/google-gemini-ai-update-december-2024/", "verified_by_llmstats": false, "analysis_method": "Automatic speech translation (BLEU score) across 21 languages", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.962212+00:00", "updated_at": "2025-07-19T19:56:13.962212+00:00", "benchmark_name": "CoVoST2" }, { "model_benchmark_id": 922, "benchmark_id": "egoschema", "model_id": "gemini-2.0-flash", "score": 0.715, "normalized_score": 0.715, "is_self_reported": true, "self_reported_source_link": "https://blog.google/technology/google-deepmind/google-gemini-ai-update-december-2024/", "verified_by_llmstats": false, "analysis_method": "Video analysis across multiple domains", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.926117+00:00", "updated_at": "2025-07-19T19:56:12.926117+00:00", "benchmark_name": "EgoSchema" }, { "model_benchmark_id": 1095, "benchmark_id": "facts-grounding", "model_id": "gemini-2.0-flash", "score": 0.836, "normalized_score": 0.836, "is_self_reported": true, "self_reported_source_link": "https://blog.google/technology/google-deepmind/google-gemini-ai-update-december-2024/", "verified_by_llmstats": false, "analysis_method": "Ability to provide factuality correct responses given documents and diverse user requests", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.278460+00:00", "updated_at": "2025-07-19T19:56:13.278460+00:00", "benchmark_name": "FACTS Grounding" }, { "model_benchmark_id": 279, "benchmark_id": "gpqa", "model_id": "gemini-2.0-flash", "score": 0.621, "normalized_score": 0.621, "is_self_reported": true, "self_reported_source_link": "https://blog.google/technology/google-deepmind/google-gemini-ai-update-december-2024/", "verified_by_llmstats": false, "analysis_method": "Challenging dataset of questions written by domain experts in biology, physics, and chemistry", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.639283+00:00", "updated_at": "2025-07-19T19:56:11.639283+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 1164, "benchmark_id": "hiddenmath", "model_id": "gemini-2.0-flash", "score": 0.63, "normalized_score": 0.63, "is_self_reported": true, "self_reported_source_link": "https://blog.google/technology/google-deepmind/google-gemini-ai-update-december-2024/", "verified_by_llmstats": false, "analysis_method": "Competition-level math problems, Held out dataset AIME/AMC-like", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.449979+00:00", "updated_at": "2025-07-19T19:56:13.449979+00:00", "benchmark_name": "HiddenMath" }, { "model_benchmark_id": 1111, "benchmark_id": "livecodebench", "model_id": "gemini-2.0-flash", "score": 0.351, "normalized_score": 0.351, "is_self_reported": true, "self_reported_source_link": "https://blog.google/technology/google-deepmind/google-gemini-ai-update-december-2024/", "verified_by_llmstats": false, "analysis_method": "Code generation in Python. Code Generation subset covering more recent examples: 06/01/2024 - 10/05/2024", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.317443+00:00", "updated_at": "2025-07-19T19:56:13.317443+00:00", "benchmark_name": "LiveCodeBench" }, { "model_benchmark_id": 388, "benchmark_id": "math", "model_id": "gemini-2.0-flash", "score": 0.897, "normalized_score": 0.897, "is_self_reported": true, "self_reported_source_link": "https://blog.google/technology/google-deepmind/google-gemini-ai-update-december-2024/", "verified_by_llmstats": false, "analysis_method": "Challenging math problems including algebra, geometry, pre-calculus, and others", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.835842+00:00", "updated_at": "2025-07-19T19:56:11.835842+00:00", "benchmark_name": "MATH" }, { "model_benchmark_id": 174, "benchmark_id": "mmlu-pro", "model_id": "gemini-2.0-flash", "score": 0.764, "normalized_score": 0.764, "is_self_reported": true, "self_reported_source_link": "https://blog.google/technology/google-deepmind/google-gemini-ai-update-december-2024/", "verified_by_llmstats": false, "analysis_method": "Enhanced version of MMLU dataset evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.437540+00:00", "updated_at": "2025-07-19T19:56:11.437540+00:00", "benchmark_name": "MMLU-Pro" }, { "model_benchmark_id": 562, "benchmark_id": "mmmu", "model_id": "gemini-2.0-flash", "score": 0.707, "normalized_score": 0.707, "is_self_reported": true, "self_reported_source_link": "https://blog.google/technology/google-deepmind/google-gemini-ai-update-december-2024/", "verified_by_llmstats": false, "analysis_method": "Multi-discipline college-level multimodal understanding and reasoning problems", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.156776+00:00", "updated_at": "2025-07-19T19:56:12.156776+00:00", "benchmark_name": "MMMU" }, { "model_benchmark_id": 1378, "benchmark_id": "mrcr", "model_id": "gemini-2.0-flash", "score": 0.692, "normalized_score": 0.692, "is_self_reported": true, "self_reported_source_link": "https://blog.google/technology/google-deepmind/google-gemini-ai-update-december-2024/", "verified_by_llmstats": false, "analysis_method": "Novel, diagnostic long-context understanding evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.900780+00:00", "updated_at": "2025-07-19T19:56:13.900780+00:00", "benchmark_name": "MRCR" }, { "model_benchmark_id": 1204, "benchmark_id": "natural2code", "model_id": "gemini-2.0-flash", "score": 0.929, "normalized_score": 0.929, "is_self_reported": true, "self_reported_source_link": "https://blog.google/technology/google-deepmind/google-gemini-ai-update-december-2024/", "verified_by_llmstats": false, "analysis_method": "Code generation evaluation across multiple languages", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.533525+00:00", "updated_at": "2025-07-19T19:56:13.533525+00:00", "benchmark_name": "Natural2Code" }, { "model_benchmark_id": 1371, "benchmark_id": "vibe-eval", "model_id": "gemini-2.0-flash", "score": 0.563, "normalized_score": 0.563, "is_self_reported": true, "self_reported_source_link": "https://blog.google/technology/google-deepmind/google-gemini-ai-update-december-2024/", "verified_by_llmstats": false, "analysis_method": "Visual understanding in chat models with challenging everyday examples", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.886575+00:00", "updated_at": "2025-07-19T19:56:13.886575+00:00", "benchmark_name": "Vibe-Eval" } ] ================================================ FILE: data/organizations/google/models/gemini-2.0-flash/model.json ================================================ { "model_id": "gemini-2.0-flash", "name": "Gemini 2.0 Flash", "organization_id": "google", "fine_tuned_from_model_id": null, "description": "Next-generation model featuring superior speed, native tool use, multimodal generation, and a 1M token context window. Supports audio, images, video, and text input with capabilities for structured outputs, function calling, code execution, search, and multimodal operations.", "release_date": "2024-12-01", "announcement_date": "2024-12-01", "license_id": "proprietary", "multimodal": true, "knowledge_cutoff": "2024-08-01", "param_count": null, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://ai.google.dev/gemini-api/docs/models/gemini#gemini-2.0-flash", "source_playground": "https://ai.google.dev/studio", "source_paper": null, "source_scorecard_blog_link": "https://blog.google/technology/google-deepmind/google-gemini-ai-update-december-2024/", "source_repo_link": null, "source_weights_link": null, "created_at": "2025-07-19T19:49:05.538624+00:00", "updated_at": "2025-07-19T19:49:05.538624+00:00", "model_family_id": null } ================================================ FILE: data/organizations/google/models/gemini-2.0-flash-lite/benchmarks.json ================================================ [ { "model_benchmark_id": 1148, "benchmark_id": "bird-sql-(dev)", "model_id": "gemini-2.0-flash-lite", "score": 0.574, "normalized_score": 0.574, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemini-api/docs/models", "verified_by_llmstats": false, "analysis_method": "- evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.415349+00:00", "updated_at": "2025-07-19T19:56:13.415349+00:00", "benchmark_name": "Bird-SQL (dev)" }, { "model_benchmark_id": 1403, "benchmark_id": "covost2", "model_id": "gemini-2.0-flash-lite", "score": 0.384, "normalized_score": 0.384, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemini-api/docs/models", "verified_by_llmstats": false, "analysis_method": "Automatic speech translation (BLEU score) across 21 languages", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.960537+00:00", "updated_at": "2025-07-19T19:56:13.960537+00:00", "benchmark_name": "CoVoST2" }, { "model_benchmark_id": 921, "benchmark_id": "egoschema", "model_id": "gemini-2.0-flash-lite", "score": 0.672, "normalized_score": 0.672, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemini-api/docs/models", "verified_by_llmstats": false, "analysis_method": "Video analysis across multiple domains", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.924659+00:00", "updated_at": "2025-07-19T19:56:12.924659+00:00", "benchmark_name": "EgoSchema" }, { "model_benchmark_id": 1088, "benchmark_id": "facts-grounding", "model_id": "gemini-2.0-flash-lite", "score": 0.836, "normalized_score": 0.836, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemini-api/docs/models", "verified_by_llmstats": false, "analysis_method": "- evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.264333+00:00", "updated_at": "2025-07-19T19:56:13.264333+00:00", "benchmark_name": "FACTS Grounding" }, { "model_benchmark_id": 1209, "benchmark_id": "global-mmlu-lite", "model_id": "gemini-2.0-flash-lite", "score": 0.782, "normalized_score": 0.782, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemini-api/docs/models", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.543616+00:00", "updated_at": "2025-07-19T19:56:13.543616+00:00", "benchmark_name": "Global-MMLU-Lite" }, { "model_benchmark_id": 266, "benchmark_id": "gpqa", "model_id": "gemini-2.0-flash-lite", "score": 0.515, "normalized_score": 0.515, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemini-api/docs/models", "verified_by_llmstats": false, "analysis_method": "Diamond", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.611234+00:00", "updated_at": "2025-07-19T19:56:11.611234+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 1156, "benchmark_id": "hiddenmath", "model_id": "gemini-2.0-flash-lite", "score": 0.553, "normalized_score": 0.553, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemini-api/docs/models", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.433332+00:00", "updated_at": "2025-07-19T19:56:13.433332+00:00", "benchmark_name": "HiddenMath" }, { "model_benchmark_id": 1320, "benchmark_id": "livecodebench-v5", "model_id": "gemini-2.0-flash-lite", "score": 0.289, "normalized_score": 0.289, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemini-api/docs/models", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.771288+00:00", "updated_at": "2025-07-19T19:56:13.771288+00:00", "benchmark_name": "LiveCodeBench v5" }, { "model_benchmark_id": 379, "benchmark_id": "math", "model_id": "gemini-2.0-flash-lite", "score": 0.868, "normalized_score": 0.868, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemini-api/docs/models", "verified_by_llmstats": false, "analysis_method": "standard", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.819524+00:00", "updated_at": "2025-07-19T19:56:11.819524+00:00", "benchmark_name": "MATH" }, { "model_benchmark_id": 166, "benchmark_id": "mmlu-pro", "model_id": "gemini-2.0-flash-lite", "score": 0.716, "normalized_score": 0.716, "is_self_reported": true, "self_reported_source_link": "https://developers.googleblog.com/en/gemini-2-family-expands/", "verified_by_llmstats": false, "analysis_method": "Chain-of-Thought accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.423223+00:00", "updated_at": "2025-07-19T19:56:11.423223+00:00", "benchmark_name": "MMLU-Pro" }, { "model_benchmark_id": 554, "benchmark_id": "mmmu", "model_id": "gemini-2.0-flash-lite", "score": 0.68, "normalized_score": 0.68, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemini-api/docs/models", "verified_by_llmstats": false, "analysis_method": "Multi-discipline college-level multimodal understanding and reasoning problems", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.141505+00:00", "updated_at": "2025-07-19T19:56:12.141505+00:00", "benchmark_name": "MMMU" }, { "model_benchmark_id": 1402, "benchmark_id": "mrcr-1m", "model_id": "gemini-2.0-flash-lite", "score": 0.58, "normalized_score": 0.58, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemini-api/docs/models", "verified_by_llmstats": false, "analysis_method": "Long-context comprehension accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.956748+00:00", "updated_at": "2025-07-19T19:56:13.956748+00:00", "benchmark_name": "MRCR 1M" }, { "model_benchmark_id": 226, "benchmark_id": "simpleqa", "model_id": "gemini-2.0-flash-lite", "score": 0.217, "normalized_score": 0.217, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemini-api/docs/models", "verified_by_llmstats": false, "analysis_method": "Factuality", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.535234+00:00", "updated_at": "2025-07-19T19:56:11.535234+00:00", "benchmark_name": "SimpleQA" } ] ================================================ FILE: data/organizations/google/models/gemini-2.0-flash-lite/model.json ================================================ { "model_id": "gemini-2.0-flash-lite", "name": "Gemini 2.0 Flash-Lite", "organization_id": "google", "fine_tuned_from_model_id": null, "description": "A Gemini 2.0 Flash model optimized for cost efficiency and low latency", "release_date": "2025-02-05", "announcement_date": "2025-02-05", "license_id": "proprietary", "multimodal": true, "knowledge_cutoff": "2024-06-01", "param_count": null, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://ai.google.dev/gemini-api/docs/models#gemini-2.0-flash-lite", "source_playground": "https://aistudio.google.com/prompts/new_chat?model=gemini-2.0-flash-lite", "source_paper": null, "source_scorecard_blog_link": "https://developers.googleblog.com/en/gemini-2-family-expands", "source_repo_link": null, "source_weights_link": null, "created_at": "2025-07-19T19:49:05.469548+00:00", "updated_at": "2025-07-19T19:49:05.469548+00:00", "model_family_id": null } ================================================ FILE: data/organizations/google/models/gemini-2.0-flash-thinking/benchmarks.json ================================================ [ { "model_benchmark_id": 448, "benchmark_id": "aime-2024", "model_id": "gemini-2.0-flash-thinking", "score": 0.733, "normalized_score": 0.733, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemini-api/docs/models/gemini#evaluation", "verified_by_llmstats": false, "analysis_method": "Enhanced reasoning on competition-level math prompts", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.952263+00:00", "updated_at": "2025-07-19T19:56:11.952263+00:00", "benchmark_name": "AIME 2024" }, { "model_benchmark_id": 271, "benchmark_id": "gpqa", "model_id": "gemini-2.0-flash-thinking", "score": 0.742, "normalized_score": 0.742, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemini-api/docs/models/gemini#evaluation", "verified_by_llmstats": false, "analysis_method": "Challenging science questions requiring chain-of-thought reasoning", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.620752+00:00", "updated_at": "2025-07-19T19:56:11.620752+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 559, "benchmark_id": "mmmu", "model_id": "gemini-2.0-flash-thinking", "score": 0.754, "normalized_score": 0.754, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemini-api/docs/models/gemini#evaluation", "verified_by_llmstats": false, "analysis_method": "Image-text QA across various domains", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.151038+00:00", "updated_at": "2025-07-19T19:56:12.151038+00:00", "benchmark_name": "MMMU" } ] ================================================ FILE: data/organizations/google/models/gemini-2.0-flash-thinking/model.json ================================================ { "model_id": "gemini-2.0-flash-thinking", "name": "Gemini 2.0 Flash Thinking", "organization_id": "google", "fine_tuned_from_model_id": null, "description": "Gemini 2.0 Flash Thinking is a enhanced reasoning model, capable of showing its thoughts to improve performance and explainability. Combining speed and performance, Gemini 2.0 Flash Thinking also excels in science and math, showing its thinking to solve complex problems.", "release_date": "2025-01-21", "announcement_date": "2025-01-21", "license_id": "proprietary", "multimodal": true, "knowledge_cutoff": "2024-08-01", "param_count": null, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://ai.google.dev/gemini-api/docs/models/gemini#gemini-2.0-flash-thinking-experimental", "source_playground": "https://ai.google.dev/studio", "source_paper": null, "source_scorecard_blog_link": "https://blog.google/technology/google-deepmind/google-gemini-ai-update-december-2024/", "source_repo_link": null, "source_weights_link": null, "created_at": "2025-07-19T19:49:05.504495+00:00", "updated_at": "2025-07-19T19:49:05.504495+00:00", "model_family_id": null } ================================================ FILE: data/organizations/google/models/gemini-2.5-flash/benchmarks.json ================================================ [ { "model_benchmark_id": 661, "benchmark_id": "aider-polyglot", "model_id": "gemini-2.5-flash", "score": 0.619, "normalized_score": 0.619, "is_self_reported": true, "self_reported_source_link": "https://developers.googleblog.com/en/start-building-with-gemini-25-flash/", "verified_by_llmstats": false, "analysis_method": "whole", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.370513+00:00", "updated_at": "2025-07-19T19:56:12.370513+00:00", "benchmark_name": "Aider-Polyglot" }, { "model_benchmark_id": 1329, "benchmark_id": "aider-polyglot-edit", "model_id": "gemini-2.5-flash", "score": 0.567, "normalized_score": 0.567, "is_self_reported": true, "self_reported_source_link": "https://blog.google/technology/google-deepmind/google-gemini-updates-io-2025", "verified_by_llmstats": false, "analysis_method": "Diff-Fenced", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.795058+00:00", "updated_at": "2025-07-19T19:56:13.795058+00:00", "benchmark_name": "Aider-Polyglot Edit" }, { "model_benchmark_id": 447, "benchmark_id": "aime-2024", "model_id": "gemini-2.5-flash", "score": 0.88, "normalized_score": 0.88, "is_self_reported": true, "self_reported_source_link": "https://developers.googleblog.com/en/start-building-with-gemini-25-flash/", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.950448+00:00", "updated_at": "2025-07-19T19:56:11.950448+00:00", "benchmark_name": "AIME 2024" }, { "model_benchmark_id": 683, "benchmark_id": "aime-2025", "model_id": "gemini-2.5-flash", "score": 0.72, "normalized_score": 0.72, "is_self_reported": true, "self_reported_source_link": "https://developers.googleblog.com/en/start-building-with-gemini-25-flash/", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.428509+00:00", "updated_at": "2025-07-19T19:56:12.428509+00:00", "benchmark_name": "AIME 2025" }, { "model_benchmark_id": 1091, "benchmark_id": "facts-grounding", "model_id": "gemini-2.5-flash", "score": 0.853, "normalized_score": 0.853, "is_self_reported": true, "self_reported_source_link": "https://developers.googleblog.com/en/start-building-with-gemini-25-flash/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.271323+00:00", "updated_at": "2025-07-19T19:56:13.271323+00:00", "benchmark_name": "FACTS Grounding" }, { "model_benchmark_id": 1212, "benchmark_id": "global-mmlu-lite", "model_id": "gemini-2.5-flash", "score": 0.884, "normalized_score": 0.884, "is_self_reported": true, "self_reported_source_link": "https://developers.googleblog.com/en/start-building-with-gemini-25-flash/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.550549+00:00", "updated_at": "2025-07-19T19:56:13.550549+00:00", "benchmark_name": "Global-MMLU-Lite" }, { "model_benchmark_id": 270, "benchmark_id": "gpqa", "model_id": "gemini-2.5-flash", "score": 0.828, "normalized_score": 0.828, "is_self_reported": true, "self_reported_source_link": "https://developers.googleblog.com/en/start-building-with-gemini-25-flash/", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.619078+00:00", "updated_at": "2025-07-19T19:56:11.619078+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 720, "benchmark_id": "humanity's-last-exam", "model_id": "gemini-2.5-flash", "score": 0.11, "normalized_score": 0.11, "is_self_reported": true, "self_reported_source_link": "https://developers.googleblog.com/en/start-building-with-gemini-25-flash/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.518055+00:00", "updated_at": "2025-07-19T19:56:12.518055+00:00", "benchmark_name": "Humanity's Last Exam" }, { "model_benchmark_id": 1321, "benchmark_id": "livecodebench-v5", "model_id": "gemini-2.5-flash", "score": 0.639, "normalized_score": 0.639, "is_self_reported": true, "self_reported_source_link": "https://developers.googleblog.com/en/start-building-with-gemini-25-flash/", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.773194+00:00", "updated_at": "2025-07-19T19:56:13.773194+00:00", "benchmark_name": "LiveCodeBench v5" }, { "model_benchmark_id": 558, "benchmark_id": "mmmu", "model_id": "gemini-2.5-flash", "score": 0.797, "normalized_score": 0.797, "is_self_reported": true, "self_reported_source_link": "https://developers.googleblog.com/en/start-building-with-gemini-25-flash/", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.148985+00:00", "updated_at": "2025-07-19T19:56:12.148985+00:00", "benchmark_name": "MMMU" }, { "model_benchmark_id": 1374, "benchmark_id": "mrcr", "model_id": "gemini-2.5-flash", "score": 0.32, "normalized_score": 0.32, "is_self_reported": true, "self_reported_source_link": "https://blog.google/technology/google-deepmind/google-gemini-updates-io-2025", "verified_by_llmstats": false, "analysis_method": "1M-pointwise", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.893404+00:00", "updated_at": "2025-07-19T19:56:13.895016+00:00", "benchmark_name": "MRCR" }, { "model_benchmark_id": 229, "benchmark_id": "simpleqa", "model_id": "gemini-2.5-flash", "score": 0.269, "normalized_score": 0.269, "is_self_reported": true, "self_reported_source_link": "https://developers.googleblog.com/en/start-building-with-gemini-25-flash/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.540281+00:00", "updated_at": "2025-07-19T19:56:11.540281+00:00", "benchmark_name": "SimpleQA" }, { "model_benchmark_id": 1341, "benchmark_id": "swe-bench-verified", "model_id": "gemini-2.5-flash", "score": 0.604, "normalized_score": 0.604, "is_self_reported": true, "self_reported_source_link": "https://developers.googleblog.com/en/start-building-with-gemini-25-flash/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.822771+00:00", "updated_at": "2025-07-19T19:56:13.822771+00:00", "benchmark_name": "SWE-Bench Verified" }, { "model_benchmark_id": 1368, "benchmark_id": "vibe-eval", "model_id": "gemini-2.5-flash", "score": 0.654, "normalized_score": 0.654, "is_self_reported": true, "self_reported_source_link": "https://developers.googleblog.com/en/start-building-with-gemini-25-flash/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.880772+00:00", "updated_at": "2025-07-19T19:56:13.880772+00:00", "benchmark_name": "Vibe-Eval" } ] ================================================ FILE: data/organizations/google/models/gemini-2.5-flash/model.json ================================================ { "model_id": "gemini-2.5-flash", "name": "Gemini 2.5 Flash", "organization_id": "google", "fine_tuned_from_model_id": null, "description": "A thinking model designed for a balance between price and performance. It builds upon Gemini 2.0 Flash with upgraded reasoning, hybrid thinking control, multimodal capabilities (text, image, video, audio input), and a 1M token input context window.", "release_date": "2025-05-20", "announcement_date": "2025-05-20", "license_id": "proprietary", "multimodal": true, "knowledge_cutoff": "2025-01-31", "param_count": null, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://ai.google.dev/gemini-api/docs/models?hl=en#gemini-2.5-flash-preview-04-17", "source_playground": "https://aistudio.google.com/?model=gemini-2.5-flash-preview-04-17", "source_paper": null, "source_scorecard_blog_link": "https://developers.googleblog.com/en/start-building-with-gemini-25-flash/", "source_repo_link": null, "source_weights_link": null, "created_at": "2025-07-19T19:49:05.500918+00:00", "updated_at": "2025-07-19T19:49:05.500918+00:00", "model_family_id": null } ================================================ FILE: data/organizations/google/models/gemini-2.5-flash-lite/benchmarks.json ================================================ [ { "model_benchmark_id": 659, "benchmark_id": "aider-polyglot", "model_id": "gemini-2.5-flash-lite", "score": 0.267, "normalized_score": 0.267, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/models/gemini/flash-lite/", "verified_by_llmstats": false, "analysis_method": "Code editing", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.366506+00:00", "updated_at": "2025-07-19T19:56:12.366506+00:00", "benchmark_name": "Aider-Polyglot" }, { "model_benchmark_id": 681, "benchmark_id": "aime-2025", "model_id": "gemini-2.5-flash-lite", "score": 0.498, "normalized_score": 0.498, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/models/gemini/flash-lite/", "verified_by_llmstats": false, "analysis_method": "Mathematics", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.422347+00:00", "updated_at": "2025-07-19T19:56:12.422347+00:00", "benchmark_name": "AIME 2025" }, { "model_benchmark_id": 1406, "benchmark_id": "arc", "model_id": "gemini-2.5-flash-lite", "score": 0.025, "normalized_score": 0.025, "is_self_reported": true, "self_reported_source_link": "https://cloud.google.com/vertex-ai/generative-ai/docs/models/gemini/2-5-flash-lite", "verified_by_llmstats": false, "analysis_method": "Default", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.969921+00:00", "updated_at": "2025-07-19T19:56:13.969921+00:00", "benchmark_name": "Arc" }, { "model_benchmark_id": 1089, "benchmark_id": "facts-grounding", "model_id": "gemini-2.5-flash-lite", "score": 0.841, "normalized_score": 0.841, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/models/gemini/flash-lite/", "verified_by_llmstats": false, "analysis_method": "Factuality", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.267251+00:00", "updated_at": "2025-07-19T19:56:13.267251+00:00", "benchmark_name": "FACTS Grounding" }, { "model_benchmark_id": 1210, "benchmark_id": "global-mmlu-lite", "model_id": "gemini-2.5-flash-lite", "score": 0.811, "normalized_score": 0.811, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/models/gemini/flash-lite/", "verified_by_llmstats": false, "analysis_method": "Multilingual performance", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.546251+00:00", "updated_at": "2025-07-19T19:56:13.546251+00:00", "benchmark_name": "Global-MMLU-Lite" }, { "model_benchmark_id": 267, "benchmark_id": "gpqa", "model_id": "gemini-2.5-flash-lite", "score": 0.646, "normalized_score": 0.646, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/models/gemini/flash-lite/", "verified_by_llmstats": false, "analysis_method": "Diamond", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.612808+00:00", "updated_at": "2025-07-19T19:56:11.612808+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 718, "benchmark_id": "humanity's-last-exam", "model_id": "gemini-2.5-flash-lite", "score": 0.051, "normalized_score": 0.051, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/models/gemini/flash-lite/", "verified_by_llmstats": false, "analysis_method": "No tools", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.514286+00:00", "updated_at": "2025-07-19T19:56:12.514286+00:00", "benchmark_name": "Humanity's Last Exam" }, { "model_benchmark_id": 1104, "benchmark_id": "livecodebench", "model_id": "gemini-2.5-flash-lite", "score": 0.337, "normalized_score": 0.337, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/models/gemini/flash-lite/", "verified_by_llmstats": false, "analysis_method": "Code generation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.300809+00:00", "updated_at": "2025-07-19T19:56:13.300809+00:00", "benchmark_name": "LiveCodeBench" }, { "model_benchmark_id": 555, "benchmark_id": "mmmu", "model_id": "gemini-2.5-flash-lite", "score": 0.729, "normalized_score": 0.729, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/models/gemini/flash-lite/", "verified_by_llmstats": false, "analysis_method": "Visual reasoning", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.143254+00:00", "updated_at": "2025-07-19T19:56:12.143254+00:00", "benchmark_name": "MMMU" }, { "model_benchmark_id": 1405, "benchmark_id": "mrcr-v2", "model_id": "gemini-2.5-flash-lite", "score": 0.166, "normalized_score": 0.166, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/models/gemini/flash-lite/", "verified_by_llmstats": false, "analysis_method": "Long context 128k average. 8 needle.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.966057+00:00", "updated_at": "2025-07-19T19:56:13.966057+00:00", "benchmark_name": "MRCR v2" }, { "model_benchmark_id": 227, "benchmark_id": "simpleqa", "model_id": "gemini-2.5-flash-lite", "score": 0.107, "normalized_score": 0.107, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/models/gemini/flash-lite/", "verified_by_llmstats": false, "analysis_method": "Factuality", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.536893+00:00", "updated_at": "2025-07-19T19:56:11.536893+00:00", "benchmark_name": "SimpleQA" }, { "model_benchmark_id": 1339, "benchmark_id": "swe-bench-verified", "model_id": "gemini-2.5-flash-lite", "score": 0.316, "normalized_score": 0.316, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/models/gemini/flash-lite/", "verified_by_llmstats": false, "analysis_method": "Agentic coding single attempt", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.819222+00:00", "updated_at": "2025-07-19T19:56:13.819222+00:00", "benchmark_name": "SWE-Bench Verified" }, { "model_benchmark_id": 1365, "benchmark_id": "vibe-eval", "model_id": "gemini-2.5-flash-lite", "score": 0.513, "normalized_score": 0.513, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/models/gemini/flash-lite/", "verified_by_llmstats": false, "analysis_method": "Reka", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.875989+00:00", "updated_at": "2025-07-19T19:56:13.875989+00:00", "benchmark_name": "Vibe-Eval" } ] ================================================ FILE: data/organizations/google/models/gemini-2.5-flash-lite/model.json ================================================ { "model_id": "gemini-2.5-flash-lite", "name": "Gemini 2.5 Flash-Lite", "organization_id": "google", "fine_tuned_from_model_id": null, "description": "Gemini 2.5 Flash-Lite is a model developed by Google DeepMind, designed to handle various tasks including reasoning, science, mathematics, code generation, and more. It features advanced capabilities in multilingual performance and long context understanding. It is optimized for low latency use cases, supporting multimodal input with a 1 million-token context length.", "release_date": "2025-06-17", "announcement_date": "2025-06-17", "license_id": "creative_commons_attribution_4_0_license", "multimodal": true, "knowledge_cutoff": "2025-01-01", "param_count": null, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://cloud.google.com/vertex-ai/generative-ai/docs/models/gemini/2-5-flash-lite", "source_playground": "https://ai.google.com/studio", "source_paper": "https://arxiv.org/abs/2503.16534", "source_scorecard_blog_link": "https://cloud.google.com/vertex-ai/generative-ai/docs/models/gemini/2-5-flash-lite", "source_repo_link": null, "source_weights_link": null, "created_at": "2025-07-19T19:49:05.473471+00:00", "updated_at": "2025-07-19T19:49:05.473471+00:00", "model_family_id": null } ================================================ FILE: data/organizations/google/models/gemini-2.5-pro/benchmarks.json ================================================ [ { "model_benchmark_id": 658, "benchmark_id": "aider-polyglot", "model_id": "gemini-2.5-pro", "score": 0.765, "normalized_score": 0.765, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/models/gemini/pro/", "verified_by_llmstats": false, "analysis_method": "-", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.364634+00:00", "updated_at": "2025-07-19T19:56:12.364634+00:00", "benchmark_name": "Aider-Polyglot" }, { "model_benchmark_id": 1328, "benchmark_id": "aider-polyglot-edit", "model_id": "gemini-2.5-pro", "score": 0.727, "normalized_score": 0.727, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/models/gemini/pro/", "verified_by_llmstats": false, "analysis_method": "Diff", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.793176+00:00", "updated_at": "2025-07-19T19:56:13.793176+00:00", "benchmark_name": "Aider-Polyglot Edit" }, { "model_benchmark_id": 446, "benchmark_id": "aime-2024", "model_id": "gemini-2.5-pro", "score": 0.92, "normalized_score": 0.92, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/models/gemini/pro/", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.948567+00:00", "updated_at": "2025-07-19T19:56:11.948567+00:00", "benchmark_name": "AIME 2024" }, { "model_benchmark_id": 679, "benchmark_id": "aime-2025", "model_id": "gemini-2.5-pro", "score": 0.83, "normalized_score": 0.83, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/models/gemini/pro/", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.417055+00:00", "updated_at": "2025-07-19T19:56:12.417055+00:00", "benchmark_name": "AIME 2025" }, { "model_benchmark_id": 1385, "benchmark_id": "arc-agi-v2", "model_id": "gemini-2.5-pro", "score": 0.049, "normalized_score": 0.049, "is_self_reported": false, "self_reported_source_link": "https://x.com/xai/status/1943158495588815072", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.918991+00:00", "updated_at": "2025-07-19T19:56:13.918991+00:00", "benchmark_name": "ARC-AGI v2" }, { "model_benchmark_id": 1207, "benchmark_id": "global-mmlu-lite", "model_id": "gemini-2.5-pro", "score": 0.886, "normalized_score": 0.886, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/models/gemini/pro/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.540318+00:00", "updated_at": "2025-07-19T19:56:13.540318+00:00", "benchmark_name": "Global-MMLU-Lite" }, { "model_benchmark_id": 263, "benchmark_id": "gpqa", "model_id": "gemini-2.5-pro", "score": 0.83, "normalized_score": 0.83, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/models/gemini/pro/", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.605360+00:00", "updated_at": "2025-07-19T19:56:11.605360+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 717, "benchmark_id": "humanity's-last-exam", "model_id": "gemini-2.5-pro", "score": 0.178, "normalized_score": 0.178, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/models/gemini/pro/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.511856+00:00", "updated_at": "2025-07-19T19:56:12.511856+00:00", "benchmark_name": "Humanity's Last Exam" }, { "model_benchmark_id": 1318, "benchmark_id": "livecodebench-v5", "model_id": "gemini-2.5-pro", "score": 0.756, "normalized_score": 0.756, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/models/gemini/pro/", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.763325+00:00", "updated_at": "2025-07-19T19:56:13.763325+00:00", "benchmark_name": "LiveCodeBench v5" }, { "model_benchmark_id": 552, "benchmark_id": "mmmu", "model_id": "gemini-2.5-pro", "score": 0.796, "normalized_score": 0.796, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/models/gemini/pro/", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.137517+00:00", "updated_at": "2025-07-19T19:56:12.137517+00:00", "benchmark_name": "MMMU" }, { "model_benchmark_id": 1372, "benchmark_id": "mrcr", "model_id": "gemini-2.5-pro", "score": 0.93, "normalized_score": 0.93, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/models/gemini/pro/", "verified_by_llmstats": false, "analysis_method": "128k-average", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.889867+00:00", "updated_at": "2025-07-19T19:56:13.889867+00:00", "benchmark_name": "MRCR" }, { "model_benchmark_id": 1384, "benchmark_id": "mrcr-1m-(pointwise)", "model_id": "gemini-2.5-pro", "score": 0.829, "normalized_score": 0.829, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/models/gemini/pro/", "verified_by_llmstats": false, "analysis_method": "Pointwise", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.915166+00:00", "updated_at": "2025-07-19T19:56:13.915166+00:00", "benchmark_name": "MRCR 1M (pointwise)" }, { "model_benchmark_id": 225, "benchmark_id": "simpleqa", "model_id": "gemini-2.5-pro", "score": 0.508, "normalized_score": 0.508, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/models/gemini/pro/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.532774+00:00", "updated_at": "2025-07-19T19:56:11.532774+00:00", "benchmark_name": "SimpleQA" }, { "model_benchmark_id": 1338, "benchmark_id": "swe-bench-verified", "model_id": "gemini-2.5-pro", "score": 0.632, "normalized_score": 0.632, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/models/gemini/pro/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.816932+00:00", "updated_at": "2025-07-19T19:56:13.816932+00:00", "benchmark_name": "SWE-Bench Verified" }, { "model_benchmark_id": 1364, "benchmark_id": "vibe-eval", "model_id": "gemini-2.5-pro", "score": 0.656, "normalized_score": 0.656, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/models/gemini/pro/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.874453+00:00", "updated_at": "2025-07-19T19:56:13.874453+00:00", "benchmark_name": "Vibe-Eval" }, { "model_benchmark_id": 1379, "benchmark_id": "video-mme", "model_id": "gemini-2.5-pro", "score": 0.848, "normalized_score": 0.848, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/models/gemini/pro/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.904547+00:00", "updated_at": "2025-07-19T19:56:13.904547+00:00", "benchmark_name": "Video-MME" } ] ================================================ FILE: data/organizations/google/models/gemini-2.5-pro/model.json ================================================ { "model_id": "gemini-2.5-pro", "name": "Gemini 2.5 Pro", "organization_id": "google", "fine_tuned_from_model_id": null, "description": "Our most intelligent AI model, built for the agentic era. Gemini 2.5 Pro leads on common benchmarks with enhanced reasoning, multimodal capabilities (text, image, video, audio input), and a 1M token context window.", "release_date": "2025-05-20", "announcement_date": "2025-05-20", "license_id": "proprietary", "multimodal": true, "knowledge_cutoff": "2025-01-31", "param_count": null, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://ai.google.dev/gemini-api/docs/models?hl=en#gemini-2.5-pro-preview-03-25", "source_playground": "https://aistudio.google.com/?model=gemini-2.5-pro-preview-03-25", "source_paper": "https://storage.googleapis.com/model-cards/documents/gemini-2.5-pro-preview.pdf", "source_scorecard_blog_link": "https://blog.google/technology/google-deepmind/gemini-model-thinking-updates-march-2025/", "source_repo_link": null, "source_weights_link": null, "created_at": "2025-07-19T19:49:05.458697+00:00", "updated_at": "2025-07-19T19:49:05.458697+00:00", "model_family_id": null } ================================================ FILE: data/organizations/google/models/gemini-2.5-pro-preview-06-05/benchmarks.json ================================================ [ { "model_benchmark_id": 660, "benchmark_id": "aider-polyglot", "model_id": "gemini-2.5-pro-preview-06-05", "score": 0.822, "normalized_score": 0.822, "is_self_reported": true, "self_reported_source_link": "https://blog.google/products/gemini/gemini-2-5-pro-latest-preview/", "verified_by_llmstats": false, "analysis_method": "Diff-fenced", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.368655+00:00", "updated_at": "2025-07-19T19:56:12.368655+00:00", "benchmark_name": "Aider-Polyglot" }, { "model_benchmark_id": 682, "benchmark_id": "aime-2025", "model_id": "gemini-2.5-pro-preview-06-05", "score": 0.88, "normalized_score": 0.88, "is_self_reported": true, "self_reported_source_link": "https://blog.google/products/gemini/gemini-2-5-pro-latest-preview/", "verified_by_llmstats": false, "analysis_method": "Single attempt", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.425843+00:00", "updated_at": "2025-07-19T19:56:12.425843+00:00", "benchmark_name": "AIME 2025" }, { "model_benchmark_id": 1090, "benchmark_id": "facts-grounding", "model_id": "gemini-2.5-pro-preview-06-05", "score": 0.878, "normalized_score": 0.878, "is_self_reported": true, "self_reported_source_link": "https://blog.google/products/gemini/gemini-2-5-pro-latest-preview/", "verified_by_llmstats": false, "analysis_method": "Factuality", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.269434+00:00", "updated_at": "2025-07-19T19:56:13.269434+00:00", "benchmark_name": "FACTS Grounding" }, { "model_benchmark_id": 1211, "benchmark_id": "global-mmlu-lite", "model_id": "gemini-2.5-pro-preview-06-05", "score": 0.892, "normalized_score": 0.892, "is_self_reported": true, "self_reported_source_link": "https://blog.google/products/gemini/gemini-2-5-pro-latest-preview/", "verified_by_llmstats": false, "analysis_method": "Multilingual performance", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.548453+00:00", "updated_at": "2025-07-19T19:56:13.548453+00:00", "benchmark_name": "Global-MMLU-Lite" }, { "model_benchmark_id": 269, "benchmark_id": "gpqa", "model_id": "gemini-2.5-pro-preview-06-05", "score": 0.864, "normalized_score": 0.864, "is_self_reported": true, "self_reported_source_link": "https://blog.google/products/gemini/gemini-2-5-pro-latest-preview/", "verified_by_llmstats": false, "analysis_method": "Single attempt Diamond", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.617404+00:00", "updated_at": "2025-07-19T19:56:11.617404+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 719, "benchmark_id": "humanity's-last-exam", "model_id": "gemini-2.5-pro-preview-06-05", "score": 0.216, "normalized_score": 0.216, "is_self_reported": true, "self_reported_source_link": "https://blog.google/products/gemini/gemini-2-5-pro-latest-preview/", "verified_by_llmstats": false, "analysis_method": "No tools", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.516239+00:00", "updated_at": "2025-07-19T19:56:12.516239+00:00", "benchmark_name": "Humanity's Last Exam" }, { "model_benchmark_id": 1105, "benchmark_id": "livecodebench", "model_id": "gemini-2.5-pro-preview-06-05", "score": 0.69, "normalized_score": 0.69, "is_self_reported": true, "self_reported_source_link": "https://blog.google/products/gemini/gemini-2-5-pro-latest-preview/", "verified_by_llmstats": false, "analysis_method": "Single attempt (1/1/2025-5/1/2025)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.303010+00:00", "updated_at": "2025-07-19T19:56:13.303010+00:00", "benchmark_name": "LiveCodeBench" }, { "model_benchmark_id": 557, "benchmark_id": "mmmu", "model_id": "gemini-2.5-pro-preview-06-05", "score": 0.82, "normalized_score": 0.82, "is_self_reported": true, "self_reported_source_link": "https://blog.google/products/gemini/gemini-2-5-pro-latest-preview/", "verified_by_llmstats": false, "analysis_method": "Single attempt", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.146880+00:00", "updated_at": "2025-07-19T19:56:12.146880+00:00", "benchmark_name": "MMMU" }, { "model_benchmark_id": 1422, "benchmark_id": "mrcr-v2-(8-needle)", "model_id": "gemini-2.5-pro-preview-06-05", "score": 0.164, "normalized_score": 0.164, "is_self_reported": true, "self_reported_source_link": "https://blog.google/products/gemini/gemini-2-5-pro-latest-preview/", "verified_by_llmstats": false, "analysis_method": "1M pointwise", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.013534+00:00", "updated_at": "2025-07-19T19:56:14.016258+00:00", "benchmark_name": "MRCR v2 (8-needle)" }, { "model_benchmark_id": 228, "benchmark_id": "simpleqa", "model_id": "gemini-2.5-pro-preview-06-05", "score": 0.54, "normalized_score": 0.54, "is_self_reported": true, "self_reported_source_link": "https://blog.google/products/gemini/gemini-2-5-pro-latest-preview/", "verified_by_llmstats": false, "analysis_method": "Factuality", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.538432+00:00", "updated_at": "2025-07-19T19:56:11.538432+00:00", "benchmark_name": "SimpleQA" }, { "model_benchmark_id": 1340, "benchmark_id": "swe-bench-verified", "model_id": "gemini-2.5-pro-preview-06-05", "score": 0.672, "normalized_score": 0.672, "is_self_reported": true, "self_reported_source_link": "https://blog.google/products/gemini/gemini-2-5-pro-latest-preview/", "verified_by_llmstats": false, "analysis_method": "Multiple attempts", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.820885+00:00", "updated_at": "2025-07-19T19:56:13.820885+00:00", "benchmark_name": "SWE-Bench Verified" }, { "model_benchmark_id": 1367, "benchmark_id": "vibe-eval", "model_id": "gemini-2.5-pro-preview-06-05", "score": 0.672, "normalized_score": 0.672, "is_self_reported": true, "self_reported_source_link": "https://blog.google/products/gemini/gemini-2-5-pro-latest-preview/", "verified_by_llmstats": false, "analysis_method": "Image understanding", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.879257+00:00", "updated_at": "2025-07-19T19:56:13.879257+00:00", "benchmark_name": "Vibe-Eval" }, { "model_benchmark_id": 1421, "benchmark_id": "videommmu", "model_id": "gemini-2.5-pro-preview-06-05", "score": 0.836, "normalized_score": 0.836, "is_self_reported": true, "self_reported_source_link": "https://blog.google/products/gemini/gemini-2-5-pro-latest-preview/", "verified_by_llmstats": false, "analysis_method": "Video understanding", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.009959+00:00", "updated_at": "2025-07-19T19:56:14.009959+00:00", "benchmark_name": "VideoMMMU" } ] ================================================ FILE: data/organizations/google/models/gemini-2.5-pro-preview-06-05/model.json ================================================ { "model_id": "gemini-2.5-pro-preview-06-05", "name": "Gemini 2.5 Pro Preview 06-05", "organization_id": "google", "fine_tuned_from_model_id": null, "description": "The latest preview version of Google's most advanced reasoning Gemini model, capable of solving complex problems. Built for the agentic era with enhanced reasoning capabilities, multimodal understanding (text, image, video, audio), and a 1M token context window. Features thinking preview, code execution, grounding with Google Search, system instructions, function calling, and controlled generation. Supports up to 3,000 images per prompt, 45-60 minutes of video, and 8.4 hours of audio.", "release_date": "2025-06-05", "announcement_date": "2025-06-05", "license_id": "proprietary", "multimodal": true, "knowledge_cutoff": "2025-01-31", "param_count": null, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://cloud.google.com/vertex-ai/generative-ai/docs/models/gemini/2-5-pro", "source_playground": "https://aistudio.google.com", "source_paper": null, "source_scorecard_blog_link": "https://blog.google/products/gemini/gemini-2-5-pro-latest-preview/", "source_repo_link": null, "source_weights_link": null, "created_at": "2025-07-19T19:49:05.493595+00:00", "updated_at": "2025-07-19T19:49:05.493595+00:00", "model_family_id": null } ================================================ FILE: data/organizations/google/models/gemini-diffusion/benchmarks.json ================================================ [ { "model_benchmark_id": 685, "benchmark_id": "aime-2025", "model_id": "gemini-diffusion", "score": 0.233, "normalized_score": 0.233, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/models/gemini-diffusion/", "verified_by_llmstats": false, "analysis_method": "pass @1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.434861+00:00", "updated_at": "2025-07-19T19:56:12.434861+00:00", "benchmark_name": "AIME 2025" }, { "model_benchmark_id": 1100, "benchmark_id": "big-bench-extra-hard", "model_id": "gemini-diffusion", "score": 0.15, "normalized_score": 0.15, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/models/gemini-diffusion/", "verified_by_llmstats": false, "analysis_method": "pass @1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.291288+00:00", "updated_at": "2025-07-19T19:56:13.291288+00:00", "benchmark_name": "BIG-Bench Extra Hard" }, { "model_benchmark_id": 1433, "benchmark_id": "bigcodebench", "model_id": "gemini-diffusion", "score": 0.454, "normalized_score": 0.454, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/models/gemini-diffusion/", "verified_by_llmstats": false, "analysis_method": "pass @1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.050987+00:00", "updated_at": "2025-07-19T19:56:14.050987+00:00", "benchmark_name": "BigCodeBench" }, { "model_benchmark_id": 1217, "benchmark_id": "global-mmlu-lite", "model_id": "gemini-diffusion", "score": 0.691, "normalized_score": 0.691, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/models/gemini-diffusion/", "verified_by_llmstats": false, "analysis_method": "pass @1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.559014+00:00", "updated_at": "2025-07-19T19:56:13.559014+00:00", "benchmark_name": "Global-MMLU-Lite" }, { "model_benchmark_id": 278, "benchmark_id": "gpqa", "model_id": "gemini-diffusion", "score": 0.404, "normalized_score": 0.404, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/models/gemini-diffusion/", "verified_by_llmstats": false, "analysis_method": "pass @1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.637311+00:00", "updated_at": "2025-07-19T19:56:11.637311+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 773, "benchmark_id": "humaneval", "model_id": "gemini-diffusion", "score": 0.896, "normalized_score": 0.896, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/models/gemini-diffusion/", "verified_by_llmstats": false, "analysis_method": "pass @1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.625233+00:00", "updated_at": "2025-07-19T19:56:12.625233+00:00", "benchmark_name": "HumanEval" }, { "model_benchmark_id": 1435, "benchmark_id": "lbpp-(v2)", "model_id": "gemini-diffusion", "score": 0.568, "normalized_score": 0.568, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/models/gemini-diffusion/", "verified_by_llmstats": false, "analysis_method": "pass @1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.056060+00:00", "updated_at": "2025-07-19T19:56:14.056060+00:00", "benchmark_name": "LBPP (v2)" }, { "model_benchmark_id": 1110, "benchmark_id": "livecodebench", "model_id": "gemini-diffusion", "score": 0.309, "normalized_score": 0.309, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/models/gemini-diffusion/", "verified_by_llmstats": false, "analysis_method": "pass @1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.314684+00:00", "updated_at": "2025-07-19T19:56:13.314684+00:00", "benchmark_name": "LiveCodeBench" }, { "model_benchmark_id": 1175, "benchmark_id": "mbpp", "model_id": "gemini-diffusion", "score": 0.76, "normalized_score": 0.76, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/models/gemini-diffusion/", "verified_by_llmstats": false, "analysis_method": "pass @1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.475906+00:00", "updated_at": "2025-07-19T19:56:13.475906+00:00", "benchmark_name": "MBPP" }, { "model_benchmark_id": 1342, "benchmark_id": "swe-bench-verified", "model_id": "gemini-diffusion", "score": 0.229, "normalized_score": 0.229, "is_self_reported": true, "self_reported_source_link": "https://deepmind.google/models/gemini-diffusion/", "verified_by_llmstats": false, "analysis_method": "pass @1, Non-agentic evaluation (single turn edit only), max prompt length of 32K", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.824708+00:00", "updated_at": "2025-07-19T19:56:13.824708+00:00", "benchmark_name": "SWE-Bench Verified" } ] ================================================ FILE: data/organizations/google/models/gemini-diffusion/model.json ================================================ { "model_id": "gemini-diffusion", "name": "Gemini Diffusion", "organization_id": "google", "fine_tuned_from_model_id": null, "description": "Gemini Diffusion is a state-of-the-art, experimental text diffusion model from Google DeepMind. It explores a new kind of language model designed to provide users with greater control, creativity, and speed in text generation. Instead of predicting text token-by-token, it learns to generate outputs by refining noise step-by-step, allowing for rapid iteration and error correction during generation. Key capabilities include rapid response times (reportedly 1479 tokens/sec excluding overhead), generation of more coherent text by outputting entire blocks of tokens at once, and iterative refinement for consistent outputs. It excels at tasks like editing, including in math and code contexts.", "release_date": "2025-05-20", "announcement_date": "2025-05-20", "license_id": "proprietary", "multimodal": false, "knowledge_cutoff": null, "param_count": null, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": null, "source_playground": null, "source_paper": null, "source_scorecard_blog_link": "https://deepmind.google/models/gemini-diffusion/", "source_repo_link": "https://github.com/google", "source_weights_link": null, "created_at": "2025-07-19T19:49:05.534835+00:00", "updated_at": "2025-07-19T19:49:05.534835+00:00", "model_family_id": null } ================================================ FILE: data/organizations/google/models/gemma-2-27b-it/benchmarks.json ================================================ [ { "model_benchmark_id": 1408, "benchmark_id": "agieval", "model_id": "gemma-2-27b-it", "score": 0.551, "normalized_score": 0.551, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/blog/gemma2", "verified_by_llmstats": false, "analysis_method": "3-5-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.975397+00:00", "updated_at": "2025-07-19T19:56:13.975397+00:00", "benchmark_name": "AGIEval" }, { "model_benchmark_id": 9, "benchmark_id": "arc-c", "model_id": "gemma-2-27b-it", "score": 0.714, "normalized_score": 0.714, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/blog/gemma2", "verified_by_llmstats": false, "analysis_method": "25-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.099650+00:00", "updated_at": "2025-07-19T19:56:11.099650+00:00", "benchmark_name": "ARC-C" }, { "model_benchmark_id": 1055, "benchmark_id": "arc-e", "model_id": "gemma-2-27b-it", "score": 0.886, "normalized_score": 0.886, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/blog/gemma2", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.203403+00:00", "updated_at": "2025-07-19T19:56:13.203403+00:00", "benchmark_name": "ARC-E" }, { "model_benchmark_id": 1392, "benchmark_id": "big-bench", "model_id": "gemma-2-27b-it", "score": 0.749, "normalized_score": 0.749, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/blog/gemma2", "verified_by_llmstats": false, "analysis_method": "3-shot, CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.932992+00:00", "updated_at": "2025-07-19T19:56:13.932992+00:00", "benchmark_name": "BIG-Bench" }, { "model_benchmark_id": 1021, "benchmark_id": "boolq", "model_id": "gemma-2-27b-it", "score": 0.848, "normalized_score": 0.848, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/blog/gemma2", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.126514+00:00", "updated_at": "2025-07-19T19:56:13.126514+00:00", "benchmark_name": "BoolQ" }, { "model_benchmark_id": 980, "benchmark_id": "gsm8k", "model_id": "gemma-2-27b-it", "score": 0.74, "normalized_score": 0.74, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/blog/gemma2", "verified_by_llmstats": false, "analysis_method": "5-shot, maj@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.058102+00:00", "updated_at": "2025-07-19T19:56:13.058102+00:00", "benchmark_name": "GSM8k" }, { "model_benchmark_id": 38, "benchmark_id": "hellaswag", "model_id": "gemma-2-27b-it", "score": 0.864, "normalized_score": 0.864, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/blog/gemma2", "verified_by_llmstats": false, "analysis_method": "10-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.164247+00:00", "updated_at": "2025-07-19T19:56:11.164247+00:00", "benchmark_name": "HellaSwag" }, { "model_benchmark_id": 767, "benchmark_id": "humaneval", "model_id": "gemma-2-27b-it", "score": 0.518, "normalized_score": 0.518, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/blog/gemma2", "verified_by_llmstats": false, "analysis_method": "pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.615384+00:00", "updated_at": "2025-07-19T19:56:12.615384+00:00", "benchmark_name": "HumanEval" }, { "model_benchmark_id": 382, "benchmark_id": "math", "model_id": "gemma-2-27b-it", "score": 0.423, "normalized_score": 0.423, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/blog/gemma2", "verified_by_llmstats": false, "analysis_method": "4-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.824501+00:00", "updated_at": "2025-07-19T19:56:11.824501+00:00", "benchmark_name": "MATH" }, { "model_benchmark_id": 1170, "benchmark_id": "mbpp", "model_id": "gemma-2-27b-it", "score": 0.626, "normalized_score": 0.626, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/blog/gemma2", "verified_by_llmstats": false, "analysis_method": "3-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.464425+00:00", "updated_at": "2025-07-19T19:56:13.464425+00:00", "benchmark_name": "MBPP" }, { "model_benchmark_id": 68, "benchmark_id": "mmlu", "model_id": "gemma-2-27b-it", "score": 0.752, "normalized_score": 0.752, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/blog/gemma2", "verified_by_llmstats": false, "analysis_method": "5-shot, top-1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.228104+00:00", "updated_at": "2025-07-19T19:56:11.228104+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 1048, "benchmark_id": "natural-questions", "model_id": "gemma-2-27b-it", "score": 0.345, "normalized_score": 0.345, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/blog/gemma2", "verified_by_llmstats": false, "analysis_method": "5-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.188220+00:00", "updated_at": "2025-07-19T19:56:13.188220+00:00", "benchmark_name": "Natural Questions" }, { "model_benchmark_id": 1030, "benchmark_id": "piqa", "model_id": "gemma-2-27b-it", "score": 0.832, "normalized_score": 0.832, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/blog/gemma2", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.145819+00:00", "updated_at": "2025-07-19T19:56:13.145819+00:00", "benchmark_name": "PIQA" }, { "model_benchmark_id": 1039, "benchmark_id": "social-iqa", "model_id": "gemma-2-27b-it", "score": 0.537, "normalized_score": 0.537, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/blog/gemma2", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.168648+00:00", "updated_at": "2025-07-19T19:56:13.168648+00:00", "benchmark_name": "Social IQa" }, { "model_benchmark_id": 248, "benchmark_id": "triviaqa", "model_id": "gemma-2-27b-it", "score": 0.837, "normalized_score": 0.837, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/blog/gemma2", "verified_by_llmstats": false, "analysis_method": "5-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.574247+00:00", "updated_at": "2025-07-19T19:56:11.574247+00:00", "benchmark_name": "TriviaQA" }, { "model_benchmark_id": 1060, "benchmark_id": "winogrande", "model_id": "gemma-2-27b-it", "score": 0.837, "normalized_score": 0.837, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/blog/gemma2", "verified_by_llmstats": false, "analysis_method": "5-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.212219+00:00", "updated_at": "2025-07-19T19:56:13.212219+00:00", "benchmark_name": "Winogrande" } ] ================================================ FILE: data/organizations/google/models/gemma-2-27b-it/model.json ================================================ { "model_id": "gemma-2-27b-it", "name": "Gemma 2 27B", "organization_id": "google", "fine_tuned_from_model_id": null, "description": "Gemma 2 27B IT is an instruction-tuned version of Google's state-of-the-art open language model. Built from the same research and technology as Gemini, it's optimized for dialogue applications through supervised fine-tuning, distillation from larger models, and RLHF. The model excels at text generation tasks including question answering, summarization, and reasoning.", "release_date": "2024-06-27", "announcement_date": "2024-06-27", "license_id": "gemma", "multimodal": false, "knowledge_cutoff": null, "param_count": 27200000000, "training_tokens": 13000000000000, "available_in_zeroeval": true, "source_api_ref": "https://huggingface.co/google/gemma-2-27b-it", "source_playground": "https://huggingface.co/chat/models/google/gemma-2-27b-it", "source_paper": "https://storage.googleapis.com/deepmind-media/gemma/gemma-2-report.pdf", "source_scorecard_blog_link": "https://huggingface.co/blog/gemma2", "source_repo_link": null, "source_weights_link": "https://huggingface.co/google/gemma-2-27b-it", "created_at": "2025-07-19T19:49:05.485572+00:00", "updated_at": "2025-07-19T19:49:05.485572+00:00", "model_family_id": null } ================================================ FILE: data/organizations/google/models/gemma-2-9b-it/benchmarks.json ================================================ [ { "model_benchmark_id": 1407, "benchmark_id": "agieval", "model_id": "gemma-2-9b-it", "score": 0.528, "normalized_score": 0.528, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/blog/gemma2", "verified_by_llmstats": false, "analysis_method": "3-5-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.973652+00:00", "updated_at": "2025-07-19T19:56:13.973652+00:00", "benchmark_name": "AGIEval" }, { "model_benchmark_id": 8, "benchmark_id": "arc-c", "model_id": "gemma-2-9b-it", "score": 0.684, "normalized_score": 0.684, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/blog/gemma2", "verified_by_llmstats": false, "analysis_method": "25-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.097779+00:00", "updated_at": "2025-07-19T19:56:11.097779+00:00", "benchmark_name": "ARC-C" }, { "model_benchmark_id": 1054, "benchmark_id": "arc-e", "model_id": "gemma-2-9b-it", "score": 0.88, "normalized_score": 0.88, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/blog/gemma2", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.201834+00:00", "updated_at": "2025-07-19T19:56:13.201834+00:00", "benchmark_name": "ARC-E" }, { "model_benchmark_id": 1391, "benchmark_id": "big-bench", "model_id": "gemma-2-9b-it", "score": 0.682, "normalized_score": 0.682, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/blog/gemma2", "verified_by_llmstats": false, "analysis_method": "3-shot Chain-of-Thought", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.930966+00:00", "updated_at": "2025-07-19T19:56:13.930966+00:00", "benchmark_name": "BIG-Bench" }, { "model_benchmark_id": 1020, "benchmark_id": "boolq", "model_id": "gemma-2-9b-it", "score": 0.842, "normalized_score": 0.842, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/blog/gemma2", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.124981+00:00", "updated_at": "2025-07-19T19:56:13.124981+00:00", "benchmark_name": "BoolQ" }, { "model_benchmark_id": 978, "benchmark_id": "gsm8k", "model_id": "gemma-2-9b-it", "score": 0.686, "normalized_score": 0.686, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/blog/gemma2", "verified_by_llmstats": false, "analysis_method": "5-shot majority@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.053844+00:00", "updated_at": "2025-07-19T19:56:13.053844+00:00", "benchmark_name": "GSM8k" }, { "model_benchmark_id": 36, "benchmark_id": "hellaswag", "model_id": "gemma-2-9b-it", "score": 0.819, "normalized_score": 0.819, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/blog/gemma2", "verified_by_llmstats": false, "analysis_method": "10-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.157090+00:00", "updated_at": "2025-07-19T19:56:11.157090+00:00", "benchmark_name": "HellaSwag" }, { "model_benchmark_id": 765, "benchmark_id": "humaneval", "model_id": "gemma-2-9b-it", "score": 0.402, "normalized_score": 0.402, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/blog/gemma2", "verified_by_llmstats": false, "analysis_method": "pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.611318+00:00", "updated_at": "2025-07-19T19:56:12.611318+00:00", "benchmark_name": "HumanEval" }, { "model_benchmark_id": 380, "benchmark_id": "math", "model_id": "gemma-2-9b-it", "score": 0.366, "normalized_score": 0.366, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/blog/gemma2", "verified_by_llmstats": false, "analysis_method": "4-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.821125+00:00", "updated_at": "2025-07-19T19:56:11.821125+00:00", "benchmark_name": "MATH" }, { "model_benchmark_id": 1169, "benchmark_id": "mbpp", "model_id": "gemma-2-9b-it", "score": 0.524, "normalized_score": 0.524, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/blog/gemma2", "verified_by_llmstats": false, "analysis_method": "3-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.462564+00:00", "updated_at": "2025-07-19T19:56:13.462564+00:00", "benchmark_name": "MBPP" }, { "model_benchmark_id": 66, "benchmark_id": "mmlu", "model_id": "gemma-2-9b-it", "score": 0.713, "normalized_score": 0.713, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/blog/gemma2", "verified_by_llmstats": false, "analysis_method": "5-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.224994+00:00", "updated_at": "2025-07-19T19:56:11.224994+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 1047, "benchmark_id": "natural-questions", "model_id": "gemma-2-9b-it", "score": 0.292, "normalized_score": 0.292, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/blog/gemma2", "verified_by_llmstats": false, "analysis_method": "5-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.186631+00:00", "updated_at": "2025-07-19T19:56:13.186631+00:00", "benchmark_name": "Natural Questions" }, { "model_benchmark_id": 1029, "benchmark_id": "piqa", "model_id": "gemma-2-9b-it", "score": 0.817, "normalized_score": 0.817, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/blog/gemma2", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.144012+00:00", "updated_at": "2025-07-19T19:56:13.144012+00:00", "benchmark_name": "PIQA" }, { "model_benchmark_id": 1038, "benchmark_id": "social-iqa", "model_id": "gemma-2-9b-it", "score": 0.534, "normalized_score": 0.534, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/blog/gemma2", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.166311+00:00", "updated_at": "2025-07-19T19:56:13.166311+00:00", "benchmark_name": "Social IQa" }, { "model_benchmark_id": 247, "benchmark_id": "triviaqa", "model_id": "gemma-2-9b-it", "score": 0.766, "normalized_score": 0.766, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/blog/gemma2", "verified_by_llmstats": false, "analysis_method": "5-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.572657+00:00", "updated_at": "2025-07-19T19:56:11.572657+00:00", "benchmark_name": "TriviaQA" }, { "model_benchmark_id": 148, "benchmark_id": "winogrande", "model_id": "gemma-2-9b-it", "score": 0.806, "normalized_score": 0.806, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/blog/gemma2", "verified_by_llmstats": false, "analysis_method": "partial score evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.380497+00:00", "updated_at": "2025-07-19T19:56:11.380497+00:00", "benchmark_name": "Winogrande" } ] ================================================ FILE: data/organizations/google/models/gemma-2-9b-it/model.json ================================================ { "model_id": "gemma-2-9b-it", "name": "Gemma 2 9B", "organization_id": "google", "fine_tuned_from_model_id": null, "description": "Gemma 2 9B IT is an instruction-tuned version of Google's Gemma 2 9B base model. It was trained on 8 trillion tokens of web data, code, and math content. The model features sliding window attention, logit soft-capping, and knowledge distillation techniques. It's optimized for dialogue applications through supervised fine-tuning, distillation, RLHF, and model merging using WARP.", "release_date": "2024-06-27", "announcement_date": "2024-06-27", "license_id": "gemma", "multimodal": false, "knowledge_cutoff": null, "param_count": 9240000000, "training_tokens": 8000000000000, "available_in_zeroeval": true, "source_api_ref": "https://huggingface.co/google/gemma-2-9b-it", "source_playground": "https://huggingface.co/chat/models/google/gemma-2-9b-it", "source_paper": "https://storage.googleapis.com/deepmind-media/gemma/gemma-2-report.pdf", "source_scorecard_blog_link": "https://huggingface.co/blog/gemma2", "source_repo_link": null, "source_weights_link": "https://huggingface.co/google/gemma-2-9b-it", "created_at": "2025-07-19T19:49:05.477806+00:00", "updated_at": "2025-07-19T19:49:05.477806+00:00", "model_family_id": null } ================================================ FILE: data/organizations/google/models/gemma-3-12b-it/benchmarks.json ================================================ [ { "model_benchmark_id": 1247, "benchmark_id": "ai2d", "model_id": "gemma-3-12b-it", "score": 0.842, "normalized_score": 0.842, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "multimodal evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.621225+00:00", "updated_at": "2025-07-19T19:56:13.621225+00:00", "benchmark_name": "AI2D" }, { "model_benchmark_id": 1096, "benchmark_id": "big-bench-extra-hard", "model_id": "gemma-3-12b-it", "score": 0.163, "normalized_score": 0.163, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.282747+00:00", "updated_at": "2025-07-19T19:56:13.282747+00:00", "benchmark_name": "BIG-Bench Extra Hard" }, { "model_benchmark_id": 1067, "benchmark_id": "big-bench-hard", "model_id": "gemma-3-12b-it", "score": 0.857, "normalized_score": 0.857, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.226924+00:00", "updated_at": "2025-07-19T19:56:13.226924+00:00", "benchmark_name": "BIG-Bench Hard" }, { "model_benchmark_id": 1147, "benchmark_id": "bird-sql-(dev)", "model_id": "gemma-3-12b-it", "score": 0.479, "normalized_score": 0.479, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "- evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.413629+00:00", "updated_at": "2025-07-19T19:56:13.413629+00:00", "benchmark_name": "Bird-SQL (dev)" }, { "model_benchmark_id": 855, "benchmark_id": "chartqa", "model_id": "gemma-3-12b-it", "score": 0.757, "normalized_score": 0.757, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "multimodal evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.789962+00:00", "updated_at": "2025-07-19T19:56:12.789962+00:00", "benchmark_name": "ChartQA" }, { "model_benchmark_id": 878, "benchmark_id": "docvqa", "model_id": "gemma-3-12b-it", "score": 0.871, "normalized_score": 0.871, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "multimodal evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.830839+00:00", "updated_at": "2025-07-19T19:56:12.830839+00:00", "benchmark_name": "DocVQA" }, { "model_benchmark_id": 1219, "benchmark_id": "eclektic", "model_id": "gemma-3-12b-it", "score": 0.103, "normalized_score": 0.103, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.563615+00:00", "updated_at": "2025-07-19T19:56:13.563615+00:00", "benchmark_name": "ECLeKTic" }, { "model_benchmark_id": 1087, "benchmark_id": "facts-grounding", "model_id": "gemma-3-12b-it", "score": 0.758, "normalized_score": 0.758, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "- evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.262640+00:00", "updated_at": "2025-07-19T19:56:13.262640+00:00", "benchmark_name": "FACTS Grounding" }, { "model_benchmark_id": 1205, "benchmark_id": "global-mmlu-lite", "model_id": "gemma-3-12b-it", "score": 0.695, "normalized_score": 0.695, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.537058+00:00", "updated_at": "2025-07-19T19:56:13.537058+00:00", "benchmark_name": "Global-MMLU-Lite" }, { "model_benchmark_id": 261, "benchmark_id": "gpqa", "model_id": "gemma-3-12b-it", "score": 0.409, "normalized_score": 0.409, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation diamond", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.600334+00:00", "updated_at": "2025-07-19T19:56:11.600334+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 977, "benchmark_id": "gsm8k", "model_id": "gemma-3-12b-it", "score": 0.944, "normalized_score": 0.944, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.052379+00:00", "updated_at": "2025-07-19T19:56:13.052379+00:00", "benchmark_name": "GSM8k" }, { "model_benchmark_id": 1153, "benchmark_id": "hiddenmath", "model_id": "gemma-3-12b-it", "score": 0.545, "normalized_score": 0.545, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.427708+00:00", "updated_at": "2025-07-19T19:56:13.427708+00:00", "benchmark_name": "HiddenMath" }, { "model_benchmark_id": 762, "benchmark_id": "humaneval", "model_id": "gemma-3-12b-it", "score": 0.854, "normalized_score": 0.854, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.606840+00:00", "updated_at": "2025-07-19T19:56:12.606840+00:00", "benchmark_name": "HumanEval" }, { "model_benchmark_id": 607, "benchmark_id": "ifeval", "model_id": "gemma-3-12b-it", "score": 0.889, "normalized_score": 0.889, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.254325+00:00", "updated_at": "2025-07-19T19:56:12.254325+00:00", "benchmark_name": "IFEval" }, { "model_benchmark_id": 1238, "benchmark_id": "infovqa", "model_id": "gemma-3-12b-it", "score": 0.649, "normalized_score": 0.649, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "multimodal evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.604072+00:00", "updated_at": "2025-07-19T19:56:13.604072+00:00", "benchmark_name": "InfoVQA" }, { "model_benchmark_id": 1101, "benchmark_id": "livecodebench", "model_id": "gemma-3-12b-it", "score": 0.246, "normalized_score": 0.246, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.294686+00:00", "updated_at": "2025-07-19T19:56:13.294686+00:00", "benchmark_name": "LiveCodeBench" }, { "model_benchmark_id": 377, "benchmark_id": "math", "model_id": "gemma-3-12b-it", "score": 0.838, "normalized_score": 0.838, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.815597+00:00", "updated_at": "2025-07-19T19:56:11.815597+00:00", "benchmark_name": "MATH" }, { "model_benchmark_id": 1266, "benchmark_id": "mathvista-mini", "model_id": "gemma-3-12b-it", "score": 0.629, "normalized_score": 0.629, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "multimodal evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.657019+00:00", "updated_at": "2025-07-19T19:56:13.657019+00:00", "benchmark_name": "MathVista-Mini" }, { "model_benchmark_id": 1166, "benchmark_id": "mbpp", "model_id": "gemma-3-12b-it", "score": 0.73, "normalized_score": 0.73, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "3-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.456223+00:00", "updated_at": "2025-07-19T19:56:13.456223+00:00", "benchmark_name": "MBPP" }, { "model_benchmark_id": 163, "benchmark_id": "mmlu-pro", "model_id": "gemma-3-12b-it", "score": 0.606, "normalized_score": 0.606, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.415028+00:00", "updated_at": "2025-07-19T19:56:11.415028+00:00", "benchmark_name": "MMLU-Pro" }, { "model_benchmark_id": 1235, "benchmark_id": "mmmu-(val)", "model_id": "gemma-3-12b-it", "score": 0.596, "normalized_score": 0.596, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "multimodal evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.595790+00:00", "updated_at": "2025-07-19T19:56:13.595790+00:00", "benchmark_name": "MMMU (val)" }, { "model_benchmark_id": 1197, "benchmark_id": "natural2code", "model_id": "gemma-3-12b-it", "score": 0.807, "normalized_score": 0.807, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.521277+00:00", "updated_at": "2025-07-19T19:56:13.521277+00:00", "benchmark_name": "Natural2Code" }, { "model_benchmark_id": 224, "benchmark_id": "simpleqa", "model_id": "gemma-3-12b-it", "score": 0.063, "normalized_score": 0.063, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.528858+00:00", "updated_at": "2025-07-19T19:56:11.528858+00:00", "benchmark_name": "SimpleQA" }, { "model_benchmark_id": 903, "benchmark_id": "textvqa", "model_id": "gemma-3-12b-it", "score": 0.677, "normalized_score": 0.677, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "multimodal evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.882990+00:00", "updated_at": "2025-07-19T19:56:12.882990+00:00", "benchmark_name": "TextVQA" }, { "model_benchmark_id": 1263, "benchmark_id": "vqav2-(val)", "model_id": "gemma-3-12b-it", "score": 0.716, "normalized_score": 0.716, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "multimodal evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.650557+00:00", "updated_at": "2025-07-19T19:56:13.650557+00:00", "benchmark_name": "VQAv2 (val)" }, { "model_benchmark_id": 1227, "benchmark_id": "wmt24++", "model_id": "gemma-3-12b-it", "score": 0.516, "normalized_score": 0.516, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.578915+00:00", "updated_at": "2025-07-19T19:56:13.578915+00:00", "benchmark_name": "WMT24++" } ] ================================================ FILE: data/organizations/google/models/gemma-3-12b-it/model.json ================================================ { "model_id": "gemma-3-12b-it", "name": "Gemma 3 12B", "organization_id": "google", "fine_tuned_from_model_id": null, "description": "Gemma 3 12B is a 12-billion-parameter vision-language model from Google, handling text and image input and generating text output. It features a 128K context window, multilingual support, and open weights. Suitable for question answering, summarization, reasoning, and image understanding tasks.", "release_date": "2025-03-12", "announcement_date": "2025-03-12", "license_id": "gemma", "multimodal": true, "knowledge_cutoff": null, "param_count": 12000000000, "training_tokens": 12000000000000, "available_in_zeroeval": true, "source_api_ref": null, "source_playground": null, "source_paper": "https://storage.googleapis.com/deepmind-media/gemma/Gemma3Report.pdf", "source_scorecard_blog_link": "https://huggingface.co/blog/gemma3", "source_repo_link": null, "source_weights_link": "https://huggingface.co/google/gemma-3-12b-it", "created_at": "2025-07-19T19:49:05.444134+00:00", "updated_at": "2025-07-19T19:49:05.444134+00:00", "model_family_id": null } ================================================ FILE: data/organizations/google/models/gemma-3-1b-it/benchmarks.json ================================================ [ { "model_benchmark_id": 1099, "benchmark_id": "big-bench-extra-hard", "model_id": "gemma-3-1b-it", "score": 0.072, "normalized_score": 0.072, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.289054+00:00", "updated_at": "2025-07-19T19:56:13.289054+00:00", "benchmark_name": "BIG-Bench Extra Hard" }, { "model_benchmark_id": 1075, "benchmark_id": "big-bench-hard", "model_id": "gemma-3-1b-it", "score": 0.391, "normalized_score": 0.391, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.240587+00:00", "updated_at": "2025-07-19T19:56:13.240587+00:00", "benchmark_name": "BIG-Bench Hard" }, { "model_benchmark_id": 1151, "benchmark_id": "bird-sql-(dev)", "model_id": "gemma-3-1b-it", "score": 0.064, "normalized_score": 0.064, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "- evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.421336+00:00", "updated_at": "2025-07-19T19:56:13.421336+00:00", "benchmark_name": "Bird-SQL (dev)" }, { "model_benchmark_id": 1225, "benchmark_id": "eclektic", "model_id": "gemma-3-1b-it", "score": 0.014, "normalized_score": 0.014, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.574307+00:00", "updated_at": "2025-07-19T19:56:13.574307+00:00", "benchmark_name": "ECLeKTic" }, { "model_benchmark_id": 1094, "benchmark_id": "facts-grounding", "model_id": "gemma-3-1b-it", "score": 0.364, "normalized_score": 0.364, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "- evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.276605+00:00", "updated_at": "2025-07-19T19:56:13.276605+00:00", "benchmark_name": "FACTS Grounding" }, { "model_benchmark_id": 1216, "benchmark_id": "global-mmlu-lite", "model_id": "gemma-3-1b-it", "score": 0.342, "normalized_score": 0.342, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.557306+00:00", "updated_at": "2025-07-19T19:56:13.557306+00:00", "benchmark_name": "Global-MMLU-Lite" }, { "model_benchmark_id": 276, "benchmark_id": "gpqa", "model_id": "gemma-3-1b-it", "score": 0.192, "normalized_score": 0.192, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation diamond", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.633668+00:00", "updated_at": "2025-07-19T19:56:11.633668+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 984, "benchmark_id": "gsm8k", "model_id": "gemma-3-1b-it", "score": 0.628, "normalized_score": 0.628, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.064705+00:00", "updated_at": "2025-07-19T19:56:13.064705+00:00", "benchmark_name": "GSM8k" }, { "model_benchmark_id": 1162, "benchmark_id": "hiddenmath", "model_id": "gemma-3-1b-it", "score": 0.158, "normalized_score": 0.158, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.445125+00:00", "updated_at": "2025-07-19T19:56:13.445125+00:00", "benchmark_name": "HiddenMath" }, { "model_benchmark_id": 772, "benchmark_id": "humaneval", "model_id": "gemma-3-1b-it", "score": 0.415, "normalized_score": 0.415, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.623656+00:00", "updated_at": "2025-07-19T19:56:12.623656+00:00", "benchmark_name": "HumanEval" }, { "model_benchmark_id": 610, "benchmark_id": "ifeval", "model_id": "gemma-3-1b-it", "score": 0.802, "normalized_score": 0.802, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.260062+00:00", "updated_at": "2025-07-19T19:56:12.260062+00:00", "benchmark_name": "IFEval" }, { "model_benchmark_id": 1109, "benchmark_id": "livecodebench", "model_id": "gemma-3-1b-it", "score": 0.019, "normalized_score": 0.019, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.311408+00:00", "updated_at": "2025-07-19T19:56:13.311408+00:00", "benchmark_name": "LiveCodeBench" }, { "model_benchmark_id": 386, "benchmark_id": "math", "model_id": "gemma-3-1b-it", "score": 0.48, "normalized_score": 0.48, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.832121+00:00", "updated_at": "2025-07-19T19:56:11.832121+00:00", "benchmark_name": "MATH" }, { "model_benchmark_id": 1174, "benchmark_id": "mbpp", "model_id": "gemma-3-1b-it", "score": 0.352, "normalized_score": 0.352, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "3-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.474036+00:00", "updated_at": "2025-07-19T19:56:13.474036+00:00", "benchmark_name": "MBPP" }, { "model_benchmark_id": 172, "benchmark_id": "mmlu-pro", "model_id": "gemma-3-1b-it", "score": 0.147, "normalized_score": 0.147, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.434242+00:00", "updated_at": "2025-07-19T19:56:11.434242+00:00", "benchmark_name": "MMLU-Pro" }, { "model_benchmark_id": 1202, "benchmark_id": "natural2code", "model_id": "gemma-3-1b-it", "score": 0.56, "normalized_score": 0.56, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.529701+00:00", "updated_at": "2025-07-19T19:56:13.529701+00:00", "benchmark_name": "Natural2Code" }, { "model_benchmark_id": 232, "benchmark_id": "simpleqa", "model_id": "gemma-3-1b-it", "score": 0.022, "normalized_score": 0.022, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.544931+00:00", "updated_at": "2025-07-19T19:56:11.544931+00:00", "benchmark_name": "SimpleQA" }, { "model_benchmark_id": 1233, "benchmark_id": "wmt24++", "model_id": "gemma-3-1b-it", "score": 0.359, "normalized_score": 0.359, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.590063+00:00", "updated_at": "2025-07-19T19:56:13.590063+00:00", "benchmark_name": "WMT24++" } ] ================================================ FILE: data/organizations/google/models/gemma-3-1b-it/model.json ================================================ { "model_id": "gemma-3-1b-it", "name": "Gemma 3 1B", "organization_id": "google", "fine_tuned_from_model_id": null, "description": "The Gemma 3 1B model is a lightweight, 1-billion-parameter language model by Google, optimized for efficiency on resource-limited devices. At 529MB, it processes text at 2,585 tokens/second with a context window of 128,000 tokens. It supports 35+ languages but handles text-only input, unlike larger multimodal Gemma models. This balance of speed and efficiency makes it ideal for fast text processing on mobile and low-power devices.", "release_date": "2025-03-12", "announcement_date": "2025-03-12", "license_id": "gemma", "multimodal": false, "knowledge_cutoff": null, "param_count": 1000000000, "training_tokens": 2000000000000, "available_in_zeroeval": true, "source_api_ref": "https://huggingface.co/google/gemma-3-1b-it", "source_playground": "https://huggingface.co/chat/models/google/gemma-3-1b-it", "source_paper": "https://storage.googleapis.com/deepmind-media/gemma/Gemma3Report.pdf", "source_scorecard_blog_link": "https://huggingface.co/blog/gemma3", "source_repo_link": null, "source_weights_link": "https://huggingface.co/google/gemma-3-1b-it", "created_at": "2025-07-19T19:49:05.527185+00:00", "updated_at": "2025-07-19T19:49:05.527185+00:00", "model_family_id": null } ================================================ FILE: data/organizations/google/models/gemma-3-27b-it/benchmarks.json ================================================ [ { "model_benchmark_id": 1249, "benchmark_id": "ai2d", "model_id": "gemma-3-27b-it", "score": 0.845, "normalized_score": 0.845, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "multimodal evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.624921+00:00", "updated_at": "2025-07-19T19:56:13.624921+00:00", "benchmark_name": "AI2D" }, { "model_benchmark_id": 1098, "benchmark_id": "big-bench-extra-hard", "model_id": "gemma-3-27b-it", "score": 0.193, "normalized_score": 0.193, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.286991+00:00", "updated_at": "2025-07-19T19:56:13.286991+00:00", "benchmark_name": "BIG-Bench Extra Hard" }, { "model_benchmark_id": 1074, "benchmark_id": "big-bench-hard", "model_id": "gemma-3-27b-it", "score": 0.876, "normalized_score": 0.876, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.238868+00:00", "updated_at": "2025-07-19T19:56:13.238868+00:00", "benchmark_name": "BIG-Bench Hard" }, { "model_benchmark_id": 1150, "benchmark_id": "bird-sql-(dev)", "model_id": "gemma-3-27b-it", "score": 0.544, "normalized_score": 0.544, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "- evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.418526+00:00", "updated_at": "2025-07-19T19:56:13.418526+00:00", "benchmark_name": "Bird-SQL (dev)" }, { "model_benchmark_id": 857, "benchmark_id": "chartqa", "model_id": "gemma-3-27b-it", "score": 0.78, "normalized_score": 0.78, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "multimodal evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.793657+00:00", "updated_at": "2025-07-19T19:56:12.793657+00:00", "benchmark_name": "ChartQA" }, { "model_benchmark_id": 880, "benchmark_id": "docvqa", "model_id": "gemma-3-27b-it", "score": 0.866, "normalized_score": 0.866, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "multimodal evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.834284+00:00", "updated_at": "2025-07-19T19:56:12.834284+00:00", "benchmark_name": "DocVQA" }, { "model_benchmark_id": 1224, "benchmark_id": "eclektic", "model_id": "gemma-3-27b-it", "score": 0.167, "normalized_score": 0.167, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.572334+00:00", "updated_at": "2025-07-19T19:56:13.572334+00:00", "benchmark_name": "ECLeKTic" }, { "model_benchmark_id": 1093, "benchmark_id": "facts-grounding", "model_id": "gemma-3-27b-it", "score": 0.749, "normalized_score": 0.749, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "- evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.275050+00:00", "updated_at": "2025-07-19T19:56:13.275050+00:00", "benchmark_name": "FACTS Grounding" }, { "model_benchmark_id": 1215, "benchmark_id": "global-mmlu-lite", "model_id": "gemma-3-27b-it", "score": 0.751, "normalized_score": 0.751, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.555532+00:00", "updated_at": "2025-07-19T19:56:13.555532+00:00", "benchmark_name": "Global-MMLU-Lite" }, { "model_benchmark_id": 275, "benchmark_id": "gpqa", "model_id": "gemma-3-27b-it", "score": 0.424, "normalized_score": 0.424, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation diamond", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.628803+00:00", "updated_at": "2025-07-19T19:56:11.628803+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 983, "benchmark_id": "gsm8k", "model_id": "gemma-3-27b-it", "score": 0.959, "normalized_score": 0.959, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.063038+00:00", "updated_at": "2025-07-19T19:56:13.063038+00:00", "benchmark_name": "GSM8k" }, { "model_benchmark_id": 1161, "benchmark_id": "hiddenmath", "model_id": "gemma-3-27b-it", "score": 0.603, "normalized_score": 0.603, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.443231+00:00", "updated_at": "2025-07-19T19:56:13.443231+00:00", "benchmark_name": "HiddenMath" }, { "model_benchmark_id": 771, "benchmark_id": "humaneval", "model_id": "gemma-3-27b-it", "score": 0.878, "normalized_score": 0.878, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.621954+00:00", "updated_at": "2025-07-19T19:56:12.621954+00:00", "benchmark_name": "HumanEval" }, { "model_benchmark_id": 609, "benchmark_id": "ifeval", "model_id": "gemma-3-27b-it", "score": 0.904, "normalized_score": 0.904, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.258406+00:00", "updated_at": "2025-07-19T19:56:12.258406+00:00", "benchmark_name": "IFEval" }, { "model_benchmark_id": 1240, "benchmark_id": "infovqa", "model_id": "gemma-3-27b-it", "score": 0.706, "normalized_score": 0.706, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "multimodal evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.607541+00:00", "updated_at": "2025-07-19T19:56:13.607541+00:00", "benchmark_name": "InfoVQA" }, { "model_benchmark_id": 1108, "benchmark_id": "livecodebench", "model_id": "gemma-3-27b-it", "score": 0.297, "normalized_score": 0.297, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.308517+00:00", "updated_at": "2025-07-19T19:56:13.308517+00:00", "benchmark_name": "LiveCodeBench" }, { "model_benchmark_id": 385, "benchmark_id": "math", "model_id": "gemma-3-27b-it", "score": 0.89, "normalized_score": 0.89, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.830123+00:00", "updated_at": "2025-07-19T19:56:11.830123+00:00", "benchmark_name": "MATH" }, { "model_benchmark_id": 1268, "benchmark_id": "mathvista-mini", "model_id": "gemma-3-27b-it", "score": 0.676, "normalized_score": 0.676, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "multimodal evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.660624+00:00", "updated_at": "2025-07-19T19:56:13.660624+00:00", "benchmark_name": "MathVista-Mini" }, { "model_benchmark_id": 1173, "benchmark_id": "mbpp", "model_id": "gemma-3-27b-it", "score": 0.744, "normalized_score": 0.744, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "3-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.472259+00:00", "updated_at": "2025-07-19T19:56:13.472259+00:00", "benchmark_name": "MBPP" }, { "model_benchmark_id": 171, "benchmark_id": "mmlu-pro", "model_id": "gemma-3-27b-it", "score": 0.675, "normalized_score": 0.675, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.432013+00:00", "updated_at": "2025-07-19T19:56:11.432013+00:00", "benchmark_name": "MMLU-Pro" }, { "model_benchmark_id": 1237, "benchmark_id": "mmmu-(val)", "model_id": "gemma-3-27b-it", "score": 0.649, "normalized_score": 0.649, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "multimodal evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.599826+00:00", "updated_at": "2025-07-19T19:56:13.599826+00:00", "benchmark_name": "MMMU (val)" }, { "model_benchmark_id": 1201, "benchmark_id": "natural2code", "model_id": "gemma-3-27b-it", "score": 0.845, "normalized_score": 0.845, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.528235+00:00", "updated_at": "2025-07-19T19:56:13.528235+00:00", "benchmark_name": "Natural2Code" }, { "model_benchmark_id": 231, "benchmark_id": "simpleqa", "model_id": "gemma-3-27b-it", "score": 0.1, "normalized_score": 0.1, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.543428+00:00", "updated_at": "2025-07-19T19:56:11.543428+00:00", "benchmark_name": "SimpleQA" }, { "model_benchmark_id": 905, "benchmark_id": "textvqa", "model_id": "gemma-3-27b-it", "score": 0.651, "normalized_score": 0.651, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "multimodal evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.886992+00:00", "updated_at": "2025-07-19T19:56:12.886992+00:00", "benchmark_name": "TextVQA" }, { "model_benchmark_id": 1265, "benchmark_id": "vqav2-(val)", "model_id": "gemma-3-27b-it", "score": 0.71, "normalized_score": 0.71, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "multimodal evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.653584+00:00", "updated_at": "2025-07-19T19:56:13.653584+00:00", "benchmark_name": "VQAv2 (val)" }, { "model_benchmark_id": 1232, "benchmark_id": "wmt24++", "model_id": "gemma-3-27b-it", "score": 0.534, "normalized_score": 0.534, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.587542+00:00", "updated_at": "2025-07-19T19:56:13.587542+00:00", "benchmark_name": "WMT24++" } ] ================================================ FILE: data/organizations/google/models/gemma-3-27b-it/model.json ================================================ { "model_id": "gemma-3-27b-it", "name": "Gemma 3 27B", "organization_id": "google", "fine_tuned_from_model_id": null, "description": "Gemma 3 27B is a 27-billion-parameter vision-language model from Google, handling text and image input and generating text output. It features a 128K context window, multilingual support, and open weights. Suitable for complex question answering, summarization, reasoning, and image understanding tasks.", "release_date": "2025-03-12", "announcement_date": "2025-03-12", "license_id": "gemma", "multimodal": true, "knowledge_cutoff": null, "param_count": 27000000000, "training_tokens": 14000000000000, "available_in_zeroeval": true, "source_api_ref": null, "source_playground": null, "source_paper": "https://storage.googleapis.com/deepmind-media/gemma/Gemma3Report.pdf", "source_scorecard_blog_link": "https://huggingface.co/blog/gemma3", "source_repo_link": null, "source_weights_link": "https://huggingface.co/google/gemma-3-27b-it", "created_at": "2025-07-19T19:49:05.523800+00:00", "updated_at": "2025-07-19T19:49:05.523800+00:00", "model_family_id": null } ================================================ FILE: data/organizations/google/models/gemma-3-4b-it/benchmarks.json ================================================ [ { "model_benchmark_id": 1248, "benchmark_id": "ai2d", "model_id": "gemma-3-4b-it", "score": 0.748, "normalized_score": 0.748, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "multimodal evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.622871+00:00", "updated_at": "2025-07-19T19:56:13.622871+00:00", "benchmark_name": "AI2D" }, { "model_benchmark_id": 1097, "benchmark_id": "big-bench-extra-hard", "model_id": "gemma-3-4b-it", "score": 0.11, "normalized_score": 0.11, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.285056+00:00", "updated_at": "2025-07-19T19:56:13.285056+00:00", "benchmark_name": "BIG-Bench Extra Hard" }, { "model_benchmark_id": 1073, "benchmark_id": "big-bench-hard", "model_id": "gemma-3-4b-it", "score": 0.722, "normalized_score": 0.722, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.237255+00:00", "updated_at": "2025-07-19T19:56:13.237255+00:00", "benchmark_name": "BIG-Bench Hard" }, { "model_benchmark_id": 1149, "benchmark_id": "bird-sql-(dev)", "model_id": "gemma-3-4b-it", "score": 0.363, "normalized_score": 0.363, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "- evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.417046+00:00", "updated_at": "2025-07-19T19:56:13.417046+00:00", "benchmark_name": "Bird-SQL (dev)" }, { "model_benchmark_id": 856, "benchmark_id": "chartqa", "model_id": "gemma-3-4b-it", "score": 0.688, "normalized_score": 0.688, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "multimodal evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.791952+00:00", "updated_at": "2025-07-19T19:56:12.791952+00:00", "benchmark_name": "ChartQA" }, { "model_benchmark_id": 879, "benchmark_id": "docvqa", "model_id": "gemma-3-4b-it", "score": 0.758, "normalized_score": 0.758, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "multimodal evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.832468+00:00", "updated_at": "2025-07-19T19:56:12.832468+00:00", "benchmark_name": "DocVQA" }, { "model_benchmark_id": 1223, "benchmark_id": "eclektic", "model_id": "gemma-3-4b-it", "score": 0.046, "normalized_score": 0.046, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.570776+00:00", "updated_at": "2025-07-19T19:56:13.570776+00:00", "benchmark_name": "ECLeKTic" }, { "model_benchmark_id": 1092, "benchmark_id": "facts-grounding", "model_id": "gemma-3-4b-it", "score": 0.701, "normalized_score": 0.701, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "- evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.273464+00:00", "updated_at": "2025-07-19T19:56:13.273464+00:00", "benchmark_name": "FACTS Grounding" }, { "model_benchmark_id": 1214, "benchmark_id": "global-mmlu-lite", "model_id": "gemma-3-4b-it", "score": 0.545, "normalized_score": 0.545, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.553690+00:00", "updated_at": "2025-07-19T19:56:13.553690+00:00", "benchmark_name": "Global-MMLU-Lite" }, { "model_benchmark_id": 274, "benchmark_id": "gpqa", "model_id": "gemma-3-4b-it", "score": 0.308, "normalized_score": 0.308, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation diamond", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.625675+00:00", "updated_at": "2025-07-19T19:56:11.625675+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 982, "benchmark_id": "gsm8k", "model_id": "gemma-3-4b-it", "score": 0.892, "normalized_score": 0.892, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.061601+00:00", "updated_at": "2025-07-19T19:56:13.061601+00:00", "benchmark_name": "GSM8k" }, { "model_benchmark_id": 1160, "benchmark_id": "hiddenmath", "model_id": "gemma-3-4b-it", "score": 0.43, "normalized_score": 0.43, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.440350+00:00", "updated_at": "2025-07-19T19:56:13.440350+00:00", "benchmark_name": "HiddenMath" }, { "model_benchmark_id": 770, "benchmark_id": "humaneval", "model_id": "gemma-3-4b-it", "score": 0.713, "normalized_score": 0.713, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.620468+00:00", "updated_at": "2025-07-19T19:56:12.620468+00:00", "benchmark_name": "HumanEval" }, { "model_benchmark_id": 608, "benchmark_id": "ifeval", "model_id": "gemma-3-4b-it", "score": 0.902, "normalized_score": 0.902, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.256346+00:00", "updated_at": "2025-07-19T19:56:12.256346+00:00", "benchmark_name": "IFEval" }, { "model_benchmark_id": 1239, "benchmark_id": "infovqa", "model_id": "gemma-3-4b-it", "score": 0.5, "normalized_score": 0.5, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "multimodal evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.605648+00:00", "updated_at": "2025-07-19T19:56:13.605648+00:00", "benchmark_name": "InfoVQA" }, { "model_benchmark_id": 1107, "benchmark_id": "livecodebench", "model_id": "gemma-3-4b-it", "score": 0.126, "normalized_score": 0.126, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.306674+00:00", "updated_at": "2025-07-19T19:56:13.306674+00:00", "benchmark_name": "LiveCodeBench" }, { "model_benchmark_id": 384, "benchmark_id": "math", "model_id": "gemma-3-4b-it", "score": 0.756, "normalized_score": 0.756, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.828322+00:00", "updated_at": "2025-07-19T19:56:11.828322+00:00", "benchmark_name": "MATH" }, { "model_benchmark_id": 1267, "benchmark_id": "mathvista-mini", "model_id": "gemma-3-4b-it", "score": 0.5, "normalized_score": 0.5, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "multimodal evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.659077+00:00", "updated_at": "2025-07-19T19:56:13.659077+00:00", "benchmark_name": "MathVista-Mini" }, { "model_benchmark_id": 1172, "benchmark_id": "mbpp", "model_id": "gemma-3-4b-it", "score": 0.632, "normalized_score": 0.632, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "3-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.469983+00:00", "updated_at": "2025-07-19T19:56:13.469983+00:00", "benchmark_name": "MBPP" }, { "model_benchmark_id": 170, "benchmark_id": "mmlu-pro", "model_id": "gemma-3-4b-it", "score": 0.436, "normalized_score": 0.436, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.430343+00:00", "updated_at": "2025-07-19T19:56:11.430343+00:00", "benchmark_name": "MMLU-Pro" }, { "model_benchmark_id": 1236, "benchmark_id": "mmmu-(val)", "model_id": "gemma-3-4b-it", "score": 0.488, "normalized_score": 0.488, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "multimodal evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.597769+00:00", "updated_at": "2025-07-19T19:56:13.597769+00:00", "benchmark_name": "MMMU (val)" }, { "model_benchmark_id": 1200, "benchmark_id": "natural2code", "model_id": "gemma-3-4b-it", "score": 0.703, "normalized_score": 0.703, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.526663+00:00", "updated_at": "2025-07-19T19:56:13.526663+00:00", "benchmark_name": "Natural2Code" }, { "model_benchmark_id": 230, "benchmark_id": "simpleqa", "model_id": "gemma-3-4b-it", "score": 0.04, "normalized_score": 0.04, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.542000+00:00", "updated_at": "2025-07-19T19:56:11.542000+00:00", "benchmark_name": "SimpleQA" }, { "model_benchmark_id": 904, "benchmark_id": "textvqa", "model_id": "gemma-3-4b-it", "score": 0.578, "normalized_score": 0.578, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "multimodal evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.885190+00:00", "updated_at": "2025-07-19T19:56:12.885190+00:00", "benchmark_name": "TextVQA" }, { "model_benchmark_id": 1264, "benchmark_id": "vqav2-(val)", "model_id": "gemma-3-4b-it", "score": 0.624, "normalized_score": 0.624, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "multimodal evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.652122+00:00", "updated_at": "2025-07-19T19:56:13.652122+00:00", "benchmark_name": "VQAv2 (val)" }, { "model_benchmark_id": 1231, "benchmark_id": "wmt24++", "model_id": "gemma-3-4b-it", "score": 0.468, "normalized_score": 0.468, "is_self_reported": true, "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.586157+00:00", "updated_at": "2025-07-19T19:56:13.586157+00:00", "benchmark_name": "WMT24++" } ] ================================================ FILE: data/organizations/google/models/gemma-3-4b-it/model.json ================================================ { "model_id": "gemma-3-4b-it", "name": "Gemma 3 4B", "organization_id": "google", "fine_tuned_from_model_id": null, "description": "Gemma 3 4B is a 4-billion-parameter vision-language model from Google, handling text and image input and generating text output. It features a 128K context window, multilingual support, and open weights. Suitable for question answering, summarization, reasoning, and image understanding tasks.", "release_date": "2025-03-12", "announcement_date": "2025-03-12", "license_id": "gemma", "multimodal": true, "knowledge_cutoff": "2024-08-01", "param_count": 4000000000, "training_tokens": 4000000000000, "available_in_zeroeval": true, "source_api_ref": null, "source_playground": null, "source_paper": "https://storage.googleapis.com/deepmind-media/gemma/Gemma3Report.pdf", "source_scorecard_blog_link": "https://huggingface.co/blog/gemma3", "source_repo_link": null, "source_weights_link": "https://huggingface.co/google/gemma-3-4b-it", "created_at": "2025-07-19T19:49:05.520515+00:00", "updated_at": "2025-07-19T19:49:05.520515+00:00", "model_family_id": null } ================================================ FILE: data/organizations/google/models/gemma-3n-e2b/benchmarks.json ================================================ [ { "model_benchmark_id": 10, "benchmark_id": "arc-c", "model_id": "gemma-3n-e2b", "score": 0.517, "normalized_score": 0.517, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B", "verified_by_llmstats": false, "analysis_method": "25-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.102376+00:00", "updated_at": "2025-07-19T19:56:11.102376+00:00", "benchmark_name": "ARC-C" }, { "model_benchmark_id": 1056, "benchmark_id": "arc-e", "model_id": "gemma-3n-e2b", "score": 0.758, "normalized_score": 0.758, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.204955+00:00", "updated_at": "2025-07-19T19:56:13.204955+00:00", "benchmark_name": "ARC-E" }, { "model_benchmark_id": 1071, "benchmark_id": "big-bench-hard", "model_id": "gemma-3n-e2b", "score": 0.443, "normalized_score": 0.443, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B", "verified_by_llmstats": false, "analysis_method": "few-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.233872+00:00", "updated_at": "2025-07-19T19:56:13.233872+00:00", "benchmark_name": "BIG-Bench Hard" }, { "model_benchmark_id": 1022, "benchmark_id": "boolq", "model_id": "gemma-3n-e2b", "score": 0.764, "normalized_score": 0.764, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.127882+00:00", "updated_at": "2025-07-19T19:56:13.127882+00:00", "benchmark_name": "BoolQ" }, { "model_benchmark_id": 946, "benchmark_id": "drop", "model_id": "gemma-3n-e2b", "score": 0.539, "normalized_score": 0.539, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B", "verified_by_llmstats": false, "analysis_method": "Token F1 score. 1-shot.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.996776+00:00", "updated_at": "2025-07-19T19:56:12.996776+00:00", "benchmark_name": "DROP" }, { "model_benchmark_id": 39, "benchmark_id": "hellaswag", "model_id": "gemma-3n-e2b", "score": 0.722, "normalized_score": 0.722, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B", "verified_by_llmstats": false, "analysis_method": "10-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.166470+00:00", "updated_at": "2025-07-19T19:56:11.166470+00:00", "benchmark_name": "HellaSwag" }, { "model_benchmark_id": 1049, "benchmark_id": "natural-questions", "model_id": "gemma-3n-e2b", "score": 0.155, "normalized_score": 0.155, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B", "verified_by_llmstats": false, "analysis_method": "5-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.190039+00:00", "updated_at": "2025-07-19T19:56:13.190039+00:00", "benchmark_name": "Natural Questions" }, { "model_benchmark_id": 1031, "benchmark_id": "piqa", "model_id": "gemma-3n-e2b", "score": 0.789, "normalized_score": 0.789, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.147878+00:00", "updated_at": "2025-07-19T19:56:13.147878+00:00", "benchmark_name": "PIQA" }, { "model_benchmark_id": 1040, "benchmark_id": "social-iqa", "model_id": "gemma-3n-e2b", "score": 0.488, "normalized_score": 0.488, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.170669+00:00", "updated_at": "2025-07-19T19:56:13.170669+00:00", "benchmark_name": "Social IQa" }, { "model_benchmark_id": 249, "benchmark_id": "triviaqa", "model_id": "gemma-3n-e2b", "score": 0.608, "normalized_score": 0.608, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B", "verified_by_llmstats": false, "analysis_method": "5-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.576196+00:00", "updated_at": "2025-07-19T19:56:11.576196+00:00", "benchmark_name": "TriviaQA" }, { "model_benchmark_id": 1061, "benchmark_id": "winogrande", "model_id": "gemma-3n-e2b", "score": 0.668, "normalized_score": 0.668, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B", "verified_by_llmstats": false, "analysis_method": "5-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.213740+00:00", "updated_at": "2025-07-19T19:56:13.213740+00:00", "benchmark_name": "Winogrande" } ] ================================================ FILE: data/organizations/google/models/gemma-3n-e2b/model.json ================================================ { "model_id": "gemma-3n-e2b", "name": "Gemma 3n E2B", "organization_id": "google", "fine_tuned_from_model_id": null, "description": "Gemma 3n is a multimodal model designed to run locally on hardware, supporting image, text, audio, and video inputs. It features a language decoder, audio encoder, and vision encoder, and is available in two sizes: E2B and E4B. The model is optimized for memory efficiency, allowing it to run on devices with limited GPU RAM. Gemma is a family of lightweight, state-of-the-art open models from Google, built from the same research and technology used to create the Gemini models. Gemma models are well-suited for a variety of content understanding tasks, including question answering, summarization, and reasoning. Their relatively small size makes it possible to deploy them in environments with limited resources such as laptops, desktops or your own cloud infrastructure, democratizing access to state of the art AI models and helping foster innovation for everyone. Gemma 3n models are designed for efficient execution on low-resource devices. They are capable of multimodal input, handling text, image, video, and audio input, and generating text outputs, with open weights for instruction-tuned variants. These models were trained with data in over 140 spoken languages.", "release_date": "2025-06-26", "announcement_date": "2025-06-26", "license_id": "proprietary", "multimodal": true, "knowledge_cutoff": "2024-06-01", "param_count": 8000000000, "training_tokens": 11000000000000, "available_in_zeroeval": true, "source_api_ref": "https://huggingface.co/blog/gemma3n", "source_playground": "https://aistudio.google.com/", "source_paper": null, "source_scorecard_blog_link": "https://ai.google.dev/gemma/docs/gemma-3n", "source_repo_link": null, "source_weights_link": "https://huggingface.co/google/gemma-3n-E2B", "created_at": "2025-07-19T19:49:05.508070+00:00", "updated_at": "2025-07-19T19:49:05.508070+00:00", "model_family_id": null } ================================================ FILE: data/organizations/google/models/gemma-3n-e2b-it/benchmarks.json ================================================ [ { "model_benchmark_id": 686, "benchmark_id": "aime-2025", "model_id": "gemma-3n-e2b-it", "score": 0.067, "normalized_score": 0.067, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it", "verified_by_llmstats": false, "analysis_method": "Accuracy. 0-shot.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.437675+00:00", "updated_at": "2025-07-19T19:56:12.437675+00:00", "benchmark_name": "AIME 2025" }, { "model_benchmark_id": 1327, "benchmark_id": "codegolf-v2.2", "model_id": "gemma-3n-e2b-it", "score": 0.11, "normalized_score": 0.11, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it", "verified_by_llmstats": false, "analysis_method": "pass@1. 0-shot.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.787794+00:00", "updated_at": "2025-07-19T19:56:13.787794+00:00", "benchmark_name": "Codegolf v2.2" }, { "model_benchmark_id": 1226, "benchmark_id": "eclektic", "model_id": "gemma-3n-e2b-it", "score": 0.025, "normalized_score": 0.025, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.575847+00:00", "updated_at": "2025-07-19T19:56:13.575847+00:00", "benchmark_name": "ECLeKTic" }, { "model_benchmark_id": 1316, "benchmark_id": "global-mmlu", "model_id": "gemma-3n-e2b-it", "score": 0.551, "normalized_score": 0.551, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.758455+00:00", "updated_at": "2025-07-19T19:56:13.758455+00:00", "benchmark_name": "Global-MMLU" }, { "model_benchmark_id": 1218, "benchmark_id": "global-mmlu-lite", "model_id": "gemma-3n-e2b-it", "score": 0.59, "normalized_score": 0.59, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it", "verified_by_llmstats": false, "analysis_method": "Accuracy. 0-shot.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.560513+00:00", "updated_at": "2025-07-19T19:56:13.560513+00:00", "benchmark_name": "Global-MMLU-Lite" }, { "model_benchmark_id": 280, "benchmark_id": "gpqa", "model_id": "gemma-3n-e2b-it", "score": 0.248, "normalized_score": 0.248, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it", "verified_by_llmstats": false, "analysis_method": "Diamond. 0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.641018+00:00", "updated_at": "2025-07-19T19:56:11.641018+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 1165, "benchmark_id": "hiddenmath", "model_id": "gemma-3n-e2b-it", "score": 0.277, "normalized_score": 0.277, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it", "verified_by_llmstats": false, "analysis_method": "Accuracy. 0-shot.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.451948+00:00", "updated_at": "2025-07-19T19:56:13.451948+00:00", "benchmark_name": "HiddenMath" }, { "model_benchmark_id": 774, "benchmark_id": "humaneval", "model_id": "gemma-3n-e2b-it", "score": 0.665, "normalized_score": 0.665, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it", "verified_by_llmstats": false, "analysis_method": "pass@1. 0-shot.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.626596+00:00", "updated_at": "2025-07-19T19:56:12.626596+00:00", "benchmark_name": "HumanEval" }, { "model_benchmark_id": 1307, "benchmark_id": "include", "model_id": "gemma-3n-e2b-it", "score": 0.386, "normalized_score": 0.386, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.735634+00:00", "updated_at": "2025-07-19T19:56:13.735634+00:00", "benchmark_name": "Include" }, { "model_benchmark_id": 1112, "benchmark_id": "livecodebench", "model_id": "gemma-3n-e2b-it", "score": 0.132, "normalized_score": 0.132, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it", "verified_by_llmstats": false, "analysis_method": "pass@1. 0-shot.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.320311+00:00", "updated_at": "2025-07-19T19:56:13.320311+00:00", "benchmark_name": "LiveCodeBench" }, { "model_benchmark_id": 1323, "benchmark_id": "livecodebench-v5", "model_id": "gemma-3n-e2b-it", "score": 0.186, "normalized_score": 0.186, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it", "verified_by_llmstats": false, "analysis_method": "pass@1. 0-shot.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.777049+00:00", "updated_at": "2025-07-19T19:56:13.777049+00:00", "benchmark_name": "LiveCodeBench v5" }, { "model_benchmark_id": 1176, "benchmark_id": "mbpp", "model_id": "gemma-3n-e2b-it", "score": 0.566, "normalized_score": 0.566, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it", "verified_by_llmstats": false, "analysis_method": "pass@1. 3-shot.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.477545+00:00", "updated_at": "2025-07-19T19:56:13.477545+00:00", "benchmark_name": "MBPP" }, { "model_benchmark_id": 1278, "benchmark_id": "mgsm", "model_id": "gemma-3n-e2b-it", "score": 0.531, "normalized_score": 0.531, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.679623+00:00", "updated_at": "2025-07-19T19:56:13.679623+00:00", "benchmark_name": "MGSM" }, { "model_benchmark_id": 71, "benchmark_id": "mmlu", "model_id": "gemma-3n-e2b-it", "score": 0.601, "normalized_score": 0.601, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it", "verified_by_llmstats": false, "analysis_method": "Accuracy. 0-shot.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.234595+00:00", "updated_at": "2025-07-19T19:56:11.234595+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 175, "benchmark_id": "mmlu-pro", "model_id": "gemma-3n-e2b-it", "score": 0.405, "normalized_score": 0.405, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it", "verified_by_llmstats": false, "analysis_method": "Accuracy. 0-shot.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.439365+00:00", "updated_at": "2025-07-19T19:56:11.439365+00:00", "benchmark_name": "MMLU-Pro" }, { "model_benchmark_id": 1312, "benchmark_id": "mmlu-prox", "model_id": "gemma-3n-e2b-it", "score": 0.081, "normalized_score": 0.081, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.746554+00:00", "updated_at": "2025-07-19T19:56:13.746554+00:00", "benchmark_name": "MMLU-ProX" }, { "model_benchmark_id": 1432, "benchmark_id": "openai-mmlu", "model_id": "gemma-3n-e2b-it", "score": 0.223, "normalized_score": 0.223, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.047435+00:00", "updated_at": "2025-07-19T19:56:14.047435+00:00", "benchmark_name": "OpenAI MMLU" }, { "model_benchmark_id": 1234, "benchmark_id": "wmt24++", "model_id": "gemma-3n-e2b-it", "score": 0.427, "normalized_score": 0.427, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it", "verified_by_llmstats": false, "analysis_method": "Character-level F-score. 0-shot.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.592107+00:00", "updated_at": "2025-07-19T19:56:13.592107+00:00", "benchmark_name": "WMT24++" } ] ================================================ FILE: data/organizations/google/models/gemma-3n-e2b-it/model.json ================================================ { "model_id": "gemma-3n-e2b-it", "name": "Gemma 3n E2B Instructed", "organization_id": "google", "fine_tuned_from_model_id": null, "description": "Gemma 3n is a multimodal model designed to run locally on hardware, supporting image, text, audio, and video inputs. It features a language decoder, audio encoder, and vision encoder, and is available in two sizes: E2B and E4B. The model is optimized for memory efficiency, allowing it to run on devices with limited GPU RAM. Gemma is a family of lightweight, state-of-the-art open models from Google, built from the same research and technology used to create the Gemini models. Gemma models are well-suited for a variety of content understanding tasks, including question answering, summarization, and reasoning. Their relatively small size makes it possible to deploy them in environments with limited resources such as laptops, desktops or your own cloud infrastructure, democratizing access to state of the art AI models and helping foster innovation for everyone. Gemma 3n models are designed for efficient execution on low-resource devices. They are capable of multimodal input, handling text, image, video, and audio input, and generating text outputs, with open weights for instruction-tuned variants. These models were trained with data in over 140 spoken languages.", "release_date": "2025-06-26", "announcement_date": "2025-06-26", "license_id": "proprietary", "multimodal": true, "knowledge_cutoff": "2024-06-01", "param_count": 8000000000, "training_tokens": 11000000000000, "available_in_zeroeval": true, "source_api_ref": "https://huggingface.co/blog/gemma3n", "source_playground": "https://aistudio.google.com/", "source_paper": null, "source_scorecard_blog_link": "https://ai.google.dev/gemma/docs/gemma-3n", "source_repo_link": null, "source_weights_link": "https://huggingface.co/google/gemma-3n-E2B-it", "created_at": "2025-07-19T19:49:05.541972+00:00", "updated_at": "2025-07-19T19:49:05.541972+00:00", "model_family_id": null } ================================================ FILE: data/organizations/google/models/gemma-3n-e2b-it-litert-preview/benchmarks.json ================================================ [ { "model_benchmark_id": 680, "benchmark_id": "aime-2025", "model_id": "gemma-3n-e2b-it-litert-preview", "score": 0.067, "normalized_score": 0.067, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it-litert-preview", "verified_by_llmstats": false, "analysis_method": "0-shot Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.419451+00:00", "updated_at": "2025-07-19T19:56:12.419451+00:00", "benchmark_name": "AIME 2025" }, { "model_benchmark_id": 7, "benchmark_id": "arc-c", "model_id": "gemma-3n-e2b-it-litert-preview", "score": 0.517, "normalized_score": 0.517, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it-litert-preview", "verified_by_llmstats": false, "analysis_method": "25-shot Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.095909+00:00", "updated_at": "2025-07-19T19:56:11.095909+00:00", "benchmark_name": "ARC-C" }, { "model_benchmark_id": 1053, "benchmark_id": "arc-e", "model_id": "gemma-3n-e2b-it-litert-preview", "score": 0.758, "normalized_score": 0.758, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it-litert-preview", "verified_by_llmstats": false, "analysis_method": "0-shot Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.199540+00:00", "updated_at": "2025-07-19T19:56:13.199540+00:00", "benchmark_name": "ARC-E" }, { "model_benchmark_id": 1069, "benchmark_id": "big-bench-hard", "model_id": "gemma-3n-e2b-it-litert-preview", "score": 0.443, "normalized_score": 0.443, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it-litert-preview", "verified_by_llmstats": false, "analysis_method": "few-shot Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.229977+00:00", "updated_at": "2025-07-19T19:56:13.229977+00:00", "benchmark_name": "BIG-Bench Hard" }, { "model_benchmark_id": 1019, "benchmark_id": "boolq", "model_id": "gemma-3n-e2b-it-litert-preview", "score": 0.764, "normalized_score": 0.764, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it-litert-preview", "verified_by_llmstats": false, "analysis_method": "0-shot Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.123278+00:00", "updated_at": "2025-07-19T19:56:13.123278+00:00", "benchmark_name": "BoolQ" }, { "model_benchmark_id": 1325, "benchmark_id": "codegolf-v2.2", "model_id": "gemma-3n-e2b-it-litert-preview", "score": 0.11, "normalized_score": 0.11, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it-litert-preview", "verified_by_llmstats": false, "analysis_method": "0-shot pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.783685+00:00", "updated_at": "2025-07-19T19:56:13.783685+00:00", "benchmark_name": "Codegolf v2.2" }, { "model_benchmark_id": 944, "benchmark_id": "drop", "model_id": "gemma-3n-e2b-it-litert-preview", "score": 0.539, "normalized_score": 0.539, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it-litert-preview", "verified_by_llmstats": false, "analysis_method": "1-shot Token F1 score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.993202+00:00", "updated_at": "2025-07-19T19:56:12.993202+00:00", "benchmark_name": "DROP" }, { "model_benchmark_id": 1221, "benchmark_id": "eclektic", "model_id": "gemma-3n-e2b-it-litert-preview", "score": 0.025, "normalized_score": 0.025, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it-litert-preview", "verified_by_llmstats": false, "analysis_method": "0-shot ECLeKTic score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.567241+00:00", "updated_at": "2025-07-19T19:56:13.567241+00:00", "benchmark_name": "ECLeKTic" }, { "model_benchmark_id": 1314, "benchmark_id": "global-mmlu", "model_id": "gemma-3n-e2b-it-litert-preview", "score": 0.551, "normalized_score": 0.551, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it-litert-preview", "verified_by_llmstats": false, "analysis_method": "0-shot Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.754602+00:00", "updated_at": "2025-07-19T19:56:13.754602+00:00", "benchmark_name": "Global-MMLU" }, { "model_benchmark_id": 1208, "benchmark_id": "global-mmlu-lite", "model_id": "gemma-3n-e2b-it-litert-preview", "score": 0.59, "normalized_score": 0.59, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it-litert-preview", "verified_by_llmstats": false, "analysis_method": "0-shot Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.542151+00:00", "updated_at": "2025-07-19T19:56:13.542151+00:00", "benchmark_name": "Global-MMLU-Lite" }, { "model_benchmark_id": 265, "benchmark_id": "gpqa", "model_id": "gemma-3n-e2b-it-litert-preview", "score": 0.248, "normalized_score": 0.248, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it-litert-preview", "verified_by_llmstats": false, "analysis_method": "Diamond, 0-shot RelaxedAccuracy/accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.609514+00:00", "updated_at": "2025-07-19T19:56:11.609514+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 35, "benchmark_id": "hellaswag", "model_id": "gemma-3n-e2b-it-litert-preview", "score": 0.722, "normalized_score": 0.722, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it-litert-preview", "verified_by_llmstats": false, "analysis_method": "10-shot Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.154889+00:00", "updated_at": "2025-07-19T19:56:11.154889+00:00", "benchmark_name": "HellaSwag" }, { "model_benchmark_id": 1155, "benchmark_id": "hiddenmath", "model_id": "gemma-3n-e2b-it-litert-preview", "score": 0.277, "normalized_score": 0.277, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it-litert-preview", "verified_by_llmstats": false, "analysis_method": "0-shot Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.431354+00:00", "updated_at": "2025-07-19T19:56:13.431354+00:00", "benchmark_name": "HiddenMath" }, { "model_benchmark_id": 764, "benchmark_id": "humaneval", "model_id": "gemma-3n-e2b-it-litert-preview", "score": 0.665, "normalized_score": 0.665, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it-litert-preview", "verified_by_llmstats": false, "analysis_method": "0-shot pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.609959+00:00", "updated_at": "2025-07-19T19:56:12.609959+00:00", "benchmark_name": "HumanEval" }, { "model_benchmark_id": 1305, "benchmark_id": "include", "model_id": "gemma-3n-e2b-it-litert-preview", "score": 0.386, "normalized_score": 0.386, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it-litert-preview", "verified_by_llmstats": false, "analysis_method": "0-shot Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.731041+00:00", "updated_at": "2025-07-19T19:56:13.731041+00:00", "benchmark_name": "Include" }, { "model_benchmark_id": 1103, "benchmark_id": "livecodebench", "model_id": "gemma-3n-e2b-it-litert-preview", "score": 0.132, "normalized_score": 0.132, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it-litert-preview", "verified_by_llmstats": false, "analysis_method": "0-shot pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.298197+00:00", "updated_at": "2025-07-19T19:56:13.298197+00:00", "benchmark_name": "LiveCodeBench" }, { "model_benchmark_id": 1319, "benchmark_id": "livecodebench-v5", "model_id": "gemma-3n-e2b-it-litert-preview", "score": 0.186, "normalized_score": 0.186, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it-litert-preview", "verified_by_llmstats": false, "analysis_method": "0-shot pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.768006+00:00", "updated_at": "2025-07-19T19:56:13.768006+00:00", "benchmark_name": "LiveCodeBench v5" }, { "model_benchmark_id": 1168, "benchmark_id": "mbpp", "model_id": "gemma-3n-e2b-it-litert-preview", "score": 0.566, "normalized_score": 0.566, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it-litert-preview", "verified_by_llmstats": false, "analysis_method": "3-shot pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.460487+00:00", "updated_at": "2025-07-19T19:56:13.460487+00:00", "benchmark_name": "MBPP" }, { "model_benchmark_id": 1274, "benchmark_id": "mgsm", "model_id": "gemma-3n-e2b-it-litert-preview", "score": 0.531, "normalized_score": 0.531, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it-litert-preview", "verified_by_llmstats": false, "analysis_method": "0-shot Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.672774+00:00", "updated_at": "2025-07-19T19:56:13.672774+00:00", "benchmark_name": "MGSM" }, { "model_benchmark_id": 65, "benchmark_id": "mmlu", "model_id": "gemma-3n-e2b-it-litert-preview", "score": 0.601, "normalized_score": 0.601, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it-litert-preview", "verified_by_llmstats": false, "analysis_method": "0-shot Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.222830+00:00", "updated_at": "2025-07-19T19:56:11.222830+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 165, "benchmark_id": "mmlu-pro", "model_id": "gemma-3n-e2b-it-litert-preview", "score": 0.405, "normalized_score": 0.405, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it-litert-preview", "verified_by_llmstats": false, "analysis_method": "0-shot Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.421645+00:00", "updated_at": "2025-07-19T19:56:11.421645+00:00", "benchmark_name": "MMLU-Pro" }, { "model_benchmark_id": 1310, "benchmark_id": "mmlu-prox", "model_id": "gemma-3n-e2b-it-litert-preview", "score": 0.081, "normalized_score": 0.081, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it-litert-preview", "verified_by_llmstats": false, "analysis_method": "0-shot Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.743201+00:00", "updated_at": "2025-07-19T19:56:13.743201+00:00", "benchmark_name": "MMLU-ProX" }, { "model_benchmark_id": 1046, "benchmark_id": "natural-questions", "model_id": "gemma-3n-e2b-it-litert-preview", "score": 0.155, "normalized_score": 0.155, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it-litert-preview", "verified_by_llmstats": false, "analysis_method": "5-shot Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.184897+00:00", "updated_at": "2025-07-19T19:56:13.184897+00:00", "benchmark_name": "Natural Questions" }, { "model_benchmark_id": 1028, "benchmark_id": "piqa", "model_id": "gemma-3n-e2b-it-litert-preview", "score": 0.789, "normalized_score": 0.789, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it-litert-preview", "verified_by_llmstats": false, "analysis_method": "0-shot Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.142086+00:00", "updated_at": "2025-07-19T19:56:13.142086+00:00", "benchmark_name": "PIQA" }, { "model_benchmark_id": 1037, "benchmark_id": "social-iqa", "model_id": "gemma-3n-e2b-it-litert-preview", "score": 0.488, "normalized_score": 0.488, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it-litert-preview", "verified_by_llmstats": false, "analysis_method": "0-shot Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.164056+00:00", "updated_at": "2025-07-19T19:56:13.164056+00:00", "benchmark_name": "Social IQa" }, { "model_benchmark_id": 246, "benchmark_id": "triviaqa", "model_id": "gemma-3n-e2b-it-litert-preview", "score": 0.608, "normalized_score": 0.608, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it-litert-preview", "verified_by_llmstats": false, "analysis_method": "5-shot Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.571204+00:00", "updated_at": "2025-07-19T19:56:11.571204+00:00", "benchmark_name": "TriviaQA" }, { "model_benchmark_id": 1059, "benchmark_id": "winogrande", "model_id": "gemma-3n-e2b-it-litert-preview", "score": 0.668, "normalized_score": 0.668, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it-litert-preview", "verified_by_llmstats": false, "analysis_method": "5-shot Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.210650+00:00", "updated_at": "2025-07-19T19:56:13.210650+00:00", "benchmark_name": "Winogrande" }, { "model_benchmark_id": 1229, "benchmark_id": "wmt24++", "model_id": "gemma-3n-e2b-it-litert-preview", "score": 0.427, "normalized_score": 0.427, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it-litert-preview", "verified_by_llmstats": false, "analysis_method": "ChrF, 0-shot Character-level F-score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.582347+00:00", "updated_at": "2025-07-19T19:56:13.582347+00:00", "benchmark_name": "WMT24++" } ] ================================================ FILE: data/organizations/google/models/gemma-3n-e2b-it-litert-preview/model.json ================================================ { "model_id": "gemma-3n-e2b-it-litert-preview", "name": "Gemma 3n E2B Instructed LiteRT (Preview)", "organization_id": "google", "fine_tuned_from_model_id": null, "description": "Gemma 3n is a generative AI model optimized for use in everyday devices, such as phones, laptops, and tablets. It features innovations like Per-Layer Embedding (PLE) parameter caching and a MatFormer model architecture for reduced compute and memory. These models handle audio, text, and visual data, though this E4B preview currently supports text and vision input. Gemma is a family of lightweight, state-of-the-art open models from Google, built from the same research and technology used to create the Gemini models, and is licensed for responsible commercial use.", "release_date": "2025-05-20", "announcement_date": "2025-05-20", "license_id": "gemma", "multimodal": true, "knowledge_cutoff": "2024-06-01", "param_count": 1910000000, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": null, "source_playground": "https://aistudio.google.com/", "source_paper": null, "source_scorecard_blog_link": "https://ai.google.dev/gemma/docs/gemma-3n", "source_repo_link": "https://huggingface.co/google/gemma-3n-E2B-it-litert-preview", "source_weights_link": "https://huggingface.co/google/gemma-3n-E2B-it-litert-preview", "created_at": "2025-07-19T19:49:05.466473+00:00", "updated_at": "2025-07-19T19:49:05.466473+00:00", "model_family_id": null } ================================================ FILE: data/organizations/google/models/gemma-3n-e4b/benchmarks.json ================================================ [ { "model_benchmark_id": 5, "benchmark_id": "arc-c", "model_id": "gemma-3n-e4b", "score": 0.616, "normalized_score": 0.616, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B", "verified_by_llmstats": false, "analysis_method": "25-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.091862+00:00", "updated_at": "2025-07-19T19:56:11.091862+00:00", "benchmark_name": "ARC-C" }, { "model_benchmark_id": 1051, "benchmark_id": "arc-e", "model_id": "gemma-3n-e4b", "score": 0.816, "normalized_score": 0.816, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.195091+00:00", "updated_at": "2025-07-19T19:56:13.195091+00:00", "benchmark_name": "ARC-E" }, { "model_benchmark_id": 1066, "benchmark_id": "big-bench-hard", "model_id": "gemma-3n-e4b", "score": 0.529, "normalized_score": 0.529, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B", "verified_by_llmstats": false, "analysis_method": "few-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.225269+00:00", "updated_at": "2025-07-19T19:56:13.225269+00:00", "benchmark_name": "BIG-Bench Hard" }, { "model_benchmark_id": 1017, "benchmark_id": "boolq", "model_id": "gemma-3n-e4b", "score": 0.816, "normalized_score": 0.816, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.120054+00:00", "updated_at": "2025-07-19T19:56:13.120054+00:00", "benchmark_name": "BoolQ" }, { "model_benchmark_id": 942, "benchmark_id": "drop", "model_id": "gemma-3n-e4b", "score": 0.608, "normalized_score": 0.608, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B", "verified_by_llmstats": false, "analysis_method": "Token F1 score. 1-shot.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.989555+00:00", "updated_at": "2025-07-19T19:56:12.989555+00:00", "benchmark_name": "DROP" }, { "model_benchmark_id": 33, "benchmark_id": "hellaswag", "model_id": "gemma-3n-e4b", "score": 0.786, "normalized_score": 0.786, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B", "verified_by_llmstats": false, "analysis_method": "10-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.150880+00:00", "updated_at": "2025-07-19T19:56:11.150880+00:00", "benchmark_name": "HellaSwag" }, { "model_benchmark_id": 1044, "benchmark_id": "natural-questions", "model_id": "gemma-3n-e4b", "score": 0.209, "normalized_score": 0.209, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B", "verified_by_llmstats": false, "analysis_method": "5-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.181324+00:00", "updated_at": "2025-07-19T19:56:13.181324+00:00", "benchmark_name": "Natural Questions" }, { "model_benchmark_id": 1026, "benchmark_id": "piqa", "model_id": "gemma-3n-e4b", "score": 0.81, "normalized_score": 0.81, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.136080+00:00", "updated_at": "2025-07-19T19:56:13.136080+00:00", "benchmark_name": "PIQA" }, { "model_benchmark_id": 1035, "benchmark_id": "social-iqa", "model_id": "gemma-3n-e4b", "score": 0.5, "normalized_score": 0.5, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.159816+00:00", "updated_at": "2025-07-19T19:56:13.159816+00:00", "benchmark_name": "Social IQa" }, { "model_benchmark_id": 244, "benchmark_id": "triviaqa", "model_id": "gemma-3n-e4b", "score": 0.702, "normalized_score": 0.702, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B", "verified_by_llmstats": false, "analysis_method": "5-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.567693+00:00", "updated_at": "2025-07-19T19:56:11.567693+00:00", "benchmark_name": "TriviaQA" }, { "model_benchmark_id": 1057, "benchmark_id": "winogrande", "model_id": "gemma-3n-e4b", "score": 0.717, "normalized_score": 0.717, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B", "verified_by_llmstats": false, "analysis_method": "5-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.207598+00:00", "updated_at": "2025-07-19T19:56:13.207598+00:00", "benchmark_name": "Winogrande" } ] ================================================ FILE: data/organizations/google/models/gemma-3n-e4b/model.json ================================================ { "model_id": "gemma-3n-e4b", "name": "Gemma 3n E4B", "organization_id": "google", "fine_tuned_from_model_id": null, "description": "Gemma 3n is a multimodal model designed to run locally on hardware, supporting image, text, audio, and video inputs. It features a language decoder, audio encoder, and vision encoder, and is available in two sizes: E2B and E4B. The model is optimized for memory efficiency, allowing it to run on devices with limited GPU RAM. Gemma is a family of lightweight, state-of-the-art open models from Google, built from the same research and technology used to create the Gemini models. Gemma models are well-suited for a variety of content understanding tasks, including question answering, summarization, and reasoning. Their relatively small size makes it possible to deploy them in environments with limited resources such as laptops, desktops or your own cloud infrastructure, democratizing access to state of the art AI models and helping foster innovation for everyone. Gemma 3n models are designed for efficient execution on low-resource devices. They are capable of multimodal input, handling text, image, video, and audio input, and generating text outputs, with open weights for instruction-tuned variants. These models were trained with data in over 140 spoken languages.", "release_date": "2025-06-26", "announcement_date": "2025-06-26", "license_id": "proprietary", "multimodal": true, "knowledge_cutoff": "2024-06-01", "param_count": 8000000000, "training_tokens": 11000000000000, "available_in_zeroeval": true, "source_api_ref": "https://huggingface.co/blog/gemma3n", "source_playground": "https://aistudio.google.com/", "source_paper": null, "source_scorecard_blog_link": "https://ai.google.dev/gemma/docs/gemma-3n", "source_repo_link": null, "source_weights_link": "https://huggingface.co/google/gemma-3n-E4B", "created_at": "2025-07-19T19:49:05.440084+00:00", "updated_at": "2025-07-19T19:49:05.440084+00:00", "model_family_id": null } ================================================ FILE: data/organizations/google/models/gemma-3n-e4b-it/benchmarks.json ================================================ [ { "model_benchmark_id": 684, "benchmark_id": "aime-2025", "model_id": "gemma-3n-e4b-it", "score": 0.116, "normalized_score": 0.116, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it", "verified_by_llmstats": false, "analysis_method": "Accuracy. 0-shot.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.431148+00:00", "updated_at": "2025-07-19T19:56:12.431148+00:00", "benchmark_name": "AIME 2025" }, { "model_benchmark_id": 1326, "benchmark_id": "codegolf-v2.2", "model_id": "gemma-3n-e4b-it", "score": 0.168, "normalized_score": 0.168, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it", "verified_by_llmstats": false, "analysis_method": "pass@1. 0-shot.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.785856+00:00", "updated_at": "2025-07-19T19:56:13.785856+00:00", "benchmark_name": "Codegolf v2.2" }, { "model_benchmark_id": 1222, "benchmark_id": "eclektic", "model_id": "gemma-3n-e4b-it", "score": 0.19, "normalized_score": 0.19, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.569227+00:00", "updated_at": "2025-07-19T19:56:13.569227+00:00", "benchmark_name": "ECLeKTic" }, { "model_benchmark_id": 1315, "benchmark_id": "global-mmlu", "model_id": "gemma-3n-e4b-it", "score": 0.603, "normalized_score": 0.603, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.756363+00:00", "updated_at": "2025-07-19T19:56:13.756363+00:00", "benchmark_name": "Global-MMLU" }, { "model_benchmark_id": 1213, "benchmark_id": "global-mmlu-lite", "model_id": "gemma-3n-e4b-it", "score": 0.645, "normalized_score": 0.645, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it", "verified_by_llmstats": false, "analysis_method": "Accuracy. 0-shot.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.552233+00:00", "updated_at": "2025-07-19T19:56:13.552233+00:00", "benchmark_name": "Global-MMLU-Lite" }, { "model_benchmark_id": 273, "benchmark_id": "gpqa", "model_id": "gemma-3n-e4b-it", "score": 0.237, "normalized_score": 0.237, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it", "verified_by_llmstats": false, "analysis_method": "Diamond. 0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.624084+00:00", "updated_at": "2025-07-19T19:56:11.624084+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 1159, "benchmark_id": "hiddenmath", "model_id": "gemma-3n-e4b-it", "score": 0.377, "normalized_score": 0.377, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it", "verified_by_llmstats": false, "analysis_method": "Accuracy. 0-shot.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.438271+00:00", "updated_at": "2025-07-19T19:56:13.438271+00:00", "benchmark_name": "HiddenMath" }, { "model_benchmark_id": 769, "benchmark_id": "humaneval", "model_id": "gemma-3n-e4b-it", "score": 0.75, "normalized_score": 0.75, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it", "verified_by_llmstats": false, "analysis_method": "pass@1. 0-shot.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.618954+00:00", "updated_at": "2025-07-19T19:56:12.618954+00:00", "benchmark_name": "HumanEval" }, { "model_benchmark_id": 1306, "benchmark_id": "include", "model_id": "gemma-3n-e4b-it", "score": 0.572, "normalized_score": 0.572, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.733461+00:00", "updated_at": "2025-07-19T19:56:13.733461+00:00", "benchmark_name": "Include" }, { "model_benchmark_id": 1106, "benchmark_id": "livecodebench", "model_id": "gemma-3n-e4b-it", "score": 0.132, "normalized_score": 0.132, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it", "verified_by_llmstats": false, "analysis_method": "pass@1. 0-shot.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.304919+00:00", "updated_at": "2025-07-19T19:56:13.304919+00:00", "benchmark_name": "LiveCodeBench" }, { "model_benchmark_id": 1322, "benchmark_id": "livecodebench-v5", "model_id": "gemma-3n-e4b-it", "score": 0.257, "normalized_score": 0.257, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it", "verified_by_llmstats": false, "analysis_method": "pass@1. 0-shot.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.775429+00:00", "updated_at": "2025-07-19T19:56:13.775429+00:00", "benchmark_name": "LiveCodeBench v5" }, { "model_benchmark_id": 1171, "benchmark_id": "mbpp", "model_id": "gemma-3n-e4b-it", "score": 0.636, "normalized_score": 0.636, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it", "verified_by_llmstats": false, "analysis_method": "pass@1. 3-shot.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.466832+00:00", "updated_at": "2025-07-19T19:56:13.466832+00:00", "benchmark_name": "MBPP" }, { "model_benchmark_id": 1277, "benchmark_id": "mgsm", "model_id": "gemma-3n-e4b-it", "score": 0.67, "normalized_score": 0.67, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.678210+00:00", "updated_at": "2025-07-19T19:56:13.678210+00:00", "benchmark_name": "MGSM" }, { "model_benchmark_id": 70, "benchmark_id": "mmlu", "model_id": "gemma-3n-e4b-it", "score": 0.649, "normalized_score": 0.649, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it", "verified_by_llmstats": false, "analysis_method": "Accuracy. 0-shot.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.232243+00:00", "updated_at": "2025-07-19T19:56:11.232243+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 169, "benchmark_id": "mmlu-pro", "model_id": "gemma-3n-e4b-it", "score": 0.506, "normalized_score": 0.506, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it", "verified_by_llmstats": false, "analysis_method": "Accuracy. 0-shot.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.428457+00:00", "updated_at": "2025-07-19T19:56:11.428457+00:00", "benchmark_name": "MMLU-Pro" }, { "model_benchmark_id": 1311, "benchmark_id": "mmlu-prox", "model_id": "gemma-3n-e4b-it", "score": 0.199, "normalized_score": 0.199, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.744918+00:00", "updated_at": "2025-07-19T19:56:13.744918+00:00", "benchmark_name": "MMLU-ProX" }, { "model_benchmark_id": 1431, "benchmark_id": "openai-mmlu", "model_id": "gemma-3n-e4b-it", "score": 0.356, "normalized_score": 0.356, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.045887+00:00", "updated_at": "2025-07-19T19:56:14.045887+00:00", "benchmark_name": "OpenAI MMLU" }, { "model_benchmark_id": 1230, "benchmark_id": "wmt24++", "model_id": "gemma-3n-e4b-it", "score": 0.501, "normalized_score": 0.501, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it", "verified_by_llmstats": false, "analysis_method": "Character-level F-score. 0-shot.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.584588+00:00", "updated_at": "2025-07-19T19:56:13.584588+00:00", "benchmark_name": "WMT24++" } ] ================================================ FILE: data/organizations/google/models/gemma-3n-e4b-it/model.json ================================================ { "model_id": "gemma-3n-e4b-it", "name": "Gemma 3n E4B Instructed", "organization_id": "google", "fine_tuned_from_model_id": null, "description": "Gemma 3n is a multimodal model designed to run locally on hardware, supporting image, text, audio, and video inputs. It features a language decoder, audio encoder, and vision encoder, and is available in two sizes: E2B and E4B. The model is optimized for memory efficiency, allowing it to run on devices with limited GPU RAM. Gemma is a family of lightweight, state-of-the-art open models from Google, built from the same research and technology used to create the Gemini models. Gemma models are well-suited for a variety of content understanding tasks, including question answering, summarization, and reasoning. Their relatively small size makes it possible to deploy them in environments with limited resources such as laptops, desktops or your own cloud infrastructure, democratizing access to state of the art AI models and helping foster innovation for everyone. Gemma 3n models are designed for efficient execution on low-resource devices. They are capable of multimodal input, handling text, image, video, and audio input, and generating text outputs, with open weights for instruction-tuned variants. These models were trained with data in over 140 spoken languages.", "release_date": "2025-06-26", "announcement_date": "2025-06-26", "license_id": "proprietary", "multimodal": true, "knowledge_cutoff": "2024-06-01", "param_count": 8000000000, "training_tokens": 11000000000000, "available_in_zeroeval": true, "source_api_ref": "https://huggingface.co/blog/gemma3n", "source_playground": "https://aistudio.google.com/", "source_paper": null, "source_scorecard_blog_link": "https://ai.google.dev/gemma/docs/gemma-3n", "source_repo_link": null, "source_weights_link": "https://huggingface.co/google/gemma-3n-E4B-it", "created_at": "2025-07-19T19:49:05.517334+00:00", "updated_at": "2025-07-19T19:49:05.517334+00:00", "model_family_id": null } ================================================ FILE: data/organizations/google/models/gemma-3n-e4b-it-litert-preview/benchmarks.json ================================================ [ { "model_benchmark_id": 678, "benchmark_id": "aime-2025", "model_id": "gemma-3n-e4b-it-litert-preview", "score": 0.116, "normalized_score": 0.116, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it-litert-preview", "verified_by_llmstats": false, "analysis_method": "0-shot Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.414248+00:00", "updated_at": "2025-07-19T19:56:12.414248+00:00", "benchmark_name": "AIME 2025" }, { "model_benchmark_id": 6, "benchmark_id": "arc-c", "model_id": "gemma-3n-e4b-it-litert-preview", "score": 0.616, "normalized_score": 0.616, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it-litert-preview", "verified_by_llmstats": false, "analysis_method": "25-shot Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.093723+00:00", "updated_at": "2025-07-19T19:56:11.093723+00:00", "benchmark_name": "ARC-C" }, { "model_benchmark_id": 1052, "benchmark_id": "arc-e", "model_id": "gemma-3n-e4b-it-litert-preview", "score": 0.816, "normalized_score": 0.816, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it-litert-preview", "verified_by_llmstats": false, "analysis_method": "0-shot Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.196728+00:00", "updated_at": "2025-07-19T19:56:13.196728+00:00", "benchmark_name": "ARC-E" }, { "model_benchmark_id": 1068, "benchmark_id": "big-bench-hard", "model_id": "gemma-3n-e4b-it-litert-preview", "score": 0.529, "normalized_score": 0.529, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it-litert-preview", "verified_by_llmstats": false, "analysis_method": "few-shot Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.228349+00:00", "updated_at": "2025-07-19T19:56:13.228349+00:00", "benchmark_name": "BIG-Bench Hard" }, { "model_benchmark_id": 1018, "benchmark_id": "boolq", "model_id": "gemma-3n-e4b-it-litert-preview", "score": 0.816, "normalized_score": 0.816, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it-litert-preview", "verified_by_llmstats": false, "analysis_method": "0-shot Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.121696+00:00", "updated_at": "2025-07-19T19:56:13.121696+00:00", "benchmark_name": "BoolQ" }, { "model_benchmark_id": 1324, "benchmark_id": "codegolf-v2.2", "model_id": "gemma-3n-e4b-it-litert-preview", "score": 0.168, "normalized_score": 0.168, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it-litert-preview", "verified_by_llmstats": false, "analysis_method": "0-shot pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.781222+00:00", "updated_at": "2025-07-19T19:56:13.781222+00:00", "benchmark_name": "Codegolf v2.2" }, { "model_benchmark_id": 943, "benchmark_id": "drop", "model_id": "gemma-3n-e4b-it-litert-preview", "score": 0.608, "normalized_score": 0.608, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it-litert-preview", "verified_by_llmstats": false, "analysis_method": "1-shot Token F1 score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.991359+00:00", "updated_at": "2025-07-19T19:56:12.991359+00:00", "benchmark_name": "DROP" }, { "model_benchmark_id": 1220, "benchmark_id": "eclektic", "model_id": "gemma-3n-e4b-it-litert-preview", "score": 0.019, "normalized_score": 0.019, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it-litert-preview", "verified_by_llmstats": false, "analysis_method": "0-shot ECLeKTic score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.565422+00:00", "updated_at": "2025-07-19T19:56:13.565422+00:00", "benchmark_name": "ECLeKTic" }, { "model_benchmark_id": 1313, "benchmark_id": "global-mmlu", "model_id": "gemma-3n-e4b-it-litert-preview", "score": 0.603, "normalized_score": 0.603, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it-litert-preview", "verified_by_llmstats": false, "analysis_method": "0-shot Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.752749+00:00", "updated_at": "2025-07-19T19:56:13.752749+00:00", "benchmark_name": "Global-MMLU" }, { "model_benchmark_id": 1206, "benchmark_id": "global-mmlu-lite", "model_id": "gemma-3n-e4b-it-litert-preview", "score": 0.645, "normalized_score": 0.645, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it-litert-preview", "verified_by_llmstats": false, "analysis_method": "0-shot Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.538643+00:00", "updated_at": "2025-07-19T19:56:13.538643+00:00", "benchmark_name": "Global-MMLU-Lite" }, { "model_benchmark_id": 262, "benchmark_id": "gpqa", "model_id": "gemma-3n-e4b-it-litert-preview", "score": 0.237, "normalized_score": 0.237, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it-litert-preview", "verified_by_llmstats": false, "analysis_method": "Diamond, 0-shot RelaxedAccuracy/accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.602493+00:00", "updated_at": "2025-07-19T19:56:11.602493+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 34, "benchmark_id": "hellaswag", "model_id": "gemma-3n-e4b-it-litert-preview", "score": 0.786, "normalized_score": 0.786, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it-litert-preview", "verified_by_llmstats": false, "analysis_method": "10-shot Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.152761+00:00", "updated_at": "2025-07-19T19:56:11.152761+00:00", "benchmark_name": "HellaSwag" }, { "model_benchmark_id": 1154, "benchmark_id": "hiddenmath", "model_id": "gemma-3n-e4b-it-litert-preview", "score": 0.377, "normalized_score": 0.377, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it-litert-preview", "verified_by_llmstats": false, "analysis_method": "0-shot Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.429415+00:00", "updated_at": "2025-07-19T19:56:13.429415+00:00", "benchmark_name": "HiddenMath" }, { "model_benchmark_id": 763, "benchmark_id": "humaneval", "model_id": "gemma-3n-e4b-it-litert-preview", "score": 0.75, "normalized_score": 0.75, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it-litert-preview", "verified_by_llmstats": false, "analysis_method": "0-shot pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.608423+00:00", "updated_at": "2025-07-19T19:56:12.608423+00:00", "benchmark_name": "HumanEval" }, { "model_benchmark_id": 1304, "benchmark_id": "include", "model_id": "gemma-3n-e4b-it-litert-preview", "score": 0.572, "normalized_score": 0.572, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it-litert-preview", "verified_by_llmstats": false, "analysis_method": "0-shot Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.729199+00:00", "updated_at": "2025-07-19T19:56:13.729199+00:00", "benchmark_name": "Include" }, { "model_benchmark_id": 1102, "benchmark_id": "livecodebench", "model_id": "gemma-3n-e4b-it-litert-preview", "score": 0.132, "normalized_score": 0.132, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it-litert-preview", "verified_by_llmstats": false, "analysis_method": "0-shot pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.296281+00:00", "updated_at": "2025-07-19T19:56:13.296281+00:00", "benchmark_name": "LiveCodeBench" }, { "model_benchmark_id": 1317, "benchmark_id": "livecodebench-v5", "model_id": "gemma-3n-e4b-it-litert-preview", "score": 0.257, "normalized_score": 0.257, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it-litert-preview", "verified_by_llmstats": false, "analysis_method": "0-shot pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.761673+00:00", "updated_at": "2025-07-19T19:56:13.761673+00:00", "benchmark_name": "LiveCodeBench v5" }, { "model_benchmark_id": 1167, "benchmark_id": "mbpp", "model_id": "gemma-3n-e4b-it-litert-preview", "score": 0.636, "normalized_score": 0.636, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it-litert-preview", "verified_by_llmstats": false, "analysis_method": "3-shot pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.458570+00:00", "updated_at": "2025-07-19T19:56:13.458570+00:00", "benchmark_name": "MBPP" }, { "model_benchmark_id": 1273, "benchmark_id": "mgsm", "model_id": "gemma-3n-e4b-it-litert-preview", "score": 0.607, "normalized_score": 0.607, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it-litert-preview", "verified_by_llmstats": false, "analysis_method": "0-shot Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.671283+00:00", "updated_at": "2025-07-19T19:56:13.671283+00:00", "benchmark_name": "MGSM" }, { "model_benchmark_id": 63, "benchmark_id": "mmlu", "model_id": "gemma-3n-e4b-it-litert-preview", "score": 0.649, "normalized_score": 0.649, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it-litert-preview", "verified_by_llmstats": false, "analysis_method": "0-shot Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.219372+00:00", "updated_at": "2025-07-19T19:56:11.219372+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 164, "benchmark_id": "mmlu-pro", "model_id": "gemma-3n-e4b-it-litert-preview", "score": 0.506, "normalized_score": 0.506, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it-litert-preview", "verified_by_llmstats": false, "analysis_method": "0-shot Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.420000+00:00", "updated_at": "2025-07-19T19:56:11.420000+00:00", "benchmark_name": "MMLU-Pro" }, { "model_benchmark_id": 1309, "benchmark_id": "mmlu-prox", "model_id": "gemma-3n-e4b-it-litert-preview", "score": 0.199, "normalized_score": 0.199, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it-litert-preview", "verified_by_llmstats": false, "analysis_method": "0-shot Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.741460+00:00", "updated_at": "2025-07-19T19:56:13.741460+00:00", "benchmark_name": "MMLU-ProX" }, { "model_benchmark_id": 1045, "benchmark_id": "natural-questions", "model_id": "gemma-3n-e4b-it-litert-preview", "score": 0.209, "normalized_score": 0.209, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it-litert-preview", "verified_by_llmstats": false, "analysis_method": "5-shot Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.183031+00:00", "updated_at": "2025-07-19T19:56:13.183031+00:00", "benchmark_name": "Natural Questions" }, { "model_benchmark_id": 1027, "benchmark_id": "piqa", "model_id": "gemma-3n-e4b-it-litert-preview", "score": 0.81, "normalized_score": 0.81, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it-litert-preview", "verified_by_llmstats": false, "analysis_method": "0-shot Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.137952+00:00", "updated_at": "2025-07-19T19:56:13.137952+00:00", "benchmark_name": "PIQA" }, { "model_benchmark_id": 1036, "benchmark_id": "social-iqa", "model_id": "gemma-3n-e4b-it-litert-preview", "score": 0.5, "normalized_score": 0.5, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it-litert-preview", "verified_by_llmstats": false, "analysis_method": "0-shot Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.161822+00:00", "updated_at": "2025-07-19T19:56:13.161822+00:00", "benchmark_name": "Social IQa" }, { "model_benchmark_id": 245, "benchmark_id": "triviaqa", "model_id": "gemma-3n-e4b-it-litert-preview", "score": 0.702, "normalized_score": 0.702, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it-litert-preview", "verified_by_llmstats": false, "analysis_method": "5-shot Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.569334+00:00", "updated_at": "2025-07-19T19:56:11.569334+00:00", "benchmark_name": "TriviaQA" }, { "model_benchmark_id": 1058, "benchmark_id": "winogrande", "model_id": "gemma-3n-e4b-it-litert-preview", "score": 0.717, "normalized_score": 0.717, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it-litert-preview", "verified_by_llmstats": false, "analysis_method": "5-shot Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.209229+00:00", "updated_at": "2025-07-19T19:56:13.209229+00:00", "benchmark_name": "Winogrande" }, { "model_benchmark_id": 1228, "benchmark_id": "wmt24++", "model_id": "gemma-3n-e4b-it-litert-preview", "score": 0.501, "normalized_score": 0.501, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it-litert-preview", "verified_by_llmstats": false, "analysis_method": "ChrF, 0-shot Character-level F-score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.580409+00:00", "updated_at": "2025-07-19T19:56:13.580409+00:00", "benchmark_name": "WMT24++" } ] ================================================ FILE: data/organizations/google/models/gemma-3n-e4b-it-litert-preview/model.json ================================================ { "model_id": "gemma-3n-e4b-it-litert-preview", "name": "Gemma 3n E4B Instructed LiteRT Preview", "organization_id": "google", "fine_tuned_from_model_id": null, "description": "Gemma 3n is a generative AI model optimized for use in everyday devices, such as phones, laptops, and tablets. It features innovations like Per-Layer Embedding (PLE) parameter caching and a MatFormer model architecture for reduced compute and memory. These models handle audio, text, and visual data, though this E4B preview currently supports text and vision input. Gemma is a family of lightweight, state-of-the-art open models from Google, built from the same research and technology used to create the Gemini models, and is licensed for responsible commercial use.", "release_date": "2025-05-20", "announcement_date": "2025-05-20", "license_id": "gemma", "multimodal": true, "knowledge_cutoff": "2024-06-01", "param_count": 1910000000, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": null, "source_playground": "https://aistudio.google.com/", "source_paper": null, "source_scorecard_blog_link": "https://ai.google.dev/gemma/docs/gemma-3n", "source_repo_link": "https://huggingface.co/google/gemma-3n-E4B-it-litert-preview", "source_weights_link": "https://huggingface.co/google/gemma-3n-E4B-it-litert-preview", "created_at": "2025-07-19T19:49:05.451978+00:00", "updated_at": "2025-07-19T19:49:05.451978+00:00", "model_family_id": null } ================================================ FILE: data/organizations/google/models/medgemma-4b-it/benchmarks.json ================================================ [ { "model_benchmark_id": 1425, "benchmark_id": "chexpert-cxr", "model_id": "medgemma-4b-it", "score": 0.481, "normalized_score": 0.481, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/medgemma-4b-it", "verified_by_llmstats": false, "analysis_method": "Average F1 for top 5 conditions", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.023334+00:00", "updated_at": "2025-07-19T19:56:14.023334+00:00", "benchmark_name": "CheXpert CXR" }, { "model_benchmark_id": 1426, "benchmark_id": "dermmcqa", "model_id": "medgemma-4b-it", "score": 0.718, "normalized_score": 0.718, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/medgemma-4b-it", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.026812+00:00", "updated_at": "2025-07-19T19:56:14.026812+00:00", "benchmark_name": "DermMCQA" }, { "model_benchmark_id": 1430, "benchmark_id": "medxpertqa", "model_id": "medgemma-4b-it", "score": 0.188, "normalized_score": 0.188, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/medgemma-4b-it", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.042823+00:00", "updated_at": "2025-07-19T19:56:14.042823+00:00", "benchmark_name": "MedXpertQA" }, { "model_benchmark_id": 1424, "benchmark_id": "mimic-cxr", "model_id": "medgemma-4b-it", "score": 0.889, "normalized_score": 0.889, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/medgemma-4b-it", "verified_by_llmstats": false, "analysis_method": "Average F1 for top 5 conditions", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.019964+00:00", "updated_at": "2025-07-19T19:56:14.019964+00:00", "benchmark_name": "MIMIC CXR" }, { "model_benchmark_id": 1429, "benchmark_id": "pathmcqa", "model_id": "medgemma-4b-it", "score": 0.698, "normalized_score": 0.698, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/medgemma-4b-it", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.039089+00:00", "updated_at": "2025-07-19T19:56:14.039089+00:00", "benchmark_name": "PathMCQA" }, { "model_benchmark_id": 1427, "benchmark_id": "slakevqa", "model_id": "medgemma-4b-it", "score": 0.623, "normalized_score": 0.623, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/medgemma-4b-it", "verified_by_llmstats": false, "analysis_method": "Tokenized F1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.029835+00:00", "updated_at": "2025-07-19T19:56:14.029835+00:00", "benchmark_name": "SlakeVQA" }, { "model_benchmark_id": 1428, "benchmark_id": "vqa-rad", "model_id": "medgemma-4b-it", "score": 0.499, "normalized_score": 0.499, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/google/medgemma-4b-it", "verified_by_llmstats": false, "analysis_method": "Tokenized F1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.035504+00:00", "updated_at": "2025-07-19T19:56:14.035504+00:00", "benchmark_name": "VQA-Rad" } ] ================================================ FILE: data/organizations/google/models/medgemma-4b-it/model.json ================================================ { "model_id": "medgemma-4b-it", "name": "MedGemma 4B IT", "organization_id": "google", "fine_tuned_from_model_id": null, "description": "MedGemma is a collection of Gemma 3 variants that are trained for performance on medical text and image comprehension. MedGemma 4B utilizes a SigLIP image encoder that has been specifically pre-trained on a variety of de-identified medical data, including chest X-rays, dermatology images, ophthalmology images, and histopathology slides. Its LLM component is trained on a diverse set of medical data, including radiology images, histopathology patches, ophthalmology images, and dermatology images. MedGemma is a multimodal model primarily evaluated on single-image tasks. It has not been evaluated for multi-turn applications and may be more sensitive to specific prompts than its predecessor, Gemma 3. Developers should consider bias in validation data and data contamination concerns when using MedGemma.", "release_date": "2025-05-20", "announcement_date": "2025-05-20", "license_id": "health_ai_developer_foundations_terms_of_use", "multimodal": true, "knowledge_cutoff": null, "param_count": 4300000000, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://developers.google.com/health-ai-developer-foundations/medgemma/get-started", "source_playground": null, "source_paper": null, "source_scorecard_blog_link": "https://developers.google.com/health-ai-developer-foundations/medgemma/model-card", "source_repo_link": null, "source_weights_link": "https://huggingface.co/google/medgemma-4b-it", "created_at": "2025-07-19T19:49:05.511963+00:00", "updated_at": "2025-07-19T19:49:05.511963+00:00", "model_family_id": null } ================================================ FILE: data/organizations/google/organization.json ================================================ { "organization_id": "google", "name": "Google", "website": "https://google.com", "description": "Technology giant with AI research", "country": "US", "created_at": "2025-07-19T19:49:05.437977+00:00", "updated_at": "2025-07-19T19:49:05.437977+00:00" } ================================================ FILE: data/organizations/ibm/models/granite-3.3-8b-base/benchmarks.json ================================================ [ { "model_benchmark_id": 1409, "benchmark_id": "agieval", "model_id": "granite-3.3-8b-base", "score": 0.493, "normalized_score": 0.493, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-base", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.976963+00:00", "updated_at": "2025-07-19T19:56:13.976963+00:00", "benchmark_name": "AGIEval" }, { "model_benchmark_id": 477, "benchmark_id": "aime-2024", "model_id": "granite-3.3-8b-base", "score": 0.812, "normalized_score": 0.812, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-base", "verified_by_llmstats": false, "analysis_method": "Not specified", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.006332+00:00", "updated_at": "2025-07-19T19:56:12.006332+00:00", "benchmark_name": "AIME 2024" }, { "model_benchmark_id": 1794, "benchmark_id": "alpacaeval-2.0", "model_id": "granite-3.3-8b-base", "score": 0.6268, "normalized_score": 0.6268, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-base", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.048676+00:00", "updated_at": "2025-07-19T19:56:15.048676+00:00", "benchmark_name": "AlpacaEval 2.0" }, { "model_benchmark_id": 23, "benchmark_id": "arc-c", "model_id": "granite-3.3-8b-base", "score": 0.5084, "normalized_score": 0.5084, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-base", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.131347+00:00", "updated_at": "2025-07-19T19:56:11.131347+00:00", "benchmark_name": "ARC-C" }, { "model_benchmark_id": 1460, "benchmark_id": "arena-hard", "model_id": "granite-3.3-8b-base", "score": 0.5756, "normalized_score": 0.5756, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-base", "verified_by_llmstats": false, "analysis_method": "Arena Hard", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.111734+00:00", "updated_at": "2025-07-19T19:56:14.111734+00:00", "benchmark_name": "Arena Hard" }, { "model_benchmark_id": 1807, "benchmark_id": "attaq", "model_id": "granite-3.3-8b-base", "score": 0.885, "normalized_score": 0.885, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-base", "verified_by_llmstats": false, "analysis_method": "Not specified (OLMES)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.087212+00:00", "updated_at": "2025-07-19T19:56:15.087212+00:00", "benchmark_name": "AttaQ" }, { "model_benchmark_id": 1081, "benchmark_id": "big-bench-hard", "model_id": "granite-3.3-8b-base", "score": 0.6913, "normalized_score": 0.6913, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-base", "verified_by_llmstats": false, "analysis_method": "OLMES (Added regex for more efficient answer extraction)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.251020+00:00", "updated_at": "2025-07-19T19:56:13.251020+00:00", "benchmark_name": "BIG-Bench Hard" }, { "model_benchmark_id": 955, "benchmark_id": "drop", "model_id": "granite-3.3-8b-base", "score": 0.3614, "normalized_score": 0.3614, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-base", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.012196+00:00", "updated_at": "2025-07-19T19:56:13.012196+00:00", "benchmark_name": "DROP" }, { "model_benchmark_id": 1004, "benchmark_id": "gsm8k", "model_id": "granite-3.3-8b-base", "score": 0.59, "normalized_score": 0.59, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-base", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.098078+00:00", "updated_at": "2025-07-19T19:56:13.098078+00:00", "benchmark_name": "GSM8k" }, { "model_benchmark_id": 49, "benchmark_id": "hellaswag", "model_id": "granite-3.3-8b-base", "score": 0.801, "normalized_score": 0.801, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-base", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.186799+00:00", "updated_at": "2025-07-19T19:56:11.186799+00:00", "benchmark_name": "HellaSwag" }, { "model_benchmark_id": 798, "benchmark_id": "humaneval", "model_id": "granite-3.3-8b-base", "score": 0.8973, "normalized_score": 0.8973, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-base", "verified_by_llmstats": false, "analysis_method": "OLMES", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.666882+00:00", "updated_at": "2025-07-19T19:56:12.666882+00:00", "benchmark_name": "HumanEval" }, { "model_benchmark_id": 1444, "benchmark_id": "humaneval+", "model_id": "granite-3.3-8b-base", "score": 0.8609, "normalized_score": 0.8609, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-base", "verified_by_llmstats": false, "analysis_method": "OLMES", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.078662+00:00", "updated_at": "2025-07-19T19:56:14.078662+00:00", "benchmark_name": "HumanEval+" }, { "model_benchmark_id": 626, "benchmark_id": "ifeval", "model_id": "granite-3.3-8b-base", "score": 0.7482, "normalized_score": 0.7482, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-base", "verified_by_llmstats": false, "analysis_method": "OLMES", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.288064+00:00", "updated_at": "2025-07-19T19:56:12.288064+00:00", "benchmark_name": "IFEval" }, { "model_benchmark_id": 508, "benchmark_id": "math-500", "model_id": "granite-3.3-8b-base", "score": 0.6902, "normalized_score": 0.6902, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-base", "verified_by_llmstats": false, "analysis_method": "Not specified", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.056690+00:00", "updated_at": "2025-07-19T19:56:12.056690+00:00", "benchmark_name": "MATH-500" }, { "model_benchmark_id": 101, "benchmark_id": "mmlu", "model_id": "granite-3.3-8b-base", "score": 0.6389, "normalized_score": 0.6389, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-base", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.290899+00:00", "updated_at": "2025-07-19T19:56:11.290899+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 1808, "benchmark_id": "nq", "model_id": "granite-3.3-8b-base", "score": 0.365, "normalized_score": 0.365, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-base", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.090844+00:00", "updated_at": "2025-07-19T19:56:15.090844+00:00", "benchmark_name": "NQ" }, { "model_benchmark_id": 1804, "benchmark_id": "popqa", "model_id": "granite-3.3-8b-base", "score": 0.2617, "normalized_score": 0.2617, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-base", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.078883+00:00", "updated_at": "2025-07-19T19:56:15.078883+00:00", "benchmark_name": "PopQA" }, { "model_benchmark_id": 250, "benchmark_id": "triviaqa", "model_id": "granite-3.3-8b-base", "score": 0.7818, "normalized_score": 0.7818, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-base", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.577753+00:00", "updated_at": "2025-07-19T19:56:11.577753+00:00", "benchmark_name": "TriviaQA" }, { "model_benchmark_id": 142, "benchmark_id": "truthfulqa", "model_id": "granite-3.3-8b-base", "score": 0.5215, "normalized_score": 0.5215, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-base", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.362380+00:00", "updated_at": "2025-07-19T19:56:11.362380+00:00", "benchmark_name": "TruthfulQA" }, { "model_benchmark_id": 152, "benchmark_id": "winogrande", "model_id": "granite-3.3-8b-base", "score": 0.744, "normalized_score": 0.744, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-base", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.387990+00:00", "updated_at": "2025-07-19T19:56:11.387990+00:00", "benchmark_name": "Winogrande" } ] ================================================ FILE: data/organizations/ibm/models/granite-3.3-8b-base/model.json ================================================ { "model_id": "granite-3.3-8b-base", "name": "Granite 3.3 8B Base", "organization_id": "ibm", "fine_tuned_from_model_id": null, "description": "Granite-3.3-8B-Base is a decoder-only language model with a 128K token context window. It improves upon Granite-3.1-8B-Base by adding support for Fill-in-the-Middle (FIM) using specialized tokens, enabling the model to generate content conditioned on both prefix and suffix. This makes it well-suited for code completion tasks", "release_date": "2025-04-16", "announcement_date": "2025-04-16", "license_id": "apache_2_0", "multimodal": true, "knowledge_cutoff": "2024-04-01", "param_count": 8170000000, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://www.ibm.com/granite/docs/", "source_playground": "https://www.ibm.com/granite/playground/", "source_paper": null, "source_scorecard_blog_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-base", "source_repo_link": "https://github.com/ibm-granite/granite-3.3-language-models", "source_weights_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-base", "created_at": "2025-07-19T19:49:05.727013+00:00", "updated_at": "2025-07-19T19:49:05.727013+00:00", "model_family_id": null } ================================================ FILE: data/organizations/ibm/models/granite-3.3-8b-instruct/benchmarks.json ================================================ [ { "model_benchmark_id": 476, "benchmark_id": "aime-2024", "model_id": "granite-3.3-8b-instruct", "score": 0.812, "normalized_score": 0.812, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-instruct", "verified_by_llmstats": false, "analysis_method": "Not specified", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.004852+00:00", "updated_at": "2025-07-19T19:56:12.004852+00:00", "benchmark_name": "AIME 2024" }, { "model_benchmark_id": 1793, "benchmark_id": "alpacaeval-2.0", "model_id": "granite-3.3-8b-instruct", "score": 0.6268, "normalized_score": 0.6268, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.046908+00:00", "updated_at": "2025-07-19T19:56:15.046908+00:00", "benchmark_name": "AlpacaEval 2.0" }, { "model_benchmark_id": 1459, "benchmark_id": "arena-hard", "model_id": "granite-3.3-8b-instruct", "score": 0.5756, "normalized_score": 0.5756, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-instruct", "verified_by_llmstats": false, "analysis_method": "Arena Hard benchmark evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.110277+00:00", "updated_at": "2025-07-19T19:56:14.110277+00:00", "benchmark_name": "Arena Hard" }, { "model_benchmark_id": 1806, "benchmark_id": "attaq", "model_id": "granite-3.3-8b-instruct", "score": 0.885, "normalized_score": 0.885, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-instruct", "verified_by_llmstats": false, "analysis_method": "Not specified (OLMES)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.085492+00:00", "updated_at": "2025-07-19T19:56:15.085492+00:00", "benchmark_name": "AttaQ" }, { "model_benchmark_id": 1080, "benchmark_id": "big-bench-hard", "model_id": "granite-3.3-8b-instruct", "score": 0.6913, "normalized_score": 0.6913, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-instruct", "verified_by_llmstats": false, "analysis_method": "OLMES (Added regex for more efficient answer extraction)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.249459+00:00", "updated_at": "2025-07-19T19:56:13.249459+00:00", "benchmark_name": "BIG-Bench Hard" }, { "model_benchmark_id": 954, "benchmark_id": "drop", "model_id": "granite-3.3-8b-instruct", "score": 0.5936, "normalized_score": 0.5936, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-instruct", "verified_by_llmstats": false, "analysis_method": "OLMES (Modified implementation)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.010691+00:00", "updated_at": "2025-07-19T19:56:13.010691+00:00", "benchmark_name": "DROP" }, { "model_benchmark_id": 1003, "benchmark_id": "gsm8k", "model_id": "granite-3.3-8b-instruct", "score": 0.8089, "normalized_score": 0.8089, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-instruct", "verified_by_llmstats": false, "analysis_method": "OLMES", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.095998+00:00", "updated_at": "2025-07-19T19:56:13.095998+00:00", "benchmark_name": "GSM8k" }, { "model_benchmark_id": 797, "benchmark_id": "humaneval", "model_id": "granite-3.3-8b-instruct", "score": 0.8973, "normalized_score": 0.8973, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-instruct", "verified_by_llmstats": false, "analysis_method": "OLMES", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.665403+00:00", "updated_at": "2025-07-19T19:56:12.665403+00:00", "benchmark_name": "HumanEval" }, { "model_benchmark_id": 1443, "benchmark_id": "humaneval+", "model_id": "granite-3.3-8b-instruct", "score": 0.8609, "normalized_score": 0.8609, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-instruct", "verified_by_llmstats": false, "analysis_method": "OLMES", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.076877+00:00", "updated_at": "2025-07-19T19:56:14.076877+00:00", "benchmark_name": "HumanEval+" }, { "model_benchmark_id": 625, "benchmark_id": "ifeval", "model_id": "granite-3.3-8b-instruct", "score": 0.7482, "normalized_score": 0.7482, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-instruct", "verified_by_llmstats": false, "analysis_method": "OLMES", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.286600+00:00", "updated_at": "2025-07-19T19:56:12.286600+00:00", "benchmark_name": "IFEval" }, { "model_benchmark_id": 507, "benchmark_id": "math-500", "model_id": "granite-3.3-8b-instruct", "score": 0.6902, "normalized_score": 0.6902, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-instruct", "verified_by_llmstats": false, "analysis_method": "Not specified", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.054762+00:00", "updated_at": "2025-07-19T19:56:12.054762+00:00", "benchmark_name": "MATH-500" }, { "model_benchmark_id": 100, "benchmark_id": "mmlu", "model_id": "granite-3.3-8b-instruct", "score": 0.6554, "normalized_score": 0.6554, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.288937+00:00", "updated_at": "2025-07-19T19:56:11.288937+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 1803, "benchmark_id": "popqa", "model_id": "granite-3.3-8b-instruct", "score": 0.2617, "normalized_score": 0.2617, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.077308+00:00", "updated_at": "2025-07-19T19:56:15.077308+00:00", "benchmark_name": "PopQA" }, { "model_benchmark_id": 141, "benchmark_id": "truthfulqa", "model_id": "granite-3.3-8b-instruct", "score": 0.6686, "normalized_score": 0.6686, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.360858+00:00", "updated_at": "2025-07-19T19:56:11.360858+00:00", "benchmark_name": "TruthfulQA" } ] ================================================ FILE: data/organizations/ibm/models/granite-3.3-8b-instruct/model.json ================================================ { "model_id": "granite-3.3-8b-instruct", "name": "Granite 3.3 8B Instruct", "organization_id": "ibm", "fine_tuned_from_model_id": null, "description": "Granite 3.3 models feature enhanced reasoning capabilities and support for Fill-in-the-Middle (FIM) code completion. They are built on a foundation of open-source instruction datasets with permissive licenses, alongside internally curated synthetic datasets tailored for long-context problem-solving. These models preserve the key strengths of previous Granite versions, including support for a 128K context length, strong performance in retrieval-augmented generation (RAG) and function calling, and controls for response length and originality. Granite 3.3 also delivers competitive results across general, enterprise, and safety benchmarks. Released as open source, the models are available under the Apache 2.0 license.", "release_date": "2025-04-16", "announcement_date": "2025-04-16", "license_id": "apache_2_0", "multimodal": true, "knowledge_cutoff": "2024-04-01", "param_count": 8000000000, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://www.ibm.com/granite/docs/", "source_playground": "https://www.ibm.com/granite/playground/", "source_paper": null, "source_scorecard_blog_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-instruct", "source_repo_link": "https://github.com/ibm-granite/granite-3.3-language-models", "source_weights_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-instruct", "created_at": "2025-07-19T19:49:05.723958+00:00", "updated_at": "2025-07-19T19:49:05.723958+00:00", "model_family_id": null } ================================================ FILE: data/organizations/ibm/models/granite-4.0-tiny-preview/benchmarks.json ================================================ [ { "model_benchmark_id": 1792, "benchmark_id": "alpacaeval-2.0", "model_id": "granite-4.0-tiny-preview", "score": 0.3516, "normalized_score": 0.3516, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-4.0-tiny-preview", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.045290+00:00", "updated_at": "2025-07-19T19:56:15.045290+00:00", "benchmark_name": "AlpacaEval 2.0" }, { "model_benchmark_id": 1458, "benchmark_id": "arena-hard", "model_id": "granite-4.0-tiny-preview", "score": 0.267, "normalized_score": 0.267, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-4.0-tiny-preview", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.108397+00:00", "updated_at": "2025-07-19T19:56:14.108397+00:00", "benchmark_name": "Arena Hard" }, { "model_benchmark_id": 1805, "benchmark_id": "attaq", "model_id": "granite-4.0-tiny-preview", "score": 0.861, "normalized_score": 0.861, "is_self_reported": true, "self_reported_source_link": "https://www.ibm.com/new/announcements/ibm-granite-4-0-tiny-preview-sneak-peek", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.083480+00:00", "updated_at": "2025-07-19T19:56:15.083480+00:00", "benchmark_name": "AttaQ" }, { "model_benchmark_id": 1079, "benchmark_id": "big-bench-hard", "model_id": "granite-4.0-tiny-preview", "score": 0.557, "normalized_score": 0.557, "is_self_reported": true, "self_reported_source_link": "https://www.ibm.com/new/announcements/ibm-granite-4-0-tiny-preview-sneak-peek", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.247228+00:00", "updated_at": "2025-07-19T19:56:13.247228+00:00", "benchmark_name": "BIG-Bench Hard" }, { "model_benchmark_id": 953, "benchmark_id": "drop", "model_id": "granite-4.0-tiny-preview", "score": 0.462, "normalized_score": 0.462, "is_self_reported": true, "self_reported_source_link": "https://www.ibm.com/new/announcements/ibm-granite-4-0-tiny-preview-sneak-peek", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.009229+00:00", "updated_at": "2025-07-19T19:56:13.009229+00:00", "benchmark_name": "DROP" }, { "model_benchmark_id": 1002, "benchmark_id": "gsm8k", "model_id": "granite-4.0-tiny-preview", "score": 0.701, "normalized_score": 0.701, "is_self_reported": true, "self_reported_source_link": "https://www.ibm.com/new/announcements/ibm-granite-4-0-tiny-preview-sneak-peek", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.094422+00:00", "updated_at": "2025-07-19T19:56:13.094422+00:00", "benchmark_name": "GSM8k" }, { "model_benchmark_id": 796, "benchmark_id": "humaneval", "model_id": "granite-4.0-tiny-preview", "score": 0.824, "normalized_score": 0.824, "is_self_reported": true, "self_reported_source_link": "https://www.ibm.com/new/announcements/ibm-granite-4-0-tiny-preview-sneak-peek", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.663900+00:00", "updated_at": "2025-07-19T19:56:12.663900+00:00", "benchmark_name": "HumanEval" }, { "model_benchmark_id": 1442, "benchmark_id": "humaneval+", "model_id": "granite-4.0-tiny-preview", "score": 0.783, "normalized_score": 0.783, "is_self_reported": true, "self_reported_source_link": "https://www.ibm.com/new/announcements/ibm-granite-4-0-tiny-preview-sneak-peek", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.074105+00:00", "updated_at": "2025-07-19T19:56:14.074105+00:00", "benchmark_name": "HumanEval+" }, { "model_benchmark_id": 624, "benchmark_id": "ifeval", "model_id": "granite-4.0-tiny-preview", "score": 0.63, "normalized_score": 0.63, "is_self_reported": true, "self_reported_source_link": "https://www.ibm.com/new/announcements/ibm-granite-4-0-tiny-preview-sneak-peek", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.285068+00:00", "updated_at": "2025-07-19T19:56:12.285068+00:00", "benchmark_name": "IFEval" }, { "model_benchmark_id": 99, "benchmark_id": "mmlu", "model_id": "granite-4.0-tiny-preview", "score": 0.604, "normalized_score": 0.604, "is_self_reported": true, "self_reported_source_link": "https://www.ibm.com/new/announcements/ibm-granite-4-0-tiny-preview-sneak-peek", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.287184+00:00", "updated_at": "2025-07-19T19:56:11.287184+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 1802, "benchmark_id": "popqa", "model_id": "granite-4.0-tiny-preview", "score": 0.229, "normalized_score": 0.229, "is_self_reported": true, "self_reported_source_link": "https://www.ibm.com/new/announcements/ibm-granite-4-0-tiny-preview-sneak-peek", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.075622+00:00", "updated_at": "2025-07-19T19:56:15.075622+00:00", "benchmark_name": "PopQA" }, { "model_benchmark_id": 140, "benchmark_id": "truthfulqa", "model_id": "granite-4.0-tiny-preview", "score": 0.581, "normalized_score": 0.581, "is_self_reported": true, "self_reported_source_link": "https://www.ibm.com/new/announcements/ibm-granite-4-0-tiny-preview-sneak-peek", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.358910+00:00", "updated_at": "2025-07-19T19:56:11.358910+00:00", "benchmark_name": "TruthfulQA" } ] ================================================ FILE: data/organizations/ibm/models/granite-4.0-tiny-preview/model.json ================================================ { "model_id": "granite-4.0-tiny-preview", "name": "IBM Granite 4.0 Tiny Preview", "organization_id": "ibm", "fine_tuned_from_model_id": null, "description": "A preliminary version of the smallest model in the upcoming Granite 4.0 family, released May 2025. It utilizes a novel hybrid Mamba-2/Transformer, fine-grained mixture of experts (MoE) architecture (7B total parameters, 1B active at inference). This preview version is partially trained (2.5T tokens) but demonstrates significant memory efficiency and performance potential, validated for at least 128K context length without positional encoding.", "release_date": "2025-05-02", "announcement_date": "2025-05-02", "license_id": "apache_2_0", "multimodal": false, "knowledge_cutoff": null, "param_count": 7000000000, "training_tokens": 2500000000000, "available_in_zeroeval": true, "source_api_ref": "https://www.ibm.com/granite/docs/", "source_playground": "https://www.ibm.com/granite/playground/", "source_paper": null, "source_scorecard_blog_link": "https://www.ibm.com/new/announcements/ibm-granite-4-0-tiny-preview-sneak-peek", "source_repo_link": null, "source_weights_link": "https://huggingface.co/ibm-granite/granite-4.0-tiny-preview", "created_at": "2025-07-19T19:49:05.720766+00:00", "updated_at": "2025-07-19T19:49:05.720766+00:00", "model_family_id": null } ================================================ FILE: data/organizations/ibm/organization.json ================================================ { "organization_id": "ibm", "name": "IBM", "website": "https://ibm.com", "description": "Technology and consulting company", "country": null, "created_at": "2025-07-19T19:49:05.719047+00:00", "updated_at": "2025-07-19T19:49:05.719047+00:00" } ================================================ FILE: data/organizations/meta/models/llama-3.1-405b-instruct/benchmarks.json ================================================ [ { "model_benchmark_id": 1562, "benchmark_id": "api-bank", "model_id": "llama-3.1-405b-instruct", "score": 0.92, "normalized_score": 0.92, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.382379+00:00", "updated_at": "2025-07-19T19:56:14.382379+00:00", "benchmark_name": "API-Bank" }, { "model_benchmark_id": 16, "benchmark_id": "arc-c", "model_id": "llama-3.1-405b-instruct", "score": 0.969, "normalized_score": 0.969, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.118562+00:00", "updated_at": "2025-07-19T19:56:11.118562+00:00", "benchmark_name": "ARC-C" }, { "model_benchmark_id": 848, "benchmark_id": "bfcl", "model_id": "llama-3.1-405b-instruct", "score": 0.885, "normalized_score": 0.885, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.775431+00:00", "updated_at": "2025-07-19T19:56:12.775431+00:00", "benchmark_name": "BFCL" }, { "model_benchmark_id": 950, "benchmark_id": "drop", "model_id": "llama-3.1-405b-instruct", "score": 0.848, "normalized_score": 0.848, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2407.21783", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.004517+00:00", "updated_at": "2025-07-19T19:56:13.004517+00:00", "benchmark_name": "DROP" }, { "model_benchmark_id": 1565, "benchmark_id": "gorilla-benchmark-api-bench", "model_id": "llama-3.1-405b-instruct", "score": 0.353, "normalized_score": 0.353, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.390263+00:00", "updated_at": "2025-07-19T19:56:14.390263+00:00", "benchmark_name": "Gorilla Benchmark API Bench" }, { "model_benchmark_id": 291, "benchmark_id": "gpqa", "model_id": "llama-3.1-405b-instruct", "score": 0.507, "normalized_score": 0.507, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.662460+00:00", "updated_at": "2025-07-19T19:56:11.662460+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 988, "benchmark_id": "gsm8k", "model_id": "llama-3.1-405b-instruct", "score": 0.968, "normalized_score": 0.968, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct", "verified_by_llmstats": false, "analysis_method": "8-shot, CoT, em_maj1@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.071677+00:00", "updated_at": "2025-07-19T19:56:13.071677+00:00", "benchmark_name": "GSM8k" }, { "model_benchmark_id": 780, "benchmark_id": "humaneval", "model_id": "llama-3.1-405b-instruct", "score": 0.89, "normalized_score": 0.89, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct", "verified_by_llmstats": false, "analysis_method": "0-shot, pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.636480+00:00", "updated_at": "2025-07-19T19:56:12.636480+00:00", "benchmark_name": "HumanEval" }, { "model_benchmark_id": 616, "benchmark_id": "ifeval", "model_id": "llama-3.1-405b-instruct", "score": 0.886, "normalized_score": 0.886, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct", "verified_by_llmstats": false, "analysis_method": "Standard", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.270752+00:00", "updated_at": "2025-07-19T19:56:12.270752+00:00", "benchmark_name": "IFEval" }, { "model_benchmark_id": 394, "benchmark_id": "math", "model_id": "llama-3.1-405b-instruct", "score": 0.738, "normalized_score": 0.738, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct", "verified_by_llmstats": false, "analysis_method": "0-shot, CoT, final_em", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.846056+00:00", "updated_at": "2025-07-19T19:56:11.846056+00:00", "benchmark_name": "MATH" }, { "model_benchmark_id": 1578, "benchmark_id": "mbpp-evalplus", "model_id": "llama-3.1-405b-instruct", "score": 0.886, "normalized_score": 0.886, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct", "verified_by_llmstats": false, "analysis_method": "0-shot, base, pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.428183+00:00", "updated_at": "2025-07-19T19:56:14.428183+00:00", "benchmark_name": "MBPP EvalPlus" }, { "model_benchmark_id": 79, "benchmark_id": "mmlu", "model_id": "llama-3.1-405b-instruct", "score": 0.873, "normalized_score": 0.873, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct", "verified_by_llmstats": false, "analysis_method": "5-shot, macro_avg/acc", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.249582+00:00", "updated_at": "2025-07-19T19:56:11.249582+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 1548, "benchmark_id": "mmlu-(cot)", "model_id": "llama-3.1-405b-instruct", "score": 0.886, "normalized_score": 0.886, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct", "verified_by_llmstats": false, "analysis_method": "0-shot, macro_avg/acc", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.339647+00:00", "updated_at": "2025-07-19T19:56:14.339647+00:00", "benchmark_name": "MMLU (CoT)" }, { "model_benchmark_id": 186, "benchmark_id": "mmlu-pro", "model_id": "llama-3.1-405b-instruct", "score": 0.733, "normalized_score": 0.733, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct", "verified_by_llmstats": false, "analysis_method": "5-shot, CoT, micro_avg/acc_char", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.458814+00:00", "updated_at": "2025-07-19T19:56:11.458814+00:00", "benchmark_name": "MMLU-Pro" }, { "model_benchmark_id": 1572, "benchmark_id": "multilingual-mgsm-(cot)", "model_id": "llama-3.1-405b-instruct", "score": 0.916, "normalized_score": 0.916, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct", "verified_by_llmstats": false, "analysis_method": "0-shot, CoT, em", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.409472+00:00", "updated_at": "2025-07-19T19:56:14.409472+00:00", "benchmark_name": "Multilingual MGSM (CoT)" }, { "model_benchmark_id": 1552, "benchmark_id": "multipl-e-humaneval", "model_id": "llama-3.1-405b-instruct", "score": 0.752, "normalized_score": 0.752, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct", "verified_by_llmstats": false, "analysis_method": "0-shot, pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.352505+00:00", "updated_at": "2025-07-19T19:56:14.352505+00:00", "benchmark_name": "Multipl-E HumanEval" }, { "model_benchmark_id": 1555, "benchmark_id": "multipl-e-mbpp", "model_id": "llama-3.1-405b-instruct", "score": 0.657, "normalized_score": 0.657, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct", "verified_by_llmstats": false, "analysis_method": "0-shot, pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.359473+00:00", "updated_at": "2025-07-19T19:56:14.359473+00:00", "benchmark_name": "Multipl-E MBPP" }, { "model_benchmark_id": 1568, "benchmark_id": "nexus", "model_id": "llama-3.1-405b-instruct", "score": 0.587, "normalized_score": 0.587, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct", "verified_by_llmstats": false, "analysis_method": "0-shot, macro_avg/acc", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.398966+00:00", "updated_at": "2025-07-19T19:56:14.398966+00:00", "benchmark_name": "Nexus" } ] ================================================ FILE: data/organizations/meta/models/llama-3.1-405b-instruct/model.json ================================================ { "model_id": "llama-3.1-405b-instruct", "name": "Llama 3.1 405B Instruct", "organization_id": "meta", "fine_tuned_from_model_id": null, "description": "Llama 3.1 405B Instruct is a large language model optimized for multilingual dialogue use cases. It outperforms many available open source and closed chat models on common industry benchmarks. The model supports 8 languages and has a 128K token context length.", "release_date": "2024-07-23", "announcement_date": "2024-07-23", "license_id": "llama_3_1_community_license", "multimodal": false, "knowledge_cutoff": null, "param_count": 405000000000, "training_tokens": 15000000000000, "available_in_zeroeval": true, "source_api_ref": "https://github.com/meta-llama/llama-models", "source_playground": "https://llama.meta.com/llama-downloads", "source_paper": null, "source_scorecard_blog_link": "https://ai.meta.com/blog/meta-llama-3-1/", "source_repo_link": "https://github.com/meta-llama/llama-models", "source_weights_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct", "created_at": "2025-07-19T19:49:05.585389+00:00", "updated_at": "2025-07-19T19:49:05.585389+00:00", "model_family_id": null } ================================================ FILE: data/organizations/meta/models/llama-3.1-70b-instruct/benchmarks.json ================================================ [ { "model_benchmark_id": 1560, "benchmark_id": "api-bank", "model_id": "llama-3.1-70b-instruct", "score": 0.9, "normalized_score": 0.9, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.378301+00:00", "updated_at": "2025-07-19T19:56:14.378301+00:00", "benchmark_name": "API-Bank" }, { "model_benchmark_id": 14, "benchmark_id": "arc-c", "model_id": "llama-3.1-70b-instruct", "score": 0.948, "normalized_score": 0.948, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.113697+00:00", "updated_at": "2025-07-19T19:56:11.113697+00:00", "benchmark_name": "ARC-C" }, { "model_benchmark_id": 846, "benchmark_id": "bfcl", "model_id": "llama-3.1-70b-instruct", "score": 0.848, "normalized_score": 0.848, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct", "verified_by_llmstats": false, "analysis_method": "Standard evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.771784+00:00", "updated_at": "2025-07-19T19:56:12.771784+00:00", "benchmark_name": "BFCL" }, { "model_benchmark_id": 948, "benchmark_id": "drop", "model_id": "llama-3.1-70b-instruct", "score": 0.796, "normalized_score": 0.796, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2407.21783", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.001514+00:00", "updated_at": "2025-07-19T19:56:13.001514+00:00", "benchmark_name": "DROP" }, { "model_benchmark_id": 1563, "benchmark_id": "gorilla-benchmark-api-bench", "model_id": "llama-3.1-70b-instruct", "score": 0.297, "normalized_score": 0.297, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.386457+00:00", "updated_at": "2025-07-19T19:56:14.386457+00:00", "benchmark_name": "Gorilla Benchmark API Bench" }, { "model_benchmark_id": 288, "benchmark_id": "gpqa", "model_id": "llama-3.1-70b-instruct", "score": 0.417, "normalized_score": 0.417, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.657221+00:00", "updated_at": "2025-07-19T19:56:11.657221+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 1556, "benchmark_id": "gsm-8k-(cot)", "model_id": "llama-3.1-70b-instruct", "score": 0.951, "normalized_score": 0.951, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct", "verified_by_llmstats": false, "analysis_method": "8-shot Chain-of-Thought", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.362878+00:00", "updated_at": "2025-07-19T19:56:14.362878+00:00", "benchmark_name": "GSM-8K (CoT)" }, { "model_benchmark_id": 778, "benchmark_id": "humaneval", "model_id": "llama-3.1-70b-instruct", "score": 0.805, "normalized_score": 0.805, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.632931+00:00", "updated_at": "2025-07-19T19:56:12.632931+00:00", "benchmark_name": "HumanEval" }, { "model_benchmark_id": 614, "benchmark_id": "ifeval", "model_id": "llama-3.1-70b-instruct", "score": 0.875, "normalized_score": 0.875, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct", "verified_by_llmstats": false, "analysis_method": "Standard evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.266791+00:00", "updated_at": "2025-07-19T19:56:12.266791+00:00", "benchmark_name": "IFEval" }, { "model_benchmark_id": 1558, "benchmark_id": "math-(cot)", "model_id": "llama-3.1-70b-instruct", "score": 0.68, "normalized_score": 0.68, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct", "verified_by_llmstats": false, "analysis_method": "0-shot Chain-of-Thought", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.371489+00:00", "updated_at": "2025-07-19T19:56:14.371489+00:00", "benchmark_name": "MATH (CoT)" }, { "model_benchmark_id": 1549, "benchmark_id": "mbpp-++-base-version", "model_id": "llama-3.1-70b-instruct", "score": 0.86, "normalized_score": 0.86, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.344061+00:00", "updated_at": "2025-07-19T19:56:14.344061+00:00", "benchmark_name": "MBPP ++ base version" }, { "model_benchmark_id": 76, "benchmark_id": "mmlu", "model_id": "llama-3.1-70b-instruct", "score": 0.836, "normalized_score": 0.836, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct", "verified_by_llmstats": false, "analysis_method": "5-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.243294+00:00", "updated_at": "2025-07-19T19:56:11.243294+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 1546, "benchmark_id": "mmlu-(cot)", "model_id": "llama-3.1-70b-instruct", "score": 0.86, "normalized_score": 0.86, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct", "verified_by_llmstats": false, "analysis_method": "0-shot Chain-of-Thought", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.334507+00:00", "updated_at": "2025-07-19T19:56:14.334507+00:00", "benchmark_name": "MMLU (CoT)" }, { "model_benchmark_id": 184, "benchmark_id": "mmlu-pro", "model_id": "llama-3.1-70b-instruct", "score": 0.664, "normalized_score": 0.664, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct", "verified_by_llmstats": false, "analysis_method": "5-shot Chain-of-Thought", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.455089+00:00", "updated_at": "2025-07-19T19:56:11.455089+00:00", "benchmark_name": "MMLU-Pro" }, { "model_benchmark_id": 1570, "benchmark_id": "multilingual-mgsm-(cot)", "model_id": "llama-3.1-70b-instruct", "score": 0.869, "normalized_score": 0.869, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct", "verified_by_llmstats": false, "analysis_method": "0-shot Chain-of-Thought", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.405488+00:00", "updated_at": "2025-07-19T19:56:14.405488+00:00", "benchmark_name": "Multilingual MGSM (CoT)" }, { "model_benchmark_id": 1550, "benchmark_id": "multipl-e-humaneval", "model_id": "llama-3.1-70b-instruct", "score": 0.655, "normalized_score": 0.655, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.347431+00:00", "updated_at": "2025-07-19T19:56:14.347431+00:00", "benchmark_name": "Multipl-E HumanEval" }, { "model_benchmark_id": 1553, "benchmark_id": "multipl-e-mbpp", "model_id": "llama-3.1-70b-instruct", "score": 0.62, "normalized_score": 0.62, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.356043+00:00", "updated_at": "2025-07-19T19:56:14.356043+00:00", "benchmark_name": "Multipl-E MBPP" }, { "model_benchmark_id": 1566, "benchmark_id": "nexus", "model_id": "llama-3.1-70b-instruct", "score": 0.567, "normalized_score": 0.567, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.394299+00:00", "updated_at": "2025-07-19T19:56:14.394299+00:00", "benchmark_name": "Nexus" } ] ================================================ FILE: data/organizations/meta/models/llama-3.1-70b-instruct/model.json ================================================ { "model_id": "llama-3.1-70b-instruct", "name": "Llama 3.1 70B Instruct", "organization_id": "meta", "fine_tuned_from_model_id": null, "description": "Llama 3.1 70B Instruct is a large language model optimized for multilingual dialogue use cases. It outperforms many available open source and closed chat models on common industry benchmarks.", "release_date": "2024-07-23", "announcement_date": "2024-07-23", "license_id": "llama_3_1_community_license", "multimodal": false, "knowledge_cutoff": null, "param_count": 70000000000, "training_tokens": 15000000000000, "available_in_zeroeval": true, "source_api_ref": "https://ai.meta.com/llama/", "source_playground": null, "source_paper": "https://ai.meta.com/research/publications/llama-3-open-foundation-and-fine-tuned-chat-models/", "source_scorecard_blog_link": "https://ai.meta.com/blog/meta-llama-3-1/", "source_repo_link": "https://github.com/meta-llama/llama-models", "source_weights_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct", "created_at": "2025-07-19T19:49:05.575761+00:00", "updated_at": "2025-07-19T19:49:05.575761+00:00", "model_family_id": null } ================================================ FILE: data/organizations/meta/models/llama-3.1-8b-instruct/benchmarks.json ================================================ [ { "model_benchmark_id": 1561, "benchmark_id": "api-bank", "model_id": "llama-3.1-8b-instruct", "score": 0.826, "normalized_score": 0.826, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.380088+00:00", "updated_at": "2025-07-19T19:56:14.380088+00:00", "benchmark_name": "API-Bank" }, { "model_benchmark_id": 15, "benchmark_id": "arc-c", "model_id": "llama-3.1-8b-instruct", "score": 0.834, "normalized_score": 0.834, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.115810+00:00", "updated_at": "2025-07-19T19:56:11.115810+00:00", "benchmark_name": "ARC-C" }, { "model_benchmark_id": 847, "benchmark_id": "bfcl", "model_id": "llama-3.1-8b-instruct", "score": 0.761, "normalized_score": 0.761, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.773659+00:00", "updated_at": "2025-07-19T19:56:12.773659+00:00", "benchmark_name": "BFCL" }, { "model_benchmark_id": 949, "benchmark_id": "drop", "model_id": "llama-3.1-8b-instruct", "score": 0.595, "normalized_score": 0.595, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2407.21783", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.003032+00:00", "updated_at": "2025-07-19T19:56:13.003032+00:00", "benchmark_name": "DROP" }, { "model_benchmark_id": 1564, "benchmark_id": "gorilla-benchmark-api-bench", "model_id": "llama-3.1-8b-instruct", "score": 0.082, "normalized_score": 0.082, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.388429+00:00", "updated_at": "2025-07-19T19:56:14.388429+00:00", "benchmark_name": "Gorilla Benchmark API Bench" }, { "model_benchmark_id": 290, "benchmark_id": "gpqa", "model_id": "llama-3.1-8b-instruct", "score": 0.304, "normalized_score": 0.304, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.660952+00:00", "updated_at": "2025-07-19T19:56:11.660952+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 1557, "benchmark_id": "gsm-8k-(cot)", "model_id": "llama-3.1-8b-instruct", "score": 0.845, "normalized_score": 0.845, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct", "verified_by_llmstats": false, "analysis_method": "8-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.364382+00:00", "updated_at": "2025-07-19T19:56:14.364382+00:00", "benchmark_name": "GSM-8K (CoT)" }, { "model_benchmark_id": 779, "benchmark_id": "humaneval", "model_id": "llama-3.1-8b-instruct", "score": 0.726, "normalized_score": 0.726, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.634981+00:00", "updated_at": "2025-07-19T19:56:12.634981+00:00", "benchmark_name": "HumanEval" }, { "model_benchmark_id": 615, "benchmark_id": "ifeval", "model_id": "llama-3.1-8b-instruct", "score": 0.804, "normalized_score": 0.804, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct", "verified_by_llmstats": false, "analysis_method": "unspecified", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.268709+00:00", "updated_at": "2025-07-19T19:56:12.268709+00:00", "benchmark_name": "IFEval" }, { "model_benchmark_id": 1559, "benchmark_id": "math-(cot)", "model_id": "llama-3.1-8b-instruct", "score": 0.519, "normalized_score": 0.519, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.373274+00:00", "updated_at": "2025-07-19T19:56:14.373274+00:00", "benchmark_name": "MATH (CoT)" }, { "model_benchmark_id": 1577, "benchmark_id": "mbpp-evalplus-(base)", "model_id": "llama-3.1-8b-instruct", "score": 0.728, "normalized_score": 0.728, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.424442+00:00", "updated_at": "2025-07-19T19:56:14.424442+00:00", "benchmark_name": "MBPP EvalPlus (base)" }, { "model_benchmark_id": 78, "benchmark_id": "mmlu", "model_id": "llama-3.1-8b-instruct", "score": 0.694, "normalized_score": 0.694, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct", "verified_by_llmstats": false, "analysis_method": "5-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.247675+00:00", "updated_at": "2025-07-19T19:56:11.247675+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 1547, "benchmark_id": "mmlu-(cot)", "model_id": "llama-3.1-8b-instruct", "score": 0.73, "normalized_score": 0.73, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.337443+00:00", "updated_at": "2025-07-19T19:56:14.337443+00:00", "benchmark_name": "MMLU (CoT)" }, { "model_benchmark_id": 185, "benchmark_id": "mmlu-pro", "model_id": "llama-3.1-8b-instruct", "score": 0.483, "normalized_score": 0.483, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct", "verified_by_llmstats": false, "analysis_method": "5-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.457212+00:00", "updated_at": "2025-07-19T19:56:11.457212+00:00", "benchmark_name": "MMLU-Pro" }, { "model_benchmark_id": 1571, "benchmark_id": "multilingual-mgsm-(cot)", "model_id": "llama-3.1-8b-instruct", "score": 0.689, "normalized_score": 0.689, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.407707+00:00", "updated_at": "2025-07-19T19:56:14.407707+00:00", "benchmark_name": "Multilingual MGSM (CoT)" }, { "model_benchmark_id": 1551, "benchmark_id": "multipl-e-humaneval", "model_id": "llama-3.1-8b-instruct", "score": 0.508, "normalized_score": 0.508, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.350301+00:00", "updated_at": "2025-07-19T19:56:14.350301+00:00", "benchmark_name": "Multipl-E HumanEval" }, { "model_benchmark_id": 1554, "benchmark_id": "multipl-e-mbpp", "model_id": "llama-3.1-8b-instruct", "score": 0.524, "normalized_score": 0.524, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.357886+00:00", "updated_at": "2025-07-19T19:56:14.357886+00:00", "benchmark_name": "Multipl-E MBPP" }, { "model_benchmark_id": 1567, "benchmark_id": "nexus", "model_id": "llama-3.1-8b-instruct", "score": 0.385, "normalized_score": 0.385, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.396611+00:00", "updated_at": "2025-07-19T19:56:14.396611+00:00", "benchmark_name": "Nexus" } ] ================================================ FILE: data/organizations/meta/models/llama-3.1-8b-instruct/model.json ================================================ { "model_id": "llama-3.1-8b-instruct", "name": "Llama 3.1 8B Instruct", "organization_id": "meta", "fine_tuned_from_model_id": null, "description": "Llama 3.1 8B Instruct is a multilingual large language model optimized for dialogue use cases. It features a 128K context length, state-of-the-art tool use, and strong reasoning capabilities.", "release_date": "2024-07-23", "announcement_date": "2024-07-23", "license_id": "llama_3_1_community_license", "multimodal": false, "knowledge_cutoff": "2023-12-31", "param_count": 8000000000, "training_tokens": 15000000000000, "available_in_zeroeval": true, "source_api_ref": "https://www.llama.com/", "source_playground": null, "source_paper": null, "source_scorecard_blog_link": "https://ai.meta.com/blog/meta-llama-3-1/", "source_repo_link": "https://github.com/meta-llama/llama-models", "source_weights_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct", "created_at": "2025-07-19T19:49:05.582878+00:00", "updated_at": "2025-07-19T19:49:05.582878+00:00", "model_family_id": null } ================================================ FILE: data/organizations/meta/models/llama-3.2-11b-instruct/benchmarks.json ================================================ [ { "model_benchmark_id": 1253, "benchmark_id": "ai2d", "model_id": "llama-3.2-11b-instruct", "score": 0.911, "normalized_score": 0.911, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct", "verified_by_llmstats": false, "analysis_method": "Test accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.631448+00:00", "updated_at": "2025-07-19T19:56:13.631448+00:00", "benchmark_name": "AI2D" }, { "model_benchmark_id": 861, "benchmark_id": "chartqa", "model_id": "llama-3.2-11b-instruct", "score": 0.834, "normalized_score": 0.834, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct", "verified_by_llmstats": false, "analysis_method": "Test, 0-shot CoT relaxed accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.801741+00:00", "updated_at": "2025-07-19T19:56:12.801741+00:00", "benchmark_name": "ChartQA" }, { "model_benchmark_id": 883, "benchmark_id": "docvqa", "model_id": "llama-3.2-11b-instruct", "score": 0.884, "normalized_score": 0.884, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct", "verified_by_llmstats": false, "analysis_method": "Test ANLS", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.839416+00:00", "updated_at": "2025-07-19T19:56:12.839416+00:00", "benchmark_name": "DocVQA" }, { "model_benchmark_id": 292, "benchmark_id": "gpqa", "model_id": "llama-3.2-11b-instruct", "score": 0.328, "normalized_score": 0.328, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct", "verified_by_llmstats": false, "analysis_method": "0-shot, CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.663962+00:00", "updated_at": "2025-07-19T19:56:11.663962+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 395, "benchmark_id": "math", "model_id": "llama-3.2-11b-instruct", "score": 0.519, "normalized_score": 0.519, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct", "verified_by_llmstats": false, "analysis_method": "0-shot, CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.847598+00:00", "updated_at": "2025-07-19T19:56:11.847598+00:00", "benchmark_name": "MATH" }, { "model_benchmark_id": 523, "benchmark_id": "mathvista", "model_id": "llama-3.2-11b-instruct", "score": 0.515, "normalized_score": 0.515, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct", "verified_by_llmstats": false, "analysis_method": "Test accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.086640+00:00", "updated_at": "2025-07-19T19:56:12.086640+00:00", "benchmark_name": "MathVista" }, { "model_benchmark_id": 1284, "benchmark_id": "mgsm", "model_id": "llama-3.2-11b-instruct", "score": 0.689, "normalized_score": 0.689, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct", "verified_by_llmstats": false, "analysis_method": "0-shot, CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.690958+00:00", "updated_at": "2025-07-19T19:56:13.690958+00:00", "benchmark_name": "MGSM" }, { "model_benchmark_id": 80, "benchmark_id": "mmlu", "model_id": "llama-3.2-11b-instruct", "score": 0.73, "normalized_score": 0.73, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct", "verified_by_llmstats": false, "analysis_method": "Macro average accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.251362+00:00", "updated_at": "2025-07-19T19:56:11.251362+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 566, "benchmark_id": "mmmu", "model_id": "llama-3.2-11b-instruct", "score": 0.507, "normalized_score": 0.507, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct", "verified_by_llmstats": false, "analysis_method": "Val, 0-shot CoT, micro avg accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.164872+00:00", "updated_at": "2025-07-19T19:56:12.164872+00:00", "benchmark_name": "MMMU" }, { "model_benchmark_id": 1530, "benchmark_id": "mmmu-pro", "model_id": "llama-3.2-11b-instruct", "score": 0.33, "normalized_score": 0.33, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct", "verified_by_llmstats": false, "analysis_method": "Test accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.288730+00:00", "updated_at": "2025-07-19T19:56:14.288730+00:00", "benchmark_name": "MMMU-Pro" }, { "model_benchmark_id": 1580, "benchmark_id": "vqav2-(test)", "model_id": "llama-3.2-11b-instruct", "score": 0.752, "normalized_score": 0.752, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.434081+00:00", "updated_at": "2025-07-19T19:56:14.434081+00:00", "benchmark_name": "VQAv2 (test)" } ] ================================================ FILE: data/organizations/meta/models/llama-3.2-11b-instruct/model.json ================================================ { "model_id": "llama-3.2-11b-instruct", "name": "Llama 3.2 11B Instruct", "organization_id": "meta", "fine_tuned_from_model_id": null, "description": "Llama 3.2 11B Vision Instruct is an instruction-tuned multimodal large language model optimized for visual recognition, image reasoning, captioning, and answering general questions about an image. It accepts text and images as input and generates text as output.", "release_date": "2024-09-25", "announcement_date": "2024-09-25", "license_id": "llama_3_2_community_license", "multimodal": true, "knowledge_cutoff": "2023-12-31", "param_count": 10600000000, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct", "source_playground": null, "source_paper": null, "source_scorecard_blog_link": "https://ai.meta.com/blog/llama-3-2-connect-2024-vision-edge-mobile-devices/", "source_repo_link": "https://github.com/facebookresearch/llama", "source_weights_link": "https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct", "created_at": "2025-07-19T19:49:05.588479+00:00", "updated_at": "2025-07-19T19:49:05.588479+00:00", "model_family_id": null } ================================================ FILE: data/organizations/meta/models/llama-3.2-3b-instruct/benchmarks.json ================================================ [ { "model_benchmark_id": 17, "benchmark_id": "arc-c", "model_id": "llama-3.2-3b-instruct", "score": 0.786, "normalized_score": 0.786, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct", "verified_by_llmstats": false, "analysis_method": "0-shot, acc", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.120164+00:00", "updated_at": "2025-07-19T19:56:11.120164+00:00", "benchmark_name": "ARC-C" }, { "model_benchmark_id": 1583, "benchmark_id": "bfcl-v2", "model_id": "llama-3.2-3b-instruct", "score": 0.67, "normalized_score": 0.67, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct", "verified_by_llmstats": false, "analysis_method": "0-shot, acc", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.446368+00:00", "updated_at": "2025-07-19T19:56:14.446368+00:00", "benchmark_name": "BFCL v2" }, { "model_benchmark_id": 293, "benchmark_id": "gpqa", "model_id": "llama-3.2-3b-instruct", "score": 0.328, "normalized_score": 0.328, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct", "verified_by_llmstats": false, "analysis_method": "0-shot, acc", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.665423+00:00", "updated_at": "2025-07-19T19:56:11.665423+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 989, "benchmark_id": "gsm8k", "model_id": "llama-3.2-3b-instruct", "score": 0.777, "normalized_score": 0.777, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct", "verified_by_llmstats": false, "analysis_method": "8-shot, em_maj1@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.073210+00:00", "updated_at": "2025-07-19T19:56:13.073210+00:00", "benchmark_name": "GSM8k" }, { "model_benchmark_id": 44, "benchmark_id": "hellaswag", "model_id": "llama-3.2-3b-instruct", "score": 0.698, "normalized_score": 0.698, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct", "verified_by_llmstats": false, "analysis_method": "0-shot, acc", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.175473+00:00", "updated_at": "2025-07-19T19:56:11.175473+00:00", "benchmark_name": "HellaSwag" }, { "model_benchmark_id": 617, "benchmark_id": "ifeval", "model_id": "llama-3.2-3b-instruct", "score": 0.774, "normalized_score": 0.774, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct", "verified_by_llmstats": false, "analysis_method": "Avg(Prompt/Instruction acc Loose/Strict)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.272319+00:00", "updated_at": "2025-07-19T19:56:12.272319+00:00", "benchmark_name": "IFEval" }, { "model_benchmark_id": 1589, "benchmark_id": "infinitebench-en.mc", "model_id": "llama-3.2-3b-instruct", "score": 0.633, "normalized_score": 0.633, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct", "verified_by_llmstats": false, "analysis_method": "0-shot, longbook_choice/acc", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.464298+00:00", "updated_at": "2025-07-19T19:56:14.464298+00:00", "benchmark_name": "InfiniteBench/En.MC" }, { "model_benchmark_id": 1588, "benchmark_id": "infinitebench-en.qa", "model_id": "llama-3.2-3b-instruct", "score": 0.198, "normalized_score": 0.198, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct", "verified_by_llmstats": false, "analysis_method": "0-shot, longbook_qa/f1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.460560+00:00", "updated_at": "2025-07-19T19:56:14.460560+00:00", "benchmark_name": "InfiniteBench/En.QA" }, { "model_benchmark_id": 396, "benchmark_id": "math", "model_id": "llama-3.2-3b-instruct", "score": 0.48, "normalized_score": 0.48, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct", "verified_by_llmstats": false, "analysis_method": "0-shot, final_em", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.849582+00:00", "updated_at": "2025-07-19T19:56:11.849582+00:00", "benchmark_name": "MATH" }, { "model_benchmark_id": 1285, "benchmark_id": "mgsm", "model_id": "llama-3.2-3b-instruct", "score": 0.582, "normalized_score": 0.582, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct", "verified_by_llmstats": false, "analysis_method": "CoT, em", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.692573+00:00", "updated_at": "2025-07-19T19:56:13.692573+00:00", "benchmark_name": "MGSM" }, { "model_benchmark_id": 81, "benchmark_id": "mmlu", "model_id": "llama-3.2-3b-instruct", "score": 0.634, "normalized_score": 0.634, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct", "verified_by_llmstats": false, "analysis_method": "5-shot, macro_avg/acc", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.252797+00:00", "updated_at": "2025-07-19T19:56:11.252797+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 1569, "benchmark_id": "nexus", "model_id": "llama-3.2-3b-instruct", "score": 0.343, "normalized_score": 0.343, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct", "verified_by_llmstats": false, "analysis_method": "0-shot, macro_avg/acc", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.401027+00:00", "updated_at": "2025-07-19T19:56:14.401027+00:00", "benchmark_name": "Nexus" }, { "model_benchmark_id": 1590, "benchmark_id": "nih-multi-needle", "model_id": "llama-3.2-3b-instruct", "score": 0.847, "normalized_score": 0.847, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct", "verified_by_llmstats": false, "analysis_method": "0-shot, recall", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.469424+00:00", "updated_at": "2025-07-19T19:56:14.469424+00:00", "benchmark_name": "NIH/Multi-needle" }, { "model_benchmark_id": 1581, "benchmark_id": "open-rewrite", "model_id": "llama-3.2-3b-instruct", "score": 0.401, "normalized_score": 0.401, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct", "verified_by_llmstats": false, "analysis_method": "0-shot, micro_avg/rougeL", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.438526+00:00", "updated_at": "2025-07-19T19:56:14.438526+00:00", "benchmark_name": "Open-rewrite" }, { "model_benchmark_id": 1582, "benchmark_id": "tldr9+-(test)", "model_id": "llama-3.2-3b-instruct", "score": 0.19, "normalized_score": 0.19, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct", "verified_by_llmstats": false, "analysis_method": "1-shot, rougeL", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.443142+00:00", "updated_at": "2025-07-19T19:56:14.443142+00:00", "benchmark_name": "TLDR9+ (test)" } ] ================================================ FILE: data/organizations/meta/models/llama-3.2-3b-instruct/model.json ================================================ { "model_id": "llama-3.2-3b-instruct", "name": "Llama 3.2 3B Instruct", "organization_id": "meta", "fine_tuned_from_model_id": null, "description": "Llama 3.2 3B Instruct is a large language model that supports a context length of 128K tokens and are state-of-the-art in their class for on-device use cases like summarization, instruction following, and rewriting tasks running locally at the edge.", "release_date": "2024-09-25", "announcement_date": "2024-09-25", "license_id": "llama_3_2_community_license", "multimodal": false, "knowledge_cutoff": null, "param_count": 3210000000, "training_tokens": 9000000000000, "available_in_zeroeval": true, "source_api_ref": "https://github.com/meta-llama/llama-models", "source_playground": "https://llama.meta.com/llama-downloads", "source_paper": null, "source_scorecard_blog_link": "https://ai.meta.com/blog/llama-3-2-connect-2024-vision-edge-mobile-devices/", "source_repo_link": "https://github.com/meta-llama/llama-models", "source_weights_link": "https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct", "created_at": "2025-07-19T19:49:05.591372+00:00", "updated_at": "2025-07-19T19:49:05.591372+00:00", "model_family_id": null } ================================================ FILE: data/organizations/meta/models/llama-3.2-90b-instruct/benchmarks.json ================================================ [ { "model_benchmark_id": 1252, "benchmark_id": "ai2d", "model_id": "llama-3.2-90b-instruct", "score": 0.923, "normalized_score": 0.923, "is_self_reported": true, "self_reported_source_link": "https://ai.meta.com/blog/llama-3-2-connect-2024-vision-edge-mobile-devices/", "verified_by_llmstats": false, "analysis_method": "-", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.629735+00:00", "updated_at": "2025-07-19T19:56:13.629735+00:00", "benchmark_name": "AI2D" }, { "model_benchmark_id": 860, "benchmark_id": "chartqa", "model_id": "llama-3.2-90b-instruct", "score": 0.855, "normalized_score": 0.855, "is_self_reported": true, "self_reported_source_link": "https://ai.meta.com/blog/llama-3-2-connect-2024-vision-edge-mobile-devices/", "verified_by_llmstats": false, "analysis_method": "-", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.799861+00:00", "updated_at": "2025-07-19T19:56:12.799861+00:00", "benchmark_name": "ChartQA" }, { "model_benchmark_id": 882, "benchmark_id": "docvqa", "model_id": "llama-3.2-90b-instruct", "score": 0.901, "normalized_score": 0.901, "is_self_reported": true, "self_reported_source_link": "https://ai.meta.com/blog/llama-3-2-connect-2024-vision-edge-mobile-devices/", "verified_by_llmstats": false, "analysis_method": "-", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.837654+00:00", "updated_at": "2025-07-19T19:56:12.837654+00:00", "benchmark_name": "DocVQA" }, { "model_benchmark_id": 289, "benchmark_id": "gpqa", "model_id": "llama-3.2-90b-instruct", "score": 0.467, "normalized_score": 0.467, "is_self_reported": true, "self_reported_source_link": "https://ai.meta.com/blog/llama-3-2-connect-2024-vision-edge-mobile-devices/", "verified_by_llmstats": false, "analysis_method": "0-shot CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.659193+00:00", "updated_at": "2025-07-19T19:56:11.659193+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 1576, "benchmark_id": "infographicsqa", "model_id": "llama-3.2-90b-instruct", "score": 0.568, "normalized_score": 0.568, "is_self_reported": true, "self_reported_source_link": "https://ai.meta.com/blog/llama-3-2-connect-2024-vision-edge-mobile-devices/", "verified_by_llmstats": false, "analysis_method": "-", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.420214+00:00", "updated_at": "2025-07-19T19:56:14.420214+00:00", "benchmark_name": "InfographicsQA" }, { "model_benchmark_id": 393, "benchmark_id": "math", "model_id": "llama-3.2-90b-instruct", "score": 0.68, "normalized_score": 0.68, "is_self_reported": true, "self_reported_source_link": "https://ai.meta.com/blog/llama-3-2-connect-2024-vision-edge-mobile-devices/", "verified_by_llmstats": false, "analysis_method": "0-shot CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.844378+00:00", "updated_at": "2025-07-19T19:56:11.844378+00:00", "benchmark_name": "MATH" }, { "model_benchmark_id": 522, "benchmark_id": "mathvista", "model_id": "llama-3.2-90b-instruct", "score": 0.573, "normalized_score": 0.573, "is_self_reported": true, "self_reported_source_link": "https://ai.meta.com/blog/llama-3-2-connect-2024-vision-edge-mobile-devices/", "verified_by_llmstats": false, "analysis_method": "-", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.084321+00:00", "updated_at": "2025-07-19T19:56:12.084321+00:00", "benchmark_name": "MathVista" }, { "model_benchmark_id": 1283, "benchmark_id": "mgsm", "model_id": "llama-3.2-90b-instruct", "score": 0.869, "normalized_score": 0.869, "is_self_reported": true, "self_reported_source_link": "https://ai.meta.com/blog/llama-3-2-connect-2024-vision-edge-mobile-devices/", "verified_by_llmstats": false, "analysis_method": "0-shot CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.688987+00:00", "updated_at": "2025-07-19T19:56:13.688987+00:00", "benchmark_name": "MGSM" }, { "model_benchmark_id": 77, "benchmark_id": "mmlu", "model_id": "llama-3.2-90b-instruct", "score": 0.86, "normalized_score": 0.86, "is_self_reported": true, "self_reported_source_link": "https://ai.meta.com/blog/llama-3-2-connect-2024-vision-edge-mobile-devices/", "verified_by_llmstats": false, "analysis_method": "0-shot CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.245688+00:00", "updated_at": "2025-07-19T19:56:11.245688+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 565, "benchmark_id": "mmmu", "model_id": "llama-3.2-90b-instruct", "score": 0.603, "normalized_score": 0.603, "is_self_reported": true, "self_reported_source_link": "https://ai.meta.com/blog/llama-3-2-connect-2024-vision-edge-mobile-devices/", "verified_by_llmstats": false, "analysis_method": "0-shot CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.162828+00:00", "updated_at": "2025-07-19T19:56:12.162828+00:00", "benchmark_name": "MMMU" }, { "model_benchmark_id": 1529, "benchmark_id": "mmmu-pro", "model_id": "llama-3.2-90b-instruct", "score": 0.452, "normalized_score": 0.452, "is_self_reported": true, "self_reported_source_link": "https://ai.meta.com/blog/llama-3-2-connect-2024-vision-edge-mobile-devices/", "verified_by_llmstats": false, "analysis_method": "0-shot CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.287214+00:00", "updated_at": "2025-07-19T19:56:14.287214+00:00", "benchmark_name": "MMMU-Pro" }, { "model_benchmark_id": 908, "benchmark_id": "textvqa", "model_id": "llama-3.2-90b-instruct", "score": 0.735, "normalized_score": 0.735, "is_self_reported": true, "self_reported_source_link": "https://ai.meta.com/blog/llama-3-2-connect-2024-vision-edge-mobile-devices/", "verified_by_llmstats": false, "analysis_method": "-", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.892927+00:00", "updated_at": "2025-07-19T19:56:12.892927+00:00", "benchmark_name": "TextVQA" }, { "model_benchmark_id": 1573, "benchmark_id": "vqav2", "model_id": "llama-3.2-90b-instruct", "score": 0.781, "normalized_score": 0.781, "is_self_reported": true, "self_reported_source_link": "https://ai.meta.com/blog/llama-3-2-connect-2024-vision-edge-mobile-devices/", "verified_by_llmstats": false, "analysis_method": "-", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.412800+00:00", "updated_at": "2025-07-19T19:56:14.412800+00:00", "benchmark_name": "VQAv2" } ] ================================================ FILE: data/organizations/meta/models/llama-3.2-90b-instruct/model.json ================================================ { "model_id": "llama-3.2-90b-instruct", "name": "Llama 3.2 90B Instruct", "organization_id": "meta", "fine_tuned_from_model_id": null, "description": "Llama 3.2 90B is a large multimodal language model optimized for visual recognition, image reasoning, and captioning tasks. It supports a context length of 128,000 tokens and is designed for deployment on edge and mobile devices, offering state-of-the-art performance in image understanding and generative tasks.", "release_date": "2024-09-25", "announcement_date": "2024-09-25", "license_id": "llama3_2", "multimodal": true, "knowledge_cutoff": null, "param_count": 90000000000, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://huggingface.co/meta-llama/Llama-3.2-90B-Vision-Instruct", "source_playground": null, "source_paper": null, "source_scorecard_blog_link": "https://ai.meta.com/blog/llama-3-2-connect-2024-vision-edge-mobile-devices/", "source_repo_link": "https://huggingface.co/meta-llama/Llama-3.2-90B-Vision-Instruct", "source_weights_link": "https://huggingface.co/meta-llama/Llama-3.2-90B-Vision-Instruct", "created_at": "2025-07-19T19:49:05.579590+00:00", "updated_at": "2025-07-19T19:49:05.579590+00:00", "model_family_id": null } ================================================ FILE: data/organizations/meta/models/llama-3.3-70b-instruct/benchmarks.json ================================================ [ { "model_benchmark_id": 1584, "benchmark_id": "bfcl-v2", "model_id": "llama-3.3-70b-instruct", "score": 0.773, "normalized_score": 0.773, "is_self_reported": true, "self_reported_source_link": "https://github.com/meta-llama/llama-models/blob/main/models/llama3_3/MODEL_CARD.md", "verified_by_llmstats": false, "analysis_method": "-", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.448863+00:00", "updated_at": "2025-07-19T19:56:14.448863+00:00", "benchmark_name": "BFCL v2" }, { "model_benchmark_id": 296, "benchmark_id": "gpqa", "model_id": "llama-3.3-70b-instruct", "score": 0.505, "normalized_score": 0.505, "is_self_reported": true, "self_reported_source_link": "https://github.com/meta-llama/llama-models/blob/main/models/llama3_3/MODEL_CARD.md", "verified_by_llmstats": false, "analysis_method": "0-shot CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.669923+00:00", "updated_at": "2025-07-19T19:56:11.669923+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 781, "benchmark_id": "humaneval", "model_id": "llama-3.3-70b-instruct", "score": 0.884, "normalized_score": 0.884, "is_self_reported": true, "self_reported_source_link": "https://github.com/meta-llama/llama-models/blob/main/models/llama3_3/MODEL_CARD.md", "verified_by_llmstats": false, "analysis_method": "-", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.637990+00:00", "updated_at": "2025-07-19T19:56:12.637990+00:00", "benchmark_name": "HumanEval" }, { "model_benchmark_id": 618, "benchmark_id": "ifeval", "model_id": "llama-3.3-70b-instruct", "score": 0.921, "normalized_score": 0.921, "is_self_reported": true, "self_reported_source_link": "https://github.com/meta-llama/llama-models/blob/main/models/llama3_3/MODEL_CARD.md", "verified_by_llmstats": false, "analysis_method": "-", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.274109+00:00", "updated_at": "2025-07-19T19:56:12.274109+00:00", "benchmark_name": "IFEval" }, { "model_benchmark_id": 399, "benchmark_id": "math", "model_id": "llama-3.3-70b-instruct", "score": 0.77, "normalized_score": 0.77, "is_self_reported": true, "self_reported_source_link": "https://github.com/meta-llama/llama-models/blob/main/models/llama3_3/MODEL_CARD.md", "verified_by_llmstats": false, "analysis_method": "0-shot CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.854268+00:00", "updated_at": "2025-07-19T19:56:11.854268+00:00", "benchmark_name": "MATH" }, { "model_benchmark_id": 1579, "benchmark_id": "mbpp-evalplus", "model_id": "llama-3.3-70b-instruct", "score": 0.876, "normalized_score": 0.876, "is_self_reported": true, "self_reported_source_link": "https://github.com/meta-llama/llama-models/blob/main/models/llama3_3/MODEL_CARD.md", "verified_by_llmstats": false, "analysis_method": "-", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.429699+00:00", "updated_at": "2025-07-19T19:56:14.429699+00:00", "benchmark_name": "MBPP EvalPlus" }, { "model_benchmark_id": 1288, "benchmark_id": "mgsm", "model_id": "llama-3.3-70b-instruct", "score": 0.911, "normalized_score": 0.911, "is_self_reported": true, "self_reported_source_link": "https://github.com/meta-llama/llama-models/blob/main/models/llama3_3/MODEL_CARD.md", "verified_by_llmstats": false, "analysis_method": "-", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.697414+00:00", "updated_at": "2025-07-19T19:56:13.697414+00:00", "benchmark_name": "MGSM" }, { "model_benchmark_id": 84, "benchmark_id": "mmlu", "model_id": "llama-3.3-70b-instruct", "score": 0.86, "normalized_score": 0.86, "is_self_reported": true, "self_reported_source_link": "https://github.com/meta-llama/llama-models/blob/main/models/llama3_3/MODEL_CARD.md", "verified_by_llmstats": false, "analysis_method": "0-shot CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.259963+00:00", "updated_at": "2025-07-19T19:56:11.259963+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 189, "benchmark_id": "mmlu-pro", "model_id": "llama-3.3-70b-instruct", "score": 0.689, "normalized_score": 0.689, "is_self_reported": true, "self_reported_source_link": "https://github.com/meta-llama/llama-models/blob/main/models/llama3_3/MODEL_CARD.md", "verified_by_llmstats": false, "analysis_method": "0-shot CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.463251+00:00", "updated_at": "2025-07-19T19:56:11.463251+00:00", "benchmark_name": "MMLU-Pro" } ] ================================================ FILE: data/organizations/meta/models/llama-3.3-70b-instruct/model.json ================================================ { "model_id": "llama-3.3-70b-instruct", "name": "Llama 3.3 70B Instruct", "organization_id": "meta", "fine_tuned_from_model_id": null, "description": "Llama 3.3 is a multilingual large language model optimized for dialogue use cases across multiple languages. It is a pretrained and instruction-tuned generative model with 70 billion parameters, outperforming many open-source and closed chat models on common industry benchmarks. Llama 3.3 supports a context length of 128,000 tokens and is designed for commercial and research use in multiple languages.", "release_date": "2024-12-06", "announcement_date": "2024-12-06", "license_id": "llama_3_3_community_license_agreement", "multimodal": false, "knowledge_cutoff": null, "param_count": 70000000000, "training_tokens": 15000000000000, "available_in_zeroeval": true, "source_api_ref": "https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct", "source_playground": "https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct", "source_paper": null, "source_scorecard_blog_link": null, "source_repo_link": "https://github.com/meta-llama/llama-models/blob/main/models/llama3_3/MODEL_CARD.md", "source_weights_link": "https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct", "created_at": "2025-07-19T19:49:05.603412+00:00", "updated_at": "2025-07-19T19:49:05.603412+00:00", "model_family_id": null } ================================================ FILE: data/organizations/meta/models/llama-4-maverick/benchmarks.json ================================================ [ { "model_benchmark_id": 862, "benchmark_id": "chartqa", "model_id": "llama-4-maverick", "score": 0.9, "normalized_score": 0.9, "is_self_reported": true, "self_reported_source_link": "https://ai.meta.com/blog/llama-4-multimodal-intelligence/", "verified_by_llmstats": false, "analysis_method": "0-shot CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.803334+00:00", "updated_at": "2025-07-19T19:56:12.803334+00:00", "benchmark_name": "ChartQA" }, { "model_benchmark_id": 884, "benchmark_id": "docvqa", "model_id": "llama-4-maverick", "score": 0.944, "normalized_score": 0.944, "is_self_reported": true, "self_reported_source_link": "https://ai.meta.com/blog/llama-4-multimodal-intelligence/", "verified_by_llmstats": false, "analysis_method": "0-shot CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.841331+00:00", "updated_at": "2025-07-19T19:56:12.841331+00:00", "benchmark_name": "DocVQA" }, { "model_benchmark_id": 294, "benchmark_id": "gpqa", "model_id": "llama-4-maverick", "score": 0.698, "normalized_score": 0.698, "is_self_reported": true, "self_reported_source_link": "https://ai.meta.com/blog/llama-4-multimodal-intelligence/", "verified_by_llmstats": false, "analysis_method": "0-shot CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.666983+00:00", "updated_at": "2025-07-19T19:56:11.666983+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 1115, "benchmark_id": "livecodebench", "model_id": "llama-4-maverick", "score": 0.434, "normalized_score": 0.434, "is_self_reported": true, "self_reported_source_link": "https://ai.meta.com/blog/llama-4-multimodal-intelligence/", "verified_by_llmstats": false, "analysis_method": "0-shot CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.326624+00:00", "updated_at": "2025-07-19T19:56:13.326624+00:00", "benchmark_name": "LiveCodeBench" }, { "model_benchmark_id": 397, "benchmark_id": "math", "model_id": "llama-4-maverick", "score": 0.612, "normalized_score": 0.612, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", "verified_by_llmstats": false, "analysis_method": "4-shot em_maj1@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.851038+00:00", "updated_at": "2025-07-19T19:56:11.851038+00:00", "benchmark_name": "MATH" }, { "model_benchmark_id": 524, "benchmark_id": "mathvista", "model_id": "llama-4-maverick", "score": 0.737, "normalized_score": 0.737, "is_self_reported": true, "self_reported_source_link": "https://ai.meta.com/blog/llama-4-multimodal-intelligence/", "verified_by_llmstats": false, "analysis_method": "0-shot CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.088308+00:00", "updated_at": "2025-07-19T19:56:12.088308+00:00", "benchmark_name": "MathVista" }, { "model_benchmark_id": 1179, "benchmark_id": "mbpp", "model_id": "llama-4-maverick", "score": 0.776, "normalized_score": 0.776, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", "verified_by_llmstats": false, "analysis_method": "3-shot pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.485323+00:00", "updated_at": "2025-07-19T19:56:13.485323+00:00", "benchmark_name": "MBPP" }, { "model_benchmark_id": 1286, "benchmark_id": "mgsm", "model_id": "llama-4-maverick", "score": 0.923, "normalized_score": 0.923, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", "verified_by_llmstats": false, "analysis_method": "0-shot CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.694238+00:00", "updated_at": "2025-07-19T19:56:13.694238+00:00", "benchmark_name": "MGSM" }, { "model_benchmark_id": 82, "benchmark_id": "mmlu", "model_id": "llama-4-maverick", "score": 0.855, "normalized_score": 0.855, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", "verified_by_llmstats": false, "analysis_method": "5-shot macro_avg/acc_char", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.254352+00:00", "updated_at": "2025-07-19T19:56:11.254352+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 187, "benchmark_id": "mmlu-pro", "model_id": "llama-4-maverick", "score": 0.805, "normalized_score": 0.805, "is_self_reported": true, "self_reported_source_link": "https://ai.meta.com/blog/llama-4-multimodal-intelligence/", "verified_by_llmstats": false, "analysis_method": "0-shot CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.460210+00:00", "updated_at": "2025-07-19T19:56:11.460210+00:00", "benchmark_name": "MMLU-Pro" }, { "model_benchmark_id": 567, "benchmark_id": "mmmu", "model_id": "llama-4-maverick", "score": 0.734, "normalized_score": 0.734, "is_self_reported": true, "self_reported_source_link": "https://ai.meta.com/blog/llama-4-multimodal-intelligence/", "verified_by_llmstats": false, "analysis_method": "0-shot CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.167124+00:00", "updated_at": "2025-07-19T19:56:12.167124+00:00", "benchmark_name": "MMMU" }, { "model_benchmark_id": 1531, "benchmark_id": "mmmu-pro", "model_id": "llama-4-maverick", "score": 0.596, "normalized_score": 0.596, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", "verified_by_llmstats": false, "analysis_method": "0-shot CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.290598+00:00", "updated_at": "2025-07-19T19:56:14.290598+00:00", "benchmark_name": "MMMU-Pro" }, { "model_benchmark_id": 1591, "benchmark_id": "tydiqa", "model_id": "llama-4-maverick", "score": 0.317, "normalized_score": 0.317, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", "verified_by_llmstats": false, "analysis_method": "1-shot average/f1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.475429+00:00", "updated_at": "2025-07-19T19:56:14.475429+00:00", "benchmark_name": "TydiQA" } ] ================================================ FILE: data/organizations/meta/models/llama-4-maverick/model.json ================================================ { "model_id": "llama-4-maverick", "name": "Llama 4 Maverick", "organization_id": "meta", "fine_tuned_from_model_id": null, "description": "Llama 4 Maverick is a natively multimodal model capable of processing both text and images. It features a 17 billion active parameter mixture-of-experts (MoE) architecture with 128 experts, supporting a wide range of multimodal tasks such as conversational interaction, image analysis, and code generation. The model includes a 1 million token context window.", "release_date": "2025-04-05", "announcement_date": "2025-04-05", "license_id": "llama_4_community_license_agreement", "multimodal": true, "knowledge_cutoff": null, "param_count": 400000000000, "training_tokens": 22000000000000, "available_in_zeroeval": true, "source_api_ref": "https://ai.meta.com/blog/llama-4-multimodal-intelligence/", "source_playground": "https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct", "source_paper": null, "source_scorecard_blog_link": null, "source_repo_link": "https://github.com/meta-llama/llama-models/tree/main/models/llama4", "source_weights_link": "https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct", "created_at": "2025-07-19T19:49:05.595636+00:00", "updated_at": "2025-07-19T19:49:05.595636+00:00", "model_family_id": null } ================================================ FILE: data/organizations/meta/models/llama-4-scout/benchmarks.json ================================================ [ { "model_benchmark_id": 863, "benchmark_id": "chartqa", "model_id": "llama-4-scout", "score": 0.888, "normalized_score": 0.888, "is_self_reported": true, "self_reported_source_link": "https://ai.meta.com/blog/llama-4-multimodal-intelligence/", "verified_by_llmstats": false, "analysis_method": "0-shot CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.804916+00:00", "updated_at": "2025-07-19T19:56:12.804916+00:00", "benchmark_name": "ChartQA" }, { "model_benchmark_id": 885, "benchmark_id": "docvqa", "model_id": "llama-4-scout", "score": 0.944, "normalized_score": 0.944, "is_self_reported": true, "self_reported_source_link": "https://ai.meta.com/blog/llama-4-multimodal-intelligence/", "verified_by_llmstats": false, "analysis_method": "0-shot (ANLS)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.842838+00:00", "updated_at": "2025-07-19T19:56:12.842838+00:00", "benchmark_name": "DocVQA" }, { "model_benchmark_id": 295, "benchmark_id": "gpqa", "model_id": "llama-4-scout", "score": 0.572, "normalized_score": 0.572, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", "verified_by_llmstats": false, "analysis_method": "0-shot (accuracy)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.668436+00:00", "updated_at": "2025-07-19T19:56:11.668436+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 1116, "benchmark_id": "livecodebench", "model_id": "llama-4-scout", "score": 0.328, "normalized_score": 0.328, "is_self_reported": true, "self_reported_source_link": "https://ai.meta.com/blog/llama-4-multimodal-intelligence/", "verified_by_llmstats": false, "analysis_method": "0-shot CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.328074+00:00", "updated_at": "2025-07-19T19:56:13.328074+00:00", "benchmark_name": "LiveCodeBench" }, { "model_benchmark_id": 398, "benchmark_id": "math", "model_id": "llama-4-scout", "score": 0.503, "normalized_score": 0.503, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", "verified_by_llmstats": false, "analysis_method": "4-shot em_maj1@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.852669+00:00", "updated_at": "2025-07-19T19:56:11.852669+00:00", "benchmark_name": "MATH" }, { "model_benchmark_id": 525, "benchmark_id": "mathvista", "model_id": "llama-4-scout", "score": 0.707, "normalized_score": 0.707, "is_self_reported": true, "self_reported_source_link": "https://ai.meta.com/blog/llama-4-multimodal-intelligence/", "verified_by_llmstats": false, "analysis_method": "0-shot CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.089981+00:00", "updated_at": "2025-07-19T19:56:12.089981+00:00", "benchmark_name": "MathVista" }, { "model_benchmark_id": 1180, "benchmark_id": "mbpp", "model_id": "llama-4-scout", "score": 0.678, "normalized_score": 0.678, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", "verified_by_llmstats": false, "analysis_method": "3-shot pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.487376+00:00", "updated_at": "2025-07-19T19:56:13.487376+00:00", "benchmark_name": "MBPP" }, { "model_benchmark_id": 1287, "benchmark_id": "mgsm", "model_id": "llama-4-scout", "score": 0.906, "normalized_score": 0.906, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", "verified_by_llmstats": false, "analysis_method": "0-shot (average/em)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.695659+00:00", "updated_at": "2025-07-19T19:56:13.695659+00:00", "benchmark_name": "MGSM" }, { "model_benchmark_id": 83, "benchmark_id": "mmlu", "model_id": "llama-4-scout", "score": 0.796, "normalized_score": 0.796, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", "verified_by_llmstats": false, "analysis_method": "5-shot macro_avg/acc_char", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.258246+00:00", "updated_at": "2025-07-19T19:56:11.258246+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 188, "benchmark_id": "mmlu-pro", "model_id": "llama-4-scout", "score": 0.743, "normalized_score": 0.743, "is_self_reported": true, "self_reported_source_link": "https://ai.meta.com/blog/llama-4-multimodal-intelligence/", "verified_by_llmstats": false, "analysis_method": "0-shot (macro_avg/acc)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.461726+00:00", "updated_at": "2025-07-19T19:56:11.461726+00:00", "benchmark_name": "MMLU-Pro" }, { "model_benchmark_id": 568, "benchmark_id": "mmmu", "model_id": "llama-4-scout", "score": 0.694, "normalized_score": 0.694, "is_self_reported": true, "self_reported_source_link": "https://ai.meta.com/blog/llama-4-multimodal-intelligence/", "verified_by_llmstats": false, "analysis_method": "0-shot CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.169227+00:00", "updated_at": "2025-07-19T19:56:12.169227+00:00", "benchmark_name": "MMMU" }, { "model_benchmark_id": 1592, "benchmark_id": "tydiqa", "model_id": "llama-4-scout", "score": 0.315, "normalized_score": 0.315, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", "verified_by_llmstats": false, "analysis_method": "1-shot average/f1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.477364+00:00", "updated_at": "2025-07-19T19:56:14.477364+00:00", "benchmark_name": "TydiQA" } ] ================================================ FILE: data/organizations/meta/models/llama-4-scout/model.json ================================================ { "model_id": "llama-4-scout", "name": "Llama 4 Scout", "organization_id": "meta", "fine_tuned_from_model_id": null, "description": "Llama 4 Scout is a natively multimodal model capable of processing both text and images. It features a 17 billion activated parameter (109B total) mixture-of-experts (MoE) architecture with 16 experts, supporting a wide range of multimodal tasks such as conversational interaction, image analysis, and code generation. The model includes a 10 million token context window.", "release_date": "2025-04-05", "announcement_date": "2025-04-05", "license_id": "llama_4_community_license_agreement", "multimodal": true, "knowledge_cutoff": null, "param_count": 109000000000, "training_tokens": 40000000000000, "available_in_zeroeval": true, "source_api_ref": "https://ai.meta.com/blog/llama-4-multimodal-intelligence/", "source_playground": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", "source_paper": null, "source_scorecard_blog_link": null, "source_repo_link": "https://github.com/meta-llama/llama-models/tree/main/models/llama4", "source_weights_link": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", "created_at": "2025-07-19T19:49:05.599841+00:00", "updated_at": "2025-07-19T19:49:05.599841+00:00", "model_family_id": null } ================================================ FILE: data/organizations/meta/organization.json ================================================ { "organization_id": "meta", "name": "Meta", "website": "https://meta.com", "description": "Social media company with AI research", "country": "US", "created_at": "2025-07-19T19:49:05.572641+00:00", "updated_at": "2025-07-19T19:49:05.572641+00:00" } ================================================ FILE: data/organizations/microsoft/models/phi-3.5-mini-instruct/benchmarks.json ================================================ [ { "model_benchmark_id": 13, "benchmark_id": "arc-c", "model_id": "phi-3.5-mini-instruct", "score": 0.846, "normalized_score": 0.846, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct", "verified_by_llmstats": false, "analysis_method": "10-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.111398+00:00", "updated_at": "2025-07-19T19:56:11.111398+00:00", "benchmark_name": "ARC-C" }, { "model_benchmark_id": 1448, "benchmark_id": "arena-hard", "model_id": "phi-3.5-mini-instruct", "score": 0.37, "normalized_score": 0.37, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct", "verified_by_llmstats": false, "analysis_method": "standard evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.088299+00:00", "updated_at": "2025-07-19T19:56:14.088299+00:00", "benchmark_name": "Arena Hard" }, { "model_benchmark_id": 1078, "benchmark_id": "big-bench-hard", "model_id": "phi-3.5-mini-instruct", "score": 0.69, "normalized_score": 0.69, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct", "verified_by_llmstats": false, "analysis_method": "0-shot chain-of-thought", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.245591+00:00", "updated_at": "2025-07-19T19:56:13.245591+00:00", "benchmark_name": "BIG-Bench Hard" }, { "model_benchmark_id": 1025, "benchmark_id": "boolq", "model_id": "phi-3.5-mini-instruct", "score": 0.78, "normalized_score": 0.78, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct", "verified_by_llmstats": false, "analysis_method": "2-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.132882+00:00", "updated_at": "2025-07-19T19:56:13.132882+00:00", "benchmark_name": "BoolQ" }, { "model_benchmark_id": 1504, "benchmark_id": "govreport", "model_id": "phi-3.5-mini-instruct", "score": 0.259, "normalized_score": 0.259, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct", "verified_by_llmstats": false, "analysis_method": "standard evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.222697+00:00", "updated_at": "2025-07-19T19:56:14.222697+00:00", "benchmark_name": "GovReport" }, { "model_benchmark_id": 285, "benchmark_id": "gpqa", "model_id": "phi-3.5-mini-instruct", "score": 0.304, "normalized_score": 0.304, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct", "verified_by_llmstats": false, "analysis_method": "0-shot chain-of-thought", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.651230+00:00", "updated_at": "2025-07-19T19:56:11.651230+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 987, "benchmark_id": "gsm8k", "model_id": "phi-3.5-mini-instruct", "score": 0.862, "normalized_score": 0.862, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct", "verified_by_llmstats": false, "analysis_method": "8-shot chain-of-thought", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.070240+00:00", "updated_at": "2025-07-19T19:56:13.070240+00:00", "benchmark_name": "GSM8k" }, { "model_benchmark_id": 43, "benchmark_id": "hellaswag", "model_id": "phi-3.5-mini-instruct", "score": 0.694, "normalized_score": 0.694, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct", "verified_by_llmstats": false, "analysis_method": "5-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.173447+00:00", "updated_at": "2025-07-19T19:56:11.173447+00:00", "benchmark_name": "HellaSwag" }, { "model_benchmark_id": 777, "benchmark_id": "humaneval", "model_id": "phi-3.5-mini-instruct", "score": 0.628, "normalized_score": 0.628, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.631199+00:00", "updated_at": "2025-07-19T19:56:12.631199+00:00", "benchmark_name": "HumanEval" }, { "model_benchmark_id": 392, "benchmark_id": "math", "model_id": "phi-3.5-mini-instruct", "score": 0.485, "normalized_score": 0.485, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct", "verified_by_llmstats": false, "analysis_method": "0-shot chain-of-thought", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.842901+00:00", "updated_at": "2025-07-19T19:56:11.842901+00:00", "benchmark_name": "MATH" }, { "model_benchmark_id": 1178, "benchmark_id": "mbpp", "model_id": "phi-3.5-mini-instruct", "score": 0.696, "normalized_score": 0.696, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct", "verified_by_llmstats": false, "analysis_method": "3-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.481045+00:00", "updated_at": "2025-07-19T19:56:13.481045+00:00", "benchmark_name": "MBPP" }, { "model_benchmark_id": 1494, "benchmark_id": "mega-mlqa", "model_id": "phi-3.5-mini-instruct", "score": 0.617, "normalized_score": 0.617, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct", "verified_by_llmstats": false, "analysis_method": "standard evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.191909+00:00", "updated_at": "2025-07-19T19:56:14.191909+00:00", "benchmark_name": "MEGA MLQA" }, { "model_benchmark_id": 1496, "benchmark_id": "mega-tydi-qa", "model_id": "phi-3.5-mini-instruct", "score": 0.622, "normalized_score": 0.622, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct", "verified_by_llmstats": false, "analysis_method": "standard evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.197084+00:00", "updated_at": "2025-07-19T19:56:14.197084+00:00", "benchmark_name": "MEGA TyDi QA" }, { "model_benchmark_id": 1498, "benchmark_id": "mega-udpos", "model_id": "phi-3.5-mini-instruct", "score": 0.465, "normalized_score": 0.465, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct", "verified_by_llmstats": false, "analysis_method": "standard evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.203616+00:00", "updated_at": "2025-07-19T19:56:14.203616+00:00", "benchmark_name": "MEGA UDPOS" }, { "model_benchmark_id": 1500, "benchmark_id": "mega-xcopa", "model_id": "phi-3.5-mini-instruct", "score": 0.631, "normalized_score": 0.631, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct", "verified_by_llmstats": false, "analysis_method": "standard evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.210364+00:00", "updated_at": "2025-07-19T19:56:14.210364+00:00", "benchmark_name": "MEGA XCOPA" }, { "model_benchmark_id": 1502, "benchmark_id": "mega-xstorycloze", "model_id": "phi-3.5-mini-instruct", "score": 0.735, "normalized_score": 0.735, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct", "verified_by_llmstats": false, "analysis_method": "standard evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.217597+00:00", "updated_at": "2025-07-19T19:56:14.217597+00:00", "benchmark_name": "MEGA XStoryCloze" }, { "model_benchmark_id": 1282, "benchmark_id": "mgsm", "model_id": "phi-3.5-mini-instruct", "score": 0.479, "normalized_score": 0.479, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct", "verified_by_llmstats": false, "analysis_method": "0-shot chain-of-thought", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.687534+00:00", "updated_at": "2025-07-19T19:56:13.687534+00:00", "benchmark_name": "MGSM" }, { "model_benchmark_id": 75, "benchmark_id": "mmlu", "model_id": "phi-3.5-mini-instruct", "score": 0.69, "normalized_score": 0.69, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct", "verified_by_llmstats": false, "analysis_method": "5-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.240966+00:00", "updated_at": "2025-07-19T19:56:11.240966+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 180, "benchmark_id": "mmlu-pro", "model_id": "phi-3.5-mini-instruct", "score": 0.474, "normalized_score": 0.474, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct", "verified_by_llmstats": false, "analysis_method": "0-shot chain-of-thought", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.447960+00:00", "updated_at": "2025-07-19T19:56:11.450171+00:00", "benchmark_name": "MMLU-Pro" }, { "model_benchmark_id": 1476, "benchmark_id": "mmmlu", "model_id": "phi-3.5-mini-instruct", "score": 0.554, "normalized_score": 0.554, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct", "verified_by_llmstats": false, "analysis_method": "5-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.148935+00:00", "updated_at": "2025-07-19T19:56:14.148935+00:00", "benchmark_name": "MMMLU" }, { "model_benchmark_id": 1471, "benchmark_id": "openbookqa", "model_id": "phi-3.5-mini-instruct", "score": 0.792, "normalized_score": 0.792, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct", "verified_by_llmstats": false, "analysis_method": "10-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.136354+00:00", "updated_at": "2025-07-19T19:56:14.136354+00:00", "benchmark_name": "OpenBookQA" }, { "model_benchmark_id": 1034, "benchmark_id": "piqa", "model_id": "phi-3.5-mini-instruct", "score": 0.81, "normalized_score": 0.81, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct", "verified_by_llmstats": false, "analysis_method": "5-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.154444+00:00", "updated_at": "2025-07-19T19:56:13.154444+00:00", "benchmark_name": "PIQA" }, { "model_benchmark_id": 1488, "benchmark_id": "qasper", "model_id": "phi-3.5-mini-instruct", "score": 0.419, "normalized_score": 0.419, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct", "verified_by_llmstats": false, "analysis_method": "standard evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.173290+00:00", "updated_at": "2025-07-19T19:56:14.173290+00:00", "benchmark_name": "Qasper" }, { "model_benchmark_id": 1506, "benchmark_id": "qmsum", "model_id": "phi-3.5-mini-instruct", "score": 0.213, "normalized_score": 0.213, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct", "verified_by_llmstats": false, "analysis_method": "standard evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.228389+00:00", "updated_at": "2025-07-19T19:56:14.228389+00:00", "benchmark_name": "QMSum" }, { "model_benchmark_id": 1492, "benchmark_id": "repoqa", "model_id": "phi-3.5-mini-instruct", "score": 0.77, "normalized_score": 0.77, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct", "verified_by_llmstats": false, "analysis_method": "average", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.186426+00:00", "updated_at": "2025-07-19T19:56:14.186426+00:00", "benchmark_name": "RepoQA" }, { "model_benchmark_id": 1490, "benchmark_id": "ruler", "model_id": "phi-3.5-mini-instruct", "score": 0.841, "normalized_score": 0.841, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct", "verified_by_llmstats": false, "analysis_method": "128k", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.179307+00:00", "updated_at": "2025-07-19T19:56:14.179307+00:00", "benchmark_name": "RULER" }, { "model_benchmark_id": 1043, "benchmark_id": "social-iqa", "model_id": "phi-3.5-mini-instruct", "score": 0.747, "normalized_score": 0.747, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct", "verified_by_llmstats": false, "analysis_method": "5-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.177860+00:00", "updated_at": "2025-07-19T19:56:13.177860+00:00", "benchmark_name": "Social IQa" }, { "model_benchmark_id": 825, "benchmark_id": "squality", "model_id": "phi-3.5-mini-instruct", "score": 0.243, "normalized_score": 0.243, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct", "verified_by_llmstats": false, "analysis_method": "standard evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.722570+00:00", "updated_at": "2025-07-19T19:56:12.722570+00:00", "benchmark_name": "SQuALITY" }, { "model_benchmark_id": 1508, "benchmark_id": "summscreenfd", "model_id": "phi-3.5-mini-instruct", "score": 0.16, "normalized_score": 0.16, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct", "verified_by_llmstats": false, "analysis_method": "standard evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.234498+00:00", "updated_at": "2025-07-19T19:56:14.234498+00:00", "benchmark_name": "SummScreenFD" }, { "model_benchmark_id": 134, "benchmark_id": "truthfulqa", "model_id": "phi-3.5-mini-instruct", "score": 0.64, "normalized_score": 0.64, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct", "verified_by_llmstats": false, "analysis_method": "10-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.346508+00:00", "updated_at": "2025-07-19T19:56:11.346508+00:00", "benchmark_name": "TruthfulQA" }, { "model_benchmark_id": 1063, "benchmark_id": "winogrande", "model_id": "phi-3.5-mini-instruct", "score": 0.685, "normalized_score": 0.685, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct", "verified_by_llmstats": false, "analysis_method": "5-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.217697+00:00", "updated_at": "2025-07-19T19:56:13.217697+00:00", "benchmark_name": "Winogrande" } ] ================================================ FILE: data/organizations/microsoft/models/phi-3.5-mini-instruct/model.json ================================================ { "model_id": "phi-3.5-mini-instruct", "name": "Phi-3.5-mini-instruct", "organization_id": "microsoft", "fine_tuned_from_model_id": null, "description": "Phi-3.5-mini-instruct is a 3.8B-parameter model that supports up to 128K context tokens, with improved multilingual capabilities across over 20 languages. It underwent additional training and safety post-training to enhance instruction-following, reasoning, math, and code generation. Ideal for environments with memory or latency constraints, it uses an MIT license.", "release_date": "2024-08-23", "announcement_date": "2024-08-23", "license_id": "mit", "multimodal": false, "knowledge_cutoff": null, "param_count": 3800000000, "training_tokens": 3400000000000, "available_in_zeroeval": true, "source_api_ref": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct", "source_playground": null, "source_paper": "https://arxiv.org/abs/2404.14219", "source_scorecard_blog_link": "https://techcommunity.microsoft.com/blog/azure-ai-services-blog/discover-the-new-multi-lingual-high-quality-phi-3-5-slms/4225280", "source_repo_link": null, "source_weights_link": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct", "created_at": "2025-07-19T19:49:05.559796+00:00", "updated_at": "2025-07-19T19:49:05.559796+00:00", "model_family_id": null } ================================================ FILE: data/organizations/microsoft/models/phi-3.5-moe-instruct/benchmarks.json ================================================ [ { "model_benchmark_id": 12, "benchmark_id": "arc-c", "model_id": "phi-3.5-moe-instruct", "score": 0.91, "normalized_score": 0.91, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct", "verified_by_llmstats": false, "analysis_method": "10-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.108027+00:00", "updated_at": "2025-07-19T19:56:11.108027+00:00", "benchmark_name": "ARC-C" }, { "model_benchmark_id": 1447, "benchmark_id": "arena-hard", "model_id": "phi-3.5-moe-instruct", "score": 0.379, "normalized_score": 0.379, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct", "verified_by_llmstats": false, "analysis_method": "standard evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.086453+00:00", "updated_at": "2025-07-19T19:56:14.086453+00:00", "benchmark_name": "Arena Hard" }, { "model_benchmark_id": 1077, "benchmark_id": "big-bench-hard", "model_id": "phi-3.5-moe-instruct", "score": 0.791, "normalized_score": 0.791, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct", "verified_by_llmstats": false, "analysis_method": "0-shot chain-of-thought", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.244054+00:00", "updated_at": "2025-07-19T19:56:13.244054+00:00", "benchmark_name": "BIG-Bench Hard" }, { "model_benchmark_id": 1024, "benchmark_id": "boolq", "model_id": "phi-3.5-moe-instruct", "score": 0.846, "normalized_score": 0.846, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct", "verified_by_llmstats": false, "analysis_method": "2-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.130867+00:00", "updated_at": "2025-07-19T19:56:13.130867+00:00", "benchmark_name": "BoolQ" }, { "model_benchmark_id": 1503, "benchmark_id": "govreport", "model_id": "phi-3.5-moe-instruct", "score": 0.264, "normalized_score": 0.264, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct", "verified_by_llmstats": false, "analysis_method": "standard evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.221191+00:00", "updated_at": "2025-07-19T19:56:14.221191+00:00", "benchmark_name": "GovReport" }, { "model_benchmark_id": 284, "benchmark_id": "gpqa", "model_id": "phi-3.5-moe-instruct", "score": 0.368, "normalized_score": 0.368, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct", "verified_by_llmstats": false, "analysis_method": "0-shot chain-of-thought", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.649286+00:00", "updated_at": "2025-07-19T19:56:11.649286+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 986, "benchmark_id": "gsm8k", "model_id": "phi-3.5-moe-instruct", "score": 0.887, "normalized_score": 0.887, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct", "verified_by_llmstats": false, "analysis_method": "8-shot chain-of-thought", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.068601+00:00", "updated_at": "2025-07-19T19:56:13.068601+00:00", "benchmark_name": "GSM8k" }, { "model_benchmark_id": 42, "benchmark_id": "hellaswag", "model_id": "phi-3.5-moe-instruct", "score": 0.838, "normalized_score": 0.838, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct", "verified_by_llmstats": false, "analysis_method": "5-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.171621+00:00", "updated_at": "2025-07-19T19:56:11.171621+00:00", "benchmark_name": "HellaSwag" }, { "model_benchmark_id": 776, "benchmark_id": "humaneval", "model_id": "phi-3.5-moe-instruct", "score": 0.707, "normalized_score": 0.707, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.629465+00:00", "updated_at": "2025-07-19T19:56:12.629465+00:00", "benchmark_name": "HumanEval" }, { "model_benchmark_id": 391, "benchmark_id": "math", "model_id": "phi-3.5-moe-instruct", "score": 0.595, "normalized_score": 0.595, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct", "verified_by_llmstats": false, "analysis_method": "0-shot chain-of-thought", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.841295+00:00", "updated_at": "2025-07-19T19:56:11.841295+00:00", "benchmark_name": "MATH" }, { "model_benchmark_id": 1177, "benchmark_id": "mbpp", "model_id": "phi-3.5-moe-instruct", "score": 0.808, "normalized_score": 0.808, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct", "verified_by_llmstats": false, "analysis_method": "3-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.479387+00:00", "updated_at": "2025-07-19T19:56:13.479387+00:00", "benchmark_name": "MBPP" }, { "model_benchmark_id": 1493, "benchmark_id": "mega-mlqa", "model_id": "phi-3.5-moe-instruct", "score": 0.653, "normalized_score": 0.653, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct", "verified_by_llmstats": false, "analysis_method": "standard evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.190086+00:00", "updated_at": "2025-07-19T19:56:14.190086+00:00", "benchmark_name": "MEGA MLQA" }, { "model_benchmark_id": 1495, "benchmark_id": "mega-tydi-qa", "model_id": "phi-3.5-moe-instruct", "score": 0.671, "normalized_score": 0.671, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct", "verified_by_llmstats": false, "analysis_method": "standard evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.195123+00:00", "updated_at": "2025-07-19T19:56:14.195123+00:00", "benchmark_name": "MEGA TyDi QA" }, { "model_benchmark_id": 1497, "benchmark_id": "mega-udpos", "model_id": "phi-3.5-moe-instruct", "score": 0.604, "normalized_score": 0.604, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct", "verified_by_llmstats": false, "analysis_method": "standard evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.201497+00:00", "updated_at": "2025-07-19T19:56:14.201497+00:00", "benchmark_name": "MEGA UDPOS" }, { "model_benchmark_id": 1499, "benchmark_id": "mega-xcopa", "model_id": "phi-3.5-moe-instruct", "score": 0.766, "normalized_score": 0.766, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct", "verified_by_llmstats": false, "analysis_method": "standard evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.208476+00:00", "updated_at": "2025-07-19T19:56:14.208476+00:00", "benchmark_name": "MEGA XCOPA" }, { "model_benchmark_id": 1501, "benchmark_id": "mega-xstorycloze", "model_id": "phi-3.5-moe-instruct", "score": 0.828, "normalized_score": 0.828, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct", "verified_by_llmstats": false, "analysis_method": "standard evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.214764+00:00", "updated_at": "2025-07-19T19:56:14.214764+00:00", "benchmark_name": "MEGA XStoryCloze" }, { "model_benchmark_id": 1281, "benchmark_id": "mgsm", "model_id": "phi-3.5-moe-instruct", "score": 0.587, "normalized_score": 0.587, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct", "verified_by_llmstats": false, "analysis_method": "0-shot chain-of-thought", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.686017+00:00", "updated_at": "2025-07-19T19:56:13.686017+00:00", "benchmark_name": "MGSM" }, { "model_benchmark_id": 74, "benchmark_id": "mmlu", "model_id": "phi-3.5-moe-instruct", "score": 0.789, "normalized_score": 0.789, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct", "verified_by_llmstats": false, "analysis_method": "5-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.239087+00:00", "updated_at": "2025-07-19T19:56:11.239087+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 178, "benchmark_id": "mmlu-pro", "model_id": "phi-3.5-moe-instruct", "score": 0.453, "normalized_score": 0.453, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct", "verified_by_llmstats": false, "analysis_method": "standard evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.444580+00:00", "updated_at": "2025-07-19T19:56:11.446076+00:00", "benchmark_name": "MMLU-Pro" }, { "model_benchmark_id": 1475, "benchmark_id": "mmmlu", "model_id": "phi-3.5-moe-instruct", "score": 0.699, "normalized_score": 0.699, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct", "verified_by_llmstats": false, "analysis_method": "5-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.147234+00:00", "updated_at": "2025-07-19T19:56:14.147234+00:00", "benchmark_name": "MMMLU" }, { "model_benchmark_id": 1470, "benchmark_id": "openbookqa", "model_id": "phi-3.5-moe-instruct", "score": 0.896, "normalized_score": 0.896, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct", "verified_by_llmstats": false, "analysis_method": "10-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.134275+00:00", "updated_at": "2025-07-19T19:56:14.134275+00:00", "benchmark_name": "OpenBookQA" }, { "model_benchmark_id": 1033, "benchmark_id": "piqa", "model_id": "phi-3.5-moe-instruct", "score": 0.886, "normalized_score": 0.886, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct", "verified_by_llmstats": false, "analysis_method": "5-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.152199+00:00", "updated_at": "2025-07-19T19:56:13.152199+00:00", "benchmark_name": "PIQA" }, { "model_benchmark_id": 1487, "benchmark_id": "qasper", "model_id": "phi-3.5-moe-instruct", "score": 0.4, "normalized_score": 0.4, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct", "verified_by_llmstats": false, "analysis_method": "standard evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.171579+00:00", "updated_at": "2025-07-19T19:56:14.171579+00:00", "benchmark_name": "Qasper" }, { "model_benchmark_id": 1505, "benchmark_id": "qmsum", "model_id": "phi-3.5-moe-instruct", "score": 0.199, "normalized_score": 0.199, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct", "verified_by_llmstats": false, "analysis_method": "standard evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.226358+00:00", "updated_at": "2025-07-19T19:56:14.226358+00:00", "benchmark_name": "QMSum" }, { "model_benchmark_id": 1491, "benchmark_id": "repoqa", "model_id": "phi-3.5-moe-instruct", "score": 0.85, "normalized_score": 0.85, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct", "verified_by_llmstats": false, "analysis_method": "average", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.184432+00:00", "updated_at": "2025-07-19T19:56:14.184432+00:00", "benchmark_name": "RepoQA" }, { "model_benchmark_id": 1489, "benchmark_id": "ruler", "model_id": "phi-3.5-moe-instruct", "score": 0.871, "normalized_score": 0.871, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct", "verified_by_llmstats": false, "analysis_method": "long context (128K) evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.177557+00:00", "updated_at": "2025-07-19T19:56:14.177557+00:00", "benchmark_name": "RULER" }, { "model_benchmark_id": 1042, "benchmark_id": "social-iqa", "model_id": "phi-3.5-moe-instruct", "score": 0.78, "normalized_score": 0.78, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct", "verified_by_llmstats": false, "analysis_method": "5-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.176106+00:00", "updated_at": "2025-07-19T19:56:13.176106+00:00", "benchmark_name": "Social IQa" }, { "model_benchmark_id": 824, "benchmark_id": "squality", "model_id": "phi-3.5-moe-instruct", "score": 0.241, "normalized_score": 0.241, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct", "verified_by_llmstats": false, "analysis_method": "standard evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.720914+00:00", "updated_at": "2025-07-19T19:56:12.720914+00:00", "benchmark_name": "SQuALITY" }, { "model_benchmark_id": 1507, "benchmark_id": "summscreenfd", "model_id": "phi-3.5-moe-instruct", "score": 0.169, "normalized_score": 0.169, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct", "verified_by_llmstats": false, "analysis_method": "standard evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.232655+00:00", "updated_at": "2025-07-19T19:56:14.232655+00:00", "benchmark_name": "SummScreenFD" }, { "model_benchmark_id": 133, "benchmark_id": "truthfulqa", "model_id": "phi-3.5-moe-instruct", "score": 0.775, "normalized_score": 0.775, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct", "verified_by_llmstats": false, "analysis_method": "10-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.344788+00:00", "updated_at": "2025-07-19T19:56:11.344788+00:00", "benchmark_name": "TruthfulQA" }, { "model_benchmark_id": 1062, "benchmark_id": "winogrande", "model_id": "phi-3.5-moe-instruct", "score": 0.813, "normalized_score": 0.813, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct", "verified_by_llmstats": false, "analysis_method": "5-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.215763+00:00", "updated_at": "2025-07-19T19:56:13.215763+00:00", "benchmark_name": "Winogrande" } ] ================================================ FILE: data/organizations/microsoft/models/phi-3.5-moe-instruct/model.json ================================================ { "model_id": "phi-3.5-moe-instruct", "name": "Phi-3.5-MoE-instruct", "organization_id": "microsoft", "fine_tuned_from_model_id": null, "description": "Phi-3.5-MoE-instruct is a mixture-of-experts model with ~42B total parameters (6.6B active) and a 128K context window. It excels at reasoning, math, coding, and multilingual tasks, outperforming larger dense models in many benchmarks. It underwent a thorough safety post-training process (SFT + DPO) and is licensed under MIT. This model is ideal for scenarios where efficiency and high performance are both required, particularly in multi-lingual or reasoning-intensive tasks.", "release_date": "2024-08-23", "announcement_date": "2024-08-23", "license_id": "mit", "multimodal": false, "knowledge_cutoff": null, "param_count": 60000000000, "training_tokens": 4900000000000, "available_in_zeroeval": true, "source_api_ref": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct", "source_playground": null, "source_paper": "https://arxiv.org/abs/2404.14219", "source_scorecard_blog_link": "https://techcommunity.microsoft.com/blog/azure-ai-services-blog/discover-the-new-multi-lingual-high-quality-phi-3-5-slms/4225280", "source_repo_link": null, "source_weights_link": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct", "created_at": "2025-07-19T19:49:05.555819+00:00", "updated_at": "2025-07-19T19:49:05.555819+00:00", "model_family_id": null } ================================================ FILE: data/organizations/microsoft/models/phi-3.5-vision-instruct/benchmarks.json ================================================ [ { "model_benchmark_id": 1250, "benchmark_id": "ai2d", "model_id": "phi-3.5-vision-instruct", "score": 0.781, "normalized_score": 0.781, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-vision-instruct", "verified_by_llmstats": false, "analysis_method": "standard evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.626694+00:00", "updated_at": "2025-07-19T19:56:13.626694+00:00", "benchmark_name": "AI2D" }, { "model_benchmark_id": 858, "benchmark_id": "chartqa", "model_id": "phi-3.5-vision-instruct", "score": 0.818, "normalized_score": 0.818, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-vision-instruct", "verified_by_llmstats": false, "analysis_method": "standard evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.795942+00:00", "updated_at": "2025-07-19T19:56:12.795942+00:00", "benchmark_name": "ChartQA" }, { "model_benchmark_id": 1520, "benchmark_id": "intergps", "model_id": "phi-3.5-vision-instruct", "score": 0.363, "normalized_score": 0.363, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-vision-instruct", "verified_by_llmstats": false, "analysis_method": "standard evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.261813+00:00", "updated_at": "2025-07-19T19:56:14.261813+00:00", "benchmark_name": "InterGPS" }, { "model_benchmark_id": 520, "benchmark_id": "mathvista", "model_id": "phi-3.5-vision-instruct", "score": 0.439, "normalized_score": 0.439, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-vision-instruct", "verified_by_llmstats": false, "analysis_method": "standard evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.080462+00:00", "updated_at": "2025-07-19T19:56:12.080462+00:00", "benchmark_name": "MathVista" }, { "model_benchmark_id": 1509, "benchmark_id": "mmbench", "model_id": "phi-3.5-vision-instruct", "score": 0.819, "normalized_score": 0.819, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-vision-instruct", "verified_by_llmstats": false, "analysis_method": "standard evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.238017+00:00", "updated_at": "2025-07-19T19:56:14.238017+00:00", "benchmark_name": "MMBench" }, { "model_benchmark_id": 563, "benchmark_id": "mmmu", "model_id": "phi-3.5-vision-instruct", "score": 0.43, "normalized_score": 0.43, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-vision-instruct", "verified_by_llmstats": false, "analysis_method": "standard evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.158730+00:00", "updated_at": "2025-07-19T19:56:12.158730+00:00", "benchmark_name": "MMMU" }, { "model_benchmark_id": 1522, "benchmark_id": "pope", "model_id": "phi-3.5-vision-instruct", "score": 0.861, "normalized_score": 0.861, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-vision-instruct", "verified_by_llmstats": false, "analysis_method": "standard evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.266959+00:00", "updated_at": "2025-07-19T19:56:14.266959+00:00", "benchmark_name": "POPE" }, { "model_benchmark_id": 1519, "benchmark_id": "scienceqa", "model_id": "phi-3.5-vision-instruct", "score": 0.913, "normalized_score": 0.913, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-vision-instruct", "verified_by_llmstats": false, "analysis_method": "standard evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.258220+00:00", "updated_at": "2025-07-19T19:56:14.258220+00:00", "benchmark_name": "ScienceQA" }, { "model_benchmark_id": 906, "benchmark_id": "textvqa", "model_id": "phi-3.5-vision-instruct", "score": 0.72, "normalized_score": 0.72, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-vision-instruct", "verified_by_llmstats": false, "analysis_method": "standard evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.888892+00:00", "updated_at": "2025-07-19T19:56:12.888892+00:00", "benchmark_name": "TextVQA" } ] ================================================ FILE: data/organizations/microsoft/models/phi-3.5-vision-instruct/model.json ================================================ { "model_id": "phi-3.5-vision-instruct", "name": "Phi-3.5-vision-instruct", "organization_id": "microsoft", "fine_tuned_from_model_id": null, "description": "Phi-3.5-vision-instruct is a 4.2B-parameter open multimodal model with up to 128K context tokens. It emphasizes multi-frame image understanding and reasoning, boosting performance on single-image benchmarks while enabling multi-image comparison, summarization, and even video analysis. The model underwent safety post-training for improved instruction-following, alignment, and robust handling of visual and text inputs, and is released under the MIT license.", "release_date": "2024-08-23", "announcement_date": "2024-08-23", "license_id": "mit", "multimodal": true, "knowledge_cutoff": null, "param_count": 4200000000, "training_tokens": 500000000000, "available_in_zeroeval": true, "source_api_ref": "https://huggingface.co/microsoft/Phi-3.5-vision-instruct", "source_playground": null, "source_paper": "https://arxiv.org/abs/2404.14219", "source_scorecard_blog_link": "https://techcommunity.microsoft.com/blog/azure-ai-services-blog/discover-the-new-multi-lingual-high-quality-phi-3-5-slms/4225280", "source_repo_link": null, "source_weights_link": "https://huggingface.co/microsoft/Phi-3.5-vision-instruct", "created_at": "2025-07-19T19:49:05.563203+00:00", "updated_at": "2025-07-19T19:49:05.563203+00:00", "model_family_id": null } ================================================ FILE: data/organizations/microsoft/models/phi-4/benchmarks.json ================================================ [ { "model_benchmark_id": 1445, "benchmark_id": "arena-hard", "model_id": "phi-4", "score": 0.754, "normalized_score": 0.754, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2412.08905", "verified_by_llmstats": false, "analysis_method": "simple-evals", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.082804+00:00", "updated_at": "2025-07-19T19:56:14.082804+00:00", "benchmark_name": "Arena Hard" }, { "model_benchmark_id": 947, "benchmark_id": "drop", "model_id": "phi-4", "score": 0.755, "normalized_score": 0.755, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2412.08905", "verified_by_llmstats": false, "analysis_method": "simple-evals", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.999411+00:00", "updated_at": "2025-07-19T19:56:12.999411+00:00", "benchmark_name": "DROP" }, { "model_benchmark_id": 282, "benchmark_id": "gpqa", "model_id": "phi-4", "score": 0.561, "normalized_score": 0.561, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2412.08905", "verified_by_llmstats": false, "analysis_method": "simple-evals", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.644574+00:00", "updated_at": "2025-07-19T19:56:11.644574+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 775, "benchmark_id": "humaneval", "model_id": "phi-4", "score": 0.826, "normalized_score": 0.826, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2412.08905", "verified_by_llmstats": false, "analysis_method": "simple-evals", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.628035+00:00", "updated_at": "2025-07-19T19:56:12.628035+00:00", "benchmark_name": "HumanEval" }, { "model_benchmark_id": 1437, "benchmark_id": "humaneval+", "model_id": "phi-4", "score": 0.828, "normalized_score": 0.828, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2412.08905", "verified_by_llmstats": false, "analysis_method": "simple-evals", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.064824+00:00", "updated_at": "2025-07-19T19:56:14.064824+00:00", "benchmark_name": "HumanEval+" }, { "model_benchmark_id": 611, "benchmark_id": "ifeval", "model_id": "phi-4", "score": 0.63, "normalized_score": 0.63, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2412.08905", "verified_by_llmstats": false, "analysis_method": "simple-evals", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.261770+00:00", "updated_at": "2025-07-19T19:56:12.261770+00:00", "benchmark_name": "IFEval" }, { "model_benchmark_id": 746, "benchmark_id": "livebench", "model_id": "phi-4", "score": 0.476, "normalized_score": 0.476, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2412.08905", "verified_by_llmstats": false, "analysis_method": "simple-evals", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.569213+00:00", "updated_at": "2025-07-19T19:56:12.569213+00:00", "benchmark_name": "LiveBench" }, { "model_benchmark_id": 389, "benchmark_id": "math", "model_id": "phi-4", "score": 0.804, "normalized_score": 0.804, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2412.08905", "verified_by_llmstats": false, "analysis_method": "simple-evals", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.837602+00:00", "updated_at": "2025-07-19T19:56:11.837602+00:00", "benchmark_name": "MATH" }, { "model_benchmark_id": 1279, "benchmark_id": "mgsm", "model_id": "phi-4", "score": 0.806, "normalized_score": 0.806, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2412.08905", "verified_by_llmstats": false, "analysis_method": "simple-evals", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.681417+00:00", "updated_at": "2025-07-19T19:56:13.681417+00:00", "benchmark_name": "MGSM" }, { "model_benchmark_id": 72, "benchmark_id": "mmlu", "model_id": "phi-4", "score": 0.848, "normalized_score": 0.848, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2412.08905", "verified_by_llmstats": false, "analysis_method": "simple-evals", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.236043+00:00", "updated_at": "2025-07-19T19:56:11.236043+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 176, "benchmark_id": "mmlu-pro", "model_id": "phi-4", "score": 0.704, "normalized_score": 0.704, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2412.08905", "verified_by_llmstats": false, "analysis_method": "simple-evals", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.441164+00:00", "updated_at": "2025-07-19T19:56:11.441164+00:00", "benchmark_name": "MMLU-Pro" }, { "model_benchmark_id": 1466, "benchmark_id": "phibench", "model_id": "phi-4", "score": 0.562, "normalized_score": 0.562, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2412.08905", "verified_by_llmstats": false, "analysis_method": "simple-evals", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.124860+00:00", "updated_at": "2025-07-19T19:56:14.124860+00:00", "benchmark_name": "PhiBench" }, { "model_benchmark_id": 233, "benchmark_id": "simpleqa", "model_id": "phi-4", "score": 0.03, "normalized_score": 0.03, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2412.08905", "verified_by_llmstats": false, "analysis_method": "simple-evals", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.546523+00:00", "updated_at": "2025-07-19T19:56:11.546523+00:00", "benchmark_name": "SimpleQA" } ] ================================================ FILE: data/organizations/microsoft/models/phi-4/model.json ================================================ { "model_id": "phi-4", "name": "Phi 4", "organization_id": "microsoft", "fine_tuned_from_model_id": null, "description": "phi-4 is a state-of-the-art open model built to excel at advanced reasoning, coding, and knowledge tasks. It leverages a blend of synthetic data, filtered web data, academic texts, and supervised fine-tuning for precision, alignment, and safety.", "release_date": "2024-12-12", "announcement_date": "2024-12-12", "license_id": "mit", "multimodal": false, "knowledge_cutoff": "2024-06-01", "param_count": 14700000000, "training_tokens": 9800000000000, "available_in_zeroeval": true, "source_api_ref": "https://huggingface.co/microsoft/phi-4", "source_playground": null, "source_paper": "https://arxiv.org/pdf/2412.08905", "source_scorecard_blog_link": "https://techcommunity.microsoft.com/blog/aiplatformblog/introducing-phi-4-microsoft%E2%80%99s-newest-small-language-model-specializing-in-comple/4357090", "source_repo_link": null, "source_weights_link": "https://huggingface.co/microsoft/phi-4", "created_at": "2025-07-19T19:49:05.549276+00:00", "updated_at": "2025-07-19T19:49:05.549276+00:00", "model_family_id": null } ================================================ FILE: data/organizations/microsoft/models/phi-4-mini/benchmarks.json ================================================ [ { "model_benchmark_id": 11, "benchmark_id": "arc-c", "model_id": "phi-4-mini", "score": 0.837, "normalized_score": 0.837, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-mini-instruct", "verified_by_llmstats": false, "analysis_method": "10-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.105059+00:00", "updated_at": "2025-07-19T19:56:11.105059+00:00", "benchmark_name": "ARC-C" }, { "model_benchmark_id": 1446, "benchmark_id": "arena-hard", "model_id": "phi-4-mini", "score": 0.328, "normalized_score": 0.328, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-mini-instruct", "verified_by_llmstats": false, "analysis_method": "Standard evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.084727+00:00", "updated_at": "2025-07-19T19:56:14.084727+00:00", "benchmark_name": "Arena Hard" }, { "model_benchmark_id": 1076, "benchmark_id": "big-bench-hard", "model_id": "phi-4-mini", "score": 0.704, "normalized_score": 0.704, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-mini-instruct", "verified_by_llmstats": false, "analysis_method": "0-shot, CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.242363+00:00", "updated_at": "2025-07-19T19:56:13.242363+00:00", "benchmark_name": "BIG-Bench Hard" }, { "model_benchmark_id": 1023, "benchmark_id": "boolq", "model_id": "phi-4-mini", "score": 0.812, "normalized_score": 0.812, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-mini-instruct", "verified_by_llmstats": false, "analysis_method": "2-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.129244+00:00", "updated_at": "2025-07-19T19:56:13.129244+00:00", "benchmark_name": "BoolQ" }, { "model_benchmark_id": 283, "benchmark_id": "gpqa", "model_id": "phi-4-mini", "score": 0.252, "normalized_score": 0.252, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-mini-instruct", "verified_by_llmstats": false, "analysis_method": "0-shot, CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.646470+00:00", "updated_at": "2025-07-19T19:56:11.646470+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 985, "benchmark_id": "gsm8k", "model_id": "phi-4-mini", "score": 0.886, "normalized_score": 0.886, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-mini-instruct", "verified_by_llmstats": false, "analysis_method": "8-shot, CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.066927+00:00", "updated_at": "2025-07-19T19:56:13.066927+00:00", "benchmark_name": "GSM8k" }, { "model_benchmark_id": 41, "benchmark_id": "hellaswag", "model_id": "phi-4-mini", "score": 0.691, "normalized_score": 0.691, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-mini-instruct", "verified_by_llmstats": false, "analysis_method": "5-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.169983+00:00", "updated_at": "2025-07-19T19:56:11.169983+00:00", "benchmark_name": "HellaSwag" }, { "model_benchmark_id": 390, "benchmark_id": "math", "model_id": "phi-4-mini", "score": 0.64, "normalized_score": 0.64, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-mini-instruct", "verified_by_llmstats": false, "analysis_method": "0-shot, CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.839081+00:00", "updated_at": "2025-07-19T19:56:11.839081+00:00", "benchmark_name": "MATH" }, { "model_benchmark_id": 1280, "benchmark_id": "mgsm", "model_id": "phi-4-mini", "score": 0.639, "normalized_score": 0.639, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-mini-instruct", "verified_by_llmstats": false, "analysis_method": "5-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.683394+00:00", "updated_at": "2025-07-19T19:56:13.683394+00:00", "benchmark_name": "MGSM" }, { "model_benchmark_id": 73, "benchmark_id": "mmlu", "model_id": "phi-4-mini", "score": 0.673, "normalized_score": 0.673, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-mini-instruct", "verified_by_llmstats": false, "analysis_method": "5-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.237489+00:00", "updated_at": "2025-07-19T19:56:11.237489+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 177, "benchmark_id": "mmlu-pro", "model_id": "phi-4-mini", "score": 0.528, "normalized_score": 0.528, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-mini-instruct", "verified_by_llmstats": false, "analysis_method": "0-shot, CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.443019+00:00", "updated_at": "2025-07-19T19:56:11.443019+00:00", "benchmark_name": "MMLU-Pro" }, { "model_benchmark_id": 1473, "benchmark_id": "multilingual-mmlu", "model_id": "phi-4-mini", "score": 0.493, "normalized_score": 0.493, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-mini-instruct", "verified_by_llmstats": false, "analysis_method": "5-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.141886+00:00", "updated_at": "2025-07-19T19:56:14.141886+00:00", "benchmark_name": "Multilingual MMLU" }, { "model_benchmark_id": 1469, "benchmark_id": "openbookqa", "model_id": "phi-4-mini", "score": 0.792, "normalized_score": 0.792, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-mini-instruct", "verified_by_llmstats": false, "analysis_method": "10-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.132301+00:00", "updated_at": "2025-07-19T19:56:14.132301+00:00", "benchmark_name": "OpenBookQA" }, { "model_benchmark_id": 1032, "benchmark_id": "piqa", "model_id": "phi-4-mini", "score": 0.776, "normalized_score": 0.776, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-mini-instruct", "verified_by_llmstats": false, "analysis_method": "5-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.150113+00:00", "updated_at": "2025-07-19T19:56:13.150113+00:00", "benchmark_name": "PIQA" }, { "model_benchmark_id": 1041, "benchmark_id": "social-iqa", "model_id": "phi-4-mini", "score": 0.725, "normalized_score": 0.725, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-mini-instruct", "verified_by_llmstats": false, "analysis_method": "5-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.172567+00:00", "updated_at": "2025-07-19T19:56:13.172567+00:00", "benchmark_name": "Social IQa" }, { "model_benchmark_id": 132, "benchmark_id": "truthfulqa", "model_id": "phi-4-mini", "score": 0.664, "normalized_score": 0.664, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-mini-instruct", "verified_by_llmstats": false, "analysis_method": "MC2, 10-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.343180+00:00", "updated_at": "2025-07-19T19:56:11.343180+00:00", "benchmark_name": "TruthfulQA" }, { "model_benchmark_id": 149, "benchmark_id": "winogrande", "model_id": "phi-4-mini", "score": 0.67, "normalized_score": 0.67, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-mini-instruct", "verified_by_llmstats": false, "analysis_method": "5-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.382335+00:00", "updated_at": "2025-07-19T19:56:11.382335+00:00", "benchmark_name": "Winogrande" } ] ================================================ FILE: data/organizations/microsoft/models/phi-4-mini/model.json ================================================ { "model_id": "phi-4-mini", "name": "Phi 4 Mini", "organization_id": "microsoft", "fine_tuned_from_model_id": null, "description": "Phi 4 Mini Instruct is a lightweight (3.8B parameters) open model built upon synthetic data and filtered web data, focusing on high-quality reasoning. It supports a 128K token context length and is enhanced for instruction adherence and safety via supervised fine-tuning and direct preference optimization.", "release_date": "2025-02-01", "announcement_date": "2025-02-01", "license_id": "mit", "multimodal": false, "knowledge_cutoff": "2024-06-01", "param_count": 3840000000, "training_tokens": 5000000000000, "available_in_zeroeval": true, "source_api_ref": null, "source_playground": null, "source_paper": "https://arxiv.org/pdf/2503.01743", "source_scorecard_blog_link": "https://azure.microsoft.com/en-us/blog/empowering-innovation-the-next-generation-of-the-phi-family/", "source_repo_link": null, "source_weights_link": "https://huggingface.co/microsoft/Phi-4-mini-instruct", "created_at": "2025-07-19T19:49:05.552796+00:00", "updated_at": "2025-07-19T19:49:05.552796+00:00", "model_family_id": null } ================================================ FILE: data/organizations/microsoft/models/phi-4-mini-reasoning/benchmarks.json ================================================ [ { "model_benchmark_id": 1436, "benchmark_id": "aime", "model_id": "phi-4-mini-reasoning", "score": 0.575, "normalized_score": 0.575, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-mini-reasoning", "verified_by_llmstats": false, "analysis_method": "Standard evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.061299+00:00", "updated_at": "2025-07-19T19:56:14.061299+00:00", "benchmark_name": "AIME" }, { "model_benchmark_id": 281, "benchmark_id": "gpqa", "model_id": "phi-4-mini-reasoning", "score": 0.52, "normalized_score": 0.52, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-mini-reasoning", "verified_by_llmstats": false, "analysis_method": "Diamond", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.642870+00:00", "updated_at": "2025-07-19T19:56:11.642870+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 494, "benchmark_id": "math-500", "model_id": "phi-4-mini-reasoning", "score": 0.946, "normalized_score": 0.946, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-mini-reasoning", "verified_by_llmstats": false, "analysis_method": "Standard evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.032863+00:00", "updated_at": "2025-07-19T19:56:12.032863+00:00", "benchmark_name": "MATH-500" } ] ================================================ FILE: data/organizations/microsoft/models/phi-4-mini-reasoning/model.json ================================================ { "model_id": "phi-4-mini-reasoning", "name": "Phi 4 Mini Reasoning", "organization_id": "microsoft", "fine_tuned_from_model_id": null, "description": "Phi-4-mini-reasoning is designed for multi-step, logic-intensive mathematical problem-solving tasks under memory/compute constrained environments and latency bound scenarios. Some of the use cases include formal proof generation, symbolic computation, advanced word problems, and a wide range of mathematical reasoning scenarios. These models excel at maintaining context across steps, applying structured logic, and delivering accurate, reliable solutions in domains that require deep analytical thinking.", "release_date": "2025-04-30", "announcement_date": "2025-04-30", "license_id": "mit", "multimodal": false, "knowledge_cutoff": "2025-02-01", "param_count": 3800000000, "training_tokens": 150000000000, "available_in_zeroeval": true, "source_api_ref": "https://learn.microsoft.com/en-us/windows/ai/apis/phi-silica?tabs=csharp0,csharp1,csharp2,csharp3", "source_playground": null, "source_paper": "https://arxiv.org/pdf/2504.21233", "source_scorecard_blog_link": "https://azure.microsoft.com/en-us/blog/one-year-of-phi-small-language-models-making-big-leaps-in-ai/", "source_repo_link": null, "source_weights_link": "https://huggingface.co/microsoft/Phi-4-mini-reasoning", "created_at": "2025-07-19T19:49:05.545846+00:00", "updated_at": "2025-07-19T19:49:05.545846+00:00", "model_family_id": null } ================================================ FILE: data/organizations/microsoft/models/phi-4-multimodal-instruct/benchmarks.json ================================================ [ { "model_benchmark_id": 1251, "benchmark_id": "ai2d", "model_id": "phi-4-multimodal-instruct", "score": 0.823, "normalized_score": 0.823, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-multimodal-instruct", "verified_by_llmstats": false, "analysis_method": "Standard Evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.628230+00:00", "updated_at": "2025-07-19T19:56:13.628230+00:00", "benchmark_name": "AI2D" }, { "model_benchmark_id": 1545, "benchmark_id": "blink", "model_id": "phi-4-multimodal-instruct", "score": 0.613, "normalized_score": 0.613, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-multimodal-instruct", "verified_by_llmstats": false, "analysis_method": "Standard Evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.329567+00:00", "updated_at": "2025-07-19T19:56:14.329567+00:00", "benchmark_name": "BLINK" }, { "model_benchmark_id": 859, "benchmark_id": "chartqa", "model_id": "phi-4-multimodal-instruct", "score": 0.814, "normalized_score": 0.814, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-multimodal-instruct", "verified_by_llmstats": false, "analysis_method": "Standard Evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.797898+00:00", "updated_at": "2025-07-19T19:56:12.797898+00:00", "benchmark_name": "ChartQA" }, { "model_benchmark_id": 881, "benchmark_id": "docvqa", "model_id": "phi-4-multimodal-instruct", "score": 0.932, "normalized_score": 0.932, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-multimodal-instruct", "verified_by_llmstats": false, "analysis_method": "Standard Evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.836095+00:00", "updated_at": "2025-07-19T19:56:12.836095+00:00", "benchmark_name": "DocVQA" }, { "model_benchmark_id": 1241, "benchmark_id": "infovqa", "model_id": "phi-4-multimodal-instruct", "score": 0.727, "normalized_score": 0.727, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-multimodal-instruct", "verified_by_llmstats": false, "analysis_method": "Standard Evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.609397+00:00", "updated_at": "2025-07-19T19:56:13.609397+00:00", "benchmark_name": "InfoVQA" }, { "model_benchmark_id": 1521, "benchmark_id": "intergps", "model_id": "phi-4-multimodal-instruct", "score": 0.486, "normalized_score": 0.486, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-multimodal-instruct", "verified_by_llmstats": false, "analysis_method": "testmini", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.263464+00:00", "updated_at": "2025-07-19T19:56:14.263464+00:00", "benchmark_name": "InterGPS" }, { "model_benchmark_id": 521, "benchmark_id": "mathvista", "model_id": "phi-4-multimodal-instruct", "score": 0.624, "normalized_score": 0.624, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-multimodal-instruct", "verified_by_llmstats": false, "analysis_method": "testmini", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.082453+00:00", "updated_at": "2025-07-19T19:56:12.082453+00:00", "benchmark_name": "MathVista" }, { "model_benchmark_id": 1510, "benchmark_id": "mmbench", "model_id": "phi-4-multimodal-instruct", "score": 0.867, "normalized_score": 0.867, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-multimodal-instruct", "verified_by_llmstats": false, "analysis_method": "dev-en", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.240071+00:00", "updated_at": "2025-07-19T19:56:14.240071+00:00", "benchmark_name": "MMBench" }, { "model_benchmark_id": 564, "benchmark_id": "mmmu", "model_id": "phi-4-multimodal-instruct", "score": 0.551, "normalized_score": 0.551, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-multimodal-instruct", "verified_by_llmstats": false, "analysis_method": "Standard Evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.161302+00:00", "updated_at": "2025-07-19T19:56:12.161302+00:00", "benchmark_name": "MMMU" }, { "model_benchmark_id": 1528, "benchmark_id": "mmmu-pro", "model_id": "phi-4-multimodal-instruct", "score": 0.385, "normalized_score": 0.385, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-multimodal-instruct", "verified_by_llmstats": false, "analysis_method": "std/vision", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.285447+00:00", "updated_at": "2025-07-19T19:56:14.285447+00:00", "benchmark_name": "MMMU-Pro" }, { "model_benchmark_id": 1538, "benchmark_id": "ocrbench", "model_id": "phi-4-multimodal-instruct", "score": 0.844, "normalized_score": 0.844, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-multimodal-instruct", "verified_by_llmstats": false, "analysis_method": "Standard Evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.309778+00:00", "updated_at": "2025-07-19T19:56:14.309778+00:00", "benchmark_name": "OCRBench" }, { "model_benchmark_id": 1523, "benchmark_id": "pope", "model_id": "phi-4-multimodal-instruct", "score": 0.856, "normalized_score": 0.856, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-multimodal-instruct", "verified_by_llmstats": false, "analysis_method": "Standard Evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.268923+00:00", "updated_at": "2025-07-19T19:56:14.268923+00:00", "benchmark_name": "POPE" }, { "model_benchmark_id": 1537, "benchmark_id": "scienceqa-visual", "model_id": "phi-4-multimodal-instruct", "score": 0.975, "normalized_score": 0.975, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-multimodal-instruct", "verified_by_llmstats": false, "analysis_method": "img-test", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.303456+00:00", "updated_at": "2025-07-19T19:56:14.303456+00:00", "benchmark_name": "ScienceQA Visual" }, { "model_benchmark_id": 907, "benchmark_id": "textvqa", "model_id": "phi-4-multimodal-instruct", "score": 0.756, "normalized_score": 0.756, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-multimodal-instruct", "verified_by_llmstats": false, "analysis_method": "Standard Evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.890738+00:00", "updated_at": "2025-07-19T19:56:12.890738+00:00", "benchmark_name": "TextVQA" }, { "model_benchmark_id": 1383, "benchmark_id": "video-mme", "model_id": "phi-4-multimodal-instruct", "score": 0.55, "normalized_score": 0.55, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-multimodal-instruct", "verified_by_llmstats": false, "analysis_method": "16 frames", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.911859+00:00", "updated_at": "2025-07-19T19:56:13.911859+00:00", "benchmark_name": "Video-MME" } ] ================================================ FILE: data/organizations/microsoft/models/phi-4-multimodal-instruct/model.json ================================================ { "model_id": "phi-4-multimodal-instruct", "name": "Phi-4-multimodal-instruct", "organization_id": "microsoft", "fine_tuned_from_model_id": null, "description": "Phi-4-multimodal-instruct is a lightweight (5.57B parameters) open multimodal foundation model that leverages research and datasets from Phi-3.5 and 4.0. It processes text, image, and audio inputs to generate text outputs, supporting a 128K token context length. Enhanced via SFT, DPO, and RLHF for instruction following and safety.", "release_date": "2025-02-01", "announcement_date": "2025-02-01", "license_id": "mit", "multimodal": true, "knowledge_cutoff": "2024-06-01", "param_count": 5600000000, "training_tokens": 5000000000000, "available_in_zeroeval": true, "source_api_ref": null, "source_playground": "https://ai.azure.com/explore/models?selectedCollection=phi&tid=72f988bf-86f1-41af-91ab-2d7cd011db47", "source_paper": "https://arxiv.org/abs/2503.01743", "source_scorecard_blog_link": "https://azure.microsoft.com/en-us/blog/empowering-innovation-the-next-generation-of-the-phi-family/", "source_repo_link": null, "source_weights_link": "https://huggingface.co/microsoft/Phi-4-multimodal-instruct", "created_at": "2025-07-19T19:49:05.571307+00:00", "updated_at": "2025-07-19T19:49:05.571307+00:00", "model_family_id": null } ================================================ FILE: data/organizations/microsoft/models/phi-4-reasoning/benchmarks.json ================================================ [ { "model_benchmark_id": 450, "benchmark_id": "aime-2024", "model_id": "phi-4-reasoning", "score": 0.753, "normalized_score": 0.753, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-reasoning", "verified_by_llmstats": false, "analysis_method": "Standard evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.955706+00:00", "updated_at": "2025-07-19T19:56:11.955706+00:00", "benchmark_name": "AIME 2024" }, { "model_benchmark_id": 688, "benchmark_id": "aime-2025", "model_id": "phi-4-reasoning", "score": 0.629, "normalized_score": 0.629, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-reasoning", "verified_by_llmstats": false, "analysis_method": "Standard evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.444086+00:00", "updated_at": "2025-07-19T19:56:12.444086+00:00", "benchmark_name": "AIME 2025" }, { "model_benchmark_id": 1450, "benchmark_id": "arena-hard", "model_id": "phi-4-reasoning", "score": 0.733, "normalized_score": 0.733, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-reasoning-plus", "verified_by_llmstats": false, "analysis_method": "Standard evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.091856+00:00", "updated_at": "2025-07-19T19:56:14.091856+00:00", "benchmark_name": "Arena Hard" }, { "model_benchmark_id": 1527, "benchmark_id": "flenqa", "model_id": "phi-4-reasoning", "score": 0.977, "normalized_score": 0.977, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-reasoning-plus", "verified_by_llmstats": false, "analysis_method": "3K-token subset", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.281300+00:00", "updated_at": "2025-07-19T19:56:14.281300+00:00", "benchmark_name": "FlenQA" }, { "model_benchmark_id": 287, "benchmark_id": "gpqa", "model_id": "phi-4-reasoning", "score": 0.658, "normalized_score": 0.658, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-reasoning", "verified_by_llmstats": false, "analysis_method": "Diamond", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.654843+00:00", "updated_at": "2025-07-19T19:56:11.654843+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 1439, "benchmark_id": "humaneval+", "model_id": "phi-4-reasoning", "score": 0.929, "normalized_score": 0.929, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-reasoning-plus", "verified_by_llmstats": false, "analysis_method": "Standard evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.068831+00:00", "updated_at": "2025-07-19T19:56:14.068831+00:00", "benchmark_name": "HumanEval+" }, { "model_benchmark_id": 613, "benchmark_id": "ifeval", "model_id": "phi-4-reasoning", "score": 0.834, "normalized_score": 0.834, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-reasoning-plus", "verified_by_llmstats": false, "analysis_method": "Strict", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.265033+00:00", "updated_at": "2025-07-19T19:56:12.265033+00:00", "benchmark_name": "IFEval" }, { "model_benchmark_id": 1114, "benchmark_id": "livecodebench", "model_id": "phi-4-reasoning", "score": 0.538, "normalized_score": 0.538, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-reasoning", "verified_by_llmstats": false, "analysis_method": "8/1/24\u20132/1/25", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.324523+00:00", "updated_at": "2025-07-19T19:56:13.324523+00:00", "benchmark_name": "LiveCodeBench" }, { "model_benchmark_id": 183, "benchmark_id": "mmlu-pro", "model_id": "phi-4-reasoning", "score": 0.743, "normalized_score": 0.743, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-reasoning-plus", "verified_by_llmstats": false, "analysis_method": "Standard evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.453150+00:00", "updated_at": "2025-07-19T19:56:11.453150+00:00", "benchmark_name": "MMLU-Pro" }, { "model_benchmark_id": 1525, "benchmark_id": "omnimath", "model_id": "phi-4-reasoning", "score": 0.766, "normalized_score": 0.766, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-reasoning", "verified_by_llmstats": false, "analysis_method": "Standard evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.276205+00:00", "updated_at": "2025-07-19T19:56:14.276205+00:00", "benchmark_name": "OmniMath" }, { "model_benchmark_id": 1468, "benchmark_id": "phibench", "model_id": "phi-4-reasoning", "score": 0.706, "normalized_score": 0.706, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-reasoning-plus", "verified_by_llmstats": false, "analysis_method": "2.21", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.127989+00:00", "updated_at": "2025-07-19T19:56:14.127989+00:00", "benchmark_name": "PhiBench" } ] ================================================ FILE: data/organizations/microsoft/models/phi-4-reasoning/model.json ================================================ { "model_id": "phi-4-reasoning", "name": "Phi 4 Reasoning", "organization_id": "microsoft", "fine_tuned_from_model_id": "phi-4", "description": "Phi-4-reasoning is a state-of-the-art open-weight reasoning model finetuned from Phi-4 using supervised fine-tuning on a dataset of chain-of-thought traces and reinforcement learning. It focuses on math, science, and coding skills.", "release_date": "2025-04-30", "announcement_date": "2025-04-30", "license_id": "mit", "multimodal": false, "knowledge_cutoff": "2025-03-01", "param_count": 14000000000, "training_tokens": 16000000000, "available_in_zeroeval": true, "source_api_ref": "https://learn.microsoft.com/en-us/windows/ai/apis/phi-silica?tabs=csharp0,csharp1,csharp2,csharp3", "source_playground": null, "source_paper": "https://arxiv.org/abs/2504.21318", "source_scorecard_blog_link": "https://azure.microsoft.com/en-us/blog/one-year-of-phi-small-language-models-making-big-leaps-in-ai/", "source_repo_link": null, "source_weights_link": "https://huggingface.co/microsoft/Phi-4-reasoning", "created_at": "2025-07-19T19:49:05.879382+00:00", "updated_at": "2025-07-19T19:49:05.879382+00:00", "model_family_id": null } ================================================ FILE: data/organizations/microsoft/models/phi-4-reasoning-plus/benchmarks.json ================================================ [ { "model_benchmark_id": 449, "benchmark_id": "aime-2024", "model_id": "phi-4-reasoning-plus", "score": 0.813, "normalized_score": 0.813, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-reasoning-plus", "verified_by_llmstats": false, "analysis_method": "Standard evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.953709+00:00", "updated_at": "2025-07-19T19:56:11.953709+00:00", "benchmark_name": "AIME 2024" }, { "model_benchmark_id": 687, "benchmark_id": "aime-2025", "model_id": "phi-4-reasoning-plus", "score": 0.78, "normalized_score": 0.78, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-reasoning-plus", "verified_by_llmstats": false, "analysis_method": "Standard evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.440995+00:00", "updated_at": "2025-07-19T19:56:12.440995+00:00", "benchmark_name": "AIME 2025" }, { "model_benchmark_id": 1449, "benchmark_id": "arena-hard", "model_id": "phi-4-reasoning-plus", "score": 0.79, "normalized_score": 0.79, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-reasoning-plus", "verified_by_llmstats": false, "analysis_method": "Standard evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.090173+00:00", "updated_at": "2025-07-19T19:56:14.090173+00:00", "benchmark_name": "Arena Hard" }, { "model_benchmark_id": 1526, "benchmark_id": "flenqa", "model_id": "phi-4-reasoning-plus", "score": 0.979, "normalized_score": 0.979, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-reasoning-plus", "verified_by_llmstats": false, "analysis_method": "3K-token subset", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.279654+00:00", "updated_at": "2025-07-19T19:56:14.279654+00:00", "benchmark_name": "FlenQA" }, { "model_benchmark_id": 286, "benchmark_id": "gpqa", "model_id": "phi-4-reasoning-plus", "score": 0.689, "normalized_score": 0.689, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-reasoning-plus", "verified_by_llmstats": false, "analysis_method": "Diamond", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.652983+00:00", "updated_at": "2025-07-19T19:56:11.652983+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 1438, "benchmark_id": "humaneval+", "model_id": "phi-4-reasoning-plus", "score": 0.923, "normalized_score": 0.923, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-reasoning-plus", "verified_by_llmstats": false, "analysis_method": "Standard evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.066904+00:00", "updated_at": "2025-07-19T19:56:14.066904+00:00", "benchmark_name": "HumanEval+" }, { "model_benchmark_id": 612, "benchmark_id": "ifeval", "model_id": "phi-4-reasoning-plus", "score": 0.849, "normalized_score": 0.849, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-reasoning-plus", "verified_by_llmstats": false, "analysis_method": "Strict", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.263243+00:00", "updated_at": "2025-07-19T19:56:12.263243+00:00", "benchmark_name": "IFEval" }, { "model_benchmark_id": 1113, "benchmark_id": "livecodebench", "model_id": "phi-4-reasoning-plus", "score": 0.531, "normalized_score": 0.531, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-reasoning-plus", "verified_by_llmstats": false, "analysis_method": "8/1/24\u20132/1/25", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.322076+00:00", "updated_at": "2025-07-19T19:56:13.322076+00:00", "benchmark_name": "LiveCodeBench" }, { "model_benchmark_id": 182, "benchmark_id": "mmlu-pro", "model_id": "phi-4-reasoning-plus", "score": 0.76, "normalized_score": 0.76, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-reasoning-plus", "verified_by_llmstats": false, "analysis_method": "Standard evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.451685+00:00", "updated_at": "2025-07-19T19:56:11.451685+00:00", "benchmark_name": "MMLU-Pro" }, { "model_benchmark_id": 1524, "benchmark_id": "omnimath", "model_id": "phi-4-reasoning-plus", "score": 0.819, "normalized_score": 0.819, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-reasoning-plus", "verified_by_llmstats": false, "analysis_method": "Standard evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.274539+00:00", "updated_at": "2025-07-19T19:56:14.274539+00:00", "benchmark_name": "OmniMath" }, { "model_benchmark_id": 1467, "benchmark_id": "phibench", "model_id": "phi-4-reasoning-plus", "score": 0.742, "normalized_score": 0.742, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-reasoning-plus", "verified_by_llmstats": false, "analysis_method": "2.21", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.126449+00:00", "updated_at": "2025-07-19T19:56:14.126449+00:00", "benchmark_name": "PhiBench" } ] ================================================ FILE: data/organizations/microsoft/models/phi-4-reasoning-plus/model.json ================================================ { "model_id": "phi-4-reasoning-plus", "name": "Phi 4 Reasoning Plus", "organization_id": "microsoft", "fine_tuned_from_model_id": null, "description": "Phi-4-reasoning-plus is a state-of-the-art open-weight reasoning model finetuned from Phi-4 using supervised fine-tuning and reinforcement learning. It focuses on math, science, and coding skills. This 'plus' version has higher accuracy due to additional RL training but may have higher latency.", "release_date": "2025-04-30", "announcement_date": "2025-04-30", "license_id": "mit", "multimodal": false, "knowledge_cutoff": "2025-03-01", "param_count": 14000000000, "training_tokens": 16000000000, "available_in_zeroeval": true, "source_api_ref": "https://learn.microsoft.com/en-us/windows/ai/apis/phi-silica?tabs=csharp0,csharp1,csharp2,csharp3", "source_playground": null, "source_paper": "https://arxiv.org/abs/2504.21318", "source_scorecard_blog_link": "https://azure.microsoft.com/en-us/blog/one-year-of-phi-small-language-models-making-big-leaps-in-ai/", "source_repo_link": null, "source_weights_link": "https://huggingface.co/microsoft/Phi-4-reasoning-plus", "created_at": "2025-07-19T19:49:05.567534+00:00", "updated_at": "2025-07-19T19:49:05.567534+00:00", "model_family_id": null } ================================================ FILE: data/organizations/microsoft/organization.json ================================================ { "organization_id": "microsoft", "name": "Microsoft", "website": "https://microsoft.com", "description": "Technology company", "country": "US", "created_at": "2025-07-19T19:49:05.543205+00:00", "updated_at": "2025-07-19T19:49:05.543205+00:00" } ================================================ FILE: data/organizations/mistral/models/codestral-22b/benchmarks.json ================================================ [ { "model_benchmark_id": 1823, "benchmark_id": "cruxeval-o", "model_id": "codestral-22b", "score": 0.513, "normalized_score": 0.513, "is_self_reported": true, "self_reported_source_link": "https://mistral.ai/news/codestral/", "verified_by_llmstats": false, "analysis_method": "pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.151317+00:00", "updated_at": "2025-07-19T19:56:15.151317+00:00", "benchmark_name": "CruxEval-O" }, { "model_benchmark_id": 809, "benchmark_id": "humaneval", "model_id": "codestral-22b", "score": 0.811, "normalized_score": 0.811, "is_self_reported": true, "self_reported_source_link": "https://mistral.ai/news/codestral/", "verified_by_llmstats": false, "analysis_method": "pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.685855+00:00", "updated_at": "2025-07-19T19:56:12.685855+00:00", "benchmark_name": "HumanEval" }, { "model_benchmark_id": 1827, "benchmark_id": "humaneval-average", "model_id": "codestral-22b", "score": 0.615, "normalized_score": 0.615, "is_self_reported": true, "self_reported_source_link": "https://mistral.ai/news/codestral/", "verified_by_llmstats": false, "analysis_method": "pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.174206+00:00", "updated_at": "2025-07-19T19:56:15.174206+00:00", "benchmark_name": "HumanEval-Average" }, { "model_benchmark_id": 1826, "benchmark_id": "humanevalfim-average", "model_id": "codestral-22b", "score": 0.916, "normalized_score": 0.916, "is_self_reported": true, "self_reported_source_link": "https://mistral.ai/news/codestral/", "verified_by_llmstats": false, "analysis_method": "pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.169908+00:00", "updated_at": "2025-07-19T19:56:15.169908+00:00", "benchmark_name": "HumanEvalFIM-Average" }, { "model_benchmark_id": 1196, "benchmark_id": "mbpp", "model_id": "codestral-22b", "score": 0.782, "normalized_score": 0.782, "is_self_reported": true, "self_reported_source_link": "https://mistral.ai/news/codestral/", "verified_by_llmstats": false, "analysis_method": "pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.517772+00:00", "updated_at": "2025-07-19T19:56:13.517772+00:00", "benchmark_name": "MBPP" }, { "model_benchmark_id": 1824, "benchmark_id": "repobench", "model_id": "codestral-22b", "score": 0.34, "normalized_score": 0.34, "is_self_reported": true, "self_reported_source_link": "https://mistral.ai/news/codestral/", "verified_by_llmstats": false, "analysis_method": "pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.155008+00:00", "updated_at": "2025-07-19T19:56:15.155008+00:00", "benchmark_name": "RepoBench" }, { "model_benchmark_id": 1825, "benchmark_id": "spider", "model_id": "codestral-22b", "score": 0.635, "normalized_score": 0.635, "is_self_reported": true, "self_reported_source_link": "https://mistral.ai/news/codestral/", "verified_by_llmstats": false, "analysis_method": "pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.159626+00:00", "updated_at": "2025-07-19T19:56:15.159626+00:00", "benchmark_name": "Spider" } ] ================================================ FILE: data/organizations/mistral/models/codestral-22b/model.json ================================================ { "model_id": "codestral-22b", "name": "Codestral-22B", "organization_id": "mistral", "fine_tuned_from_model_id": null, "description": "A 22B parameter code generation model trained on 80+ programming languages including Python, Java, C, C++, JavaScript, and Bash. Supports both instruction-following and fill-in-the-middle (FIM) capabilities for code completion and generation tasks.", "release_date": "2024-05-29", "announcement_date": "2024-05-29", "license_id": "mnpl_0_1", "multimodal": false, "knowledge_cutoff": null, "param_count": 22200000000, "training_tokens": null, "available_in_zeroeval": false, "source_api_ref": "https://docs.mistral.ai/api/", "source_playground": "https://chat.mistral.ai/", "source_paper": null, "source_scorecard_blog_link": "https://mistral.ai/news/codestral/", "source_repo_link": null, "source_weights_link": "https://huggingface.co/mistralai/Codestral-22B-v0.1", "created_at": "2025-07-19T19:49:05.805621+00:00", "updated_at": "2025-07-19T19:49:05.805621+00:00", "model_family_id": null } ================================================ FILE: data/organizations/mistral/models/devstral-medium-2507/benchmarks.json ================================================ [ { "model_benchmark_id": 1352, "benchmark_id": "swe-bench-verified", "model_id": "devstral-medium-2507", "score": 0.616, "normalized_score": 0.616, "is_self_reported": true, "self_reported_source_link": "https://mistral.ai/news/devstral-2507", "verified_by_llmstats": false, "analysis_method": "N/A", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.845635+00:00", "updated_at": "2025-07-19T19:56:13.845635+00:00", "benchmark_name": "SWE-Bench Verified" } ] ================================================ FILE: data/organizations/mistral/models/devstral-medium-2507/model.json ================================================ { "model_id": "devstral-medium-2507", "name": "Devstral Medium", "organization_id": "mistral", "fine_tuned_from_model_id": null, "description": "Devstral Medium builds upon the strengths of Devstral Small and takes performance to the next level with a score of 61.6% on SWE-Bench Verified. Devstral Medium is available through the Mistral public API, and offers exceptional performance at a competitive price point, making it an ideal choice for businesses and developers looking for a high-quality, cost-effective model.", "release_date": "2025-07-10", "announcement_date": "2025-07-10", "license_id": "proprietary", "multimodal": false, "knowledge_cutoff": null, "param_count": null, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://console.mistral.ai", "source_playground": null, "source_paper": null, "source_scorecard_blog_link": "https://mistral.ai/news/devstral-2507", "source_repo_link": null, "source_weights_link": null, "created_at": "2025-07-19T19:49:05.783461+00:00", "updated_at": "2025-07-19T19:49:05.783461+00:00", "model_family_id": null } ================================================ FILE: data/organizations/mistral/models/devstral-small-2507/benchmarks.json ================================================ [ { "model_benchmark_id": 1353, "benchmark_id": "swe-bench-verified", "model_id": "devstral-small-2507", "score": 0.536, "normalized_score": 0.536, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Devstral-Small-2507", "verified_by_llmstats": false, "analysis_method": "OpenHands scaffold", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.847228+00:00", "updated_at": "2025-07-19T19:56:13.847228+00:00", "benchmark_name": "SWE-Bench Verified" } ] ================================================ FILE: data/organizations/mistral/models/devstral-small-2507/model.json ================================================ { "model_id": "devstral-small-2507", "name": "Devstral Small 1.1", "organization_id": "mistral", "fine_tuned_from_model_id": null, "description": "Devstral Small 1.1 (also called devstral-small-2507) is based on the Mistral-Small-3.1 foundation model and contains approximately 24 billion parameters. It supports a 128k token context window, which allows it to handle multi-file code inputs and long prompts typical in software engineering workflows. The model is fine-tuned specifically for structured outputs, including XML and function-calling formats. This makes it compatible with agent frameworks such as OpenHands and suitable for tasks like program navigation, multi-step edits, and code search. It is licensed under Apache 2.0 and available for both research and commercial use.", "release_date": "2025-07-11", "announcement_date": "2025-07-11", "license_id": "apache_2_0", "multimodal": false, "knowledge_cutoff": null, "param_count": 24000000000, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://console.mistral.ai", "source_playground": null, "source_paper": null, "source_scorecard_blog_link": "https://huggingface.co/mistralai/Devstral-Small-2507", "source_repo_link": null, "source_weights_link": "https://huggingface.co/mistralai/Devstral-Small-2507/blob/main/model.safetensors.index.json", "created_at": "2025-07-19T19:49:05.797947+00:00", "updated_at": "2025-07-19T19:49:05.797947+00:00", "model_family_id": null } ================================================ FILE: data/organizations/mistral/models/magistral-medium/benchmarks.json ================================================ [ { "model_benchmark_id": 665, "benchmark_id": "aider-polyglot", "model_id": "magistral-medium", "score": 0.471, "normalized_score": 0.471, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2506.10910", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.379075+00:00", "updated_at": "2025-07-19T19:56:12.379075+00:00", "benchmark_name": "Aider-Polyglot" }, { "model_benchmark_id": 480, "benchmark_id": "aime-2024", "model_id": "magistral-medium", "score": 0.736, "normalized_score": 0.736, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2506.10910", "verified_by_llmstats": false, "analysis_method": "pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.011044+00:00", "updated_at": "2025-07-19T19:56:12.011044+00:00", "benchmark_name": "AIME 2024" }, { "model_benchmark_id": 704, "benchmark_id": "aime-2025", "model_id": "magistral-medium", "score": 0.649, "normalized_score": 0.649, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2506.10910", "verified_by_llmstats": false, "analysis_method": "pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.473748+00:00", "updated_at": "2025-07-19T19:56:12.473748+00:00", "benchmark_name": "AIME 2025" }, { "model_benchmark_id": 343, "benchmark_id": "gpqa", "model_id": "magistral-medium", "score": 0.708, "normalized_score": 0.708, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2506.10910", "verified_by_llmstats": false, "analysis_method": "Diamond", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.745089+00:00", "updated_at": "2025-07-19T19:56:11.745089+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 724, "benchmark_id": "humanity's-last-exam", "model_id": "magistral-medium", "score": 0.09, "normalized_score": 0.09, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2506.10910", "verified_by_llmstats": false, "analysis_method": "text subset", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.525031+00:00", "updated_at": "2025-07-19T19:56:12.525031+00:00", "benchmark_name": "Humanity's Last Exam" }, { "model_benchmark_id": 1145, "benchmark_id": "livecodebench", "model_id": "magistral-medium", "score": 0.503, "normalized_score": 0.503, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/pdf/2506.10910", "verified_by_llmstats": false, "analysis_method": "v6", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.408465+00:00", "updated_at": "2025-07-19T19:56:13.410002+00:00", "benchmark_name": "LiveCodeBench" } ] ================================================ FILE: data/organizations/mistral/models/magistral-medium/model.json ================================================ { "model_id": "magistral-medium", "name": "Magistral Medium", "organization_id": "mistral", "fine_tuned_from_model_id": null, "description": "Trained solely with reinforcement learning on top of Mistral Medium 3, Magistral Medium is a reasoning model that achieves strong performance on complex math and code tasks without relying on distillation from existing reasoning models. The training uses an RLVR framework with modifications to GRPO, enabling improved reasoning ability and multilingual consistency.", "release_date": "2025-06-10", "announcement_date": "2025-06-10", "license_id": "apache_2_0", "multimodal": true, "knowledge_cutoff": "2025-06-01", "param_count": 24000000000, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://docs.mistral.ai/api/", "source_playground": "https://chat.mistral.ai/", "source_paper": "https://arxiv.org/pdf/2506.10910", "source_scorecard_blog_link": "https://mistral.ai/news/magistral", "source_repo_link": null, "source_weights_link": null, "created_at": "2025-07-19T19:49:05.780565+00:00", "updated_at": "2025-07-19T19:49:05.780565+00:00", "model_family_id": null } ================================================ FILE: data/organizations/mistral/models/magistral-small-2506/benchmarks.json ================================================ [ { "model_benchmark_id": 479, "benchmark_id": "aime-2024", "model_id": "magistral-small-2506", "score": 0.7068, "normalized_score": 0.7068, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Magistral-Small-2506", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.009597+00:00", "updated_at": "2025-07-19T19:56:12.009597+00:00", "benchmark_name": "AIME 2024" }, { "model_benchmark_id": 703, "benchmark_id": "aime-2025", "model_id": "magistral-small-2506", "score": 0.6276, "normalized_score": 0.6276, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Magistral-Small-2506", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.471565+00:00", "updated_at": "2025-07-19T19:56:12.471565+00:00", "benchmark_name": "AIME 2025" }, { "model_benchmark_id": 342, "benchmark_id": "gpqa", "model_id": "magistral-small-2506", "score": 0.6818, "normalized_score": 0.6818, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Magistral-Small-2506", "verified_by_llmstats": false, "analysis_method": "Diamond", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.743610+00:00", "updated_at": "2025-07-19T19:56:11.743610+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 1144, "benchmark_id": "livecodebench", "model_id": "magistral-small-2506", "score": 0.513, "normalized_score": 0.513, "is_self_reported": true, "self_reported_source_link": "https://mistral.ai/news/codestral/", "verified_by_llmstats": false, "analysis_method": "v5", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.406640+00:00", "updated_at": "2025-07-19T19:56:13.406640+00:00", "benchmark_name": "LiveCodeBench" } ] ================================================ FILE: data/organizations/mistral/models/magistral-small-2506/model.json ================================================ { "model_id": "magistral-small-2506", "name": "Magistral Small 2506", "organization_id": "mistral", "fine_tuned_from_model_id": null, "description": "Building upon Mistral Small 3.1 (2503), with added reasoning capabilities, undergoing SFT from Magistral Medium traces and RL on top, it's a small, efficient reasoning model with 24B parameters. Magistral Small can be deployed locally, fitting within a single RTX 4090 or a 32GB RAM MacBook once quantized.", "release_date": "2025-06-10", "announcement_date": "2025-06-10", "license_id": "apache_2_0", "multimodal": false, "knowledge_cutoff": "2025-06-01", "param_count": 24000000000, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://docs.mistral.ai/api/", "source_playground": "https://chat.mistral.ai/", "source_paper": "https://arxiv.org/pdf/2506.10910", "source_scorecard_blog_link": "https://mistral.ai/news/magistral", "source_repo_link": null, "source_weights_link": "https://huggingface.co/mistralai/Magistral-Small-2506", "created_at": "2025-07-19T19:49:05.777162+00:00", "updated_at": "2025-07-19T19:49:05.777162+00:00", "model_family_id": null } ================================================ FILE: data/organizations/mistral/models/ministral-8b-instruct-2410/benchmarks.json ================================================ [ { "model_benchmark_id": 1410, "benchmark_id": "agieval", "model_id": "ministral-8b-instruct-2410", "score": 0.483, "normalized_score": 0.483, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Ministral-8B-Instruct-2410", "verified_by_llmstats": false, "analysis_method": "-", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.978647+00:00", "updated_at": "2025-07-19T19:56:13.978647+00:00", "benchmark_name": "AGIEval" }, { "model_benchmark_id": 30, "benchmark_id": "arc-c", "model_id": "ministral-8b-instruct-2410", "score": 0.719, "normalized_score": 0.719, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Ministral-8B-Instruct-2410", "verified_by_llmstats": false, "analysis_method": "-", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.142536+00:00", "updated_at": "2025-07-19T19:56:11.142536+00:00", "benchmark_name": "ARC-C" }, { "model_benchmark_id": 1464, "benchmark_id": "arena-hard", "model_id": "ministral-8b-instruct-2410", "score": 0.709, "normalized_score": 0.709, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Ministral-8B-Instruct-2410", "verified_by_llmstats": false, "analysis_method": "", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.118772+00:00", "updated_at": "2025-07-19T19:56:14.118772+00:00", "benchmark_name": "Arena Hard" }, { "model_benchmark_id": 1820, "benchmark_id": "french-mmlu", "model_id": "ministral-8b-instruct-2410", "score": 0.575, "normalized_score": 0.575, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Ministral-8B-Instruct-2410", "verified_by_llmstats": false, "analysis_method": "-", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.137792+00:00", "updated_at": "2025-07-19T19:56:15.137792+00:00", "benchmark_name": "French MMLU" }, { "model_benchmark_id": 806, "benchmark_id": "humaneval", "model_id": "ministral-8b-instruct-2410", "score": 0.348, "normalized_score": 0.348, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Ministral-8B-Instruct-2410", "verified_by_llmstats": false, "analysis_method": "-", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.681246+00:00", "updated_at": "2025-07-19T19:56:12.681246+00:00", "benchmark_name": "HumanEval" }, { "model_benchmark_id": 422, "benchmark_id": "math", "model_id": "ministral-8b-instruct-2410", "score": 0.545, "normalized_score": 0.545, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Ministral-8B-Instruct-2410", "verified_by_llmstats": false, "analysis_method": "-", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.895272+00:00", "updated_at": "2025-07-19T19:56:11.895272+00:00", "benchmark_name": "MATH" }, { "model_benchmark_id": 1821, "benchmark_id": "mbpp-pass@1", "model_id": "ministral-8b-instruct-2410", "score": 0.7, "normalized_score": 0.7, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Ministral-8B-Instruct-2410", "verified_by_llmstats": false, "analysis_method": "-", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.141858+00:00", "updated_at": "2025-07-19T19:56:15.141858+00:00", "benchmark_name": "MBPP pass@1" }, { "model_benchmark_id": 112, "benchmark_id": "mmlu", "model_id": "ministral-8b-instruct-2410", "score": 0.65, "normalized_score": 0.65, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Ministral-8B-Instruct-2410", "verified_by_llmstats": false, "analysis_method": "-", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.309619+00:00", "updated_at": "2025-07-19T19:56:11.309619+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 1612, "benchmark_id": "mt-bench", "model_id": "ministral-8b-instruct-2410", "score": 0.83, "normalized_score": 0.83, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Ministral-8B-Instruct-2410", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.535003+00:00", "updated_at": "2025-07-19T19:56:14.535003+00:00", "benchmark_name": "MT-Bench" }, { "model_benchmark_id": 253, "benchmark_id": "triviaqa", "model_id": "ministral-8b-instruct-2410", "score": 0.655, "normalized_score": 0.655, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Ministral-8B-Instruct-2410", "verified_by_llmstats": false, "analysis_method": "-", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.582765+00:00", "updated_at": "2025-07-19T19:56:11.582765+00:00", "benchmark_name": "TriviaQA" }, { "model_benchmark_id": 155, "benchmark_id": "winogrande", "model_id": "ministral-8b-instruct-2410", "score": 0.753, "normalized_score": 0.753, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Ministral-8B-Instruct-2410", "verified_by_llmstats": false, "analysis_method": "-", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.394106+00:00", "updated_at": "2025-07-19T19:56:11.394106+00:00", "benchmark_name": "Winogrande" } ] ================================================ FILE: data/organizations/mistral/models/ministral-8b-instruct-2410/model.json ================================================ { "model_id": "ministral-8b-instruct-2410", "name": "Ministral 8B Instruct", "organization_id": "mistral", "fine_tuned_from_model_id": null, "description": "The Ministral-8B-Instruct-2410 is an instruct fine-tuned model for local intelligence, on-device computing, and at-the-edge use cases, significantly outperforming existing models of similar size.", "release_date": "2024-10-16", "announcement_date": "2024-10-16", "license_id": "mistral_research_license", "multimodal": false, "knowledge_cutoff": null, "param_count": 8019808256, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://huggingface.co/mistralai/Ministral-8B-Instruct-2410", "source_playground": null, "source_paper": null, "source_scorecard_blog_link": "https://mistral.ai/news/ministraux/", "source_repo_link": null, "source_weights_link": "https://huggingface.co/mistralai/Ministral-8B-Instruct-2410", "created_at": "2025-07-19T19:49:05.786083+00:00", "updated_at": "2025-07-19T19:49:05.786083+00:00", "model_family_id": null } ================================================ FILE: data/organizations/mistral/models/mistral-large-2-2407/benchmarks.json ================================================ [ { "model_benchmark_id": 1014, "benchmark_id": "gsm8k", "model_id": "mistral-large-2-2407", "score": 0.93, "normalized_score": 0.93, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Large-Instruct-2407", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.113392+00:00", "updated_at": "2025-07-19T19:56:13.113392+00:00", "benchmark_name": "GSM8k" }, { "model_benchmark_id": 810, "benchmark_id": "humaneval", "model_id": "mistral-large-2-2407", "score": 0.92, "normalized_score": 0.92, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Large-Instruct-2407", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.687406+00:00", "updated_at": "2025-07-19T19:56:12.687406+00:00", "benchmark_name": "HumanEval" }, { "model_benchmark_id": 116, "benchmark_id": "mmlu", "model_id": "mistral-large-2-2407", "score": 0.84, "normalized_score": 0.84, "is_self_reported": true, "self_reported_source_link": "https://mistral.ai/news/mistral-large-2407/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.316024+00:00", "updated_at": "2025-07-19T19:56:11.316024+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 1828, "benchmark_id": "mmlu-french", "model_id": "mistral-large-2-2407", "score": 0.828, "normalized_score": 0.828, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Large-Instruct-2407", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.178056+00:00", "updated_at": "2025-07-19T19:56:15.178056+00:00", "benchmark_name": "MMLU French" }, { "model_benchmark_id": 1615, "benchmark_id": "mt-bench", "model_id": "mistral-large-2-2407", "score": 0.863, "normalized_score": 0.863, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Large-Instruct-2407", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.541051+00:00", "updated_at": "2025-07-19T19:56:14.541051+00:00", "benchmark_name": "MT-Bench" } ] ================================================ FILE: data/organizations/mistral/models/mistral-large-2-2407/model.json ================================================ { "model_id": "mistral-large-2-2407", "name": "Mistral Large 2", "organization_id": "mistral", "fine_tuned_from_model_id": null, "description": "A 123B parameter model with strong capabilities in code generation, mathematics, and reasoning. Features enhanced multilingual support across dozens of languages, 128k context window, and advanced function calling capabilities. Excels in instruction-following and maintains concise outputs.", "release_date": "2024-07-24", "announcement_date": "2024-07-24", "license_id": "mistral_research_license", "multimodal": false, "knowledge_cutoff": null, "param_count": 123000000000, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://docs.mistral.ai/", "source_playground": "https://chat.mistral.ai/", "source_paper": null, "source_scorecard_blog_link": "https://mistral.ai/news/mistral-large-2407/", "source_repo_link": null, "source_weights_link": "https://huggingface.co/mistralai/Mistral-Large-Instruct-2407", "created_at": "2025-07-19T19:49:05.813974+00:00", "updated_at": "2025-07-19T19:49:05.813974+00:00", "model_family_id": null } ================================================ FILE: data/organizations/mistral/models/mistral-nemo-instruct-2407/benchmarks.json ================================================ [ { "model_benchmark_id": 1819, "benchmark_id": "commonsenseqa", "model_id": "mistral-nemo-instruct-2407", "score": 0.704, "normalized_score": 0.704, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Nemo-Instruct-2407", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.133096+00:00", "updated_at": "2025-07-19T19:56:15.133096+00:00", "benchmark_name": "CommonSenseQA" }, { "model_benchmark_id": 54, "benchmark_id": "hellaswag", "model_id": "mistral-nemo-instruct-2407", "score": 0.835, "normalized_score": 0.835, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Nemo-Instruct-2407", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.196732+00:00", "updated_at": "2025-07-19T19:56:11.196732+00:00", "benchmark_name": "HellaSwag" }, { "model_benchmark_id": 111, "benchmark_id": "mmlu", "model_id": "mistral-nemo-instruct-2407", "score": 0.68, "normalized_score": 0.68, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Nemo-Instruct-2407", "verified_by_llmstats": false, "analysis_method": "5-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.308247+00:00", "updated_at": "2025-07-19T19:56:11.308247+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 1050, "benchmark_id": "natural-questions", "model_id": "mistral-nemo-instruct-2407", "score": 0.312, "normalized_score": 0.312, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Nemo-Instruct-2407", "verified_by_llmstats": false, "analysis_method": "5-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.191770+00:00", "updated_at": "2025-07-19T19:56:13.191770+00:00", "benchmark_name": "Natural Questions" }, { "model_benchmark_id": 1472, "benchmark_id": "openbookqa", "model_id": "mistral-nemo-instruct-2407", "score": 0.606, "normalized_score": 0.606, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Nemo-Instruct-2407", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.138075+00:00", "updated_at": "2025-07-19T19:56:14.138075+00:00", "benchmark_name": "OpenBookQA" }, { "model_benchmark_id": 252, "benchmark_id": "triviaqa", "model_id": "mistral-nemo-instruct-2407", "score": 0.738, "normalized_score": 0.738, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Nemo-Instruct-2407", "verified_by_llmstats": false, "analysis_method": "5-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.581108+00:00", "updated_at": "2025-07-19T19:56:11.581108+00:00", "benchmark_name": "TriviaQA" }, { "model_benchmark_id": 146, "benchmark_id": "truthfulqa", "model_id": "mistral-nemo-instruct-2407", "score": 0.503, "normalized_score": 0.503, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Nemo-Instruct-2407", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.369082+00:00", "updated_at": "2025-07-19T19:56:11.369082+00:00", "benchmark_name": "TruthfulQA" }, { "model_benchmark_id": 154, "benchmark_id": "winogrande", "model_id": "mistral-nemo-instruct-2407", "score": 0.768, "normalized_score": 0.768, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Nemo-Instruct-2407", "verified_by_llmstats": false, "analysis_method": "0-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.392106+00:00", "updated_at": "2025-07-19T19:56:11.392106+00:00", "benchmark_name": "Winogrande" } ] ================================================ FILE: data/organizations/mistral/models/mistral-nemo-instruct-2407/model.json ================================================ { "model_id": "mistral-nemo-instruct-2407", "name": "Mistral NeMo Instruct", "organization_id": "mistral", "fine_tuned_from_model_id": null, "description": "A state-of-the-art 12B multilingual model with a 128k context window, designed for global applications and strong in multiple languages.", "release_date": "2024-07-18", "announcement_date": "2024-07-18", "license_id": "apache_2_0", "multimodal": false, "knowledge_cutoff": null, "param_count": 12000000000, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://docs.mistral.ai/getting-started/models/models_overview/", "source_playground": null, "source_paper": null, "source_scorecard_blog_link": "https://mistral.ai/news/mistral-nemo/", "source_repo_link": "https://huggingface.co/mistralai/Mistral-Nemo-Instruct-2407", "source_weights_link": "https://huggingface.co/mistralai/Mistral-Nemo-Instruct-2407", "created_at": "2025-07-19T19:49:05.773595+00:00", "updated_at": "2025-07-19T19:49:05.773595+00:00", "model_family_id": null } ================================================ FILE: data/organizations/mistral/models/mistral-small-2409/model.json ================================================ { "model_id": "mistral-small-2409", "name": "Mistral Small", "organization_id": "mistral", "fine_tuned_from_model_id": null, "description": "An enterprise-grade 22B parameter model optimized for tasks like translation, summarization, and sentiment analysis. Offers significant improvements in human alignment, reasoning capabilities, and code generation compared to previous versions.", "release_date": "2024-09-17", "announcement_date": "2024-09-17", "license_id": "mistral_research_license", "multimodal": false, "knowledge_cutoff": null, "param_count": 22000000000, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://docs.mistral.ai/api/", "source_playground": "https://console.mistral.ai/", "source_paper": null, "source_scorecard_blog_link": "https://mistral.ai/news/september-24-release/", "source_repo_link": null, "source_weights_link": "https://huggingface.co/mistralai/Mistral-Small-Instruct-2409", "created_at": "2025-07-19T19:49:05.809465+00:00", "updated_at": "2025-07-19T19:49:05.809465+00:00", "model_family_id": null } ================================================ FILE: data/organizations/mistral/models/mistral-small-24b-base-2501/benchmarks.json ================================================ [ { "model_benchmark_id": 1411, "benchmark_id": "agieval", "model_id": "mistral-small-24b-base-2501", "score": 0.658, "normalized_score": 0.658, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-24B-Base-2501", "verified_by_llmstats": false, "analysis_method": "-", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.980585+00:00", "updated_at": "2025-07-19T19:56:13.980585+00:00", "benchmark_name": "AGIEval" }, { "model_benchmark_id": 31, "benchmark_id": "arc-c", "model_id": "mistral-small-24b-base-2501", "score": 0.9129, "normalized_score": 0.9129, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-24B-Base-2501", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.143960+00:00", "updated_at": "2025-07-19T19:56:11.143960+00:00", "benchmark_name": "ARC-C" }, { "model_benchmark_id": 345, "benchmark_id": "gpqa", "model_id": "mistral-small-24b-base-2501", "score": 0.3437, "normalized_score": 0.3437, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-24B-Base-2501", "verified_by_llmstats": false, "analysis_method": "5-shot, CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.748111+00:00", "updated_at": "2025-07-19T19:56:11.748111+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 1013, "benchmark_id": "gsm8k", "model_id": "mistral-small-24b-base-2501", "score": 0.8073, "normalized_score": 0.8073, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-24B-Base-2501", "verified_by_llmstats": false, "analysis_method": "5-shot, maj@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.111924+00:00", "updated_at": "2025-07-19T19:56:13.111924+00:00", "benchmark_name": "GSM8k" }, { "model_benchmark_id": 424, "benchmark_id": "math", "model_id": "mistral-small-24b-base-2501", "score": 0.4598, "normalized_score": 0.4598, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-24B-Base-2501", "verified_by_llmstats": false, "analysis_method": "5-shot, MaJ", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.898806+00:00", "updated_at": "2025-07-19T19:56:11.898806+00:00", "benchmark_name": "MATH" }, { "model_benchmark_id": 1195, "benchmark_id": "mbpp", "model_id": "mistral-small-24b-base-2501", "score": 0.6964, "normalized_score": 0.6964, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-24B-Base-2501", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.516399+00:00", "updated_at": "2025-07-19T19:56:13.516399+00:00", "benchmark_name": "MBPP" }, { "model_benchmark_id": 113, "benchmark_id": "mmlu", "model_id": "mistral-small-24b-base-2501", "score": 0.8073, "normalized_score": 0.8073, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-24B-Base-2501", "verified_by_llmstats": false, "analysis_method": "5-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.311218+00:00", "updated_at": "2025-07-19T19:56:11.311218+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 217, "benchmark_id": "mmlu-pro", "model_id": "mistral-small-24b-base-2501", "score": 0.5437, "normalized_score": 0.5437, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-24B-Base-2501", "verified_by_llmstats": false, "analysis_method": "0-shot CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.511957+00:00", "updated_at": "2025-07-19T19:56:11.511957+00:00", "benchmark_name": "MMLU-Pro" }, { "model_benchmark_id": 254, "benchmark_id": "triviaqa", "model_id": "mistral-small-24b-base-2501", "score": 0.8032, "normalized_score": 0.8032, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-24B-Base-2501", "verified_by_llmstats": false, "analysis_method": "5-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.585944+00:00", "updated_at": "2025-07-19T19:56:11.585944+00:00", "benchmark_name": "TriviaQA" } ] ================================================ FILE: data/organizations/mistral/models/mistral-small-24b-base-2501/model.json ================================================ { "model_id": "mistral-small-24b-base-2501", "name": "Mistral Small 3 24B Base", "organization_id": "mistral", "fine_tuned_from_model_id": null, "description": "Mistral Small 3 is competitive with larger models such as Llama 3.3 70B or Qwen 32B, and is an excellent open replacement for opaque proprietary models like GPT4o-mini. Mistral Small 3 is on par with Llama 3.3 70B instruct, while being more than 3x faster on the same hardware.", "release_date": "2025-01-30", "announcement_date": "2025-01-30", "license_id": "apache_2_0", "multimodal": true, "knowledge_cutoff": "2023-10-01", "param_count": 23600000000, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": null, "source_playground": "https://console.mistral.ai/", "source_paper": null, "source_scorecard_blog_link": "https://mistral.ai/news/mistral-small-3", "source_repo_link": null, "source_weights_link": "https://huggingface.co/mistralai/Mistral-Small-24B-Base-2501", "created_at": "2025-07-19T19:49:05.791166+00:00", "updated_at": "2025-07-19T19:49:05.791166+00:00", "model_family_id": null } ================================================ FILE: data/organizations/mistral/models/mistral-small-24b-instruct-2501/benchmarks.json ================================================ [ { "model_benchmark_id": 1465, "benchmark_id": "arena-hard", "model_id": "mistral-small-24b-instruct-2501", "score": 0.876, "normalized_score": 0.876, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-24B-Instruct-2501", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.120697+00:00", "updated_at": "2025-07-19T19:56:14.120697+00:00", "benchmark_name": "Arena Hard" }, { "model_benchmark_id": 344, "benchmark_id": "gpqa", "model_id": "mistral-small-24b-instruct-2501", "score": 0.453, "normalized_score": 0.453, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-24B-Instruct-2501", "verified_by_llmstats": false, "analysis_method": "5 shot COT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.746578+00:00", "updated_at": "2025-07-19T19:56:11.746578+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 807, "benchmark_id": "humaneval", "model_id": "mistral-small-24b-instruct-2501", "score": 0.848, "normalized_score": 0.848, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-24B-Instruct-2501", "verified_by_llmstats": false, "analysis_method": "5 shot COT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.682647+00:00", "updated_at": "2025-07-19T19:56:12.682647+00:00", "benchmark_name": "HumanEval" }, { "model_benchmark_id": 630, "benchmark_id": "ifeval", "model_id": "mistral-small-24b-instruct-2501", "score": 0.829, "normalized_score": 0.829, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-24B-Instruct-2501", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.295754+00:00", "updated_at": "2025-07-19T19:56:12.295754+00:00", "benchmark_name": "IFEval" }, { "model_benchmark_id": 423, "benchmark_id": "math", "model_id": "mistral-small-24b-instruct-2501", "score": 0.706, "normalized_score": 0.706, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-24B-Instruct-2501", "verified_by_llmstats": false, "analysis_method": "instruct", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.896887+00:00", "updated_at": "2025-07-19T19:56:11.896887+00:00", "benchmark_name": "MATH" }, { "model_benchmark_id": 216, "benchmark_id": "mmlu-pro", "model_id": "mistral-small-24b-instruct-2501", "score": 0.663, "normalized_score": 0.663, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-24B-Instruct-2501", "verified_by_llmstats": false, "analysis_method": "5 shot COT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.510254+00:00", "updated_at": "2025-07-19T19:56:11.510254+00:00", "benchmark_name": "MMLU-Pro" }, { "model_benchmark_id": 1613, "benchmark_id": "mt-bench", "model_id": "mistral-small-24b-instruct-2501", "score": 0.835, "normalized_score": 0.835, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-24B-Instruct-2501", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.537073+00:00", "updated_at": "2025-07-19T19:56:14.537073+00:00", "benchmark_name": "MT-Bench" }, { "model_benchmark_id": 1818, "benchmark_id": "wild-bench", "model_id": "mistral-small-24b-instruct-2501", "score": 0.522, "normalized_score": 0.522, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-24B-Instruct-2501", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.128734+00:00", "updated_at": "2025-07-19T19:56:15.128734+00:00", "benchmark_name": "Wild Bench" } ] ================================================ FILE: data/organizations/mistral/models/mistral-small-24b-instruct-2501/model.json ================================================ { "model_id": "mistral-small-24b-instruct-2501", "name": "Mistral Small 3 24B Instruct", "organization_id": "mistral", "fine_tuned_from_model_id": null, "description": "Mistral Small 3 is a 24B-parameter LLM licensed under Apache-2.0. It focuses on low-latency, high-efficiency instruction following, maintaining performance comparable to larger models. It provides quick, accurate responses for conversational agents, function calling, and domain-specific fine-tuning. Suitable for local inference when quantized, it rivals models 2\u20133\u00d7 its size while using significantly fewer compute resources.", "release_date": "2025-01-30", "announcement_date": "2025-01-30", "license_id": "apache_2_0", "multimodal": false, "knowledge_cutoff": "2023-10-01", "param_count": 24000000000, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://docs.mistral.ai/api/", "source_playground": null, "source_paper": null, "source_scorecard_blog_link": "https://mistral.ai/news/mistral-small-3/", "source_repo_link": null, "source_weights_link": "https://huggingface.co/mistralai/Mistral-Small-24B-Instruct-2501", "created_at": "2025-07-19T19:49:05.788628+00:00", "updated_at": "2025-07-19T19:49:05.788628+00:00", "model_family_id": null } ================================================ FILE: data/organizations/mistral/models/mistral-small-3.1-24b-base-2503/benchmarks.json ================================================ [ { "model_benchmark_id": 346, "benchmark_id": "gpqa", "model_id": "mistral-small-3.1-24b-base-2503", "score": 0.375, "normalized_score": 0.375, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Base-2503", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.749533+00:00", "updated_at": "2025-07-19T19:56:11.749533+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 114, "benchmark_id": "mmlu", "model_id": "mistral-small-3.1-24b-base-2503", "score": 0.8101, "normalized_score": 0.8101, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Base-2503", "verified_by_llmstats": false, "analysis_method": "-", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.312907+00:00", "updated_at": "2025-07-19T19:56:11.312907+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 218, "benchmark_id": "mmlu-pro", "model_id": "mistral-small-3.1-24b-base-2503", "score": 0.5603, "normalized_score": 0.5603, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Base-2503", "verified_by_llmstats": false, "analysis_method": "0-shot CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.513719+00:00", "updated_at": "2025-07-19T19:56:11.513719+00:00", "benchmark_name": "MMLU-Pro" }, { "model_benchmark_id": 587, "benchmark_id": "mmmu", "model_id": "mistral-small-3.1-24b-base-2503", "score": 0.5927, "normalized_score": 0.5927, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Base-2503", "verified_by_llmstats": false, "analysis_method": "CoT accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.207080+00:00", "updated_at": "2025-07-19T19:56:12.207080+00:00", "benchmark_name": "MMMU" }, { "model_benchmark_id": 255, "benchmark_id": "triviaqa", "model_id": "mistral-small-3.1-24b-base-2503", "score": 0.805, "normalized_score": 0.805, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Base-2503", "verified_by_llmstats": false, "analysis_method": "5-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.587622+00:00", "updated_at": "2025-07-19T19:56:11.587622+00:00", "benchmark_name": "TriviaQA" } ] ================================================ FILE: data/organizations/mistral/models/mistral-small-3.1-24b-base-2503/model.json ================================================ { "model_id": "mistral-small-3.1-24b-base-2503", "name": "Mistral Small 3.1 24B Base", "organization_id": "mistral", "fine_tuned_from_model_id": null, "description": "Pretrained base model version of Mistral Small 3.1. Features improved text performance, multimodal understanding, multilingual capabilities, and an expanded 128k token context window compared to Mistral Small 3. Designed for fine-tuning.", "release_date": "2025-03-17", "announcement_date": "2025-03-17", "license_id": "apache_2_0", "multimodal": true, "knowledge_cutoff": null, "param_count": 24000000000, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": null, "source_playground": "https://console.mistral.ai/", "source_paper": null, "source_scorecard_blog_link": "https://mistral.ai/news/mistral-small-3-1", "source_repo_link": null, "source_weights_link": "https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Base-2503", "created_at": "2025-07-19T19:49:05.793911+00:00", "updated_at": "2025-07-19T19:49:05.793911+00:00", "model_family_id": null } ================================================ FILE: data/organizations/mistral/models/mistral-small-3.1-24b-instruct-2503/benchmarks.json ================================================ [ { "model_benchmark_id": 340, "benchmark_id": "gpqa", "model_id": "mistral-small-3.1-24b-instruct-2503", "score": 0.4596, "normalized_score": 0.4596, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503", "verified_by_llmstats": false, "analysis_method": "Diamond, 5-shot CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.740584+00:00", "updated_at": "2025-07-19T19:56:11.741944+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 805, "benchmark_id": "humaneval", "model_id": "mistral-small-3.1-24b-instruct-2503", "score": 0.8841, "normalized_score": 0.8841, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503", "verified_by_llmstats": false, "analysis_method": "standard", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.677771+00:00", "updated_at": "2025-07-19T19:56:12.677771+00:00", "benchmark_name": "HumanEval" }, { "model_benchmark_id": 421, "benchmark_id": "math", "model_id": "mistral-small-3.1-24b-instruct-2503", "score": 0.693, "normalized_score": 0.693, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503", "verified_by_llmstats": false, "analysis_method": "standard", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.893255+00:00", "updated_at": "2025-07-19T19:56:11.893255+00:00", "benchmark_name": "MATH" }, { "model_benchmark_id": 1194, "benchmark_id": "mbpp", "model_id": "mistral-small-3.1-24b-instruct-2503", "score": 0.7471, "normalized_score": 0.7471, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503", "verified_by_llmstats": false, "analysis_method": "standard", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.514872+00:00", "updated_at": "2025-07-19T19:56:13.514872+00:00", "benchmark_name": "MBPP" }, { "model_benchmark_id": 110, "benchmark_id": "mmlu", "model_id": "mistral-small-3.1-24b-instruct-2503", "score": 0.8062, "normalized_score": 0.8062, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503", "verified_by_llmstats": false, "analysis_method": "-", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.306426+00:00", "updated_at": "2025-07-19T19:56:11.306426+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 215, "benchmark_id": "mmlu-pro", "model_id": "mistral-small-3.1-24b-instruct-2503", "score": 0.6676, "normalized_score": 0.6676, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503", "verified_by_llmstats": false, "analysis_method": "5-shot CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.508555+00:00", "updated_at": "2025-07-19T19:56:11.508555+00:00", "benchmark_name": "MMLU-Pro" }, { "model_benchmark_id": 585, "benchmark_id": "mmmu", "model_id": "mistral-small-3.1-24b-instruct-2503", "score": 0.5927, "normalized_score": 0.5927, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503", "verified_by_llmstats": false, "analysis_method": "CoT accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.203401+00:00", "updated_at": "2025-07-19T19:56:12.203401+00:00", "benchmark_name": "MMMU" }, { "model_benchmark_id": 237, "benchmark_id": "simpleqa", "model_id": "mistral-small-3.1-24b-instruct-2503", "score": 0.1043, "normalized_score": 0.1043, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503", "verified_by_llmstats": false, "analysis_method": "TotalAcc, Correct", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.552923+00:00", "updated_at": "2025-07-19T19:56:11.552923+00:00", "benchmark_name": "SimpleQA" }, { "model_benchmark_id": 251, "benchmark_id": "triviaqa", "model_id": "mistral-small-3.1-24b-instruct-2503", "score": 0.805, "normalized_score": 0.805, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503", "verified_by_llmstats": false, "analysis_method": "5-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.579482+00:00", "updated_at": "2025-07-19T19:56:11.579482+00:00", "benchmark_name": "TriviaQA" } ] ================================================ FILE: data/organizations/mistral/models/mistral-small-3.1-24b-instruct-2503/model.json ================================================ { "model_id": "mistral-small-3.1-24b-instruct-2503", "name": "Mistral Small 3.1 24B Instruct", "organization_id": "mistral", "fine_tuned_from_model_id": null, "description": "Building upon Mistral Small 3 (2501), Mistral Small 3.1 (2503) adds state-of-the-art vision understanding and enhances long context capabilities up to 128k tokens without compromising text performance. With 24 billion parameters, this model achieves top-tier capabilities in both text and vision tasks.", "release_date": "2025-03-17", "announcement_date": "2025-03-17", "license_id": "apache_2_0", "multimodal": true, "knowledge_cutoff": null, "param_count": 24000000000, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": null, "source_playground": "https://console.mistral.ai/", "source_paper": null, "source_scorecard_blog_link": "https://mistral.ai/news/mistral-small-3-1", "source_repo_link": null, "source_weights_link": "https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503", "created_at": "2025-07-19T19:49:05.770816+00:00", "updated_at": "2025-07-19T19:49:05.770816+00:00", "model_family_id": null } ================================================ FILE: data/organizations/mistral/models/mistral-small-3.2-24b-instruct-2506/benchmarks.json ================================================ [ { "model_benchmark_id": 16767, "benchmark_id": "ai2d", "model_id": "mistral-small-3.2-24b-instruct-2506", "score": 0.9291, "normalized_score": 0.9291, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-3.2-24B-Instruct-2506", "verified_by_llmstats": false, "analysis_method": "-", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-03T22:06:15.105841+00:00", "updated_at": "2025-08-03T22:06:15.105841+00:00", "benchmark_name": "AI2D" }, { "model_benchmark_id": 16768, "benchmark_id": "arena-hard", "model_id": "mistral-small-3.2-24b-instruct-2506", "score": 0.431, "normalized_score": 0.431, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-3.2-24B-Instruct-2506", "verified_by_llmstats": false, "analysis_method": "v2", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-03T22:06:15.107885+00:00", "updated_at": "2025-08-03T22:06:15.107885+00:00", "benchmark_name": "Arena Hard" }, { "model_benchmark_id": 16769, "benchmark_id": "chartqa", "model_id": "mistral-small-3.2-24b-instruct-2506", "score": 0.874, "normalized_score": 0.874, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-3.2-24B-Instruct-2506", "verified_by_llmstats": false, "analysis_method": "-", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-03T22:06:15.109760+00:00", "updated_at": "2025-08-03T22:06:15.109760+00:00", "benchmark_name": "ChartQA" }, { "model_benchmark_id": 16770, "benchmark_id": "docvqa", "model_id": "mistral-small-3.2-24b-instruct-2506", "score": 0.9486, "normalized_score": 0.9486, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-3.2-24B-Instruct-2506", "verified_by_llmstats": false, "analysis_method": "-", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-03T22:06:15.111977+00:00", "updated_at": "2025-08-03T22:06:15.111977+00:00", "benchmark_name": "DocVQA" }, { "model_benchmark_id": 16771, "benchmark_id": "gpqa", "model_id": "mistral-small-3.2-24b-instruct-2506", "score": 0.4422, "normalized_score": 0.4422, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-3.2-24B-Instruct-2506", "verified_by_llmstats": false, "analysis_method": "5-shot CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-03T22:06:15.113518+00:00", "updated_at": "2025-08-03T22:06:15.113518+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 16772, "benchmark_id": "gpqa", "model_id": "mistral-small-3.2-24b-instruct-2506", "score": 0.4613, "normalized_score": 0.4613, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-3.2-24B-Instruct-2506", "verified_by_llmstats": false, "analysis_method": "5-shot CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-03T22:06:15.115179+00:00", "updated_at": "2025-08-03T22:06:15.115179+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 16773, "benchmark_id": "humaneval-plus", "model_id": "mistral-small-3.2-24b-instruct-2506", "score": 0.929, "normalized_score": 0.929, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-3.2-24B-Instruct-2506", "verified_by_llmstats": false, "analysis_method": "Pass@5", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-03T22:06:15.116763+00:00", "updated_at": "2025-08-03T22:06:15.116763+00:00", "benchmark_name": "HumanEval Plus" }, { "model_benchmark_id": 16774, "benchmark_id": "if", "model_id": "mistral-small-3.2-24b-instruct-2506", "score": 0.8478, "normalized_score": 0.8478, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-3.2-24B-Instruct-2506", "verified_by_llmstats": false, "analysis_method": "-", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-03T22:06:15.118250+00:00", "updated_at": "2025-08-03T22:06:15.118250+00:00", "benchmark_name": "IF" }, { "model_benchmark_id": 16775, "benchmark_id": "math", "model_id": "mistral-small-3.2-24b-instruct-2506", "score": 0.6942, "normalized_score": 0.6942, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-3.2-24B-Instruct-2506", "verified_by_llmstats": false, "analysis_method": "5-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-03T22:06:15.119723+00:00", "updated_at": "2025-08-03T22:06:15.119723+00:00", "benchmark_name": "MATH" }, { "model_benchmark_id": 16776, "benchmark_id": "mathvista", "model_id": "mistral-small-3.2-24b-instruct-2506", "score": 0.6709, "normalized_score": 0.6709, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-3.2-24B-Instruct-2506", "verified_by_llmstats": false, "analysis_method": "-", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-03T22:06:15.121246+00:00", "updated_at": "2025-08-03T22:06:15.121246+00:00", "benchmark_name": "MathVista" }, { "model_benchmark_id": 16777, "benchmark_id": "mbpp-plus", "model_id": "mistral-small-3.2-24b-instruct-2506", "score": 0.7833, "normalized_score": 0.7833, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-3.2-24B-Instruct-2506", "verified_by_llmstats": false, "analysis_method": "Pass@5", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-03T22:06:15.122828+00:00", "updated_at": "2025-08-03T22:06:15.122828+00:00", "benchmark_name": "MBPP Plus" }, { "model_benchmark_id": 16778, "benchmark_id": "mmlu", "model_id": "mistral-small-3.2-24b-instruct-2506", "score": 0.805, "normalized_score": 0.805, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-3.2-24B-Instruct-2506", "verified_by_llmstats": false, "analysis_method": "5-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-03T22:06:15.124220+00:00", "updated_at": "2025-08-03T22:06:15.124220+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 16779, "benchmark_id": "mmlu-pro", "model_id": "mistral-small-3.2-24b-instruct-2506", "score": 0.6906, "normalized_score": 0.6906, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-3.2-24B-Instruct-2506", "verified_by_llmstats": false, "analysis_method": "5-shot CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-03T22:06:15.125972+00:00", "updated_at": "2025-08-03T22:06:15.125972+00:00", "benchmark_name": "MMLU-Pro" }, { "model_benchmark_id": 16780, "benchmark_id": "mmmu", "model_id": "mistral-small-3.2-24b-instruct-2506", "score": 0.625, "normalized_score": 0.625, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-3.2-24B-Instruct-2506", "verified_by_llmstats": false, "analysis_method": "-", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-03T22:06:15.127425+00:00", "updated_at": "2025-08-03T22:06:15.127425+00:00", "benchmark_name": "MMMU" }, { "model_benchmark_id": 16781, "benchmark_id": "simpleqa", "model_id": "mistral-small-3.2-24b-instruct-2506", "score": 0.121, "normalized_score": 0.121, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-3.2-24B-Instruct-2506", "verified_by_llmstats": false, "analysis_method": "TotalAcc", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-03T22:06:15.129114+00:00", "updated_at": "2025-08-03T22:06:15.129114+00:00", "benchmark_name": "SimpleQA" }, { "model_benchmark_id": 16782, "benchmark_id": "wild-bench", "model_id": "mistral-small-3.2-24b-instruct-2506", "score": 0.6533, "normalized_score": 0.6533, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-3.2-24B-Instruct-2506", "verified_by_llmstats": false, "analysis_method": "v2", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-03T22:06:15.130665+00:00", "updated_at": "2025-08-03T22:06:15.130665+00:00", "benchmark_name": "Wild Bench" } ] ================================================ FILE: data/organizations/mistral/models/mistral-small-3.2-24b-instruct-2506/model.json ================================================ { "model_id": "mistral-small-3.2-24b-instruct-2506", "name": "Mistral Small 3.2 24B Instruct", "organization_id": "mistral", "fine_tuned_from_model_id": "mistral-small-3.1-24b-base-2503", "description": "Mistral-Small-3.2-24B-Instruct-2506 is a minor update of Mistral-Small-3.1-24B-Instruct-2503.", "release_date": "2025-06-20", "announcement_date": "2025-06-20", "license_id": "apache_2_0", "multimodal": true, "knowledge_cutoff": "2023-10-01", "param_count": 23600000000, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": null, "source_playground": "https://console.mistral.ai/", "source_paper": null, "source_scorecard_blog_link": null, "source_repo_link": null, "source_weights_link": "https://huggingface.co/mistralai/Mistral-Small-3.2-24B-Instruct-2506", "created_at": "2025-08-03T22:06:11.933573+00:00", "updated_at": "2025-08-03T22:06:11.933573+00:00", "model_family_id": null } ================================================ FILE: data/organizations/mistral/models/pixtral-12b-2409/benchmarks.json ================================================ [ { "model_benchmark_id": 874, "benchmark_id": "chartqa", "model_id": "pixtral-12b-2409", "score": 0.818, "normalized_score": 0.818, "is_self_reported": true, "self_reported_source_link": "https://mistral.ai/news/pixtral-12b/", "verified_by_llmstats": false, "analysis_method": "Chain of Thought (CoT)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.822444+00:00", "updated_at": "2025-07-19T19:56:12.822444+00:00", "benchmark_name": "ChartQA" }, { "model_benchmark_id": 899, "benchmark_id": "docvqa", "model_id": "pixtral-12b-2409", "score": 0.907, "normalized_score": 0.907, "is_self_reported": true, "self_reported_source_link": "https://mistral.ai/news/pixtral-12b/", "verified_by_llmstats": false, "analysis_method": "ANLS", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.871485+00:00", "updated_at": "2025-07-19T19:56:12.871485+00:00", "benchmark_name": "DocVQA" }, { "model_benchmark_id": 808, "benchmark_id": "humaneval", "model_id": "pixtral-12b-2409", "score": 0.72, "normalized_score": 0.72, "is_self_reported": true, "self_reported_source_link": "https://mistral.ai/news/pixtral-12b/", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.684555+00:00", "updated_at": "2025-07-19T19:56:12.684555+00:00", "benchmark_name": "HumanEval" }, { "model_benchmark_id": 631, "benchmark_id": "ifeval", "model_id": "pixtral-12b-2409", "score": 0.613, "normalized_score": 0.613, "is_self_reported": true, "self_reported_source_link": "https://mistral.ai/news/pixtral-12b/", "verified_by_llmstats": false, "analysis_method": "Text Instruction Following Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.297384+00:00", "updated_at": "2025-07-19T19:56:12.297384+00:00", "benchmark_name": "IFEval" }, { "model_benchmark_id": 425, "benchmark_id": "math", "model_id": "pixtral-12b-2409", "score": 0.481, "normalized_score": 0.481, "is_self_reported": true, "self_reported_source_link": "https://mistral.ai/news/pixtral-12b/", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.900275+00:00", "updated_at": "2025-07-19T19:56:11.900275+00:00", "benchmark_name": "MATH" }, { "model_benchmark_id": 537, "benchmark_id": "mathvista", "model_id": "pixtral-12b-2409", "score": 0.58, "normalized_score": 0.58, "is_self_reported": true, "self_reported_source_link": "https://mistral.ai/news/pixtral-12b/", "verified_by_llmstats": false, "analysis_method": "Chain of Thought (CoT)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.111272+00:00", "updated_at": "2025-07-19T19:56:12.111272+00:00", "benchmark_name": "MathVista" }, { "model_benchmark_id": 1822, "benchmark_id": "mm-if-eval", "model_id": "pixtral-12b-2409", "score": 0.527, "normalized_score": 0.527, "is_self_reported": true, "self_reported_source_link": "https://mistral.ai/news/pixtral-12b/", "verified_by_llmstats": false, "analysis_method": "Multimodal Instruction Following Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.145578+00:00", "updated_at": "2025-07-19T19:56:15.145578+00:00", "benchmark_name": "MM IF-Eval" }, { "model_benchmark_id": 115, "benchmark_id": "mmlu", "model_id": "pixtral-12b-2409", "score": 0.692, "normalized_score": 0.692, "is_self_reported": true, "self_reported_source_link": "https://mistral.ai/news/pixtral-12b/", "verified_by_llmstats": false, "analysis_method": "5-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.314507+00:00", "updated_at": "2025-07-19T19:56:11.314507+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 1733, "benchmark_id": "mm-mt-bench", "model_id": "pixtral-12b-2409", "score": 0.605, "normalized_score": 0.605, "is_self_reported": true, "self_reported_source_link": "https://mistral.ai/news/pixtral-12b/", "verified_by_llmstats": false, "analysis_method": "Multimodal MT-Bench Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.887276+00:00", "updated_at": "2025-07-19T19:56:14.887276+00:00", "benchmark_name": "MM-MT-Bench" }, { "model_benchmark_id": 588, "benchmark_id": "mmmu", "model_id": "pixtral-12b-2409", "score": 0.525, "normalized_score": 0.525, "is_self_reported": true, "self_reported_source_link": "https://mistral.ai/news/pixtral-12b/", "verified_by_llmstats": false, "analysis_method": "Chain of Thought (CoT)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.209409+00:00", "updated_at": "2025-07-19T19:56:12.209409+00:00", "benchmark_name": "MMMU" }, { "model_benchmark_id": 1614, "benchmark_id": "mt-bench", "model_id": "pixtral-12b-2409", "score": 0.768, "normalized_score": 0.768, "is_self_reported": true, "self_reported_source_link": "https://mistral.ai/news/pixtral-12b/", "verified_by_llmstats": false, "analysis_method": "Text MT-Bench Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.539185+00:00", "updated_at": "2025-07-19T19:56:14.539185+00:00", "benchmark_name": "MT-Bench" }, { "model_benchmark_id": 1575, "benchmark_id": "vqav2", "model_id": "pixtral-12b-2409", "score": 0.786, "normalized_score": 0.786, "is_self_reported": true, "self_reported_source_link": "https://mistral.ai/news/pixtral-12b/", "verified_by_llmstats": false, "analysis_method": "VQA Match", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.416120+00:00", "updated_at": "2025-07-19T19:56:14.416120+00:00", "benchmark_name": "VQAv2" } ] ================================================ FILE: data/organizations/mistral/models/pixtral-12b-2409/model.json ================================================ { "model_id": "pixtral-12b-2409", "name": "Pixtral-12B", "organization_id": "mistral", "fine_tuned_from_model_id": null, "description": "A 12B parameter multimodal model with a 400M parameter vision encoder, capable of understanding both natural images and documents. Excels at multimodal tasks while maintaining strong text-only performance. Supports variable image sizes and multiple images in context.", "release_date": "2024-09-17", "announcement_date": "2024-09-17", "license_id": "apache_2_0", "multimodal": true, "knowledge_cutoff": null, "param_count": 12400000000, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://docs.mistral.ai/platform/endpoints/", "source_playground": "https://chat.mistral.ai", "source_paper": null, "source_scorecard_blog_link": "https://mistral.ai/news/pixtral-12b/", "source_repo_link": "https://huggingface.co/mistralai/Pixtral-12B-2409", "source_weights_link": "https://huggingface.co/mistralai/Pixtral-12B-2409", "created_at": "2025-07-19T19:49:05.802013+00:00", "updated_at": "2025-07-19T19:49:05.802013+00:00", "model_family_id": null } ================================================ FILE: data/organizations/mistral/models/pixtral-large/benchmarks.json ================================================ [ { "model_benchmark_id": 1261, "benchmark_id": "ai2d", "model_id": "pixtral-large", "score": 0.938, "normalized_score": 0.938, "is_self_reported": true, "self_reported_source_link": "https://mistral.ai/news/pixtral-large/", "verified_by_llmstats": false, "analysis_method": "BBox", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.645378+00:00", "updated_at": "2025-07-19T19:56:13.645378+00:00", "benchmark_name": "AI2D" }, { "model_benchmark_id": 873, "benchmark_id": "chartqa", "model_id": "pixtral-large", "score": 0.881, "normalized_score": 0.881, "is_self_reported": true, "self_reported_source_link": "https://mistral.ai/news/pixtral-large/", "verified_by_llmstats": false, "analysis_method": "CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.820802+00:00", "updated_at": "2025-07-19T19:56:12.820802+00:00", "benchmark_name": "ChartQA" }, { "model_benchmark_id": 898, "benchmark_id": "docvqa", "model_id": "pixtral-large", "score": 0.933, "normalized_score": 0.933, "is_self_reported": true, "self_reported_source_link": "https://mistral.ai/news/pixtral-large/", "verified_by_llmstats": false, "analysis_method": "ANLS", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.869454+00:00", "updated_at": "2025-07-19T19:56:12.869454+00:00", "benchmark_name": "DocVQA" }, { "model_benchmark_id": 536, "benchmark_id": "mathvista", "model_id": "pixtral-large", "score": 0.694, "normalized_score": 0.694, "is_self_reported": true, "self_reported_source_link": "https://mistral.ai/news/pixtral-large/", "verified_by_llmstats": false, "analysis_method": "CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.109764+00:00", "updated_at": "2025-07-19T19:56:12.109764+00:00", "benchmark_name": "MathVista" }, { "model_benchmark_id": 1732, "benchmark_id": "mm-mt-bench", "model_id": "pixtral-large", "score": 0.74, "normalized_score": 0.74, "is_self_reported": true, "self_reported_source_link": "https://mistral.ai/news/pixtral-large/", "verified_by_llmstats": false, "analysis_method": "GPT-4o Judge", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.885715+00:00", "updated_at": "2025-07-19T19:56:14.885715+00:00", "benchmark_name": "MM-MT-Bench" }, { "model_benchmark_id": 586, "benchmark_id": "mmmu", "model_id": "pixtral-large", "score": 0.64, "normalized_score": 0.64, "is_self_reported": true, "self_reported_source_link": "https://mistral.ai/news/pixtral-large/", "verified_by_llmstats": false, "analysis_method": "CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.205240+00:00", "updated_at": "2025-07-19T19:56:12.205240+00:00", "benchmark_name": "MMMU" }, { "model_benchmark_id": 1574, "benchmark_id": "vqav2", "model_id": "pixtral-large", "score": 0.809, "normalized_score": 0.809, "is_self_reported": true, "self_reported_source_link": "https://mistral.ai/news/pixtral-large/", "verified_by_llmstats": false, "analysis_method": "VQA Match", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.414450+00:00", "updated_at": "2025-07-19T19:56:14.414450+00:00", "benchmark_name": "VQAv2" } ] ================================================ FILE: data/organizations/mistral/models/pixtral-large/model.json ================================================ { "model_id": "pixtral-large", "name": "Pixtral Large", "organization_id": "mistral", "fine_tuned_from_model_id": "mistral-large-2-2407", "description": "A 124B parameter multimodal model built on top of Mistral Large 2, featuring frontier-level image understanding capabilities. Excels at understanding documents, charts, and natural images while maintaining strong text-only performance. Features a 123B multimodal decoder and 1B parameter vision encoder with a 128K context window supporting up to 30 high-resolution images.", "release_date": "2024-11-18", "announcement_date": "2024-11-18", "license_id": "mistral_research_license_(mrl)_for_research;_mistral_commercial_license_for_commercial_use", "multimodal": true, "knowledge_cutoff": null, "param_count": 124000000000, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://mistral.ai/", "source_playground": "https://chat.mistral.ai/", "source_paper": null, "source_scorecard_blog_link": "https://mistral.ai/news/pixtral-large/", "source_repo_link": null, "source_weights_link": "https://huggingface.co/mistralai/Pixtral-Large-Instruct-2411", "created_at": "2025-07-19T19:49:05.913427+00:00", "updated_at": "2025-07-19T19:49:05.913427+00:00", "model_family_id": null } ================================================ FILE: data/organizations/mistral/organization.json ================================================ { "organization_id": "mistral", "name": "Mistral AI", "website": "https://mistral.ai", "description": "French AI company", "country": "FR", "created_at": "2025-07-19T19:49:05.769198+00:00", "updated_at": "2025-07-19T19:49:05.769198+00:00" } ================================================ FILE: data/organizations/moonshotai/models/kimi-k1.5/benchmarks.json ================================================ [ { "model_benchmark_id": 444, "benchmark_id": "aime-2024", "model_id": "kimi-k1.5", "score": 0.775, "normalized_score": 0.775, "is_self_reported": true, "self_reported_source_link": "https://github.com/MoonshotAI/Kimi-k1.5", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.945090+00:00", "updated_at": "2025-07-19T19:56:11.945090+00:00", "benchmark_name": "AIME 2024" }, { "model_benchmark_id": 435, "benchmark_id": "c-eval", "model_id": "kimi-k1.5", "score": 0.883, "normalized_score": 0.883, "is_self_reported": true, "self_reported_source_link": "https://github.com/MoonshotAI/Kimi-k1.5", "verified_by_llmstats": false, "analysis_method": "Exact Match", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.922484+00:00", "updated_at": "2025-07-19T19:56:11.922484+00:00", "benchmark_name": "C-Eval" }, { "model_benchmark_id": 599, "benchmark_id": "cluewsc", "model_id": "kimi-k1.5", "score": 0.914, "normalized_score": 0.914, "is_self_reported": true, "self_reported_source_link": "https://github.com/MoonshotAI/Kimi-k1.5", "verified_by_llmstats": false, "analysis_method": "Exact Match", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.236097+00:00", "updated_at": "2025-07-19T19:56:12.236097+00:00", "benchmark_name": "CLUEWSC" }, { "model_benchmark_id": 602, "benchmark_id": "ifeval", "model_id": "kimi-k1.5", "score": 0.872, "normalized_score": 0.872, "is_self_reported": true, "self_reported_source_link": "https://github.com/MoonshotAI/Kimi-k1.5", "verified_by_llmstats": false, "analysis_method": "Exact Match", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.244895+00:00", "updated_at": "2025-07-19T19:56:12.244895+00:00", "benchmark_name": "IFEval" }, { "model_benchmark_id": 514, "benchmark_id": "livecodebench-v5-24.12-25.2", "model_id": "kimi-k1.5", "score": 0.625, "normalized_score": 0.625, "is_self_reported": true, "self_reported_source_link": "https://github.com/MoonshotAI/Kimi-k1.5", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.068737+00:00", "updated_at": "2025-07-19T19:56:12.068737+00:00", "benchmark_name": "LiveCodeBench v5 24.12-25.2" }, { "model_benchmark_id": 492, "benchmark_id": "math-500", "model_id": "kimi-k1.5", "score": 0.962, "normalized_score": 0.962, "is_self_reported": true, "self_reported_source_link": "https://github.com/MoonshotAI/Kimi-k1.5", "verified_by_llmstats": false, "analysis_method": "Exact Match", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.029931+00:00", "updated_at": "2025-07-19T19:56:12.029931+00:00", "benchmark_name": "MATH-500" }, { "model_benchmark_id": 515, "benchmark_id": "mathvista", "model_id": "kimi-k1.5", "score": 0.749, "normalized_score": 0.749, "is_self_reported": true, "self_reported_source_link": "https://github.com/MoonshotAI/Kimi-k1.5", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.071814+00:00", "updated_at": "2025-07-19T19:56:12.071814+00:00", "benchmark_name": "MathVista" }, { "model_benchmark_id": 58, "benchmark_id": "mmlu", "model_id": "kimi-k1.5", "score": 0.874, "normalized_score": 0.874, "is_self_reported": true, "self_reported_source_link": "https://github.com/MoonshotAI/Kimi-k1.5", "verified_by_llmstats": false, "analysis_method": "Exact Match", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.207582+00:00", "updated_at": "2025-07-19T19:56:11.207582+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 549, "benchmark_id": "mmmu", "model_id": "kimi-k1.5", "score": 0.7, "normalized_score": 0.7, "is_self_reported": true, "self_reported_source_link": "https://github.com/MoonshotAI/Kimi-k1.5", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.132422+00:00", "updated_at": "2025-07-19T19:56:12.132422+00:00", "benchmark_name": "MMMU" } ] ================================================ FILE: data/organizations/moonshotai/models/kimi-k1.5/model.json ================================================ { "model_id": "kimi-k1.5", "name": "Kimi-k1.5", "organization_id": "moonshotai", "fine_tuned_from_model_id": null, "description": "Kimi 1.5 is a next-generation multimodal large language model developed by Moonshot AI. It incorporates advanced reinforcement learning (RL) and scalable multimodal reasoning, delivering state-of-the-art performance in math, code, vision, and long-context reasoning tasks.", "release_date": "2025-01-20", "announcement_date": "2025-01-20", "license_id": "proprietary", "multimodal": true, "knowledge_cutoff": null, "param_count": null, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://platform.moonshot.cn/docs/api-reference", "source_playground": "https://kimi.ai/", "source_paper": "https://arxiv.org/abs/2501.12599", "source_scorecard_blog_link": null, "source_repo_link": "https://github.com/MoonshotAI/Kimi-k1.5", "source_weights_link": null, "created_at": "2025-07-19T19:49:05.426406+00:00", "updated_at": "2025-07-19T19:49:05.426406+00:00", "model_family_id": null } ================================================ FILE: data/organizations/moonshotai/models/kimi-k2-0905/benchmarks.json ================================================ [ { "model_benchmark_id": 9001, "benchmark_id": "gpqa", "model_id": "kimi-k2-0905", "score": 0.758, "normalized_score": 0.758, "is_self_reported": true, "self_reported_source_link": "https://moonshot.cn/blog/kimi-k2-0905", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2024-09-05T00:00:00.000000+00:00", "updated_at": "2024-09-15T00:00:00.000000+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 9002, "benchmark_id": "mmlu", "model_id": "kimi-k2-0905", "score": 0.902, "normalized_score": 0.902, "is_self_reported": true, "self_reported_source_link": "https://moonshot.cn/blog/kimi-k2-0905", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2024-09-05T00:00:00.000000+00:00", "updated_at": "2024-09-15T00:00:00.000000+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 9003, "benchmark_id": "math", "model_id": "kimi-k2-0905", "score": 0.891, "normalized_score": 0.891, "is_self_reported": true, "self_reported_source_link": "https://moonshot.cn/blog/kimi-k2-0905", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2024-09-05T00:00:00.000000+00:00", "updated_at": "2024-09-15T00:00:00.000000+00:00", "benchmark_name": "MATH" }, { "model_benchmark_id": 9004, "benchmark_id": "humaneval", "model_id": "kimi-k2-0905", "score": 0.945, "normalized_score": 0.945, "is_self_reported": true, "self_reported_source_link": "https://moonshot.cn/blog/kimi-k2-0905", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2024-09-05T00:00:00.000000+00:00", "updated_at": "2024-09-15T00:00:00.000000+00:00", "benchmark_name": "HumanEval" }, { "model_benchmark_id": 9005, "benchmark_id": "mmlu-pro", "model_id": "kimi-k2-0905", "score": 0.825, "normalized_score": 0.825, "is_self_reported": true, "self_reported_source_link": "https://moonshot.cn/blog/kimi-k2-0905", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2024-09-05T00:00:00.000000+00:00", "updated_at": "2024-09-15T00:00:00.000000+00:00", "benchmark_name": "MMLU-Pro" }, { "model_benchmark_id": 9006, "benchmark_id": "aime-2024", "model_id": "kimi-k2-0905", "score": 0.72, "normalized_score": 0.72, "is_self_reported": true, "self_reported_source_link": "https://moonshot.cn/blog/kimi-k2-0905", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2024-09-05T00:00:00.000000+00:00", "updated_at": "2024-09-15T00:00:00.000000+00:00", "benchmark_name": "AIME 2024" } ] ================================================ FILE: data/organizations/moonshotai/models/kimi-k2-0905/model.json ================================================ { "model_id": "kimi-k2-0905", "name": "Kimi K2 0905", "organization_id": "moonshotai", "fine_tuned_from_model_id": "kimi-k2-instruct", "description": "Kimi K2 0905 is the September update of Kimi K2 0711. It is a large-scale Mixture-of-Experts (MoE) language model developed by Moonshot AI, featuring 1 trillion total parameters with 32 billion active per forward pass. It supports long-context inference up to 256k tokens, extended from the previous 128k. This update improves agentic coding with higher accuracy and better generalization across scaffolds, and enhances frontend coding with more aesthetic and functional outputs for web, 3D, and related tasks. The model is trained with a novel stack incorporating the MuonClip optimizer for stable large-scale MoE training.", "release_date": "2025-09-05", "announcement_date": "2025-09-05", "license_id": "proprietary", "multimodal": false, "knowledge_cutoff": null, "param_count": 1000000000000, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://docs.moonshot.cn/", "source_playground": "https://kimi.moonshot.cn/", "source_paper": null, "source_scorecard_blog_link": "https://moonshot.cn/blog/kimi-k2-0905", "source_repo_link": null, "source_weights_link": null, "created_at": "2025-09-14T00:00:00.000000+00:00", "updated_at": "2025-09-14T00:00:00.000000+00:00", "model_family_id": null } ================================================ FILE: data/organizations/moonshotai/models/kimi-k2-base/benchmarks.json ================================================ [ { "model_benchmark_id": 434, "benchmark_id": "c-eval", "model_id": "kimi-k2-base", "score": 0.925, "normalized_score": 0.925, "is_self_reported": true, "self_reported_source_link": "https://github.com/MoonshotAI/Kimi-K2", "verified_by_llmstats": false, "analysis_method": "EM", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.920573+00:00", "updated_at": "2025-07-19T19:56:11.920573+00:00", "benchmark_name": "C-Eval" }, { "model_benchmark_id": 440, "benchmark_id": "csimpleqa", "model_id": "kimi-k2-base", "score": 0.776, "normalized_score": 0.776, "is_self_reported": true, "self_reported_source_link": "https://github.com/MoonshotAI/Kimi-K2", "verified_by_llmstats": false, "analysis_method": "Correct", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.934566+00:00", "updated_at": "2025-07-19T19:56:11.934566+00:00", "benchmark_name": "CSimpleQA" }, { "model_benchmark_id": 369, "benchmark_id": "evalplus", "model_id": "kimi-k2-base", "score": 0.803, "normalized_score": 0.803, "is_self_reported": true, "self_reported_source_link": "https://github.com/MoonshotAI/Kimi-K2", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.796250+00:00", "updated_at": "2025-07-19T19:56:11.796250+00:00", "benchmark_name": "EvalPlus" }, { "model_benchmark_id": 256, "benchmark_id": "gpqa", "model_id": "kimi-k2-base", "score": 0.481, "normalized_score": 0.481, "is_self_reported": true, "self_reported_source_link": "https://github.com/MoonshotAI/Kimi-K2", "verified_by_llmstats": false, "analysis_method": "Diamond Avg@8", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.591508+00:00", "updated_at": "2025-07-19T19:56:11.591508+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 158, "benchmark_id": "gsm8k", "model_id": "kimi-k2-base", "score": 0.921, "normalized_score": 0.921, "is_self_reported": true, "self_reported_source_link": "https://github.com/MoonshotAI/Kimi-K2", "verified_by_llmstats": false, "analysis_method": "EM", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.403308+00:00", "updated_at": "2025-07-19T19:56:11.403308+00:00", "benchmark_name": "GSM8k" }, { "model_benchmark_id": 367, "benchmark_id": "livecodebench-v6", "model_id": "kimi-k2-base", "score": 0.263, "normalized_score": 0.263, "is_self_reported": true, "self_reported_source_link": "https://github.com/MoonshotAI/Kimi-K2", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.789592+00:00", "updated_at": "2025-07-19T19:56:11.789592+00:00", "benchmark_name": "LiveCodeBench v6" }, { "model_benchmark_id": 373, "benchmark_id": "math", "model_id": "kimi-k2-base", "score": 0.702, "normalized_score": 0.702, "is_self_reported": true, "self_reported_source_link": "https://github.com/MoonshotAI/Kimi-K2", "verified_by_llmstats": false, "analysis_method": "EM", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.808795+00:00", "updated_at": "2025-07-19T19:56:11.808795+00:00", "benchmark_name": "MATH" }, { "model_benchmark_id": 57, "benchmark_id": "mmlu", "model_id": "kimi-k2-base", "score": 0.878, "normalized_score": 0.878, "is_self_reported": true, "self_reported_source_link": "https://github.com/MoonshotAI/Kimi-K2", "verified_by_llmstats": false, "analysis_method": "EM", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.205746+00:00", "updated_at": "2025-07-19T19:56:11.205746+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 161, "benchmark_id": "mmlu-pro", "model_id": "kimi-k2-base", "score": 0.692, "normalized_score": 0.692, "is_self_reported": true, "self_reported_source_link": "https://github.com/MoonshotAI/Kimi-K2", "verified_by_llmstats": false, "analysis_method": "EM", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.410852+00:00", "updated_at": "2025-07-19T19:56:11.410852+00:00", "benchmark_name": "MMLU-Pro" }, { "model_benchmark_id": 221, "benchmark_id": "mmlu-redux-2.0", "model_id": "kimi-k2-base", "score": 0.902, "normalized_score": 0.902, "is_self_reported": true, "self_reported_source_link": "https://github.com/MoonshotAI/Kimi-K2", "verified_by_llmstats": false, "analysis_method": "EM", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.520883+00:00", "updated_at": "2025-07-19T19:56:11.520883+00:00", "benchmark_name": "MMLU-redux-2.0" }, { "model_benchmark_id": 222, "benchmark_id": "simpleqa", "model_id": "kimi-k2-base", "score": 0.353, "normalized_score": 0.353, "is_self_reported": true, "self_reported_source_link": "https://github.com/MoonshotAI/Kimi-K2", "verified_by_llmstats": false, "analysis_method": "Correct", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.524097+00:00", "updated_at": "2025-07-19T19:56:11.524097+00:00", "benchmark_name": "SimpleQA" }, { "model_benchmark_id": 364, "benchmark_id": "supergpqa", "model_id": "kimi-k2-base", "score": 0.447, "normalized_score": 0.447, "is_self_reported": true, "self_reported_source_link": "https://github.com/MoonshotAI/Kimi-K2", "verified_by_llmstats": false, "analysis_method": "EM", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.781413+00:00", "updated_at": "2025-07-19T19:56:11.781413+00:00", "benchmark_name": "SuperGPQA" }, { "model_benchmark_id": 243, "benchmark_id": "triviaqa", "model_id": "kimi-k2-base", "score": 0.851, "normalized_score": 0.851, "is_self_reported": true, "self_reported_source_link": "https://github.com/MoonshotAI/Kimi-K2", "verified_by_llmstats": false, "analysis_method": "EM", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.566226+00:00", "updated_at": "2025-07-19T19:56:11.566226+00:00", "benchmark_name": "TriviaQA" } ] ================================================ FILE: data/organizations/moonshotai/models/kimi-k2-base/model.json ================================================ { "model_id": "kimi-k2-base", "name": "Kimi K2 Base", "organization_id": "moonshotai", "fine_tuned_from_model_id": null, "description": "Kimi K2 base model is a state-of-the-art mixture-of-experts (MoE) language model with 32 billion activated parameters and 1 trillion total parameters. Trained on 15.5 trillion tokens with the MuonClip optimizer, this is the foundation model before instruction tuning. It demonstrates strong performance on knowledge, reasoning, and coding benchmarks while being optimized for agentic capabilities.", "release_date": "2025-07-11", "announcement_date": "2025-07-11", "license_id": "mit", "multimodal": false, "knowledge_cutoff": null, "param_count": 1000000000000, "training_tokens": 15500000000000, "available_in_zeroeval": true, "source_api_ref": "https://platform.moonshot.ai", "source_playground": null, "source_paper": null, "source_scorecard_blog_link": "https://moonshotai.github.io/Kimi-K2/", "source_repo_link": "https://github.com/MoonshotAI/Kimi-K2", "source_weights_link": "https://huggingface.co/moonshotai/Kimi-K2-Base", "created_at": "2025-07-19T19:49:05.422399+00:00", "updated_at": "2025-07-19T19:49:05.422399+00:00", "model_family_id": null } ================================================ FILE: data/organizations/moonshotai/models/kimi-k2-instruct/benchmarks.json ================================================ [ { "model_benchmark_id": 676, "benchmark_id": "acebench", "model_id": "kimi-k2-instruct", "score": 0.765, "normalized_score": 0.765, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.408910+00:00", "updated_at": "2025-07-19T19:56:12.408910+00:00", "benchmark_name": "AceBench" }, { "model_benchmark_id": 657, "benchmark_id": "aider-polyglot", "model_id": "kimi-k2-instruct", "score": 0.6, "normalized_score": 0.6, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.362819+00:00", "updated_at": "2025-07-19T19:56:12.362819+00:00", "benchmark_name": "Aider-Polyglot" }, { "model_benchmark_id": 445, "benchmark_id": "aime-2024", "model_id": "kimi-k2-instruct", "score": 0.696, "normalized_score": 0.696, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "Avg@64", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.946639+00:00", "updated_at": "2025-07-19T19:56:11.946639+00:00", "benchmark_name": "AIME 2024" }, { "model_benchmark_id": 677, "benchmark_id": "aime-2025", "model_id": "kimi-k2-instruct", "score": 0.495, "normalized_score": 0.495, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "Avg@64", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.412395+00:00", "updated_at": "2025-07-19T19:56:12.412395+00:00", "benchmark_name": "AIME 2025" }, { "model_benchmark_id": 715, "benchmark_id": "autologi", "model_id": "kimi-k2-instruct", "score": 0.895, "normalized_score": 0.895, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.506457+00:00", "updated_at": "2025-07-19T19:56:12.506457+00:00", "benchmark_name": "AutoLogi" }, { "model_benchmark_id": 757, "benchmark_id": "cbnsl", "model_id": "kimi-k2-instruct", "score": 0.956, "normalized_score": 0.956, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.594017+00:00", "updated_at": "2025-07-19T19:56:12.594017+00:00", "benchmark_name": "CBNSL" }, { "model_benchmark_id": 709, "benchmark_id": "cnmo-2024", "model_id": "kimi-k2-instruct", "score": 0.743, "normalized_score": 0.743, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "Avg@16", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.489469+00:00", "updated_at": "2025-07-19T19:56:12.489469+00:00", "benchmark_name": "CNMO 2024" }, { "model_benchmark_id": 441, "benchmark_id": "csimpleqa", "model_id": "kimi-k2-instruct", "score": 0.784, "normalized_score": 0.784, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "Correct", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.936097+00:00", "updated_at": "2025-07-19T19:56:11.936097+00:00", "benchmark_name": "CSimpleQA" }, { "model_benchmark_id": 257, "benchmark_id": "gpqa", "model_id": "kimi-k2-instruct", "score": 0.751, "normalized_score": 0.751, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "Diamond Avg@8", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.593256+00:00", "updated_at": "2025-07-19T19:56:11.593256+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 159, "benchmark_id": "gsm8k", "model_id": "kimi-k2-instruct", "score": 0.973, "normalized_score": 0.973, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.405113+00:00", "updated_at": "2025-07-19T19:56:11.405113+00:00", "benchmark_name": "GSM8k" }, { "model_benchmark_id": 707, "benchmark_id": "hmmt-2025", "model_id": "kimi-k2-instruct", "score": 0.388, "normalized_score": 0.388, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "Avg@32", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.482540+00:00", "updated_at": "2025-07-19T19:56:12.482540+00:00", "benchmark_name": "HMMT 2025" }, { "model_benchmark_id": 758, "benchmark_id": "humaneval", "model_id": "kimi-k2-instruct", "score": 0.933, "normalized_score": 0.933, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.598519+00:00", "updated_at": "2025-07-19T19:56:12.598519+00:00", "benchmark_name": "HumanEval" }, { "model_benchmark_id": 819, "benchmark_id": "humaneval-er", "model_id": "kimi-k2-instruct", "score": 0.811, "normalized_score": 0.811, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.707650+00:00", "updated_at": "2025-07-19T19:56:12.707650+00:00", "benchmark_name": "HumanEval-ER" }, { "model_benchmark_id": 716, "benchmark_id": "humanity's-last-exam", "model_id": "kimi-k2-instruct", "score": 0.047, "normalized_score": 0.047, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "Accuracy (Text Only)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.510122+00:00", "updated_at": "2025-07-19T19:56:12.510122+00:00", "benchmark_name": "Humanity's Last Exam" }, { "model_benchmark_id": 603, "benchmark_id": "ifeval", "model_id": "kimi-k2-instruct", "score": 0.898, "normalized_score": 0.898, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "Prompt Strict", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.247003+00:00", "updated_at": "2025-07-19T19:56:12.247003+00:00", "benchmark_name": "IFEval" }, { "model_benchmark_id": 745, "benchmark_id": "livebench", "model_id": "kimi-k2-instruct", "score": 0.764, "normalized_score": 0.764, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.567525+00:00", "updated_at": "2025-07-19T19:56:12.567525+00:00", "benchmark_name": "LiveBench" }, { "model_benchmark_id": 368, "benchmark_id": "livecodebench-v6", "model_id": "kimi-k2-instruct", "score": 0.537, "normalized_score": 0.537, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.791826+00:00", "updated_at": "2025-07-19T19:56:11.791826+00:00", "benchmark_name": "LiveCodeBench v6" }, { "model_benchmark_id": 493, "benchmark_id": "math-500", "model_id": "kimi-k2-instruct", "score": 0.974, "normalized_score": 0.974, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.031465+00:00", "updated_at": "2025-07-19T19:56:12.031465+00:00", "benchmark_name": "MATH-500" }, { "model_benchmark_id": 59, "benchmark_id": "mmlu", "model_id": "kimi-k2-instruct", "score": 0.895, "normalized_score": 0.895, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "EM", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.209924+00:00", "updated_at": "2025-07-19T19:56:11.209924+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 162, "benchmark_id": "mmlu-pro", "model_id": "kimi-k2-instruct", "score": 0.811, "normalized_score": 0.811, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "EM", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.412849+00:00", "updated_at": "2025-07-19T19:56:11.412849+00:00", "benchmark_name": "MMLU-Pro" }, { "model_benchmark_id": 727, "benchmark_id": "mmlu-redux", "model_id": "kimi-k2-instruct", "score": 0.927, "normalized_score": 0.927, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "EM", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.531649+00:00", "updated_at": "2025-07-19T19:56:12.531649+00:00", "benchmark_name": "MMLU-Redux" }, { "model_benchmark_id": 739, "benchmark_id": "multichallenge", "model_id": "kimi-k2-instruct", "score": 0.541, "normalized_score": 0.541, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.554319+00:00", "updated_at": "2025-07-19T19:56:12.554319+00:00", "benchmark_name": "MultiChallenge" }, { "model_benchmark_id": 639, "benchmark_id": "multipl-e", "model_id": "kimi-k2-instruct", "score": 0.857, "normalized_score": 0.857, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.314432+00:00", "updated_at": "2025-07-19T19:56:12.314432+00:00", "benchmark_name": "MultiPL-E" }, { "model_benchmark_id": 820, "benchmark_id": "musr", "model_id": "kimi-k2-instruct", "score": 0.764, "normalized_score": 0.764, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.711252+00:00", "updated_at": "2025-07-19T19:56:12.711252+00:00", "benchmark_name": "MuSR" }, { "model_benchmark_id": 638, "benchmark_id": "ojbench", "model_id": "kimi-k2-instruct", "score": 0.271, "normalized_score": 0.271, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.310963+00:00", "updated_at": "2025-07-19T19:56:12.310963+00:00", "benchmark_name": "OJBench" }, { "model_benchmark_id": 713, "benchmark_id": "polymath-en", "model_id": "kimi-k2-instruct", "score": 0.651, "normalized_score": 0.651, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "Avg@4", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.499339+00:00", "updated_at": "2025-07-19T19:56:12.499339+00:00", "benchmark_name": "PolyMath-en" }, { "model_benchmark_id": 223, "benchmark_id": "simpleqa", "model_id": "kimi-k2-instruct", "score": 0.31, "normalized_score": 0.31, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "Correct", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.526736+00:00", "updated_at": "2025-07-19T19:56:11.526736+00:00", "benchmark_name": "SimpleQA" }, { "model_benchmark_id": 365, "benchmark_id": "supergpqa", "model_id": "kimi-k2-instruct", "score": 0.572, "normalized_score": 0.572, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.782850+00:00", "updated_at": "2025-07-19T19:56:11.782850+00:00", "benchmark_name": "SuperGPQA" }, { "model_benchmark_id": 651, "benchmark_id": "swe-bench-multilingual", "model_id": "kimi-k2-instruct", "score": 0.473, "normalized_score": 0.473, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "Single Attempt", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.343981+00:00", "updated_at": "2025-07-19T19:56:12.343981+00:00", "benchmark_name": "SWE-bench Multilingual" }, { "model_benchmark_id": 649, "benchmark_id": "swe-bench-verified-(agentic-coding)", "model_id": "kimi-k2-instruct", "score": 0.658, "normalized_score": 0.658, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "Single Attempt", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.333761+00:00", "updated_at": "2025-07-19T19:56:12.333761+00:00", "benchmark_name": "SWE-bench Verified (Agentic Coding)" }, { "model_benchmark_id": 648, "benchmark_id": "swe-bench-verified-(agentless)", "model_id": "kimi-k2-instruct", "score": 0.518, "normalized_score": 0.518, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "Single Patch without Test", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.330548+00:00", "updated_at": "2025-07-19T19:56:12.330548+00:00", "benchmark_name": "SWE-bench Verified (Agentless)" }, { "model_benchmark_id": 650, "benchmark_id": "swe-bench-verified-(multiple-attempts)", "model_id": "kimi-k2-instruct", "score": 0.716, "normalized_score": 0.716, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "Multiple Attempts with parallel test-time compute", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.339305+00:00", "updated_at": "2025-07-19T19:56:12.339305+00:00", "benchmark_name": "SWE-bench Verified (Multiple Attempts)" }, { "model_benchmark_id": 674, "benchmark_id": "tau2-airline", "model_id": "kimi-k2-instruct", "score": 0.565, "normalized_score": 0.565, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "Avg@4", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.401229+00:00", "updated_at": "2025-07-19T19:56:12.401229+00:00", "benchmark_name": "Tau2 airline" }, { "model_benchmark_id": 673, "benchmark_id": "tau2-retail", "model_id": "kimi-k2-instruct", "score": 0.706, "normalized_score": 0.706, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "Avg@4", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.395604+00:00", "updated_at": "2025-07-19T19:56:12.395604+00:00", "benchmark_name": "Tau2 retail" }, { "model_benchmark_id": 675, "benchmark_id": "tau2-telecom", "model_id": "kimi-k2-instruct", "score": 0.658, "normalized_score": 0.658, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "Avg@4", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.405145+00:00", "updated_at": "2025-07-19T19:56:12.405145+00:00", "benchmark_name": "Tau2 telecom" }, { "model_benchmark_id": 652, "benchmark_id": "terminal-bench", "model_id": "kimi-k2-instruct", "score": 0.3, "normalized_score": 0.3, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "Inhouse Framework", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.348003+00:00", "updated_at": "2025-07-19T19:56:12.348003+00:00", "benchmark_name": "Terminal-bench" }, { "model_benchmark_id": 656, "benchmark_id": "terminus", "model_id": "kimi-k2-instruct", "score": 0.25, "normalized_score": 0.25, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.358921+00:00", "updated_at": "2025-07-19T19:56:12.358921+00:00", "benchmark_name": "Terminus" }, { "model_benchmark_id": 714, "benchmark_id": "zebralogic", "model_id": "kimi-k2-instruct", "score": 0.89, "normalized_score": 0.89, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.502879+00:00", "updated_at": "2025-07-19T19:56:12.502879+00:00", "benchmark_name": "ZebraLogic" } ] ================================================ FILE: data/organizations/moonshotai/models/kimi-k2-instruct/model.json ================================================ { "model_id": "kimi-k2-instruct", "name": "Kimi K2 Instruct", "organization_id": "moonshotai", "fine_tuned_from_model_id": "kimi-k2-base", "description": "Kimi K2 is a state-of-the-art mixture-of-experts (MoE) language model with 32 billion activated parameters and 1 trillion total parameters. Trained with the MuonClip optimizer, it achieves exceptional performance across frontier knowledge, reasoning, and coding tasks while being meticulously optimized for agentic capabilities. The instruct variant is post-trained for drop-in, general-purpose chat and agentic experiences without long thinking.", "release_date": "2025-07-11", "announcement_date": "2025-07-11", "license_id": "mit", "multimodal": false, "knowledge_cutoff": null, "param_count": 1000000000000, "training_tokens": 15500000000000, "available_in_zeroeval": true, "source_api_ref": "https://platform.moonshot.ai", "source_playground": "https://kimi.com", "source_paper": null, "source_scorecard_blog_link": "https://moonshotai.github.io/Kimi-K2/", "source_repo_link": "https://github.com/MoonshotAI/Kimi-K2", "source_weights_link": "https://huggingface.co/moonshotai/Kimi-K2-Instruct", "created_at": "2025-07-19T19:49:05.875884+00:00", "updated_at": "2025-07-19T19:49:05.875884+00:00", "model_family_id": null } ================================================ FILE: data/organizations/moonshotai/models/kimi-k2-instruct-0905/benchmarks.json ================================================ [ { "model_benchmark_id": 10001, "benchmark_id": "swe-bench-verified", "model_id": "kimi-k2-instruct-0905", "score": 0.658, "normalized_score": 0.658, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "Agentic Coding - Single Attempt", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": "65.8% single attempt, 71.6% multiple", "created_at": "2025-09-05T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "Swe Bench Verified" }, { "model_benchmark_id": 10002, "benchmark_id": "swe-bench-multilingual", "model_id": "kimi-k2-instruct-0905", "score": 0.473, "normalized_score": 0.473, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "Agentic Coding - Single Attempt", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-09-05T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "Swe Bench Multilingual" }, { "model_benchmark_id": 10003, "benchmark_id": "terminal-bench", "model_id": "kimi-k2-instruct-0905", "score": 0.25, "normalized_score": 0.25, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "Terminus", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-09-05T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "Terminal Bench" }, { "model_benchmark_id": 10004, "benchmark_id": "livecodebench", "model_id": "kimi-k2-instruct-0905", "score": 0.537, "normalized_score": 0.537, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "v6 (Aug 24-May 25) Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-09-05T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "Livecodebench" }, { "model_benchmark_id": 10005, "benchmark_id": "ojbench", "model_id": "kimi-k2-instruct-0905", "score": 0.271, "normalized_score": 0.271, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-09-05T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "Ojbench" }, { "model_benchmark_id": 10006, "benchmark_id": "multipl-e", "model_id": "kimi-k2-instruct-0905", "score": 0.857, "normalized_score": 0.857, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-09-05T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "Multiple" }, { "model_benchmark_id": 10007, "benchmark_id": "aider-polyglot", "model_id": "kimi-k2-instruct-0905", "score": 0.6, "normalized_score": 0.6, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-09-05T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "Aider Polyglot" }, { "model_benchmark_id": 10008, "benchmark_id": "tau2-retail", "model_id": "kimi-k2-instruct-0905", "score": 0.706, "normalized_score": 0.706, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "Avg@4", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-09-05T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "Tau2 Retail" }, { "model_benchmark_id": 10009, "benchmark_id": "tau2-airline", "model_id": "kimi-k2-instruct-0905", "score": 0.565, "normalized_score": 0.565, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "Avg@4", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-09-05T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "Tau2 Airline" }, { "model_benchmark_id": 10010, "benchmark_id": "tau2-telecom", "model_id": "kimi-k2-instruct-0905", "score": 0.658, "normalized_score": 0.658, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "Avg@4", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-09-05T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "Tau2 Telecom" }, { "model_benchmark_id": 10011, "benchmark_id": "acebench", "model_id": "kimi-k2-instruct-0905", "score": 0.765, "normalized_score": 0.765, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-09-05T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "Acebench" }, { "model_benchmark_id": 10012, "benchmark_id": "aime-2024", "model_id": "kimi-k2-instruct-0905", "score": 0.696, "normalized_score": 0.696, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "Avg@64", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-09-05T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "Aime 2024" }, { "model_benchmark_id": 10013, "benchmark_id": "aime-2025", "model_id": "kimi-k2-instruct-0905", "score": 0.495, "normalized_score": 0.495, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "Avg@64", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-09-05T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "Aime 2025" }, { "model_benchmark_id": 10014, "benchmark_id": "math-500", "model_id": "kimi-k2-instruct-0905", "score": 0.974, "normalized_score": 0.974, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-09-05T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "Math 500" }, { "model_benchmark_id": 10015, "benchmark_id": "hmmt-2025", "model_id": "kimi-k2-instruct-0905", "score": 0.388, "normalized_score": 0.388, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "Avg@32", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-09-05T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "Hmmt 2025" }, { "model_benchmark_id": 10016, "benchmark_id": "cnmo-2024", "model_id": "kimi-k2-instruct-0905", "score": 0.743, "normalized_score": 0.743, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "Avg@16", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-09-05T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "Cnmo 2024" }, { "model_benchmark_id": 10017, "benchmark_id": "polymath-en", "model_id": "kimi-k2-instruct-0905", "score": 0.651, "normalized_score": 0.651, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "Avg@4", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-09-05T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "Polymath En" }, { "model_benchmark_id": 10018, "benchmark_id": "zebralogic", "model_id": "kimi-k2-instruct-0905", "score": 0.89, "normalized_score": 0.89, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-09-05T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "Zebralogic" }, { "model_benchmark_id": 10019, "benchmark_id": "autologi", "model_id": "kimi-k2-instruct-0905", "score": 0.895, "normalized_score": 0.895, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-09-05T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "Autologi" }, { "model_benchmark_id": 10020, "benchmark_id": "gpqa", "model_id": "kimi-k2-instruct-0905", "score": 0.751, "normalized_score": 0.751, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "Diamond - Avg@8", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-09-05T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "Gpqa" }, { "model_benchmark_id": 10021, "benchmark_id": "supergpqa", "model_id": "kimi-k2-instruct-0905", "score": 0.572, "normalized_score": 0.572, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-09-05T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "Supergpqa" }, { "model_benchmark_id": 10022, "benchmark_id": "hle", "model_id": "kimi-k2-instruct-0905", "score": 0.047, "normalized_score": 0.047, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "Text Only", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-09-05T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "Hle" }, { "model_benchmark_id": 10023, "benchmark_id": "mmlu", "model_id": "kimi-k2-instruct-0905", "score": 0.895, "normalized_score": 0.895, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "EM", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-09-05T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "Mmlu" }, { "model_benchmark_id": 10024, "benchmark_id": "mmlu-redux", "model_id": "kimi-k2-instruct-0905", "score": 0.927, "normalized_score": 0.927, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "EM", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-09-05T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "Mmlu Redux" }, { "model_benchmark_id": 10025, "benchmark_id": "mmlu-pro", "model_id": "kimi-k2-instruct-0905", "score": 0.811, "normalized_score": 0.811, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "EM", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-09-05T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "Mmlu Pro" }, { "model_benchmark_id": 10026, "benchmark_id": "ifeval", "model_id": "kimi-k2-instruct-0905", "score": 0.898, "normalized_score": 0.898, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "Prompt Strict", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-09-05T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "Ifeval" }, { "model_benchmark_id": 10027, "benchmark_id": "multichallenge", "model_id": "kimi-k2-instruct-0905", "score": 0.541, "normalized_score": 0.541, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-09-05T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "Multichallenge" }, { "model_benchmark_id": 10028, "benchmark_id": "simpleqa", "model_id": "kimi-k2-instruct-0905", "score": 0.31, "normalized_score": 0.31, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "Correct", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-09-05T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "Simpleqa" }, { "model_benchmark_id": 10029, "benchmark_id": "livebench", "model_id": "kimi-k2-instruct-0905", "score": 0.764, "normalized_score": 0.764, "is_self_reported": true, "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/", "verified_by_llmstats": false, "analysis_method": "2024/11/25 Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-09-05T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "Livebench" } ] ================================================ FILE: data/organizations/moonshotai/models/kimi-k2-instruct-0905/model.json ================================================ { "model_id": "kimi-k2-instruct-0905", "name": "Kimi K2-Instruct-0905", "organization_id": "moonshotai", "model_family_id": null, "fine_tuned_from_model_id": null, "description": "Kimi K2-Instruct-0905 is the latest, most capable version of Kimi K2, achieving state-of-the-art performance in frontier knowledge, math, and coding among non-thinking models. This Mixture-of-Experts model features 32 billion activated parameters and 1 trillion total parameters, meticulously optimized for agentic tasks. Key features include enhanced agentic coding intelligence, extended context length to 256K tokens, and a hybrid architecture trained with MuonClip optimizer on 15.5T tokens. The model achieves 65.8% on SWE-bench Verified (single attempt), 47.3% on SWE-bench Multilingual, and excels at tool use with 70.6% on Tau2-retail. It is a reflex-grade model without long thinking, designed to act and execute complex tasks seamlessly.", "release_date": "2025-09-05", "announcement_date": "2025-09-05", "license_id": "mit", "multimodal": false, "knowledge_cutoff": null, "param_count": 1000000000000, "training_tokens": 15500000000000, "available_in_zeroeval": true, "source_api_ref": "https://platform.moonshot.ai", "source_playground": "https://kimi.moonshot.cn/", "source_paper": "https://github.com/MoonshotAI/Kimi-K2/blob/main/tech_report.pdf", "source_scorecard_blog_link": "https://moonshotai.github.io/Kimi-K2/", "source_repo_link": "https://github.com/MoonshotAI/Kimi-K2", "source_weights_link": "https://huggingface.co/MoonshotAI", "created_at": "2025-09-05T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00" } ================================================ FILE: data/organizations/moonshotai/organization.json ================================================ { "organization_id": "moonshotai", "name": "Moonshot AI", "website": "https://moonshot.cn", "description": "Chinese AI company developing the Kimi series of large language models, including state-of-the-art mixture-of-experts models with long-context capabilities", "country": "CN", "created_at": "2025-07-19T19:49:05.419295+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00" } ================================================ FILE: data/organizations/nvidia/models/llama-3.1-nemotron-70b-instruct/benchmarks.json ================================================ [ { "model_benchmark_id": 24, "benchmark_id": "arc-c", "model_id": "llama-3.1-nemotron-70b-instruct", "score": 0.692, "normalized_score": 0.692, "is_self_reported": true, "self_reported_source_link": "https://developer.nvidia.com/blog/advancing-the-accuracy-efficiency-frontier-with-llama-3-1-nemotron-51b/", "verified_by_llmstats": false, "analysis_method": "Standard evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.133318+00:00", "updated_at": "2025-07-19T19:56:11.133318+00:00", "benchmark_name": "ARC-C" }, { "model_benchmark_id": 1005, "benchmark_id": "gsm8k", "model_id": "llama-3.1-nemotron-70b-instruct", "score": 0.9143, "normalized_score": 0.9143, "is_self_reported": true, "self_reported_source_link": "https://developer.nvidia.com/blog/advancing-the-accuracy-efficiency-frontier-with-llama-3-1-nemotron-51b/", "verified_by_llmstats": false, "analysis_method": "Standard evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.099846+00:00", "updated_at": "2025-07-19T19:56:13.099846+00:00", "benchmark_name": "GSM8k" }, { "model_benchmark_id": 1811, "benchmark_id": "gsm8k-chat", "model_id": "llama-3.1-nemotron-70b-instruct", "score": 0.8188, "normalized_score": 0.8188, "is_self_reported": true, "self_reported_source_link": "https://developer.nvidia.com/blog/advancing-the-accuracy-efficiency-frontier-with-llama-3-1-nemotron-51b/", "verified_by_llmstats": false, "analysis_method": "Chat evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.104394+00:00", "updated_at": "2025-07-19T19:56:15.104394+00:00", "benchmark_name": "GSM8K Chat" }, { "model_benchmark_id": 50, "benchmark_id": "hellaswag", "model_id": "llama-3.1-nemotron-70b-instruct", "score": 0.8558, "normalized_score": 0.8558, "is_self_reported": true, "self_reported_source_link": "https://developer.nvidia.com/blog/advancing-the-accuracy-efficiency-frontier-with-llama-3-1-nemotron-51b/", "verified_by_llmstats": false, "analysis_method": "Standard evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.188734+00:00", "updated_at": "2025-07-19T19:56:11.188734+00:00", "benchmark_name": "HellaSwag" }, { "model_benchmark_id": 1812, "benchmark_id": "instruct-humaneval", "model_id": "llama-3.1-nemotron-70b-instruct", "score": 0.7384, "normalized_score": 0.7384, "is_self_reported": true, "self_reported_source_link": "https://developer.nvidia.com/blog/advancing-the-accuracy-efficiency-frontier-with-llama-3-1-nemotron-51b/", "verified_by_llmstats": false, "analysis_method": "Code evaluation (n=20)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.108307+00:00", "updated_at": "2025-07-19T19:56:15.108307+00:00", "benchmark_name": "Instruct HumanEval" }, { "model_benchmark_id": 102, "benchmark_id": "mmlu", "model_id": "llama-3.1-nemotron-70b-instruct", "score": 0.802, "normalized_score": 0.802, "is_self_reported": true, "self_reported_source_link": "https://developer.nvidia.com/blog/advancing-the-accuracy-efficiency-frontier-with-llama-3-1-nemotron-51b/", "verified_by_llmstats": false, "analysis_method": "Standard evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.292516+00:00", "updated_at": "2025-07-19T19:56:11.292516+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 1810, "benchmark_id": "mmlu-chat", "model_id": "llama-3.1-nemotron-70b-instruct", "score": 0.8058, "normalized_score": 0.8058, "is_self_reported": true, "self_reported_source_link": "https://developer.nvidia.com/blog/advancing-the-accuracy-efficiency-frontier-with-llama-3-1-nemotron-51b/", "verified_by_llmstats": false, "analysis_method": "Chat evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.100072+00:00", "updated_at": "2025-07-19T19:56:15.100072+00:00", "benchmark_name": "MMLU Chat" }, { "model_benchmark_id": 1611, "benchmark_id": "mt-bench", "model_id": "llama-3.1-nemotron-70b-instruct", "score": 0.0899, "normalized_score": 0.0899, "is_self_reported": true, "self_reported_source_link": "https://developer.nvidia.com/blog/advancing-the-accuracy-efficiency-frontier-with-llama-3-1-nemotron-51b/", "verified_by_llmstats": false, "analysis_method": "Chat evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.532800+00:00", "updated_at": "2025-07-19T19:56:14.532800+00:00", "benchmark_name": "MT-Bench" }, { "model_benchmark_id": 143, "benchmark_id": "truthfulqa", "model_id": "llama-3.1-nemotron-70b-instruct", "score": 0.5863, "normalized_score": 0.5863, "is_self_reported": true, "self_reported_source_link": "https://developer.nvidia.com/blog/advancing-the-accuracy-efficiency-frontier-with-llama-3-1-nemotron-51b/", "verified_by_llmstats": false, "analysis_method": "Standard evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.363751+00:00", "updated_at": "2025-07-19T19:56:11.363751+00:00", "benchmark_name": "TruthfulQA" }, { "model_benchmark_id": 153, "benchmark_id": "winogrande", "model_id": "llama-3.1-nemotron-70b-instruct", "score": 0.8453, "normalized_score": 0.8453, "is_self_reported": true, "self_reported_source_link": "https://developer.nvidia.com/blog/advancing-the-accuracy-efficiency-frontier-with-llama-3-1-nemotron-51b/", "verified_by_llmstats": false, "analysis_method": "Standard evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.390043+00:00", "updated_at": "2025-07-19T19:56:11.390043+00:00", "benchmark_name": "Winogrande" }, { "model_benchmark_id": 1809, "benchmark_id": "xlsum-english", "model_id": "llama-3.1-nemotron-70b-instruct", "score": 0.3161, "normalized_score": 0.3161, "is_self_reported": true, "self_reported_source_link": "https://developer.nvidia.com/blog/advancing-the-accuracy-efficiency-frontier-with-llama-3-1-nemotron-51b/", "verified_by_llmstats": false, "analysis_method": "Standard evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.094560+00:00", "updated_at": "2025-07-19T19:56:15.094560+00:00", "benchmark_name": "XLSum English" } ] ================================================ FILE: data/organizations/nvidia/models/llama-3.1-nemotron-70b-instruct/model.json ================================================ { "model_id": "llama-3.1-nemotron-70b-instruct", "name": "Llama 3.1 Nemotron 70B Instruct", "organization_id": "nvidia", "fine_tuned_from_model_id": "llama-3.1-70b-instruct", "description": "A large language model customized by NVIDIA to improve the helpfulness of LLM generated responses. It is a fine-tuned version of Llama 3.1 70B Instruct. The model was trained using RLHF (REINFORCE) with HelpSteer2-Preference prompts.", "release_date": "2024-10-01", "announcement_date": "2024-10-01", "license_id": "llama_3_1_community_license", "multimodal": false, "knowledge_cutoff": "2023-12-01", "param_count": 70000000000, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://build.nvidia.com/nvidia/llama-3_1-nemotron-70b-instruct", "source_playground": null, "source_paper": "https://arxiv.org/abs/2410.01257", "source_scorecard_blog_link": "https://developer.nvidia.com/blog/advancing-the-accuracy-efficiency-frontier-with-llama-3-1-nemotron-51b/", "source_repo_link": null, "source_weights_link": "https://huggingface.co/nvidia/Llama-3.1-Nemotron-70B-Instruct", "created_at": "2025-07-19T19:49:05.908923+00:00", "updated_at": "2025-07-19T19:49:05.908923+00:00", "model_family_id": null } ================================================ FILE: data/organizations/nvidia/models/llama-3.1-nemotron-nano-8b-v1/benchmarks.json ================================================ [ { "model_benchmark_id": 698, "benchmark_id": "aime-2025", "model_id": "llama-3.1-nemotron-nano-8b-v1", "score": 0.471, "normalized_score": 0.471, "is_self_reported": true, "self_reported_source_link": "https://build.nvidia.com/nvidia/llama-3_1-nemotron-nano-8b-v1/modelcard", "verified_by_llmstats": false, "analysis_method": "Pass@1, Reasoning", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.461794+00:00", "updated_at": "2025-07-19T19:56:12.461794+00:00", "benchmark_name": "AIME 2025" }, { "model_benchmark_id": 1586, "benchmark_id": "bfcl-v2", "model_id": "llama-3.1-nemotron-nano-8b-v1", "score": 0.636, "normalized_score": 0.636, "is_self_reported": true, "self_reported_source_link": "https://build.nvidia.com/nvidia/llama-3_1-nemotron-nano-8b-v1/modelcard", "verified_by_llmstats": false, "analysis_method": "Score, Reasoning", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.454860+00:00", "updated_at": "2025-07-19T19:56:14.454860+00:00", "benchmark_name": "BFCL v2" }, { "model_benchmark_id": 327, "benchmark_id": "gpqa", "model_id": "llama-3.1-nemotron-nano-8b-v1", "score": 0.541, "normalized_score": 0.541, "is_self_reported": true, "self_reported_source_link": "https://build.nvidia.com/nvidia/llama-3_1-nemotron-nano-8b-v1/modelcard", "verified_by_llmstats": false, "analysis_method": "Diamond, Pass@1, Reasoning", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.719213+00:00", "updated_at": "2025-07-19T19:56:11.719213+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 627, "benchmark_id": "ifeval", "model_id": "llama-3.1-nemotron-nano-8b-v1", "score": 0.793, "normalized_score": 0.793, "is_self_reported": true, "self_reported_source_link": "https://build.nvidia.com/nvidia/llama-3_1-nemotron-nano-8b-v1/modelcard", "verified_by_llmstats": false, "analysis_method": "Strict Accuracy, Reasoning", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.289960+00:00", "updated_at": "2025-07-19T19:56:12.289960+00:00", "benchmark_name": "IFEval" }, { "model_benchmark_id": 510, "benchmark_id": "math-500", "model_id": "llama-3.1-nemotron-nano-8b-v1", "score": 0.954, "normalized_score": 0.954, "is_self_reported": true, "self_reported_source_link": "https://build.nvidia.com/nvidia/llama-3_1-nemotron-nano-8b-v1/modelcard", "verified_by_llmstats": false, "analysis_method": "Pass@1, Reasoning", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.059893+00:00", "updated_at": "2025-07-19T19:56:12.059893+00:00", "benchmark_name": "MATH-500" }, { "model_benchmark_id": 1193, "benchmark_id": "mbpp", "model_id": "llama-3.1-nemotron-nano-8b-v1", "score": 0.846, "normalized_score": 0.846, "is_self_reported": true, "self_reported_source_link": "https://build.nvidia.com/nvidia/llama-3_1-nemotron-nano-8b-v1/modelcard", "verified_by_llmstats": false, "analysis_method": "0-shot, Pass@1, Reasoning", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.512976+00:00", "updated_at": "2025-07-19T19:56:13.512976+00:00", "benchmark_name": "MBPP" }, { "model_benchmark_id": 1610, "benchmark_id": "mt-bench", "model_id": "llama-3.1-nemotron-nano-8b-v1", "score": 0.81, "normalized_score": 0.81, "is_self_reported": true, "self_reported_source_link": "https://build.nvidia.com/nvidia/llama-3_1-nemotron-nano-8b-v1/modelcard", "verified_by_llmstats": false, "analysis_method": "Score, Reasoning", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.530016+00:00", "updated_at": "2025-07-19T19:56:14.530016+00:00", "benchmark_name": "MT-Bench" } ] ================================================ FILE: data/organizations/nvidia/models/llama-3.1-nemotron-nano-8b-v1/model.json ================================================ { "model_id": "llama-3.1-nemotron-nano-8b-v1", "name": "Llama 3.1 Nemotron Nano 8B V1", "organization_id": "nvidia", "fine_tuned_from_model_id": null, "description": "Llama-3.1-Nemotron-Nano-8B-v1 is a large language model (LLM) which is a derivative of Meta Llama-3.1-8B-Instruct (AKA the reference model). It is a reasoning model that is post trained for reasoning, human chat preferences, and tasks, such as RAG and tool calling.", "release_date": "2025-03-18", "announcement_date": "2025-03-18", "license_id": "llama_3_1_community_license", "multimodal": false, "knowledge_cutoff": "2023-12-31", "param_count": 8000000000, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": null, "source_playground": "https://build.nvidia.com/nvidia/llama-3_1-nemotron-nano-8b-v1", "source_paper": "https://arxiv.org/abs/2502.00203", "source_scorecard_blog_link": "https://build.nvidia.com/nvidia/llama-3_1-nemotron-nano-8b-v1/modelcard", "source_repo_link": null, "source_weights_link": "https://huggingface.co/nvidia/Llama-3.1-Nemotron-Nano-8B-v1", "created_at": "2025-07-19T19:49:05.733231+00:00", "updated_at": "2025-07-19T19:49:05.733231+00:00", "model_family_id": null } ================================================ FILE: data/organizations/nvidia/models/llama-3.1-nemotron-ultra-253b-v1/benchmarks.json ================================================ [ { "model_benchmark_id": 699, "benchmark_id": "aime-2025", "model_id": "llama-3.1-nemotron-ultra-253b-v1", "score": 0.725, "normalized_score": 0.725, "is_self_reported": true, "self_reported_source_link": "https://build.nvidia.com/nvidia/llama-3_1-nemotron-ultra-253b-v1/modelcard", "verified_by_llmstats": false, "analysis_method": "Pass@1, Reasoning", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.463355+00:00", "updated_at": "2025-07-19T19:56:12.463355+00:00", "benchmark_name": "AIME 2025" }, { "model_benchmark_id": 1587, "benchmark_id": "bfcl-v2", "model_id": "llama-3.1-nemotron-ultra-253b-v1", "score": 0.741, "normalized_score": 0.741, "is_self_reported": true, "self_reported_source_link": "https://build.nvidia.com/nvidia/llama-3_1-nemotron-ultra-253b-v1/modelcard", "verified_by_llmstats": false, "analysis_method": "Score, Reasoning", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.456840+00:00", "updated_at": "2025-07-19T19:56:14.456840+00:00", "benchmark_name": "BFCL v2" }, { "model_benchmark_id": 328, "benchmark_id": "gpqa", "model_id": "llama-3.1-nemotron-ultra-253b-v1", "score": 0.7601, "normalized_score": 0.7601, "is_self_reported": true, "self_reported_source_link": "https://build.nvidia.com/nvidia/llama-3_1-nemotron-ultra-253b-v1/modelcard", "verified_by_llmstats": false, "analysis_method": "Pass@1, Reasoning", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.721348+00:00", "updated_at": "2025-07-19T19:56:11.721348+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 628, "benchmark_id": "ifeval", "model_id": "llama-3.1-nemotron-ultra-253b-v1", "score": 0.8945, "normalized_score": 0.8945, "is_self_reported": true, "self_reported_source_link": "https://build.nvidia.com/nvidia/llama-3_1-nemotron-ultra-253b-v1/modelcard", "verified_by_llmstats": false, "analysis_method": "Strict Accuracy, Reasoning", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.292359+00:00", "updated_at": "2025-07-19T19:56:12.292359+00:00", "benchmark_name": "IFEval" }, { "model_benchmark_id": 1143, "benchmark_id": "livecodebench", "model_id": "llama-3.1-nemotron-ultra-253b-v1", "score": 0.6631, "normalized_score": 0.6631, "is_self_reported": true, "self_reported_source_link": "https://build.nvidia.com/nvidia/llama-3_1-nemotron-ultra-253b-v1/modelcard", "verified_by_llmstats": false, "analysis_method": "Pass@1, Reasoning", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.404565+00:00", "updated_at": "2025-07-19T19:56:13.404565+00:00", "benchmark_name": "LiveCodeBench" }, { "model_benchmark_id": 511, "benchmark_id": "math-500", "model_id": "llama-3.1-nemotron-ultra-253b-v1", "score": 0.97, "normalized_score": 0.97, "is_self_reported": true, "self_reported_source_link": "https://build.nvidia.com/nvidia/llama-3_1-nemotron-ultra-253b-v1/modelcard", "verified_by_llmstats": false, "analysis_method": "Pass@1, Reasoning", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.061892+00:00", "updated_at": "2025-07-19T19:56:12.061892+00:00", "benchmark_name": "MATH-500" } ] ================================================ FILE: data/organizations/nvidia/models/llama-3.1-nemotron-ultra-253b-v1/model.json ================================================ { "model_id": "llama-3.1-nemotron-ultra-253b-v1", "name": "Llama 3.1 Nemotron Ultra 253B v1", "organization_id": "nvidia", "fine_tuned_from_model_id": null, "description": "A 253B parameter derivative of Meta Llama 3.1 405B Instruct, developed by NVIDIA using Neural Architecture Search (NAS) and vertical compression. It underwent multi-phase post-training (SFT for Math, Code, Reasoning, Chat, Tool Calling; RL with GRPO) to enhance reasoning and instruction-following. Optimized for accuracy/efficiency tradeoff on NVIDIA GPUs. Supports 128k context.", "release_date": "2025-04-07", "announcement_date": "2025-04-07", "license_id": "llama_3_1_community_license", "multimodal": false, "knowledge_cutoff": "2023-12-01", "param_count": 253000000000, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": null, "source_playground": "https://build.nvidia.com/nvidia/llama-3_1-nemotron-ultra-253b-v1", "source_paper": "https://arxiv.org/abs/2502.00203", "source_scorecard_blog_link": "https://build.nvidia.com/nvidia/llama-3_1-nemotron-ultra-253b-v1/modelcard", "source_repo_link": null, "source_weights_link": "https://huggingface.co/nvidia/Llama-3_1-Nemotron-Ultra-253B-v1", "created_at": "2025-07-19T19:49:05.735588+00:00", "updated_at": "2025-07-19T19:49:05.735588+00:00", "model_family_id": null } ================================================ FILE: data/organizations/nvidia/models/llama-3.3-nemotron-super-49b-v1/benchmarks.json ================================================ [ { "model_benchmark_id": 697, "benchmark_id": "aime-2025", "model_id": "llama-3.3-nemotron-super-49b-v1", "score": 0.584, "normalized_score": 0.584, "is_self_reported": true, "self_reported_source_link": "https://build.nvidia.com/nvidia/llama-3_3-nemotron-super-49b-v1/modelcard", "verified_by_llmstats": false, "analysis_method": "Pass@1, Reasoning On", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.459628+00:00", "updated_at": "2025-07-19T19:56:12.459628+00:00", "benchmark_name": "AIME 2025" }, { "model_benchmark_id": 1461, "benchmark_id": "arena-hard", "model_id": "llama-3.3-nemotron-super-49b-v1", "score": 0.883, "normalized_score": 0.883, "is_self_reported": true, "self_reported_source_link": "https://build.nvidia.com/nvidia/llama-3_3-nemotron-super-49b-v1/modelcard", "verified_by_llmstats": false, "analysis_method": "Score, Reasoning Off", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.113375+00:00", "updated_at": "2025-07-19T19:56:14.113375+00:00", "benchmark_name": "Arena Hard" }, { "model_benchmark_id": 1585, "benchmark_id": "bfcl-v2", "model_id": "llama-3.3-nemotron-super-49b-v1", "score": 0.737, "normalized_score": 0.737, "is_self_reported": true, "self_reported_source_link": "https://build.nvidia.com/nvidia/llama-3_3-nemotron-super-49b-v1/modelcard", "verified_by_llmstats": false, "analysis_method": "Score, Reasoning On", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.452681+00:00", "updated_at": "2025-07-19T19:56:14.452681+00:00", "benchmark_name": "BFCL v2" }, { "model_benchmark_id": 326, "benchmark_id": "gpqa", "model_id": "llama-3.3-nemotron-super-49b-v1", "score": 0.6667, "normalized_score": 0.6667, "is_self_reported": true, "self_reported_source_link": "https://build.nvidia.com/nvidia/llama-3_3-nemotron-super-49b-v1/modelcard", "verified_by_llmstats": false, "analysis_method": "Pass@1, Reasoning On", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.717785+00:00", "updated_at": "2025-07-19T19:56:11.717785+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 509, "benchmark_id": "math-500", "model_id": "llama-3.3-nemotron-super-49b-v1", "score": 0.966, "normalized_score": 0.966, "is_self_reported": true, "self_reported_source_link": "https://build.nvidia.com/nvidia/llama-3_3-nemotron-super-49b-v1/modelcard", "verified_by_llmstats": false, "analysis_method": "Pass@1, Reasoning On", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.058280+00:00", "updated_at": "2025-07-19T19:56:12.058280+00:00", "benchmark_name": "MATH-500" }, { "model_benchmark_id": 1192, "benchmark_id": "mbpp", "model_id": "llama-3.3-nemotron-super-49b-v1", "score": 0.913, "normalized_score": 0.913, "is_self_reported": true, "self_reported_source_link": "https://build.nvidia.com/nvidia/llama-3_3-nemotron-super-49b-v1/modelcard", "verified_by_llmstats": false, "analysis_method": "Pass@1, Reasoning On", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.511549+00:00", "updated_at": "2025-07-19T19:56:13.511549+00:00", "benchmark_name": "MBPP" }, { "model_benchmark_id": 1609, "benchmark_id": "mt-bench", "model_id": "llama-3.3-nemotron-super-49b-v1", "score": 0.917, "normalized_score": 0.917, "is_self_reported": true, "self_reported_source_link": "https://build.nvidia.com/nvidia/llama-3_3-nemotron-super-49b-v1/modelcard", "verified_by_llmstats": false, "analysis_method": "Score, Reasoning On", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.527840+00:00", "updated_at": "2025-07-19T19:56:14.527840+00:00", "benchmark_name": "MT-Bench" } ] ================================================ FILE: data/organizations/nvidia/models/llama-3.3-nemotron-super-49b-v1/model.json ================================================ { "model_id": "llama-3.3-nemotron-super-49b-v1", "name": "Llama-3.3 Nemotron Super 49B v1", "organization_id": "nvidia", "fine_tuned_from_model_id": null, "description": "Llama-3.3-Nemotron-Super-49B-v1 is a large language model (LLM) derived from Meta Llama-3.3-70B-Instruct. It's post-trained for reasoning, chat, RAG, and tool calling, offering a balance between accuracy and efficiency (optimized for single H100). It underwent multi-phase post-training including SFT and RL (RLOO, RPO).", "release_date": "2025-03-18", "announcement_date": "2025-03-18", "license_id": "llama_3_1_community_license", "multimodal": false, "knowledge_cutoff": "2023-12-31", "param_count": 49900000000, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": null, "source_playground": "https://build.nvidia.com/nvidia/llama-3_3-nemotron-super-49b-v1", "source_paper": "https://arxiv.org/abs/2502.00203", "source_scorecard_blog_link": "https://build.nvidia.com/nvidia/llama-3_3-nemotron-super-49b-v1/modelcard", "source_repo_link": null, "source_weights_link": "https://huggingface.co/nvidia/Llama-3_3-Nemotron-Super-49B-v1", "created_at": "2025-07-19T19:49:05.730826+00:00", "updated_at": "2025-07-19T19:49:05.730826+00:00", "model_family_id": null } ================================================ FILE: data/organizations/nvidia/models/nemotron-nano-9b-v2/benchmarks.json ================================================ [ { "model_benchmark_id": 12345, "benchmark_id": "aime-2025", "model_id": "nvidia-nemotron-nano-9b-v2", "score": 0.721, "normalized_score": 0.721, "is_self_reported": true, "self_reported_source_link": "https://build.nvidia.com/nvidia/nvidia-nemotron-nano-9b-v2/modelcard", "verified_by_llmstats": false, "analysis_method": "Score, Reasoning On", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-10-04T16:07:30.482+00:00", "updated_at": "2025-10-04T16:07:30.482+00:00", "benchmark_name": "AIME 2025" }, { "model_benchmark_id": 12345, "benchmark_id": "math-500", "model_id": "nvidia-nemotron-nano-9b-v2", "score": 0.978, "normalized_score": 0.978, "is_self_reported": true, "self_reported_source_link": "https://build.nvidia.com/nvidia/nvidia-nemotron-nano-9b-v2/modelcard", "verified_by_llmstats": false, "analysis_method": "Score, Reasoning On", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-10-04T16:07:30.482+00:00", "updated_at": "2025-10-04T16:07:30.482+00:00", "benchmark_name": "MATH-500" }, { "model_benchmark_id": 12345, "benchmark_id": "gpqa", "model_id": "nvidia-nemotron-nano-9b-v2", "score": 0.640, "normalized_score": 0.640, "is_self_reported": true, "self_reported_source_link": "https://build.nvidia.com/nvidia/nvidia-nemotron-nano-9b-v2/modelcard", "verified_by_llmstats": false, "analysis_method": "Score, Reasoning On", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-10-04T16:07:30.482+00:00", "updated_at": "2025-10-04T16:07:30.482+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 12345, "benchmark_id": "livecodebench", "model_id": "nvidia-nemotron-nano-9b-v2", "score": 0.711, "normalized_score": 0.711, "is_self_reported": true, "self_reported_source_link": "https://build.nvidia.com/nvidia/nvidia-nemotron-nano-9b-v2/modelcard", "verified_by_llmstats": false, "analysis_method": "Score, Reasoning On", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-10-04T16:07:30.482+00:00", "updated_at": "2025-10-04T16:07:30.482+00:00", "benchmark_name": "LiveCodeBench" }, { "model_benchmark_id": 12345, "benchmark_id": "bfcl-v3-multiturn", "model_id": "nvidia-nemotron-nano-9b-v2", "score": 0.669, "normalized_score": 0.669, "is_self_reported": true, "self_reported_source_link": "https://build.nvidia.com/nvidia/nvidia-nemotron-nano-9b-v2/modelcard", "verified_by_llmstats": false, "analysis_method": "Score, Reasoning On", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-10-04T16:07:30.482+00:00", "updated_at": "2025-10-04T16:07:30.482+00:00", "benchmark_name": "BFCL v3" }, { "model_benchmark_id": 12345, "benchmark_id": "ifeval", "model_id": "nvidia-nemotron-nano-9b-v2", "score": 0.903, "normalized_score": 0.903, "is_self_reported": true, "self_reported_source_link": "https://build.nvidia.com/nvidia/nvidia-nemotron-nano-9b-v2/modelcard", "verified_by_llmstats": false, "analysis_method": "Score, Reasoning On", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-10-04T16:07:30.482+00:00", "updated_at": "2025-10-04T16:07:30.482+00:00", "benchmark_name": "IFEval" } ] ================================================ FILE: data/organizations/nvidia/models/nemotron-nano-9b-v2/model.json ================================================ { "model_id": "nvidia-nemotron-nano-9b-v2", "name": "Nemotron Nano 9B v2", "organization_id": "nvidia", "fine_tuned_from_model_id": null, "description": "NVIDIA-Nemotron-Nano-9B-v2 is a large language model (LLM) trained from scratch by NVIDIA, and designed as a unified model for both reasoning and non-reasoning tasks. It responds to user queries and tasks by first generating a reasoning trace and then concluding with a final response. The model's reasoning capabilities can be controlled via a system prompt. If the user prefers the model to provide its final answer without intermediate reasoning traces, it can be configured to do so, albeit with a slight decrease in accuracy for harder prompts that require reasoning. Conversely, allowing the model to generate reasoning traces first generally results in higher-quality final solutions to queries and tasks.", "release_date": "2025-08-18", "announcement_date": "2025-08-18", "license_id": "nvidia_open_model_license_agreement", "multimodal": false, "knowledge_cutoff": "2024-09", "param_count": 8900000000, "training_tokens": 21100000000000, "available_in_zeroeval": true, "source_api_ref": null, "source_playground": "https://build.nvidia.com/nvidia/nvidia-nemotron-nano-9b-v2", "source_paper": "https://arxiv.org/abs/2508.14444", "source_scorecard_blog_link": "https://build.nvidia.com/nvidia/nvidia-nemotron-nano-9b-v2/modelcard", "source_repo_link": null, "source_weights_link": "https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-9B-v2", "created_at": "2025-10-02T21:51:16.835+00:00", "updated_at": "2025-10-02T21:51:16.835+00:00", "model_family_id": null } ================================================ FILE: data/organizations/nvidia/organization.json ================================================ { "organization_id": "nvidia", "name": "NVIDIA", "website": "https://nvidia.com", "description": "GPU and AI company", "country": "US", "created_at": "2025-07-19T19:49:05.728519+00:00", "updated_at": "2025-07-19T19:49:05.728519+00:00" } ================================================ FILE: data/organizations/openai/models/gpt-3.5-turbo-0125/benchmarks.json ================================================ [ { "model_benchmark_id": 963, "benchmark_id": "drop", "model_id": "gpt-3.5-turbo-0125", "score": 0.702, "normalized_score": 0.702, "is_self_reported": false, "self_reported_source_link": "https://example.com/benchmark-image", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.025267+00:00", "updated_at": "2025-07-19T19:56:13.025267+00:00", "benchmark_name": "DROP" }, { "model_benchmark_id": 359, "benchmark_id": "gpqa", "model_id": "gpt-3.5-turbo-0125", "score": 0.308, "normalized_score": 0.308, "is_self_reported": false, "self_reported_source_link": "https://example.com/benchmark-image", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.770449+00:00", "updated_at": "2025-07-19T19:56:11.770449+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 815, "benchmark_id": "humaneval", "model_id": "gpt-3.5-turbo-0125", "score": 0.68, "normalized_score": 0.68, "is_self_reported": false, "self_reported_source_link": "https://example.com/benchmark-image", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.697970+00:00", "updated_at": "2025-07-19T19:56:12.697970+00:00", "benchmark_name": "HumanEval" }, { "model_benchmark_id": 429, "benchmark_id": "math", "model_id": "gpt-3.5-turbo-0125", "score": 0.431, "normalized_score": 0.431, "is_self_reported": false, "self_reported_source_link": "https://example.com/benchmark-image", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.906977+00:00", "updated_at": "2025-07-19T19:56:11.906977+00:00", "benchmark_name": "MATH" }, { "model_benchmark_id": 547, "benchmark_id": "mathvista", "model_id": "gpt-3.5-turbo-0125", "score": 0.0, "normalized_score": 0.0, "is_self_reported": false, "self_reported_source_link": "https://example.com/benchmark-image", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.127494+00:00", "updated_at": "2025-07-19T19:56:12.127494+00:00", "benchmark_name": "MathVista" }, { "model_benchmark_id": 1299, "benchmark_id": "mgsm", "model_id": "gpt-3.5-turbo-0125", "score": 0.563, "normalized_score": 0.563, "is_self_reported": false, "self_reported_source_link": "https://example.com/benchmark-image", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.717321+00:00", "updated_at": "2025-07-19T19:56:13.717321+00:00", "benchmark_name": "MGSM" }, { "model_benchmark_id": 126, "benchmark_id": "mmlu", "model_id": "gpt-3.5-turbo-0125", "score": 0.698, "normalized_score": 0.698, "is_self_reported": false, "self_reported_source_link": "https://example.com/benchmark-image", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.331664+00:00", "updated_at": "2025-07-19T19:56:11.331664+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 597, "benchmark_id": "mmmu", "model_id": "gpt-3.5-turbo-0125", "score": 0.0, "normalized_score": 0.0, "is_self_reported": false, "self_reported_source_link": "https://example.com/benchmark-image", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.230222+00:00", "updated_at": "2025-07-19T19:56:12.230222+00:00", "benchmark_name": "MMMU" } ] ================================================ FILE: data/organizations/openai/models/gpt-3.5-turbo-0125/model.json ================================================ { "model_id": "gpt-3.5-turbo-0125", "name": "GPT-3.5 Turbo", "organization_id": "openai", "fine_tuned_from_model_id": null, "description": "The latest GPT-3.5 Turbo model with higher accuracy at responding in requested formats and a fix for a bug which caused a text encoding issue for non-English language function calls.", "release_date": "2023-03-21", "announcement_date": "2023-03-21", "license_id": "proprietary", "multimodal": false, "knowledge_cutoff": "2021-09-30", "param_count": null, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://platform.openai.com/docs/models/gpt-3-5-turbo", "source_playground": "https://platform.openai.com/playground", "source_paper": null, "source_scorecard_blog_link": "https://openai.com/blog/new-models-and-developer-products-announced-at-devday", "source_repo_link": null, "source_weights_link": null, "created_at": "2025-07-19T19:49:05.858492+00:00", "updated_at": "2025-07-19T19:49:05.858492+00:00", "model_family_id": null } ================================================ FILE: data/organizations/openai/models/gpt-4-0613/benchmarks.json ================================================ [ { "model_benchmark_id": 1917, "benchmark_id": "ai2-reasoning-challenge-(arc)", "model_id": "gpt-4-0613", "score": 0.963, "normalized_score": 0.963, "is_self_reported": true, "self_reported_source_link": "https://openai.com/research/gpt-4", "verified_by_llmstats": false, "analysis_method": "25-shot, Grade-school multiple choice science questions (Challenge-set)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.421959+00:00", "updated_at": "2025-07-19T19:56:15.421959+00:00", "benchmark_name": "AI2 Reasoning Challenge (ARC)" }, { "model_benchmark_id": 965, "benchmark_id": "drop", "model_id": "gpt-4-0613", "score": 0.809, "normalized_score": 0.809, "is_self_reported": true, "self_reported_source_link": "https://openai.com/research/gpt-4", "verified_by_llmstats": false, "analysis_method": "3-shot, Reading comprehension & arithmetic (f1 score)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.028099+00:00", "updated_at": "2025-07-19T19:56:13.028099+00:00", "benchmark_name": "DROP" }, { "model_benchmark_id": 362, "benchmark_id": "gpqa", "model_id": "gpt-4-0613", "score": 0.357, "normalized_score": 0.357, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/hello-gpt-4o/", "verified_by_llmstats": false, "analysis_method": "5-shot, Commonsense reasoning", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.775863+00:00", "updated_at": "2025-07-19T19:56:11.775863+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 55, "benchmark_id": "hellaswag", "model_id": "gpt-4-0613", "score": 0.953, "normalized_score": 0.953, "is_self_reported": true, "self_reported_source_link": "https://openai.com/research/gpt-4", "verified_by_llmstats": false, "analysis_method": "10-shot, Commonsense reasoning around everyday events", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.199031+00:00", "updated_at": "2025-07-19T19:56:11.199031+00:00", "benchmark_name": "HellaSwag" }, { "model_benchmark_id": 817, "benchmark_id": "humaneval", "model_id": "gpt-4-0613", "score": 0.67, "normalized_score": 0.67, "is_self_reported": true, "self_reported_source_link": "https://openai.com/research/gpt-4", "verified_by_llmstats": false, "analysis_method": "0-shot, Python coding tasks", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.702020+00:00", "updated_at": "2025-07-19T19:56:12.702020+00:00", "benchmark_name": "HumanEval" }, { "model_benchmark_id": 1915, "benchmark_id": "lsat", "model_id": "gpt-4-0613", "score": 0.88, "normalized_score": 0.88, "is_self_reported": true, "self_reported_source_link": "https://openai.com/research/gpt-4", "verified_by_llmstats": false, "analysis_method": "Percentile score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.413295+00:00", "updated_at": "2025-07-19T19:56:15.413295+00:00", "benchmark_name": "LSAT" }, { "model_benchmark_id": 432, "benchmark_id": "math", "model_id": "gpt-4-0613", "score": 0.42, "normalized_score": 0.42, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/hello-gpt-4o/", "verified_by_llmstats": false, "analysis_method": "Mathematics problem-solving", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.913379+00:00", "updated_at": "2025-07-19T19:56:11.913379+00:00", "benchmark_name": "MATH" }, { "model_benchmark_id": 1302, "benchmark_id": "mgsm", "model_id": "gpt-4-0613", "score": 0.745, "normalized_score": 0.745, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/hello-gpt-4o/", "verified_by_llmstats": false, "analysis_method": "Mathematics problem-solving", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.721873+00:00", "updated_at": "2025-07-19T19:56:13.721873+00:00", "benchmark_name": "MGSM" }, { "model_benchmark_id": 129, "benchmark_id": "mmlu", "model_id": "gpt-4-0613", "score": 0.864, "normalized_score": 0.864, "is_self_reported": true, "self_reported_source_link": "https://openai.com/research/gpt-4", "verified_by_llmstats": false, "analysis_method": "5-shot, Multiple-choice questions in 57 subjects (professional & academic)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.336601+00:00", "updated_at": "2025-07-19T19:56:11.336601+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 1916, "benchmark_id": "sat-math", "model_id": "gpt-4-0613", "score": 0.89, "normalized_score": 0.89, "is_self_reported": true, "self_reported_source_link": "https://openai.com/research/gpt-4", "verified_by_llmstats": false, "analysis_method": "Estimated from reported score of 710 out of 800", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.417889+00:00", "updated_at": "2025-07-19T19:56:15.417889+00:00", "benchmark_name": "SAT Math" }, { "model_benchmark_id": 1914, "benchmark_id": "uniform-bar-exam", "model_id": "gpt-4-0613", "score": 0.9, "normalized_score": 0.9, "is_self_reported": true, "self_reported_source_link": "https://openai.com/research/gpt-4", "verified_by_llmstats": false, "analysis_method": "Percentage score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.408427+00:00", "updated_at": "2025-07-19T19:56:15.408427+00:00", "benchmark_name": "Uniform Bar Exam" }, { "model_benchmark_id": 156, "benchmark_id": "winogrande", "model_id": "gpt-4-0613", "score": 0.875, "normalized_score": 0.875, "is_self_reported": true, "self_reported_source_link": "https://openai.com/research/gpt-4", "verified_by_llmstats": false, "analysis_method": "5-shot, Commonsense reasoning around pronoun resolution", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.396099+00:00", "updated_at": "2025-07-19T19:56:11.396099+00:00", "benchmark_name": "Winogrande" } ] ================================================ FILE: data/organizations/openai/models/gpt-4-0613/model.json ================================================ { "model_id": "gpt-4-0613", "name": "GPT-4", "organization_id": "openai", "fine_tuned_from_model_id": null, "description": "GPT-4 is a large multimodal model capable of processing both image and text inputs and generating human-like text outputs. It demonstrates human-level performance on various professional and academic benchmarks.", "release_date": "2023-06-13", "announcement_date": "2023-06-13", "license_id": "proprietary", "multimodal": true, "knowledge_cutoff": "2022-12-31", "param_count": null, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://platform.openai.com/docs/api-reference/chat", "source_playground": "https://platform.openai.com/playground", "source_paper": "https://arxiv.org/abs/2303.08774", "source_scorecard_blog_link": "https://openai.com/research/gpt-4", "source_repo_link": "https://github.com/openai/gpt-4", "source_weights_link": null, "created_at": "2025-07-19T19:49:05.869531+00:00", "updated_at": "2025-07-19T19:49:05.869531+00:00", "model_family_id": null } ================================================ FILE: data/organizations/openai/models/gpt-4-turbo-2024-04-09/benchmarks.json ================================================ [ { "model_benchmark_id": 966, "benchmark_id": "drop", "model_id": "gpt-4-turbo-2024-04-09", "score": 0.86, "normalized_score": 0.86, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/hello-gpt-4o/", "verified_by_llmstats": false, "analysis_method": "Reading comprehension & arithmetic (f1 score)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.030041+00:00", "updated_at": "2025-07-19T19:56:13.030041+00:00", "benchmark_name": "DROP" }, { "model_benchmark_id": 363, "benchmark_id": "gpqa", "model_id": "gpt-4-turbo-2024-04-09", "score": 0.48, "normalized_score": 0.48, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/hello-gpt-4o/", "verified_by_llmstats": false, "analysis_method": "General-Purpose Question Answering", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.777899+00:00", "updated_at": "2025-07-19T19:56:11.777899+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 818, "benchmark_id": "humaneval", "model_id": "gpt-4-turbo-2024-04-09", "score": 0.871, "normalized_score": 0.871, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/hello-gpt-4o/", "verified_by_llmstats": false, "analysis_method": "Python coding tasks", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.703615+00:00", "updated_at": "2025-07-19T19:56:12.703615+00:00", "benchmark_name": "HumanEval" }, { "model_benchmark_id": 433, "benchmark_id": "math", "model_id": "gpt-4-turbo-2024-04-09", "score": 0.726, "normalized_score": 0.726, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/hello-gpt-4o/", "verified_by_llmstats": false, "analysis_method": "Mathematics problem-solving", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.916360+00:00", "updated_at": "2025-07-19T19:56:11.916360+00:00", "benchmark_name": "MATH" }, { "model_benchmark_id": 1303, "benchmark_id": "mgsm", "model_id": "gpt-4-turbo-2024-04-09", "score": 0.885, "normalized_score": 0.885, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/hello-gpt-4o/", "verified_by_llmstats": false, "analysis_method": "Grade School Math Word Problems", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.723556+00:00", "updated_at": "2025-07-19T19:56:13.723556+00:00", "benchmark_name": "MGSM" }, { "model_benchmark_id": 130, "benchmark_id": "mmlu", "model_id": "gpt-4-turbo-2024-04-09", "score": 0.865, "normalized_score": 0.865, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/hello-gpt-4o/", "verified_by_llmstats": false, "analysis_method": "Multiple-choice questions in 57 subjects (professional & academic)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.337995+00:00", "updated_at": "2025-07-19T19:56:11.337995+00:00", "benchmark_name": "MMLU" } ] ================================================ FILE: data/organizations/openai/models/gpt-4-turbo-2024-04-09/model.json ================================================ { "model_id": "gpt-4-turbo-2024-04-09", "name": "GPT-4 Turbo", "organization_id": "openai", "fine_tuned_from_model_id": null, "description": "The latest GPT-4 model with improved performance, updated knowledge, and enhanced capabilities. It offers faster response times and more affordable pricing compared to previous versions.", "release_date": "2024-04-09", "announcement_date": "2024-04-09", "license_id": "proprietary", "multimodal": false, "knowledge_cutoff": "2023-12-31", "param_count": null, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4", "source_playground": "https://platform.openai.com/playground", "source_paper": null, "source_scorecard_blog_link": "https://openai.com/index/new-models-and-developer-products-announced-at-devday/", "source_repo_link": null, "source_weights_link": null, "created_at": "2025-07-19T19:49:05.872559+00:00", "updated_at": "2025-07-19T19:49:05.872559+00:00", "model_family_id": null } ================================================ FILE: data/organizations/openai/models/gpt-4.1-2025-04-14/benchmarks.json ================================================ [ { "model_benchmark_id": 671, "benchmark_id": "aider-polyglot", "model_id": "gpt-4.1-2025-04-14", "score": 0.516, "normalized_score": 0.516, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "Standard benchmark", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.389292+00:00", "updated_at": "2025-07-19T19:56:12.389292+00:00", "benchmark_name": "Aider-Polyglot" }, { "model_benchmark_id": 1335, "benchmark_id": "aider-polyglot-edit", "model_id": "gpt-4.1-2025-04-14", "score": 0.529, "normalized_score": 0.529, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "Standard benchmark", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.808732+00:00", "updated_at": "2025-07-19T19:56:13.808732+00:00", "benchmark_name": "Aider-Polyglot Edit" }, { "model_benchmark_id": 486, "benchmark_id": "aime-2024", "model_id": "gpt-4.1-2025-04-14", "score": 0.481, "normalized_score": 0.481, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "Standard benchmark", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.019979+00:00", "updated_at": "2025-07-19T19:56:12.019979+00:00", "benchmark_name": "AIME 2024" }, { "model_benchmark_id": 1889, "benchmark_id": "charxiv-d", "model_id": "gpt-4.1-2025-04-14", "score": 0.879, "normalized_score": 0.879, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "Standard benchmark", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.330689+00:00", "updated_at": "2025-07-19T19:56:15.330689+00:00", "benchmark_name": "CharXiv-D" }, { "model_benchmark_id": 1837, "benchmark_id": "charxiv-r", "model_id": "gpt-4.1-2025-04-14", "score": 0.567, "normalized_score": 0.567, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "Standard benchmark", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.201588+00:00", "updated_at": "2025-07-19T19:56:15.201588+00:00", "benchmark_name": "CharXiv-R" }, { "model_benchmark_id": 1860, "benchmark_id": "collie", "model_id": "gpt-4.1-2025-04-14", "score": 0.658, "normalized_score": 0.658, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "Standard benchmark", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.261360+00:00", "updated_at": "2025-07-19T19:56:15.261360+00:00", "benchmark_name": "COLLIE" }, { "model_benchmark_id": 1895, "benchmark_id": "complexfuncbench", "model_id": "gpt-4.1-2025-04-14", "score": 0.655, "normalized_score": 0.655, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "Standard benchmark", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.348011+00:00", "updated_at": "2025-07-19T19:56:15.348011+00:00", "benchmark_name": "ComplexFuncBench" }, { "model_benchmark_id": 353, "benchmark_id": "gpqa", "model_id": "gpt-4.1-2025-04-14", "score": 0.663, "normalized_score": 0.663, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "Diamond", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.761405+00:00", "updated_at": "2025-07-19T19:56:11.761405+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 1874, "benchmark_id": "graphwalks-bfs-<128k", "model_id": "gpt-4.1-2025-04-14", "score": 0.617, "normalized_score": 0.617, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "Standard benchmark", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.294683+00:00", "updated_at": "2025-07-19T19:56:15.294683+00:00", "benchmark_name": "Graphwalks BFS <128k" }, { "model_benchmark_id": 1877, "benchmark_id": "graphwalks-bfs->128k", "model_id": "gpt-4.1-2025-04-14", "score": 0.19, "normalized_score": 0.19, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "Internal benchmark", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.302353+00:00", "updated_at": "2025-07-19T19:56:15.302353+00:00", "benchmark_name": "Graphwalks BFS >128k" }, { "model_benchmark_id": 1881, "benchmark_id": "graphwalks-parents-<128k", "model_id": "gpt-4.1-2025-04-14", "score": 0.58, "normalized_score": 0.58, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "Internal benchmark", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.312231+00:00", "updated_at": "2025-07-19T19:56:15.312231+00:00", "benchmark_name": "Graphwalks parents <128k" }, { "model_benchmark_id": 1886, "benchmark_id": "graphwalks-parents->128k", "model_id": "gpt-4.1-2025-04-14", "score": 0.25, "normalized_score": 0.25, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "Internal benchmark", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.324002+00:00", "updated_at": "2025-07-19T19:56:15.324002+00:00", "benchmark_name": "Graphwalks parents >128k" }, { "model_benchmark_id": 635, "benchmark_id": "ifeval", "model_id": "gpt-4.1-2025-04-14", "score": 0.874, "normalized_score": 0.874, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "Standard benchmark", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.304284+00:00", "updated_at": "2025-07-19T19:56:12.304284+00:00", "benchmark_name": "IFEval" }, { "model_benchmark_id": 1848, "benchmark_id": "internal-api-instruction-following-(hard)", "model_id": "gpt-4.1-2025-04-14", "score": 0.491, "normalized_score": 0.491, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "Internal benchmark", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.230360+00:00", "updated_at": "2025-07-19T19:56:15.230360+00:00", "benchmark_name": "Internal API instruction following (hard)" }, { "model_benchmark_id": 543, "benchmark_id": "mathvista", "model_id": "gpt-4.1-2025-04-14", "score": 0.722, "normalized_score": 0.722, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "Standard benchmark", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.121168+00:00", "updated_at": "2025-07-19T19:56:12.121168+00:00", "benchmark_name": "MathVista" }, { "model_benchmark_id": 121, "benchmark_id": "mmlu", "model_id": "gpt-4.1-2025-04-14", "score": 0.902, "normalized_score": 0.902, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "Standard benchmark", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.323612+00:00", "updated_at": "2025-07-19T19:56:11.323612+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 1483, "benchmark_id": "mmmlu", "model_id": "gpt-4.1-2025-04-14", "score": 0.873, "normalized_score": 0.873, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "Standard benchmark", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.161058+00:00", "updated_at": "2025-07-19T19:56:14.161058+00:00", "benchmark_name": "MMMLU" }, { "model_benchmark_id": 593, "benchmark_id": "mmmu", "model_id": "gpt-4.1-2025-04-14", "score": 0.748, "normalized_score": 0.748, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "Standard benchmark", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.222754+00:00", "updated_at": "2025-07-19T19:56:12.222754+00:00", "benchmark_name": "MMMU" }, { "model_benchmark_id": 743, "benchmark_id": "multichallenge", "model_id": "gpt-4.1-2025-04-14", "score": 0.383, "normalized_score": 0.383, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "Standard benchmark (GPT-4o grader)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.561934+00:00", "updated_at": "2025-07-19T19:56:12.561934+00:00", "benchmark_name": "MultiChallenge" }, { "model_benchmark_id": 1854, "benchmark_id": "multichallenge-(o3-mini-grader)", "model_id": "gpt-4.1-2025-04-14", "score": 0.462, "normalized_score": 0.462, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "Standard benchmark (o3-mini grader, see footnote [3])", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.244951+00:00", "updated_at": "2025-07-19T19:56:15.244951+00:00", "benchmark_name": "MultiChallenge (o3-mini grader)" }, { "model_benchmark_id": 1653, "benchmark_id": "multi-if", "model_id": "gpt-4.1-2025-04-14", "score": 0.708, "normalized_score": 0.708, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "Standard benchmark", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.648170+00:00", "updated_at": "2025-07-19T19:56:14.648170+00:00", "benchmark_name": "Multi-IF" }, { "model_benchmark_id": 1866, "benchmark_id": "openai-mrcr:-2-needle-128k", "model_id": "gpt-4.1-2025-04-14", "score": 0.572, "normalized_score": 0.572, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "Internal benchmark", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.275855+00:00", "updated_at": "2025-07-19T19:56:15.275855+00:00", "benchmark_name": "OpenAI-MRCR: 2 needle 128k" }, { "model_benchmark_id": 1871, "benchmark_id": "openai-mrcr:-2-needle-1m", "model_id": "gpt-4.1-2025-04-14", "score": 0.463, "normalized_score": 0.463, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "Internal benchmark", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.286394+00:00", "updated_at": "2025-07-19T19:56:15.286394+00:00", "benchmark_name": "OpenAI-MRCR: 2 needle 1M" }, { "model_benchmark_id": 1358, "benchmark_id": "swe-bench-verified", "model_id": "gpt-4.1-2025-04-14", "score": 0.546, "normalized_score": 0.546, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "Internal methodology, see source footnote [2]", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.858938+00:00", "updated_at": "2025-07-19T19:56:13.858938+00:00", "benchmark_name": "SWE-Bench Verified" }, { "model_benchmark_id": 1780, "benchmark_id": "tau-bench-airline", "model_id": "gpt-4.1-2025-04-14", "score": 0.494, "normalized_score": 0.494, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "Avg 5 runs, no custom tools/prompting (footnote [4])", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.015514+00:00", "updated_at": "2025-07-19T19:56:15.015514+00:00", "benchmark_name": "TAU-bench Airline" }, { "model_benchmark_id": 1766, "benchmark_id": "tau-bench-retail", "model_id": "gpt-4.1-2025-04-14", "score": 0.68, "normalized_score": 0.68, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "Avg 5 runs, no custom tools/prompting (footnote [4], GPT-4o user model)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.986496+00:00", "updated_at": "2025-07-19T19:56:14.986496+00:00", "benchmark_name": "TAU-bench Retail" }, { "model_benchmark_id": 1907, "benchmark_id": "video-mme-(long,-no-subtitles)", "model_id": "gpt-4.1-2025-04-14", "score": 0.72, "normalized_score": 0.72, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "Standard benchmark", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.377204+00:00", "updated_at": "2025-07-19T19:56:15.377204+00:00", "benchmark_name": "Video-MME (long, no subtitles)" }, { "model_benchmark_id": 10011, "benchmark_id": "aime-2025", "model_id": "gpt-4.1-2025-04-14", "score": 0.464, "normalized_score": 0.464, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "GPT-4.1 with no tools - Competition mathematics (AIME 2025).", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "AIME 2025" }, { "model_benchmark_id": 10012, "benchmark_id": "humanity's-last-exam", "model_id": "gpt-4.1-2025-04-14", "score": 0.054, "normalized_score": 0.054, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "GPT-4.1 with no tools - Expert-level questions across subjects.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "Humanity's Last Exam" }, { "model_benchmark_id": 10013, "benchmark_id": "hmmt-2025", "model_id": "gpt-4.1-2025-04-14", "score": 0.289, "normalized_score": 0.289, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "GPT-4.1 with no tools - Harvard-MIT Mathematics Tournament.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "HMMT 2025" } ] ================================================ FILE: data/organizations/openai/models/gpt-4.1-2025-04-14/model.json ================================================ { "model_id": "gpt-4.1-2025-04-14", "name": "GPT-4.1", "organization_id": "openai", "fine_tuned_from_model_id": null, "description": "GPT-4.1 is OpenAI's latest and most advanced flagship model, significantly improving upon GPT-4 Turbo in performance across benchmarks, speed, and cost-effectiveness.", "release_date": "2025-04-14", "announcement_date": "2025-04-14", "license_id": "proprietary", "multimodal": true, "knowledge_cutoff": "2024-06-01", "param_count": null, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://platform.openai.com/docs/models/gpt-4.1", "source_playground": "https://platform.openai.com/playground?mode=chat&model=gpt-4.1", "source_paper": null, "source_scorecard_blog_link": "https://openai.com/index/gpt-4-1/", "source_repo_link": null, "source_weights_link": null, "created_at": "2025-07-19T19:49:05.841143+00:00", "updated_at": "2025-07-19T19:49:05.841143+00:00", "model_family_id": null } ================================================ FILE: data/organizations/openai/models/gpt-4.1-mini-2025-04-14/benchmarks.json ================================================ [ { "model_benchmark_id": 667, "benchmark_id": "aider-polyglot", "model_id": "gpt-4.1-mini-2025-04-14", "score": 0.347, "normalized_score": 0.347, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "Standard benchmark", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.382631+00:00", "updated_at": "2025-07-19T19:56:12.382631+00:00", "benchmark_name": "Aider-Polyglot" }, { "model_benchmark_id": 1331, "benchmark_id": "aider-polyglot-edit", "model_id": "gpt-4.1-mini-2025-04-14", "score": 0.316, "normalized_score": 0.316, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "Standard benchmark", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.801113+00:00", "updated_at": "2025-07-19T19:56:13.801113+00:00", "benchmark_name": "Aider-Polyglot Edit" }, { "model_benchmark_id": 482, "benchmark_id": "aime-2024", "model_id": "gpt-4.1-mini-2025-04-14", "score": 0.496, "normalized_score": 0.496, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "Standard benchmark", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.013761+00:00", "updated_at": "2025-07-19T19:56:12.013761+00:00", "benchmark_name": "AIME 2024" }, { "model_benchmark_id": 1887, "benchmark_id": "charxiv-d", "model_id": "gpt-4.1-mini-2025-04-14", "score": 0.884, "normalized_score": 0.884, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "Standard benchmark", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.327509+00:00", "updated_at": "2025-07-19T19:56:15.327509+00:00", "benchmark_name": "CharXiv-D" }, { "model_benchmark_id": 1834, "benchmark_id": "charxiv-r", "model_id": "gpt-4.1-mini-2025-04-14", "score": 0.568, "normalized_score": 0.568, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "Standard benchmark", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.195563+00:00", "updated_at": "2025-07-19T19:56:15.195563+00:00", "benchmark_name": "CharXiv-R" }, { "model_benchmark_id": 1857, "benchmark_id": "collie", "model_id": "gpt-4.1-mini-2025-04-14", "score": 0.546, "normalized_score": 0.546, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "Standard benchmark", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.255006+00:00", "updated_at": "2025-07-19T19:56:15.255006+00:00", "benchmark_name": "COLLIE" }, { "model_benchmark_id": 1892, "benchmark_id": "complexfuncbench", "model_id": "gpt-4.1-mini-2025-04-14", "score": 0.493, "normalized_score": 0.493, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "Standard benchmark", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.339307+00:00", "updated_at": "2025-07-19T19:56:15.339307+00:00", "benchmark_name": "ComplexFuncBench" }, { "model_benchmark_id": 348, "benchmark_id": "gpqa", "model_id": "gpt-4.1-mini-2025-04-14", "score": 0.65, "normalized_score": 0.65, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "Diamond", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.752534+00:00", "updated_at": "2025-07-19T19:56:11.752534+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 1872, "benchmark_id": "graphwalks-bfs-<128k", "model_id": "gpt-4.1-mini-2025-04-14", "score": 0.617, "normalized_score": 0.617, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "Standard benchmark", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.289789+00:00", "updated_at": "2025-07-19T19:56:15.289789+00:00", "benchmark_name": "Graphwalks BFS <128k" }, { "model_benchmark_id": 1875, "benchmark_id": "graphwalks-bfs->128k", "model_id": "gpt-4.1-mini-2025-04-14", "score": 0.15, "normalized_score": 0.15, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "Internal benchmark", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.298708+00:00", "updated_at": "2025-07-19T19:56:15.298708+00:00", "benchmark_name": "Graphwalks BFS >128k" }, { "model_benchmark_id": 1878, "benchmark_id": "graphwalks-parents-<128k", "model_id": "gpt-4.1-mini-2025-04-14", "score": 0.605, "normalized_score": 0.605, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "Internal benchmark", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.306151+00:00", "updated_at": "2025-07-19T19:56:15.306151+00:00", "benchmark_name": "Graphwalks parents <128k" }, { "model_benchmark_id": 1884, "benchmark_id": "graphwalks-parents->128k", "model_id": "gpt-4.1-mini-2025-04-14", "score": 0.11, "normalized_score": 0.11, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "Internal benchmark", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.319823+00:00", "updated_at": "2025-07-19T19:56:15.319823+00:00", "benchmark_name": "Graphwalks parents >128k" }, { "model_benchmark_id": 632, "benchmark_id": "ifeval", "model_id": "gpt-4.1-mini-2025-04-14", "score": 0.841, "normalized_score": 0.841, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "Standard benchmark", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.299050+00:00", "updated_at": "2025-07-19T19:56:12.299050+00:00", "benchmark_name": "IFEval" }, { "model_benchmark_id": 1845, "benchmark_id": "internal-api-instruction-following-(hard)", "model_id": "gpt-4.1-mini-2025-04-14", "score": 0.451, "normalized_score": 0.451, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "Internal benchmark", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.225405+00:00", "updated_at": "2025-07-19T19:56:15.225405+00:00", "benchmark_name": "Internal API instruction following (hard)" }, { "model_benchmark_id": 539, "benchmark_id": "mathvista", "model_id": "gpt-4.1-mini-2025-04-14", "score": 0.731, "normalized_score": 0.731, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "Standard benchmark", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.114367+00:00", "updated_at": "2025-07-19T19:56:12.114367+00:00", "benchmark_name": "MathVista" }, { "model_benchmark_id": 117, "benchmark_id": "mmlu", "model_id": "gpt-4.1-mini-2025-04-14", "score": 0.875, "normalized_score": 0.875, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "Standard benchmark", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.317652+00:00", "updated_at": "2025-07-19T19:56:11.317652+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 1481, "benchmark_id": "mmmlu", "model_id": "gpt-4.1-mini-2025-04-14", "score": 0.785, "normalized_score": 0.785, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "Standard benchmark", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.157799+00:00", "updated_at": "2025-07-19T19:56:14.157799+00:00", "benchmark_name": "MMMLU" }, { "model_benchmark_id": 590, "benchmark_id": "mmmu", "model_id": "gpt-4.1-mini-2025-04-14", "score": 0.727, "normalized_score": 0.727, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "Standard benchmark", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.217019+00:00", "updated_at": "2025-07-19T19:56:12.217019+00:00", "benchmark_name": "MMMU" }, { "model_benchmark_id": 740, "benchmark_id": "multichallenge", "model_id": "gpt-4.1-mini-2025-04-14", "score": 0.358, "normalized_score": 0.358, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "Standard benchmark (GPT-4o grader)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.555824+00:00", "updated_at": "2025-07-19T19:56:12.555824+00:00", "benchmark_name": "MultiChallenge" }, { "model_benchmark_id": 1851, "benchmark_id": "multichallenge-(o3-mini-grader)", "model_id": "gpt-4.1-mini-2025-04-14", "score": 0.422, "normalized_score": 0.422, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "Standard benchmark (o3-mini grader, see footnote [3])", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.239021+00:00", "updated_at": "2025-07-19T19:56:15.239021+00:00", "benchmark_name": "MultiChallenge (o3-mini grader)" }, { "model_benchmark_id": 1650, "benchmark_id": "multi-if", "model_id": "gpt-4.1-mini-2025-04-14", "score": 0.67, "normalized_score": 0.67, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "Standard benchmark", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.643303+00:00", "updated_at": "2025-07-19T19:56:14.643303+00:00", "benchmark_name": "Multi-IF" }, { "model_benchmark_id": 1863, "benchmark_id": "openai-mrcr:-2-needle-128k", "model_id": "gpt-4.1-mini-2025-04-14", "score": 0.472, "normalized_score": 0.472, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "Internal benchmark", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.270008+00:00", "updated_at": "2025-07-19T19:56:15.270008+00:00", "benchmark_name": "OpenAI-MRCR: 2 needle 128k" }, { "model_benchmark_id": 1869, "benchmark_id": "openai-mrcr:-2-needle-1m", "model_id": "gpt-4.1-mini-2025-04-14", "score": 0.333, "normalized_score": 0.333, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "Internal benchmark", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.282718+00:00", "updated_at": "2025-07-19T19:56:15.282718+00:00", "benchmark_name": "OpenAI-MRCR: 2 needle 1M" }, { "model_benchmark_id": 1355, "benchmark_id": "swe-bench-verified", "model_id": "gpt-4.1-mini-2025-04-14", "score": 0.236, "normalized_score": 0.236, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "Internal methodology, see source footnote [2]", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.852737+00:00", "updated_at": "2025-07-19T19:56:13.852737+00:00", "benchmark_name": "SWE-Bench Verified" }, { "model_benchmark_id": 1776, "benchmark_id": "tau-bench-airline", "model_id": "gpt-4.1-mini-2025-04-14", "score": 0.36, "normalized_score": 0.36, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "Avg 5 runs, no custom tools/prompting (footnote [4])", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.007636+00:00", "updated_at": "2025-07-19T19:56:15.007636+00:00", "benchmark_name": "TAU-bench Airline" }, { "model_benchmark_id": 1762, "benchmark_id": "tau-bench-retail", "model_id": "gpt-4.1-mini-2025-04-14", "score": 0.558, "normalized_score": 0.558, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "Avg 5 runs, no custom tools/prompting (footnote [4], GPT-4o user model)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.978528+00:00", "updated_at": "2025-07-19T19:56:14.978528+00:00", "benchmark_name": "TAU-bench Retail" }, { "model_benchmark_id": 10014, "benchmark_id": "aime-2025", "model_id": "gpt-4.1-mini-2025-04-14", "score": 0.402, "normalized_score": 0.402, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "GPT-4.1 mini with no tools - Competition mathematics (AIME 2025).", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "AIME 2025" }, { "model_benchmark_id": 10015, "benchmark_id": "humanity's-last-exam", "model_id": "gpt-4.1-mini-2025-04-14", "score": 0.037, "normalized_score": 0.037, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "GPT-4.1 mini with no tools - Expert-level questions across subjects.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "Humanity's Last Exam" }, { "model_benchmark_id": 10016, "benchmark_id": "hmmt-2025", "model_id": "gpt-4.1-mini-2025-04-14", "score": 0.35, "normalized_score": 0.35, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "GPT-4.1 mini with no tools - Harvard-MIT Mathematics Tournament.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "HMMT 2025" } ] ================================================ FILE: data/organizations/openai/models/gpt-4.1-mini-2025-04-14/model.json ================================================ { "model_id": "gpt-4.1-mini-2025-04-14", "name": "GPT-4.1 mini", "organization_id": "openai", "fine_tuned_from_model_id": null, "description": "GPT-4.1 mini provides a balance between intelligence, speed, and cost. It's a significant leap in small model performance, even beating GPT-4o in many benchmarks while reducing latency and cost.", "release_date": "2025-04-14", "announcement_date": "2025-04-14", "license_id": "proprietary", "multimodal": true, "knowledge_cutoff": "2024-05-31", "param_count": null, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://platform.openai.com/docs/models/gpt-4.1-mini", "source_playground": "https://platform.openai.com/playground?mode=chat&model=gpt-4.1-mini", "source_paper": null, "source_scorecard_blog_link": "https://openai.com/index/gpt-4-1/", "source_repo_link": null, "source_weights_link": null, "created_at": "2025-07-19T19:49:05.821382+00:00", "updated_at": "2025-07-19T19:49:05.821382+00:00", "model_family_id": null } ================================================ FILE: data/organizations/openai/models/gpt-4.1-nano-2025-04-14/benchmarks.json ================================================ [ { "model_benchmark_id": 669, "benchmark_id": "aider-polyglot", "model_id": "gpt-4.1-nano-2025-04-14", "score": 0.098, "normalized_score": 0.098, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Standard benchmark", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.385924+00:00", "updated_at": "2025-07-19T19:56:12.385924+00:00", "benchmark_name": "Aider-Polyglot" }, { "model_benchmark_id": 1333, "benchmark_id": "aider-polyglot-edit", "model_id": "gpt-4.1-nano-2025-04-14", "score": 0.062, "normalized_score": 0.062, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Standard benchmark", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.804864+00:00", "updated_at": "2025-07-19T19:56:13.804864+00:00", "benchmark_name": "Aider-Polyglot Edit" }, { "model_benchmark_id": 484, "benchmark_id": "aime-2024", "model_id": "gpt-4.1-nano-2025-04-14", "score": 0.294, "normalized_score": 0.294, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Standard benchmark", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.016856+00:00", "updated_at": "2025-07-19T19:56:12.016856+00:00", "benchmark_name": "AIME 2024" }, { "model_benchmark_id": 1888, "benchmark_id": "charxiv-d", "model_id": "gpt-4.1-nano-2025-04-14", "score": 0.739, "normalized_score": 0.739, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Standard benchmark", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.329021+00:00", "updated_at": "2025-07-19T19:56:15.329021+00:00", "benchmark_name": "CharXiv-D" }, { "model_benchmark_id": 1836, "benchmark_id": "charxiv-r", "model_id": "gpt-4.1-nano-2025-04-14", "score": 0.405, "normalized_score": 0.405, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Standard benchmark", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.199274+00:00", "updated_at": "2025-07-19T19:56:15.199274+00:00", "benchmark_name": "CharXiv-R" }, { "model_benchmark_id": 1858, "benchmark_id": "collie", "model_id": "gpt-4.1-nano-2025-04-14", "score": 0.425, "normalized_score": 0.425, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Standard benchmark", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.257208+00:00", "updated_at": "2025-07-19T19:56:15.257208+00:00", "benchmark_name": "COLLIE" }, { "model_benchmark_id": 1893, "benchmark_id": "complexfuncbench", "model_id": "gpt-4.1-nano-2025-04-14", "score": 0.057, "normalized_score": 0.057, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Standard benchmark", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.341699+00:00", "updated_at": "2025-07-19T19:56:15.341699+00:00", "benchmark_name": "ComplexFuncBench" }, { "model_benchmark_id": 350, "benchmark_id": "gpqa", "model_id": "gpt-4.1-nano-2025-04-14", "score": 0.503, "normalized_score": 0.503, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Diamond", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.756178+00:00", "updated_at": "2025-07-19T19:56:11.756178+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 1873, "benchmark_id": "graphwalks-bfs-<128k", "model_id": "gpt-4.1-nano-2025-04-14", "score": 0.25, "normalized_score": 0.25, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Standard benchmark", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.291775+00:00", "updated_at": "2025-07-19T19:56:15.291775+00:00", "benchmark_name": "Graphwalks BFS <128k" }, { "model_benchmark_id": 1876, "benchmark_id": "graphwalks-bfs->128k", "model_id": "gpt-4.1-nano-2025-04-14", "score": 0.029, "normalized_score": 0.029, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Internal benchmark", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.300453+00:00", "updated_at": "2025-07-19T19:56:15.300453+00:00", "benchmark_name": "Graphwalks BFS >128k" }, { "model_benchmark_id": 1879, "benchmark_id": "graphwalks-parents-<128k", "model_id": "gpt-4.1-nano-2025-04-14", "score": 0.094, "normalized_score": 0.094, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Internal benchmark", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.308330+00:00", "updated_at": "2025-07-19T19:56:15.308330+00:00", "benchmark_name": "Graphwalks parents <128k" }, { "model_benchmark_id": 1885, "benchmark_id": "graphwalks-parents->128k", "model_id": "gpt-4.1-nano-2025-04-14", "score": 0.056, "normalized_score": 0.056, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Internal benchmark", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.322097+00:00", "updated_at": "2025-07-19T19:56:15.322097+00:00", "benchmark_name": "Graphwalks parents >128k" }, { "model_benchmark_id": 633, "benchmark_id": "ifeval", "model_id": "gpt-4.1-nano-2025-04-14", "score": 0.745, "normalized_score": 0.745, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Standard benchmark", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.300562+00:00", "updated_at": "2025-07-19T19:56:12.300562+00:00", "benchmark_name": "IFEval" }, { "model_benchmark_id": 1846, "benchmark_id": "internal-api-instruction-following-(hard)", "model_id": "gpt-4.1-nano-2025-04-14", "score": 0.316, "normalized_score": 0.316, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Internal benchmark", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.227248+00:00", "updated_at": "2025-07-19T19:56:15.227248+00:00", "benchmark_name": "Internal API instruction following (hard)" }, { "model_benchmark_id": 541, "benchmark_id": "mathvista", "model_id": "gpt-4.1-nano-2025-04-14", "score": 0.562, "normalized_score": 0.562, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Standard benchmark", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.117553+00:00", "updated_at": "2025-07-19T19:56:12.117553+00:00", "benchmark_name": "MathVista" }, { "model_benchmark_id": 118, "benchmark_id": "mmlu", "model_id": "gpt-4.1-nano-2025-04-14", "score": 0.801, "normalized_score": 0.801, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Standard benchmark", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.319012+00:00", "updated_at": "2025-07-19T19:56:11.319012+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 1482, "benchmark_id": "mmmlu", "model_id": "gpt-4.1-nano-2025-04-14", "score": 0.669, "normalized_score": 0.669, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Standard benchmark", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.159419+00:00", "updated_at": "2025-07-19T19:56:14.159419+00:00", "benchmark_name": "MMMLU" }, { "model_benchmark_id": 592, "benchmark_id": "mmmu", "model_id": "gpt-4.1-nano-2025-04-14", "score": 0.554, "normalized_score": 0.554, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Standard benchmark", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.220951+00:00", "updated_at": "2025-07-19T19:56:12.220951+00:00", "benchmark_name": "MMMU" }, { "model_benchmark_id": 741, "benchmark_id": "multichallenge", "model_id": "gpt-4.1-nano-2025-04-14", "score": 0.15, "normalized_score": 0.15, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Standard benchmark (GPT-4o grader)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.557571+00:00", "updated_at": "2025-07-19T19:56:12.557571+00:00", "benchmark_name": "MultiChallenge" }, { "model_benchmark_id": 1852, "benchmark_id": "multichallenge-(o3-mini-grader)", "model_id": "gpt-4.1-nano-2025-04-14", "score": 0.311, "normalized_score": 0.311, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Standard benchmark (o3-mini grader, see footnote [3])", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.241054+00:00", "updated_at": "2025-07-19T19:56:15.241054+00:00", "benchmark_name": "MultiChallenge (o3-mini grader)" }, { "model_benchmark_id": 1651, "benchmark_id": "multi-if", "model_id": "gpt-4.1-nano-2025-04-14", "score": 0.572, "normalized_score": 0.572, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Standard benchmark", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.645047+00:00", "updated_at": "2025-07-19T19:56:14.645047+00:00", "benchmark_name": "Multi-IF" }, { "model_benchmark_id": 1864, "benchmark_id": "openai-mrcr:-2-needle-128k", "model_id": "gpt-4.1-nano-2025-04-14", "score": 0.366, "normalized_score": 0.366, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Internal benchmark", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.272341+00:00", "updated_at": "2025-07-19T19:56:15.272341+00:00", "benchmark_name": "OpenAI-MRCR: 2 needle 128k" }, { "model_benchmark_id": 1870, "benchmark_id": "openai-mrcr:-2-needle-1m", "model_id": "gpt-4.1-nano-2025-04-14", "score": 0.12, "normalized_score": 0.12, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Internal benchmark", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.284545+00:00", "updated_at": "2025-07-19T19:56:15.284545+00:00", "benchmark_name": "OpenAI-MRCR: 2 needle 1M" }, { "model_benchmark_id": 1778, "benchmark_id": "tau-bench-airline", "model_id": "gpt-4.1-nano-2025-04-14", "score": 0.14, "normalized_score": 0.14, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Avg 5 runs, no custom tools/prompting (footnote [4])", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.011934+00:00", "updated_at": "2025-07-19T19:56:15.011934+00:00", "benchmark_name": "TAU-bench Airline" }, { "model_benchmark_id": 1764, "benchmark_id": "tau-bench-retail", "model_id": "gpt-4.1-nano-2025-04-14", "score": 0.226, "normalized_score": 0.226, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Avg 5 runs, no custom tools/prompting (footnote [4], GPT-4o user model)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.982239+00:00", "updated_at": "2025-07-19T19:56:14.982239+00:00", "benchmark_name": "TAU-bench Retail" } ] ================================================ FILE: data/organizations/openai/models/gpt-4.1-nano-2025-04-14/model.json ================================================ { "model_id": "gpt-4.1-nano-2025-04-14", "name": "GPT-4.1 nano", "organization_id": "openai", "fine_tuned_from_model_id": null, "description": "GPT-4.1 nano is OpenAI's fastest and cheapest model available in the GPT-4.1 family. It delivers exceptional performance at a small size with its 1 million token context window. Ideal for tasks like classification or autocompletion.", "release_date": "2025-04-14", "announcement_date": "2025-04-14", "license_id": "proprietary", "multimodal": true, "knowledge_cutoff": "2024-05-31", "param_count": null, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://platform.openai.com/docs/models/gpt-4.1-nano", "source_playground": "https://platform.openai.com/playground?mode=chat&model=gpt-4.1-nano", "source_paper": null, "source_scorecard_blog_link": "https://openai.com/index/gpt-4-1/", "source_repo_link": null, "source_weights_link": null, "created_at": "2025-07-19T19:49:05.827978+00:00", "updated_at": "2025-07-19T19:49:05.827978+00:00", "model_family_id": null } ================================================ FILE: data/organizations/openai/models/gpt-4.5/benchmarks.json ================================================ [ { "model_benchmark_id": 1337, "benchmark_id": "aider-polyglot-edit", "model_id": "gpt-4.5", "score": 0.449, "normalized_score": 0.449, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.811839+00:00", "updated_at": "2025-07-19T19:56:13.811839+00:00", "benchmark_name": "Aider-Polyglot Edit" }, { "model_benchmark_id": 489, "benchmark_id": "aime-2024", "model_id": "gpt-4.5", "score": 0.367, "normalized_score": 0.367, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.024273+00:00", "updated_at": "2025-07-19T19:56:12.024273+00:00", "benchmark_name": "AIME 2024" }, { "model_benchmark_id": 1891, "benchmark_id": "charxiv-d", "model_id": "gpt-4.5", "score": 0.9, "normalized_score": 0.9, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.335527+00:00", "updated_at": "2025-07-19T19:56:15.335527+00:00", "benchmark_name": "CharXiv-D" }, { "model_benchmark_id": 1839, "benchmark_id": "charxiv-r", "model_id": "gpt-4.5", "score": 0.554, "normalized_score": 0.554, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.204875+00:00", "updated_at": "2025-07-19T19:56:15.204875+00:00", "benchmark_name": "CharXiv-R" }, { "model_benchmark_id": 1862, "benchmark_id": "collie", "model_id": "gpt-4.5", "score": 0.723, "normalized_score": 0.723, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.265565+00:00", "updated_at": "2025-07-19T19:56:15.265565+00:00", "benchmark_name": "COLLIE" }, { "model_benchmark_id": 1897, "benchmark_id": "complexfuncbench", "model_id": "gpt-4.5", "score": 0.63, "normalized_score": 0.63, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.351430+00:00", "updated_at": "2025-07-19T19:56:15.351430+00:00", "benchmark_name": "ComplexFuncBench" }, { "model_benchmark_id": 357, "benchmark_id": "gpqa", "model_id": "gpt-4.5", "score": 0.695, "normalized_score": 0.695, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Accuracy (Diamond)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.767414+00:00", "updated_at": "2025-07-19T19:56:11.767414+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 1906, "benchmark_id": "graphwalks-bfs-<128k", "model_id": "gpt-4.5", "score": 0.723, "normalized_score": 0.723, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.372855+00:00", "updated_at": "2025-07-19T19:56:15.372855+00:00", "benchmark_name": "Graphwalks BFS <128k" }, { "model_benchmark_id": 1883, "benchmark_id": "graphwalks-parents-<128k", "model_id": "gpt-4.5", "score": 0.726, "normalized_score": 0.726, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.315697+00:00", "updated_at": "2025-07-19T19:56:15.315697+00:00", "benchmark_name": "Graphwalks parents <128k" }, { "model_benchmark_id": 1015, "benchmark_id": "gsm8k", "model_id": "gpt-4.5", "score": 0.97, "normalized_score": 0.97, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-4-5/", "verified_by_llmstats": false, "analysis_method": "Answer accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.114869+00:00", "updated_at": "2025-07-19T19:56:13.114869+00:00", "benchmark_name": "GSM8k" }, { "model_benchmark_id": 813, "benchmark_id": "humaneval", "model_id": "gpt-4.5", "score": 0.88, "normalized_score": 0.88, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-4-5/", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.694244+00:00", "updated_at": "2025-07-19T19:56:12.694244+00:00", "benchmark_name": "HumanEval" }, { "model_benchmark_id": 637, "benchmark_id": "ifeval", "model_id": "gpt-4.5", "score": 0.882, "normalized_score": 0.882, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.307682+00:00", "updated_at": "2025-07-19T19:56:12.307682+00:00", "benchmark_name": "IFEval" }, { "model_benchmark_id": 1850, "benchmark_id": "internal-api-instruction-following-(hard)", "model_id": "gpt-4.5", "score": 0.54, "normalized_score": 0.54, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.234022+00:00", "updated_at": "2025-07-19T19:56:15.234022+00:00", "benchmark_name": "Internal API instruction following (hard)" }, { "model_benchmark_id": 545, "benchmark_id": "mathvista", "model_id": "gpt-4.5", "score": 0.723, "normalized_score": 0.723, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.124115+00:00", "updated_at": "2025-07-19T19:56:12.124115+00:00", "benchmark_name": "MathVista" }, { "model_benchmark_id": 124, "benchmark_id": "mmlu", "model_id": "gpt-4.5", "score": 0.908, "normalized_score": 0.908, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Multiple-choice accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.328688+00:00", "updated_at": "2025-07-19T19:56:11.328688+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 1485, "benchmark_id": "mmmlu", "model_id": "gpt-4.5", "score": 0.851, "normalized_score": 0.851, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.164320+00:00", "updated_at": "2025-07-19T19:56:14.164320+00:00", "benchmark_name": "MMMLU" }, { "model_benchmark_id": 595, "benchmark_id": "mmmu", "model_id": "gpt-4.5", "score": 0.752, "normalized_score": 0.752, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.226731+00:00", "updated_at": "2025-07-19T19:56:12.226731+00:00", "benchmark_name": "MMMU" }, { "model_benchmark_id": 744, "benchmark_id": "multichallenge", "model_id": "gpt-4.5", "score": 0.438, "normalized_score": 0.438, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.563438+00:00", "updated_at": "2025-07-19T19:56:12.563438+00:00", "benchmark_name": "MultiChallenge" }, { "model_benchmark_id": 1856, "benchmark_id": "multichallenge-(o3-mini-grader)", "model_id": "gpt-4.5", "score": 0.501, "normalized_score": 0.501, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.249385+00:00", "updated_at": "2025-07-19T19:56:15.249385+00:00", "benchmark_name": "MultiChallenge (o3-mini grader)" }, { "model_benchmark_id": 1655, "benchmark_id": "multi-if", "model_id": "gpt-4.5", "score": 0.708, "normalized_score": 0.708, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.652033+00:00", "updated_at": "2025-07-19T19:56:14.652033+00:00", "benchmark_name": "Multi-IF" }, { "model_benchmark_id": 1868, "benchmark_id": "openai-mrcr:-2-needle-128k", "model_id": "gpt-4.5", "score": 0.385, "normalized_score": 0.385, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.279311+00:00", "updated_at": "2025-07-19T19:56:15.279311+00:00", "benchmark_name": "OpenAI-MRCR: 2 needle 128k" }, { "model_benchmark_id": 240, "benchmark_id": "simpleqa", "model_id": "gpt-4.5", "score": 0.625, "normalized_score": 0.625, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-4-5/", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.559622+00:00", "updated_at": "2025-07-19T19:56:11.559622+00:00", "benchmark_name": "SimpleQA" }, { "model_benchmark_id": 1360, "benchmark_id": "swe-bench-verified", "model_id": "gpt-4.5", "score": 0.38, "normalized_score": 0.38, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Success rate", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.863719+00:00", "updated_at": "2025-07-19T19:56:13.863719+00:00", "benchmark_name": "SWE-Bench Verified" }, { "model_benchmark_id": 1900, "benchmark_id": "swe-lancer", "model_id": "gpt-4.5", "score": 0.373, "normalized_score": 0.373, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Success rate ($186K equivalent)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.358579+00:00", "updated_at": "2025-07-19T19:56:15.358579+00:00", "benchmark_name": "SWE-Lancer" }, { "model_benchmark_id": 1903, "benchmark_id": "swe-lancer-(ic-diamond-subset)", "model_id": "gpt-4.5", "score": 0.174, "normalized_score": 0.174, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Success rate ($41K equivalent)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.365353+00:00", "updated_at": "2025-07-19T19:56:15.365353+00:00", "benchmark_name": "SWE-Lancer (IC-Diamond subset)" }, { "model_benchmark_id": 1782, "benchmark_id": "tau-bench-airline", "model_id": "gpt-4.5", "score": 0.5, "normalized_score": 0.5, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.020093+00:00", "updated_at": "2025-07-19T19:56:15.020093+00:00", "benchmark_name": "TAU-bench Airline" }, { "model_benchmark_id": 1768, "benchmark_id": "tau-bench-retail", "model_id": "gpt-4.5", "score": 0.684, "normalized_score": 0.684, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.989887+00:00", "updated_at": "2025-07-19T19:56:14.989887+00:00", "benchmark_name": "TAU-bench Retail" } ] ================================================ FILE: data/organizations/openai/models/gpt-4.5/model.json ================================================ { "model_id": "gpt-4.5", "name": "GPT-4.5", "organization_id": "openai", "fine_tuned_from_model_id": null, "description": "GPT-4.5 is OpenAI's most advanced model, offering improved reasoning, coding, and creative capabilities with faster performance and longer context handling than GPT-4. It features enhanced instruction following, reduced hallucinations, and better factual accuracy.", "release_date": "2025-02-27", "announcement_date": "2025-02-27", "license_id": "proprietary", "multimodal": true, "knowledge_cutoff": null, "param_count": null, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://platform.openai.com/docs/models/gpt-4-5#gpt-4-5", "source_playground": "https://platform.openai.com/playground", "source_paper": null, "source_scorecard_blog_link": "https://openai.com/index/introducing-gpt-4-5/", "source_repo_link": "https://github.com/openai", "source_weights_link": null, "created_at": "2025-07-19T19:49:05.852855+00:00", "updated_at": "2025-07-19T19:49:05.852855+00:00", "model_family_id": null } ================================================ FILE: data/organizations/openai/models/gpt-4o-2024-05-13/benchmarks.json ================================================ [ { "model_benchmark_id": 962, "benchmark_id": "drop", "model_id": "gpt-4o-2024-05-13", "score": 0.834, "normalized_score": 0.834, "is_self_reported": true, "self_reported_source_link": "https://openai.com/blog/gpt-4o", "verified_by_llmstats": false, "analysis_method": "F1 Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.023727+00:00", "updated_at": "2025-07-19T19:56:13.023727+00:00", "benchmark_name": "DROP" }, { "model_benchmark_id": 352, "benchmark_id": "gpqa", "model_id": "gpt-4o-2024-05-13", "score": 0.536, "normalized_score": 0.536, "is_self_reported": true, "self_reported_source_link": "https://openai.com/blog/gpt-4o", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.759539+00:00", "updated_at": "2025-07-19T19:56:11.759539+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 811, "benchmark_id": "humaneval", "model_id": "gpt-4o-2024-05-13", "score": 0.902, "normalized_score": 0.902, "is_self_reported": true, "self_reported_source_link": "https://openai.com/blog/gpt-4o", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.689969+00:00", "updated_at": "2025-07-19T19:56:12.689969+00:00", "benchmark_name": "HumanEval" }, { "model_benchmark_id": 427, "benchmark_id": "math", "model_id": "gpt-4o-2024-05-13", "score": 0.766, "normalized_score": 0.766, "is_self_reported": true, "self_reported_source_link": "https://openai.com/blog/gpt-4o", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.903446+00:00", "updated_at": "2025-07-19T19:56:11.903446+00:00", "benchmark_name": "MATH" }, { "model_benchmark_id": 542, "benchmark_id": "mathvista", "model_id": "gpt-4o-2024-05-13", "score": 0.638, "normalized_score": 0.638, "is_self_reported": true, "self_reported_source_link": "https://openai.com/blog/gpt-4o", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.119289+00:00", "updated_at": "2025-07-19T19:56:12.119289+00:00", "benchmark_name": "MathVista" }, { "model_benchmark_id": 1297, "benchmark_id": "mgsm", "model_id": "gpt-4o-2024-05-13", "score": 0.905, "normalized_score": 0.905, "is_self_reported": true, "self_reported_source_link": "https://openai.com/blog/gpt-4o", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.714155+00:00", "updated_at": "2025-07-19T19:56:13.714155+00:00", "benchmark_name": "MGSM" }, { "model_benchmark_id": 120, "benchmark_id": "mmlu", "model_id": "gpt-4o-2024-05-13", "score": 0.887, "normalized_score": 0.887, "is_self_reported": true, "self_reported_source_link": "https://openai.com/blog/gpt-4o", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.322163+00:00", "updated_at": "2025-07-19T19:56:11.322163+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 219, "benchmark_id": "mmlu-pro", "model_id": "gpt-4o-2024-05-13", "score": 0.726, "normalized_score": 0.726, "is_self_reported": true, "self_reported_source_link": "https://openai.com/blog/gpt-4o", "verified_by_llmstats": false, "analysis_method": "0-shot CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.515262+00:00", "updated_at": "2025-07-19T19:56:11.515262+00:00", "benchmark_name": "MMLU-Pro" } ] ================================================ FILE: data/organizations/openai/models/gpt-4o-2024-05-13/model.json ================================================ { "model_id": "gpt-4o-2024-05-13", "name": "GPT-4o", "organization_id": "openai", "fine_tuned_from_model_id": null, "description": "GPT-4o ('o' for 'omni') is a multimodal AI model that accepts text, audio, image, and video inputs, and generates text, audio, and image outputs. It matches GPT-4 Turbo performance on text and code, with improvements in non-English languages, vision, and audio understanding.", "release_date": "2024-05-13", "announcement_date": "2024-05-13", "license_id": "proprietary", "multimodal": true, "knowledge_cutoff": null, "param_count": null, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://platform.openai.com/docs/api-reference", "source_playground": "https://chat.openai.com/", "source_paper": null, "source_scorecard_blog_link": "https://openai.com/index/hello-gpt-4o/", "source_repo_link": null, "source_weights_link": null, "created_at": "2025-07-19T19:49:05.838358+00:00", "updated_at": "2025-07-19T19:49:05.838358+00:00", "model_family_id": null } ================================================ FILE: data/organizations/openai/models/gpt-4o-2024-08-06/benchmarks.json ================================================ [ { "model_benchmark_id": 1908, "benchmark_id": "activitynet", "model_id": "gpt-4o-2024-08-06", "score": 0.619, "normalized_score": 0.619, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/hello-gpt-4o/", "verified_by_llmstats": false, "analysis_method": "test set evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.381219+00:00", "updated_at": "2025-07-19T19:56:15.381219+00:00", "benchmark_name": "ActivityNet" }, { "model_benchmark_id": 1262, "benchmark_id": "ai2d", "model_id": "gpt-4o-2024-08-06", "score": 0.942, "normalized_score": 0.942, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/hello-gpt-4o/", "verified_by_llmstats": false, "analysis_method": "test set evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.646808+00:00", "updated_at": "2025-07-19T19:56:13.646808+00:00", "benchmark_name": "AI2D" }, { "model_benchmark_id": 672, "benchmark_id": "aider-polyglot", "model_id": "gpt-4o-2024-08-06", "score": 0.307, "normalized_score": 0.307, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.391433+00:00", "updated_at": "2025-07-19T19:56:12.391433+00:00", "benchmark_name": "Aider-Polyglot" }, { "model_benchmark_id": 1336, "benchmark_id": "aider-polyglot-edit", "model_id": "gpt-4o-2024-08-06", "score": 0.182, "normalized_score": 0.182, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.810263+00:00", "updated_at": "2025-07-19T19:56:13.810263+00:00", "benchmark_name": "Aider-Polyglot Edit" }, { "model_benchmark_id": 488, "benchmark_id": "aime-2024", "model_id": "gpt-4o-2024-08-06", "score": 0.131, "normalized_score": 0.131, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.022775+00:00", "updated_at": "2025-07-19T19:56:12.022775+00:00", "benchmark_name": "AIME 2024" }, { "model_benchmark_id": 875, "benchmark_id": "chartqa", "model_id": "gpt-4o-2024-08-06", "score": 0.857, "normalized_score": 0.857, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/hello-gpt-4o/", "verified_by_llmstats": false, "analysis_method": "test set evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.824155+00:00", "updated_at": "2025-07-19T19:56:12.824155+00:00", "benchmark_name": "ChartQA" }, { "model_benchmark_id": 1890, "benchmark_id": "charxiv-d", "model_id": "gpt-4o-2024-08-06", "score": 0.853, "normalized_score": 0.853, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.333294+00:00", "updated_at": "2025-07-19T19:56:15.333294+00:00", "benchmark_name": "CharXiv-D" }, { "model_benchmark_id": 1838, "benchmark_id": "charxiv-r", "model_id": "gpt-4o-2024-08-06", "score": 0.588, "normalized_score": 0.588, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-5/", "verified_by_llmstats": false, "analysis_method": "GPT-4o without thinking mode - Scientific figure reasoning and interpretation.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.203285+00:00", "updated_at": "2025-07-19T19:56:15.203285+00:00", "benchmark_name": "CharXiv-R" }, { "model_benchmark_id": 1861, "benchmark_id": "collie", "model_id": "gpt-4o-2024-08-06", "score": 0.61, "normalized_score": 0.61, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-5/", "verified_by_llmstats": false, "analysis_method": "GPT-4o without thinking mode - Instruction-following in freeform writing.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.262884+00:00", "updated_at": "2025-07-19T19:56:15.262884+00:00", "benchmark_name": "COLLIE" }, { "model_benchmark_id": 1867, "benchmark_id": "tau2-airline", "model_id": "gpt-4o-2024-08-06", "score": 0.455, "normalized_score": 0.455, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-5/", "verified_by_llmstats": false, "analysis_method": "GPT-4o without thinking mode - Function calling benchmark (airline domain).", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "Tau2 airline" }, { "model_benchmark_id": 1868, "benchmark_id": "tau2-retail", "model_id": "gpt-4o-2024-08-06", "score": 0.634, "normalized_score": 0.634, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-5/", "verified_by_llmstats": false, "analysis_method": "GPT-4o without thinking mode - Function calling benchmark (retail domain).", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "Tau2 retail" }, { "model_benchmark_id": 1869, "benchmark_id": "tau2-telecom", "model_id": "gpt-4o-2024-08-06", "score": 0.235, "normalized_score": 0.235, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-5/", "verified_by_llmstats": false, "analysis_method": "GPT-4o without thinking mode - Function calling benchmark (telecom domain).", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "Tau2 telecom" }, { "model_benchmark_id": 1870, "benchmark_id": "mmmu-pro", "model_id": "gpt-4o-2024-08-06", "score": 0.599, "normalized_score": 0.599, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-5/", "verified_by_llmstats": false, "analysis_method": "GPT-4o without thinking mode - Graduate-level visual problem-solving with advanced multimodal reasoning.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "MMMU-Pro" }, { "model_benchmark_id": 1871, "benchmark_id": "videommmu", "model_id": "gpt-4o-2024-08-06", "score": 0.612, "normalized_score": 0.612, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-5/", "verified_by_llmstats": false, "analysis_method": "GPT-4o without thinking mode - Video-based multimodal reasoning (max frame 256).", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "VideoMMMU" }, { "model_benchmark_id": 1872, "benchmark_id": "erqa", "model_id": "gpt-4o-2024-08-06", "score": 0.352, "normalized_score": 0.352, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-5/", "verified_by_llmstats": false, "analysis_method": "GPT-4o without thinking mode - Multimodal spatial reasoning.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "ERQA" }, { "model_benchmark_id": 1896, "benchmark_id": "complexfuncbench", "model_id": "gpt-4o-2024-08-06", "score": 0.665, "normalized_score": 0.665, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.349679+00:00", "updated_at": "2025-07-19T19:56:15.349679+00:00", "benchmark_name": "ComplexFuncBench" }, { "model_benchmark_id": 900, "benchmark_id": "docvqa", "model_id": "gpt-4o-2024-08-06", "score": 0.928, "normalized_score": 0.928, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/hello-gpt-4o/", "verified_by_llmstats": false, "analysis_method": "test set evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.873722+00:00", "updated_at": "2025-07-19T19:56:12.873722+00:00", "benchmark_name": "DocVQA" }, { "model_benchmark_id": 926, "benchmark_id": "egoschema", "model_id": "gpt-4o-2024-08-06", "score": 0.722, "normalized_score": 0.722, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/hello-gpt-4o/", "verified_by_llmstats": false, "analysis_method": "test set evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.935728+00:00", "updated_at": "2025-07-19T19:56:12.935728+00:00", "benchmark_name": "EgoSchema" }, { "model_benchmark_id": 355, "benchmark_id": "gpqa", "model_id": "gpt-4o-2024-08-06", "score": 0.701, "normalized_score": 0.701, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-5/", "verified_by_llmstats": false, "analysis_method": "GPT-4o - Diamond no thinking no tools", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.764329+00:00", "updated_at": "2025-07-19T19:56:11.764329+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 1905, "benchmark_id": "graphwalks-bfs-<128k", "model_id": "gpt-4o-2024-08-06", "score": 0.417, "normalized_score": 0.417, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.370259+00:00", "updated_at": "2025-07-19T19:56:15.370259+00:00", "benchmark_name": "Graphwalks BFS <128k" }, { "model_benchmark_id": 1882, "benchmark_id": "graphwalks-parents-<128k", "model_id": "gpt-4o-2024-08-06", "score": 0.354, "normalized_score": 0.354, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.314044+00:00", "updated_at": "2025-07-19T19:56:15.314044+00:00", "benchmark_name": "Graphwalks parents <128k" }, { "model_benchmark_id": 636, "benchmark_id": "ifeval", "model_id": "gpt-4o-2024-08-06", "score": 0.81, "normalized_score": 0.81, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.306083+00:00", "updated_at": "2025-07-19T19:56:12.306083+00:00", "benchmark_name": "IFEval" }, { "model_benchmark_id": 1849, "benchmark_id": "internal-api-instruction-following-(hard)", "model_id": "gpt-4o-2024-08-06", "score": 0.292, "normalized_score": 0.292, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.232334+00:00", "updated_at": "2025-07-19T19:56:15.232334+00:00", "benchmark_name": "Internal API instruction following (hard)" }, { "model_benchmark_id": 544, "benchmark_id": "mathvista", "model_id": "gpt-4o-2024-08-06", "score": 0.614, "normalized_score": 0.614, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.122558+00:00", "updated_at": "2025-07-19T19:56:12.122558+00:00", "benchmark_name": "MathVista" }, { "model_benchmark_id": 122, "benchmark_id": "mmlu", "model_id": "gpt-4o-2024-08-06", "score": 0.857, "normalized_score": 0.857, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.325082+00:00", "updated_at": "2025-07-19T19:56:11.325082+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 220, "benchmark_id": "mmlu-pro", "model_id": "gpt-4o-2024-08-06", "score": 0.747, "normalized_score": 0.747, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/spaces/TIGER-Lab/MMLU-Pro", "verified_by_llmstats": false, "analysis_method": "0-shot CoT", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.517058+00:00", "updated_at": "2025-07-19T19:56:11.517058+00:00", "benchmark_name": "MMLU-Pro" }, { "model_benchmark_id": 1484, "benchmark_id": "mmmlu", "model_id": "gpt-4o-2024-08-06", "score": 0.814, "normalized_score": 0.814, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.162717+00:00", "updated_at": "2025-07-19T19:56:14.162717+00:00", "benchmark_name": "MMMLU" }, { "model_benchmark_id": 594, "benchmark_id": "mmmu", "model_id": "gpt-4o-2024-08-06", "score": 0.722, "normalized_score": 0.722, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-5/", "verified_by_llmstats": false, "analysis_method": "GPT-4o without thinking mode - College-level visual problem-solving with multimodal reasoning.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.224513+00:00", "updated_at": "2025-07-19T19:56:12.224513+00:00", "benchmark_name": "MMMU" }, { "model_benchmark_id": 1855, "benchmark_id": "multichallenge-(o3-mini-grader)", "model_id": "gpt-4o-2024-08-06", "score": 0.399, "normalized_score": 0.399, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.246431+00:00", "updated_at": "2025-07-19T19:56:15.246431+00:00", "benchmark_name": "MultiChallenge (o3-mini grader)" }, { "model_benchmark_id": 1654, "benchmark_id": "multi-if", "model_id": "gpt-4o-2024-08-06", "score": 0.609, "normalized_score": 0.609, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.650416+00:00", "updated_at": "2025-07-19T19:56:14.650416+00:00", "benchmark_name": "Multi-IF" }, { "model_benchmark_id": 1867, "benchmark_id": "openai-mrcr:-2-needle-128k", "model_id": "gpt-4o-2024-08-06", "score": 0.319, "normalized_score": 0.319, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.277538+00:00", "updated_at": "2025-07-19T19:56:15.277538+00:00", "benchmark_name": "OpenAI-MRCR: 2 needle 128k" }, { "model_benchmark_id": 239, "benchmark_id": "simpleqa", "model_id": "gpt-4o-2024-08-06", "score": 0.382, "normalized_score": 0.382, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-4-5/", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.557852+00:00", "updated_at": "2025-07-19T19:56:11.557852+00:00", "benchmark_name": "SimpleQA" }, { "model_benchmark_id": 1359, "benchmark_id": "swe-bench-verified", "model_id": "gpt-4o-2024-08-06", "score": 0.332, "normalized_score": 0.332, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.861280+00:00", "updated_at": "2025-07-19T19:56:13.861280+00:00", "benchmark_name": "SWE-Bench Verified" }, { "model_benchmark_id": 1899, "benchmark_id": "swe-lancer", "model_id": "gpt-4o-2024-08-06", "score": 0.326, "normalized_score": 0.326, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "percentage score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.356738+00:00", "updated_at": "2025-07-19T19:56:15.356738+00:00", "benchmark_name": "SWE-Lancer" }, { "model_benchmark_id": 1902, "benchmark_id": "swe-lancer-(ic-diamond-subset)", "model_id": "gpt-4o-2024-08-06", "score": 0.124, "normalized_score": 0.124, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "percentage score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.363614+00:00", "updated_at": "2025-07-19T19:56:15.363614+00:00", "benchmark_name": "SWE-Lancer (IC-Diamond subset)" }, { "model_benchmark_id": 1781, "benchmark_id": "tau-bench-airline", "model_id": "gpt-4o-2024-08-06", "score": 0.428, "normalized_score": 0.428, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.017725+00:00", "updated_at": "2025-07-19T19:56:15.017725+00:00", "benchmark_name": "TAU-bench Airline" }, { "model_benchmark_id": 1767, "benchmark_id": "tau-bench-retail", "model_id": "gpt-4o-2024-08-06", "score": 0.603, "normalized_score": 0.603, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.988086+00:00", "updated_at": "2025-07-19T19:56:14.988086+00:00", "benchmark_name": "TAU-bench Retail" }, { "model_benchmark_id": 2003, "benchmark_id": "humanity's-last-exam", "model_id": "gpt-4o-2024-08-06", "score": 0.053, "normalized_score": 0.053, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-5/", "verified_by_llmstats": false, "analysis_method": "GPT-4o without thinking mode (no tools) - Full set of expert-level questions across subjects.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "Humanity's Last Exam" }, { "model_benchmark_id": 2005, "benchmark_id": "scale-multichallenge", "model_id": "gpt-4o-2024-08-06", "score": 0.403, "normalized_score": 0.403, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-5/", "verified_by_llmstats": false, "analysis_method": "GPT-4o without thinking mode - Multi-turn instruction following benchmark.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "Scale MultiChallenge" } ] ================================================ FILE: data/organizations/openai/models/gpt-4o-2024-08-06/model.json ================================================ { "model_id": "gpt-4o-2024-08-06", "name": "GPT-4o", "organization_id": "openai", "fine_tuned_from_model_id": null, "description": "GPT-4o ('o' for 'omni') is a multimodal AI model that accepts text, audio, image, and video inputs, and generates text, audio, and image outputs. It matches GPT-4 Turbo performance on text and code, with improvements in non-English languages, vision, and audio understanding.", "release_date": "2024-08-06", "announcement_date": "2024-08-06", "license_id": "proprietary", "multimodal": true, "knowledge_cutoff": null, "param_count": null, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://platform.openai.com/docs/api-reference", "source_playground": "https://chat.openai.com/", "source_paper": null, "source_scorecard_blog_link": "https://openai.com/index/hello-gpt-4o/", "source_repo_link": null, "source_weights_link": null, "created_at": "2025-07-19T19:49:05.847621+00:00", "updated_at": "2025-07-19T19:49:05.847621+00:00", "model_family_id": null } ================================================ FILE: data/organizations/openai/models/gpt-4o-mini-2024-07-18/benchmarks.json ================================================ [ { "model_benchmark_id": 964, "benchmark_id": "drop", "model_id": "gpt-4o-mini-2024-07-18", "score": 0.797, "normalized_score": 0.797, "is_self_reported": true, "self_reported_source_link": "https://openai.com/blog/gpt-4o-mini-announcement", "verified_by_llmstats": false, "analysis_method": "F1 Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.026741+00:00", "updated_at": "2025-07-19T19:56:13.026741+00:00", "benchmark_name": "DROP" }, { "model_benchmark_id": 361, "benchmark_id": "gpqa", "model_id": "gpt-4o-mini-2024-07-18", "score": 0.402, "normalized_score": 0.402, "is_self_reported": true, "self_reported_source_link": "https://openai.com/blog/gpt-4o-mini-announcement", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.774361+00:00", "updated_at": "2025-07-19T19:56:11.774361+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 816, "benchmark_id": "humaneval", "model_id": "gpt-4o-mini-2024-07-18", "score": 0.872, "normalized_score": 0.872, "is_self_reported": true, "self_reported_source_link": "https://openai.com/blog/gpt-4o-mini-announcement", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.700095+00:00", "updated_at": "2025-07-19T19:56:12.700095+00:00", "benchmark_name": "HumanEval" }, { "model_benchmark_id": 431, "benchmark_id": "math", "model_id": "gpt-4o-mini-2024-07-18", "score": 0.702, "normalized_score": 0.702, "is_self_reported": true, "self_reported_source_link": "https://openai.com/blog/gpt-4o-mini-announcement", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.911917+00:00", "updated_at": "2025-07-19T19:56:11.911917+00:00", "benchmark_name": "MATH" }, { "model_benchmark_id": 548, "benchmark_id": "mathvista", "model_id": "gpt-4o-mini-2024-07-18", "score": 0.567, "normalized_score": 0.567, "is_self_reported": true, "self_reported_source_link": "https://openai.com/blog/gpt-4o-mini-announcement", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.128984+00:00", "updated_at": "2025-07-19T19:56:12.128984+00:00", "benchmark_name": "MathVista" }, { "model_benchmark_id": 1301, "benchmark_id": "mgsm", "model_id": "gpt-4o-mini-2024-07-18", "score": 0.87, "normalized_score": 0.87, "is_self_reported": true, "self_reported_source_link": "https://openai.com/blog/gpt-4o-mini-announcement", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.720445+00:00", "updated_at": "2025-07-19T19:56:13.720445+00:00", "benchmark_name": "MGSM" }, { "model_benchmark_id": 128, "benchmark_id": "mmlu", "model_id": "gpt-4o-mini-2024-07-18", "score": 0.82, "normalized_score": 0.82, "is_self_reported": true, "self_reported_source_link": "https://openai.com/blog/gpt-4o-mini-announcement", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.335061+00:00", "updated_at": "2025-07-19T19:56:11.335061+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 598, "benchmark_id": "mmmu", "model_id": "gpt-4o-mini-2024-07-18", "score": 0.594, "normalized_score": 0.594, "is_self_reported": true, "self_reported_source_link": "https://openai.com/blog/gpt-4o-mini-announcement", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.232157+00:00", "updated_at": "2025-07-19T19:56:12.232157+00:00", "benchmark_name": "MMMU" }, { "model_benchmark_id": 1363, "benchmark_id": "swe-bench-verified", "model_id": "gpt-4o-mini-2024-07-18", "score": 0.087, "normalized_score": 0.087, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "Pass Rate", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.870038+00:00", "updated_at": "2025-07-19T19:56:13.870038+00:00", "benchmark_name": "SWE-Bench Verified" } ] ================================================ FILE: data/organizations/openai/models/gpt-4o-mini-2024-07-18/model.json ================================================ { "model_id": "gpt-4o-mini-2024-07-18", "name": "GPT-4o mini", "organization_id": "openai", "fine_tuned_from_model_id": null, "description": "GPT-4o mini is OpenAI's latest cost-efficient small model, designed to make AI intelligence more accessible and affordable. It excels in textual intelligence and multimodal reasoning, outperforming previous models like GPT-3.5 Turbo. With a context window of 128K tokens and support for text and vision, it offers low-cost, real-time applications such as customer support chatbots. Priced at 15 cents per million input tokens and 60 cents per million output tokens, it is significantly cheaper than its predecessors. Safety is prioritized with built-in measures and improved resistance to security threats.", "release_date": "2024-07-18", "announcement_date": "2024-07-18", "license_id": "proprietary", "multimodal": true, "knowledge_cutoff": "2023-10-01", "param_count": null, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://platform.openai.com/docs/api-reference", "source_playground": null, "source_paper": null, "source_scorecard_blog_link": "https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/", "source_repo_link": null, "source_weights_link": null, "created_at": "2025-07-19T19:49:05.866393+00:00", "updated_at": "2025-07-19T19:49:05.866393+00:00", "model_family_id": null } ================================================ FILE: data/organizations/openai/models/gpt-5-2025-08-07/benchmarks.json ================================================ [ { "model_benchmark_id": 9002, "benchmark_id": "swe-bench-verified", "model_id": "gpt-5-2025-08-07", "score": 0.749, "normalized_score": 0.749, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-5/", "verified_by_llmstats": false, "analysis_method": "Thinking mode enabled (up to 128K tokens) with enhanced reasoning capabilities and iterative problem-solving approach.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "SWE-Bench Verified" }, { "model_benchmark_id": 9004, "benchmark_id": "aider-polyglot", "model_id": "gpt-5-2025-08-07", "score": 0.88, "normalized_score": 0.88, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-5/", "verified_by_llmstats": false, "analysis_method": "Thinking mode enabled (up to 128K tokens) with step-by-step reasoning and multi-language code understanding.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "Aider-Polyglot" }, { "model_benchmark_id": 10027, "benchmark_id": "swe-lancer-(ic-diamond-subset)", "model_id": "gpt-5-2025-08-07", "score": 1.0, "normalized_score": 1.0, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "GPT-5 - IC SWE Diamond Freelance Coding Tasks (earnings-based evaluation).", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "SWE-Lancer (IC-Diamond subset)" }, { "model_benchmark_id": 9020, "benchmark_id": "aime-2025", "model_id": "gpt-5-2025-08-07", "score": 0.946, "normalized_score": 0.946, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-5/", "verified_by_llmstats": false, "analysis_method": "GPT-5 standard with thinking mode enabled (no tools) - competition mathematics.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "AIME 2025" }, { "model_benchmark_id": 9009, "benchmark_id": "mmmu", "model_id": "gpt-5-2025-08-07", "score": 0.842, "normalized_score": 0.842, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-5/", "verified_by_llmstats": false, "analysis_method": "GPT-5 with thinking mode - College-level visual problem-solving with multimodal reasoning.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "MMMU" }, { "model_benchmark_id": 9006, "benchmark_id": "mmlu", "model_id": "gpt-5-2025-08-07", "score": 0.925, "normalized_score": 0.925, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-5/", "verified_by_llmstats": false, "analysis_method": "Standard benchmark across multiple academic subjects with comprehensive knowledge evaluation.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 9007, "benchmark_id": "humaneval", "model_id": "gpt-5-2025-08-07", "score": 0.934, "normalized_score": 0.934, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-5/", "verified_by_llmstats": false, "analysis_method": "Code generation benchmark with function completion tasks in Python.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "HumanEval" }, { "model_benchmark_id": 9008, "benchmark_id": "math", "model_id": "gpt-5-2025-08-07", "score": 0.847, "normalized_score": 0.847, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-5/", "verified_by_llmstats": false, "analysis_method": "Thinking mode enabled with step-by-step mathematical problem solving and verification.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "MATH" }, { "model_benchmark_id": 9013, "benchmark_id": "healthbench-hard", "model_id": "gpt-5-2025-08-07", "score": 0.016, "normalized_score": 0.016, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-5/", "verified_by_llmstats": false, "analysis_method": "Thinking mode enabled for medical hallucination detection. Measured inaccuracies on challenging healthcare conversations.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "HealthBench Hard" }, { "model_benchmark_id": 9024, "benchmark_id": "frontiermath", "model_id": "gpt-5-2025-08-07", "score": 0.263, "normalized_score": 0.263, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-5/", "verified_by_llmstats": false, "analysis_method": "GPT-5 standard with thinking mode enabled (with python tool only) - FrontierMath Tier 1-3 expert-level mathematics.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "FrontierMath" }, { "model_benchmark_id": 9028, "benchmark_id": "hmmt-2025", "model_id": "gpt-5-2025-08-07", "score": 0.933, "normalized_score": 0.933, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-5/", "verified_by_llmstats": false, "analysis_method": "GPT-5 standard with thinking mode enabled (no tools) - Harvard-MIT Mathematics Tournament.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "HMMT 2025" }, { "model_benchmark_id": 9032, "benchmark_id": "gpqa", "model_id": "gpt-5-2025-08-07", "score": 0.857, "normalized_score": 0.857, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-5/", "verified_by_llmstats": false, "analysis_method": "GPT-5 - Diamond thinking no tools", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 9037, "benchmark_id": "humanity's-last-exam", "model_id": "gpt-5-2025-08-07", "score": 0.248, "normalized_score": 0.248, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-5/", "verified_by_llmstats": false, "analysis_method": "GPT-5 standard with thinking mode (no tools) - Full set of expert-level questions across subjects.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "Humanity's Last Exam" }, { "model_benchmark_id": 9041, "benchmark_id": "scale-multichallenge", "model_id": "gpt-5-2025-08-07", "score": 0.696, "normalized_score": 0.696, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-5/", "verified_by_llmstats": false, "analysis_method": "GPT-5 with thinking mode enabled - Multi-turn instruction following benchmark.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "Scale MultiChallenge" }, { "model_benchmark_id": 9043, "benchmark_id": "browsecomp", "model_id": "gpt-5-2025-08-07", "score": 0.549, "normalized_score": 0.549, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-5/", "verified_by_llmstats": false, "analysis_method": "GPT-5 with thinking mode enabled - Agentic search & browsing benchmark.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "BrowseComp" }, { "model_benchmark_id": 9045, "benchmark_id": "collie", "model_id": "gpt-5-2025-08-07", "score": 0.99, "normalized_score": 0.99, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-5/", "verified_by_llmstats": false, "analysis_method": "GPT-5 with thinking mode enabled - Instruction-following in freeform writing.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "COLLIE" }, { "model_benchmark_id": 10034, "benchmark_id": "multichallenge-(o3-mini-grader)", "model_id": "gpt-5-2025-08-07", "score": 0.696, "normalized_score": 0.696, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "GPT-5 with o3-mini grader - Multi-turn instruction following benchmark with improved grading accuracy.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "MultiChallenge (o3-mini grader)" }, { "model_benchmark_id": 10035, "benchmark_id": "internal-api-instruction-following-(hard)", "model_id": "gpt-5-2025-08-07", "score": 0.64, "normalized_score": 0.64, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "GPT-5 - Internal API instruction following evaluation (hard difficulty).", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "Internal API instruction following (hard)" }, { "model_benchmark_id": 9047, "benchmark_id": "tau2-airline", "model_id": "gpt-5-2025-08-07", "score": 0.626, "normalized_score": 0.626, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "GPT-5 - Function calling benchmark (airline domain).", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "Tau2 airline" }, { "model_benchmark_id": 9049, "benchmark_id": "tau2-retail", "model_id": "gpt-5-2025-08-07", "score": 0.811, "normalized_score": 0.811, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "GPT-5 with thinking mode - Function calling benchmark (retail domain).", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "Tau2 retail" }, { "model_benchmark_id": 9051, "benchmark_id": "tau2-telecom", "model_id": "gpt-5-2025-08-07", "score": 0.967, "normalized_score": 0.967, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "GPT-5 with thinking mode - Function calling benchmark (telecom domain).", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "Tau2 telecom" }, { "model_benchmark_id": 9053, "benchmark_id": "mmmu-pro", "model_id": "gpt-5-2025-08-07", "score": 0.784, "normalized_score": 0.784, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-5/", "verified_by_llmstats": false, "analysis_method": "GPT-5 with thinking mode - Graduate-level visual problem-solving with advanced multimodal reasoning.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "MMMU-Pro" }, { "model_benchmark_id": 9055, "benchmark_id": "videommmu", "model_id": "gpt-5-2025-08-07", "score": 0.846, "normalized_score": 0.846, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-5/", "verified_by_llmstats": false, "analysis_method": "GPT-5 with thinking mode - Video-based multimodal reasoning (max frame 256).", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "VideoMMMU" }, { "model_benchmark_id": 9057, "benchmark_id": "charxiv-r", "model_id": "gpt-5-2025-08-07", "score": 0.811, "normalized_score": 0.811, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-5/", "verified_by_llmstats": false, "analysis_method": "GPT-5 with thinking mode - Scientific figure reasoning and interpretation.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "CharXiv-R" }, { "model_benchmark_id": 9059, "benchmark_id": "erqa", "model_id": "gpt-5-2025-08-07", "score": 0.657, "normalized_score": 0.657, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-5/", "verified_by_llmstats": false, "analysis_method": "GPT-5 with thinking mode - Multimodal spatial reasoning.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "ERQA" }, { "model_benchmark_id": 10048, "benchmark_id": "openai-mrcr:-2-needle-128k", "model_id": "gpt-5-2025-08-07", "score": 0.952, "normalized_score": 0.952, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "OpenAI-MRCR 2-needle retrieval at 128k tokens.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "OpenAI-MRCR: 2 needle 128k" }, { "model_benchmark_id": 10049, "benchmark_id": "openai-mrcr:-2-needle-256k", "model_id": "gpt-5-2025-08-07", "score": 0.868, "normalized_score": 0.868, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "OpenAI-MRCR 2-needle retrieval at 256k tokens.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "OpenAI-MRCR: 2 needle 256k" }, { "model_benchmark_id": 10050, "benchmark_id": "graphwalks-bfs-<128k", "model_id": "gpt-5-2025-08-07", "score": 0.783, "normalized_score": 0.783, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "Graphwalks BFS (<128k) long-context reasoning.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "Graphwalks BFS <128k" }, { "model_benchmark_id": 10051, "benchmark_id": "graphwalks-parents-<128k", "model_id": "gpt-5-2025-08-07", "score": 0.733, "normalized_score": 0.733, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "Graphwalks parents (<128k) long-context reasoning.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "Graphwalks parents <128k" }, { "model_benchmark_id": 10052, "benchmark_id": "browsecomp-long-128k", "model_id": "gpt-5-2025-08-07", "score": 0.9, "normalized_score": 0.9, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "BrowseComp long-context 128k variant.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "BrowseComp Long Context 128k" }, { "model_benchmark_id": 10053, "benchmark_id": "browsecomp-long-256k", "model_id": "gpt-5-2025-08-07", "score": 0.888, "normalized_score": 0.888, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "BrowseComp long-context 256k variant.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "BrowseComp Long Context 256k" }, { "model_benchmark_id": 10054, "benchmark_id": "videomme-w-sub.", "model_id": "gpt-5-2025-08-07", "score": 0.867, "normalized_score": 0.867, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "VideoMME (long) with subtitles category.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "VideoMME w sub." }, { "model_benchmark_id": 10069, "benchmark_id": "longfact-concepts", "model_id": "gpt-5-2025-08-07", "score": 0.007, "normalized_score": 0.007, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "Thinking mode enabled for hallucination detection. Measured on open-source prompts for concept-based factual queries.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "LongFact-Concepts" }, { "model_benchmark_id": 10070, "benchmark_id": "longfact-objects", "model_id": "gpt-5-2025-08-07", "score": 0.008, "normalized_score": 0.008, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "Thinking mode enabled for hallucination detection. Measured on open-source prompts for object-based factual queries.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "LongFact-Objects" }, { "model_benchmark_id": 10071, "benchmark_id": "factscore", "model_id": "gpt-5-2025-08-07", "score": 0.01, "normalized_score": 0.01, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/", "verified_by_llmstats": false, "analysis_method": "Thinking mode enabled for factual accuracy assessment. Measured hallucination rate on open-source prompts.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "FactScore" } ] ================================================ FILE: data/organizations/openai/models/gpt-5-2025-08-07/model.json ================================================ { "model_id": "gpt-5-2025-08-07", "name": "GPT-5", "organization_id": "openai", "fine_tuned_from_model_id": null, "description": "GPT-5 is our flagship model for coding, reasoning, and agentic tasks across domains. The best model for coding and agentic tasks with higher reasoning capabilities and medium speed.", "release_date": "2025-08-07", "announcement_date": "2025-08-07", "license_id": "proprietary", "multimodal": true, "knowledge_cutoff": "2024-09-30", "param_count": null, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://platform.openai.com/docs/models/gpt-5", "source_playground": "https://platform.openai.com/playground?mode=chat&model=gpt-5", "source_paper": "https://cdn.openai.com/pdf/8124a3ce-ab78-4f06-96eb-49ea29ffb52f/gpt5-system-card-aug7.pdf", "source_scorecard_blog_link": "https://openai.com/index/gpt-5/", "source_repo_link": null, "source_weights_link": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "model_family_id": null } ================================================ FILE: data/organizations/openai/models/gpt-5-codex-2025-09-15/benchmarks.json ================================================ [ { "model_benchmark_id": 10100, "benchmark_id": "swe-bench-verified", "model_id": "gpt-5-codex-2025-09-15", "score": 0.745, "normalized_score": 0.745, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-upgrades-to-codex/", "verified_by_llmstats": false, "analysis_method": "GPT-5 Codex specialized for code review and critical flaw detection with enhanced agentic coding capabilities.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-09-18T00:00:00.000000+00:00", "updated_at": "2025-09-18T00:00:00.000000+00:00", "benchmark_name": "SWE-Bench Verified" } ] ================================================ FILE: data/organizations/openai/models/gpt-5-codex-2025-09-15/model.json ================================================ { "model_id": "gpt-5-codex-2025-09-15", "name": "GPT-5 Codex", "organization_id": "openai", "fine_tuned_from_model_id": null, "description": "GPT-5 Codex has been trained specifically for conducting code reviews and finding critical flaws. When reviewing, it navigates your codebase and analyzes code patterns to identify potential security vulnerabilities, performance issues, and bugs.", "release_date": "2025-09-15", "announcement_date": "2025-09-15", "license_id": "proprietary", "multimodal": false, "knowledge_cutoff": "2024-09-30", "param_count": null, "training_tokens": null, "available_in_zeroeval": false, "source_api_ref": "https://platform.openai.com/docs/models/gpt-5-codex", "source_playground": "https://platform.openai.com/playground?mode=chat&model=gpt-5-codex", "source_paper": null, "source_scorecard_blog_link": "https://openai.com/index/introducing-upgrades-to-codex/", "source_repo_link": null, "source_weights_link": null, "created_at": "2025-09-18T00:00:00.000000+00:00", "updated_at": "2025-09-18T00:00:00.000000+00:00", "model_family_id": null } ================================================ FILE: data/organizations/openai/models/gpt-5-mini-2025-08-07/benchmarks.json ================================================ [ { "model_benchmark_id": 9021, "benchmark_id": "aime-2025", "model_id": "gpt-5-mini-2025-08-07", "score": 0.911, "normalized_score": 0.911, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-5/", "verified_by_llmstats": false, "analysis_method": "GPT-5 mini with thinking mode enabled (no tools) - competition mathematics.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "AIME 2025" }, { "model_benchmark_id": 9025, "benchmark_id": "frontiermath", "model_id": "gpt-5-mini-2025-08-07", "score": 0.221, "normalized_score": 0.221, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-5/", "verified_by_llmstats": false, "analysis_method": "GPT-5 mini with thinking mode enabled (with python tool only) - FrontierMath Tier 1-3 expert-level mathematics.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "FrontierMath" }, { "model_benchmark_id": 9033, "benchmark_id": "gpqa", "model_id": "gpt-5-mini-2025-08-07", "score": 0.823, "normalized_score": 0.823, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-5/", "verified_by_llmstats": false, "analysis_method": "GPT-5 mini - Diamond thinking no tools", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 9038, "benchmark_id": "humanity's-last-exam", "model_id": "gpt-5-mini-2025-08-07", "score": 0.167, "normalized_score": 0.167, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-5/", "verified_by_llmstats": false, "analysis_method": "GPT-5 mini with thinking mode (no tools) - Full set of expert-level questions across subjects.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "Humanity's Last Exam" }, { "model_benchmark_id": 9029, "benchmark_id": "hmmt-2025", "model_id": "gpt-5-mini-2025-08-07", "score": 0.878, "normalized_score": 0.878, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-5/", "verified_by_llmstats": false, "analysis_method": "GPT-5 mini with thinking mode enabled (no tools) - Harvard-MIT Mathematics Tournament.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "HMMT 2025" } ] ================================================ FILE: data/organizations/openai/models/gpt-5-mini-2025-08-07/model.json ================================================ { "model_id": "gpt-5-mini-2025-08-07", "name": "GPT-5 mini", "organization_id": "openai", "fine_tuned_from_model_id": null, "description": "A faster, more cost-efficient version of GPT-5 for well-defined tasks. Great for well-defined tasks and precise prompts with high reasoning capabilities at reduced cost.", "release_date": "2025-08-07", "announcement_date": "2025-08-07", "license_id": "proprietary", "multimodal": true, "knowledge_cutoff": "2024-05-30", "param_count": null, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://platform.openai.com/docs/models/gpt-5-mini", "source_playground": "https://platform.openai.com/playground?mode=chat&model=gpt-5-mini", "source_paper": null, "source_scorecard_blog_link": "https://openai.com/index/gpt-5/", "source_repo_link": null, "source_weights_link": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "model_family_id": null } ================================================ FILE: data/organizations/openai/models/gpt-5-nano-2025-08-07/benchmarks.json ================================================ [ { "model_benchmark_id": 9022, "benchmark_id": "aime-2025", "model_id": "gpt-5-nano-2025-08-07", "score": 0.852, "normalized_score": 0.852, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-5/", "verified_by_llmstats": false, "analysis_method": "GPT-5 nano with thinking mode enabled (no tools) - competition mathematics.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "AIME 2025" }, { "model_benchmark_id": 9026, "benchmark_id": "frontiermath", "model_id": "gpt-5-nano-2025-08-07", "score": 0.096, "normalized_score": 0.096, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-5/", "verified_by_llmstats": false, "analysis_method": "GPT-5 nano with thinking mode enabled (with python tool only) - FrontierMath Tier 1-3 expert-level mathematics.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "FrontierMath" }, { "model_benchmark_id": 9034, "benchmark_id": "gpqa", "model_id": "gpt-5-nano-2025-08-07", "score": 0.712, "normalized_score": 0.712, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-5/", "verified_by_llmstats": false, "analysis_method": "GPT-5 nano - Diamond thinking no tools", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 9039, "benchmark_id": "humanity's-last-exam", "model_id": "gpt-5-nano-2025-08-07", "score": 0.087, "normalized_score": 0.087, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-5/", "verified_by_llmstats": false, "analysis_method": "GPT-5 nano with thinking mode (no tools) - Full set of expert-level questions across subjects.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "Humanity's Last Exam" }, { "model_benchmark_id": 9030, "benchmark_id": "hmmt-2025", "model_id": "gpt-5-nano-2025-08-07", "score": 0.756, "normalized_score": 0.756, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-5/", "verified_by_llmstats": false, "analysis_method": "GPT-5 nano with thinking mode enabled (no tools) - Harvard-MIT Mathematics Tournament.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "HMMT 2025" } ] ================================================ FILE: data/organizations/openai/models/gpt-5-nano-2025-08-07/model.json ================================================ { "model_id": "gpt-5-nano-2025-08-07", "name": "GPT-5 nano", "organization_id": "openai", "fine_tuned_from_model_id": null, "description": "GPT-5 nano is our fastest, cheapest version of GPT-5. It's great for summarization and classification tasks with average reasoning capabilities and very fast speed.", "release_date": "2025-08-07", "announcement_date": "2025-08-07", "license_id": "proprietary", "multimodal": true, "knowledge_cutoff": "2024-05-30", "param_count": null, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://platform.openai.com/docs/models/gpt-5-nano", "source_playground": "https://platform.openai.com/playground?mode=chat&model=gpt-5-nano", "source_paper": null, "source_scorecard_blog_link": "https://openai.com/index/gpt-5/", "source_repo_link": null, "source_weights_link": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "model_family_id": null } ================================================ FILE: data/organizations/openai/models/gpt-oss-120b/benchmarks.json ================================================ [ { "model_benchmark_id": 224, "benchmark_id": "codeforces", "model_id": "gpt-oss-120b", "score": 0.874, "normalized_score": 0.874, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-oss/", "verified_by_llmstats": false, "analysis_method": "Elo (with tools)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-05T19:49:05.852855+00:00", "updated_at": "2025-08-05T19:49:05.852855+00:00", "benchmark_name": "Codeforces Competition code" }, { "model_benchmark_id": 224, "benchmark_id": "codeforces", "model_id": "gpt-oss-120b", "score": 0.821, "normalized_score": 0.821, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-oss/", "verified_by_llmstats": false, "analysis_method": "Elo (without tools)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-05T19:49:05.852855+00:00", "updated_at": "2025-08-05T19:49:05.852855+00:00", "benchmark_name": "Codeforces Competition code" }, { "model_benchmark_id": 224, "benchmark_id": "humanity's-last-exam", "model_id": "gpt-oss-120b", "score": 0.19, "normalized_score": 0.19, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-oss/", "verified_by_llmstats": false, "analysis_method": "Accuracy (with tools)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-05T19:49:05.852855+00:00", "updated_at": "2025-08-05T19:49:05.852855+00:00", "benchmark_name": "Humanity's Last Exam" }, { "model_benchmark_id": 224, "benchmark_id": "humanity's-last-exam", "model_id": "gpt-oss-120b", "score": 0.149, "normalized_score": 0.149, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-oss/", "verified_by_llmstats": false, "analysis_method": "Accuracy (without tools)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-05T19:49:05.852855+00:00", "updated_at": "2025-08-05T19:49:05.852855+00:00", "benchmark_name": "Humanity's Last Exam" }, { "model_benchmark_id": 224, "benchmark_id": "healthbench", "model_id": "gpt-oss-120b", "score": 0.576, "normalized_score": 0.576, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-oss/", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-05T19:49:05.852855+00:00", "updated_at": "2025-08-05T19:49:05.852855+00:00", "benchmark_name": "HealthBench - Realistic health conversations" }, { "model_benchmark_id": 225, "benchmark_id": "healthbench-hard", "model_id": "gpt-oss-120b", "score": 0.3, "normalized_score": 0.3, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-oss/", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-05T19:49:05.852855+00:00", "updated_at": "2025-08-05T19:49:05.852855+00:00", "benchmark_name": "HealthBench Hard - Challenging health conversations" }, { "model_benchmark_id": 2226, "benchmark_id": "gpqa", "model_id": "gpt-oss-120b", "score": 0.801, "normalized_score": 0.801, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-oss/", "verified_by_llmstats": false, "analysis_method": "Without tools", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-05T19:49:05.852855+00:00", "updated_at": "2025-08-05T19:49:05.852855+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 22226, "benchmark_id": "mmlu", "model_id": "gpt-oss-120b", "score": 0.9, "normalized_score": 0.9, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-oss/", "verified_by_llmstats": false, "analysis_method": "Without tools", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-05T19:49:05.852855+00:00", "updated_at": "2025-08-05T19:49:05.852855+00:00", "benchmark_name": "MMLU benchmark" }, { "model_benchmark_id": 22226, "benchmark_id": "tau-bench-retail", "model_id": "gpt-oss-120b", "score": 0.678, "normalized_score": 0.678, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-oss/", "verified_by_llmstats": false, "analysis_method": "Function calling", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-05T19:49:05.852855+00:00", "updated_at": "2025-08-05T19:49:05.852855+00:00", "benchmark_name": "TAU-bench Retail benchmark" } ] ================================================ FILE: data/organizations/openai/models/gpt-oss-120b/model.json ================================================ { "model_id": "gpt-oss-120b", "name": "GPT OSS 120B", "organization_id": "openai", "fine_tuned_from_model_id": null, "description": "GPT-OSS-120B is an open-weight, 116.8B-parameter Mixture-of-Experts (MoE) language model from OpenAI designed for high-reasoning, agentic, and general-purpose production use cases. It activates 5.1B parameters per forward pass and is optimized to run on a single H100 GPU with native MXFP4 quantization. The model supports configurable reasoning depth, full chain-of-thought access, and native tool use, including function calling, browsing, and structured output generation. It achieves near-parity with OpenAI o4-mini on core reasoning benchmarks. Note: While referred to as '120b' for simplicity, it technically has 116.8B parameters.", "release_date": "2025-08-05", "announcement_date": "2025-08-05", "license_id": "apache_2_0", "multimodal": false, "knowledge_cutoff": null, "param_count": 116800000000, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": null, "source_playground": "https://gpt-oss.com/", "source_paper": "https://cdn.openai.com/pdf/419b6906-9da6-406c-a19d-1bb078ac7637/oai_gpt-oss_model_card.pdf", "source_scorecard_blog_link": "https://openai.com/index/gpt-oss-model-card/", "source_repo_link": "https://github.com/openai/gpt-oss", "source_weights_link": "https://huggingface.co/openai/gpt-oss-120b", "created_at": "2025-08-05T19:49:05.852855+00:00", "updated_at": "2025-08-05T19:49:05.852855+00:00", "model_family_id": null } ================================================ FILE: data/organizations/openai/models/gpt-oss-20b/benchmarks.json ================================================ [ { "model_benchmark_id": 224, "benchmark_id": "codeforces", "model_id": "gpt-oss-20b", "score": 0.8387, "normalized_score": 0.8387, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-oss/", "verified_by_llmstats": false, "analysis_method": "Elo (with tools)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-05T19:49:05.852855+00:00", "updated_at": "2025-08-05T19:49:05.852855+00:00", "benchmark_name": "Codeforces Competition code" }, { "model_benchmark_id": 224, "benchmark_id": "codeforces", "model_id": "gpt-oss-20b", "score": 0.7433, "normalized_score": 0.7433, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-oss/", "verified_by_llmstats": false, "analysis_method": "Elo (without tools)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-05T19:49:05.852855+00:00", "updated_at": "2025-08-05T19:49:05.852855+00:00", "benchmark_name": "Codeforces Competition code" }, { "model_benchmark_id": 224, "benchmark_id": "humanity's-last-exam", "model_id": "gpt-oss-20b", "score": 0.173, "normalized_score": 0.173, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-oss/", "verified_by_llmstats": false, "analysis_method": "Accuracy (with tools)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-05T19:49:05.852855+00:00", "updated_at": "2025-08-05T19:49:05.852855+00:00", "benchmark_name": "Humanity's Last Exam" }, { "model_benchmark_id": 224, "benchmark_id": "humanity's-last-exam", "model_id": "gpt-oss-20b", "score": 0.109, "normalized_score": 0.109, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-oss/", "verified_by_llmstats": false, "analysis_method": "Accuracy (without tools)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-05T19:49:05.852855+00:00", "updated_at": "2025-08-05T19:49:05.852855+00:00", "benchmark_name": "Humanity's Last Exam" }, { "model_benchmark_id": 224, "benchmark_id": "healthbench", "model_id": "gpt-oss-20b", "score": 0.425, "normalized_score": 0.425, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-oss/", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-05T19:49:05.852855+00:00", "updated_at": "2025-08-05T19:49:05.852855+00:00", "benchmark_name": "HealthBench - Realistic health conversations" }, { "model_benchmark_id": 225, "benchmark_id": "healthbench-hard", "model_id": "gpt-oss-20b", "score": 0.108, "normalized_score": 0.108, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-oss/", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-05T19:49:05.852855+00:00", "updated_at": "2025-08-05T19:49:05.852855+00:00", "benchmark_name": "HealthBench Hard - Challenging health conversations" }, { "model_benchmark_id": 2226, "benchmark_id": "gpqa", "model_id": "gpt-oss-20b", "score": 0.715, "normalized_score": 0.715, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-oss/", "verified_by_llmstats": false, "analysis_method": "Diamond (without tools)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-05T19:49:05.852855+00:00", "updated_at": "2025-08-05T19:49:05.852855+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 22226, "benchmark_id": "mmlu", "model_id": "gpt-oss-20b", "score": 0.853, "normalized_score": 0.853, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-oss/", "verified_by_llmstats": false, "analysis_method": "Without tools", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-05T19:49:05.852855+00:00", "updated_at": "2025-08-05T19:49:05.852855+00:00", "benchmark_name": "MMLU benchmark" }, { "model_benchmark_id": 22226, "benchmark_id": "tau-bench-retail", "model_id": "gpt-oss-20b", "score": 0.548, "normalized_score": 0.548, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-oss/", "verified_by_llmstats": false, "analysis_method": "Function calling", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-05T19:49:05.852855+00:00", "updated_at": "2025-08-05T19:49:05.852855+00:00", "benchmark_name": "TAU-bench Retail benchmark" } ] ================================================ FILE: data/organizations/openai/models/gpt-oss-20b/model.json ================================================ { "model_id": "gpt-oss-20b", "name": "GPT OSS 20B", "organization_id": "openai", "fine_tuned_from_model_id": null, "description": "The gpt-oss-20b model (technically 20.9B parameters) achieves near-parity with OpenAI o4-mini on core reasoning benchmarks, while running efficiently on a single 80 GB GPU. The gpt-oss-20b model delivers similar results to OpenAI o3‑mini on common benchmarks and can run on edge devices with just 16 GB of memory, making it ideal for on-device use cases, local inference, or rapid iteration without costly infrastructure. Both models also perform strongly on tool use, few-shot function calling, CoT reasoning (as seen in results on the Tau-Bench agentic evaluation suite) and HealthBench (even outperforming proprietary models like OpenAI o1 and GPT‑4o). Note: While referred to as '20b' for simplicity, it technically has 20.9B parameters.", "release_date": "2025-08-05", "announcement_date": "2025-08-05", "license_id": "apache_2_0", "multimodal": false, "knowledge_cutoff": null, "param_count": 20900000000, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": null, "source_playground": "https://gpt-oss.com/", "source_paper": "https://cdn.openai.com/pdf/419b6906-9da6-406c-a19d-1bb078ac7637/oai_gpt-oss_model_card.pdf", "source_scorecard_blog_link": "https://openai.com/index/gpt-oss-model-card/", "source_repo_link": "https://github.com/openai/gpt-oss", "source_weights_link": "https://huggingface.co/openai/gpt-oss-20b", "created_at": "2025-08-05T19:49:05.852855+00:00", "updated_at": "2025-08-05T19:49:05.852855+00:00", "model_family_id": null } ================================================ FILE: data/organizations/openai/models/o1-2024-12-17/benchmarks.json ================================================ [ { "model_benchmark_id": 490, "benchmark_id": "aime-2024", "model_id": "o1-2024-12-17", "score": 0.743, "normalized_score": 0.743, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.025628+00:00", "updated_at": "2025-07-19T19:56:12.025628+00:00", "benchmark_name": "AIME 2024" }, { "model_benchmark_id": 1831, "benchmark_id": "frontiermath", "model_id": "o1-2024-12-17", "score": 0.055, "normalized_score": 0.055, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/o1-and-new-tools-for-developers/", "verified_by_llmstats": false, "analysis_method": "pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.186673+00:00", "updated_at": "2025-07-19T19:56:15.186673+00:00", "benchmark_name": "FrontierMath" }, { "model_benchmark_id": 358, "benchmark_id": "gpqa", "model_id": "o1-2024-12-17", "score": 0.78, "normalized_score": 0.78, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/openai-o3-mini/", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.768954+00:00", "updated_at": "2025-07-19T19:56:11.768954+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 1911, "benchmark_id": "gpqa-biology", "model_id": "o1-2024-12-17", "score": 0.692, "normalized_score": 0.692, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/learning-to-reason-with-llms/", "verified_by_llmstats": false, "analysis_method": "pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.394088+00:00", "updated_at": "2025-07-19T19:56:15.394088+00:00", "benchmark_name": "GPQA Biology" }, { "model_benchmark_id": 1912, "benchmark_id": "gpqa-chemistry", "model_id": "o1-2024-12-17", "score": 0.647, "normalized_score": 0.647, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/learning-to-reason-with-llms/", "verified_by_llmstats": false, "analysis_method": "pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.399030+00:00", "updated_at": "2025-07-19T19:56:15.399030+00:00", "benchmark_name": "GPQA Chemistry" }, { "model_benchmark_id": 1913, "benchmark_id": "gpqa-physics", "model_id": "o1-2024-12-17", "score": 0.928, "normalized_score": 0.928, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/learning-to-reason-with-llms/", "verified_by_llmstats": false, "analysis_method": "pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.403790+00:00", "updated_at": "2025-07-19T19:56:15.403790+00:00", "benchmark_name": "GPQA Physics" }, { "model_benchmark_id": 1016, "benchmark_id": "gsm8k", "model_id": "o1-2024-12-17", "score": 0.971, "normalized_score": 0.971, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/learning-to-reason-with-llms/", "verified_by_llmstats": false, "analysis_method": "pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.116437+00:00", "updated_at": "2025-07-19T19:56:13.116437+00:00", "benchmark_name": "GSM8k" }, { "model_benchmark_id": 814, "benchmark_id": "humaneval", "model_id": "o1-2024-12-17", "score": 0.881, "normalized_score": 0.881, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/learning-to-reason-with-llms/", "verified_by_llmstats": false, "analysis_method": "pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.696047+00:00", "updated_at": "2025-07-19T19:56:12.696047+00:00", "benchmark_name": "HumanEval" }, { "model_benchmark_id": 755, "benchmark_id": "livebench", "model_id": "o1-2024-12-17", "score": 0.67, "normalized_score": 0.67, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/openai-o3-mini//", "verified_by_llmstats": false, "analysis_method": "coding", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.587814+00:00", "updated_at": "2025-07-19T19:56:12.587814+00:00", "benchmark_name": "LiveBench" }, { "model_benchmark_id": 428, "benchmark_id": "math", "model_id": "o1-2024-12-17", "score": 0.964, "normalized_score": 0.964, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/learning-to-reason-with-llms/", "verified_by_llmstats": false, "analysis_method": "pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.905279+00:00", "updated_at": "2025-07-19T19:56:11.905279+00:00", "benchmark_name": "MATH" }, { "model_benchmark_id": 546, "benchmark_id": "mathvista", "model_id": "o1-2024-12-17", "score": 0.718, "normalized_score": 0.718, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.126058+00:00", "updated_at": "2025-07-19T19:56:12.126058+00:00", "benchmark_name": "MathVista" }, { "model_benchmark_id": 1298, "benchmark_id": "mgsm", "model_id": "o1-2024-12-17", "score": 0.893, "normalized_score": 0.893, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/o1-and-new-tools-for-developers/", "verified_by_llmstats": false, "analysis_method": "pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.715686+00:00", "updated_at": "2025-07-19T19:56:13.715686+00:00", "benchmark_name": "MGSM" }, { "model_benchmark_id": 125, "benchmark_id": "mmlu", "model_id": "o1-2024-12-17", "score": 0.918, "normalized_score": 0.918, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/learning-to-reason-with-llms/", "verified_by_llmstats": false, "analysis_method": "pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.330211+00:00", "updated_at": "2025-07-19T19:56:11.330211+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 1486, "benchmark_id": "mmmlu", "model_id": "o1-2024-12-17", "score": 0.877, "normalized_score": 0.877, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.165932+00:00", "updated_at": "2025-07-19T19:56:14.165932+00:00", "benchmark_name": "MMMLU" }, { "model_benchmark_id": 596, "benchmark_id": "mmmu", "model_id": "o1-2024-12-17", "score": 0.776, "normalized_score": 0.776, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.228467+00:00", "updated_at": "2025-07-19T19:56:12.228467+00:00", "benchmark_name": "MMMU" }, { "model_benchmark_id": 241, "benchmark_id": "simpleqa", "model_id": "o1-2024-12-17", "score": 0.47, "normalized_score": 0.47, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-4-5/", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.561209+00:00", "updated_at": "2025-07-19T19:56:11.561209+00:00", "benchmark_name": "SimpleQA" }, { "model_benchmark_id": 1361, "benchmark_id": "swe-bench-verified", "model_id": "o1-2024-12-17", "score": 0.41, "normalized_score": 0.41, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "verified", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.865799+00:00", "updated_at": "2025-07-19T19:56:13.865799+00:00", "benchmark_name": "SWE-Bench Verified" }, { "model_benchmark_id": 1783, "benchmark_id": "tau-bench-airline", "model_id": "o1-2024-12-17", "score": 0.5, "normalized_score": 0.5, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "agents", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.021642+00:00", "updated_at": "2025-07-19T19:56:15.021642+00:00", "benchmark_name": "TAU-bench Airline" }, { "model_benchmark_id": 1769, "benchmark_id": "tau-bench-retail", "model_id": "o1-2024-12-17", "score": 0.708, "normalized_score": 0.708, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "agents", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.992114+00:00", "updated_at": "2025-07-19T19:56:14.992114+00:00", "benchmark_name": "TAU-bench Retail" } ] ================================================ FILE: data/organizations/openai/models/o1-2024-12-17/model.json ================================================ { "model_id": "o1-2024-12-17", "name": "o1", "organization_id": "openai", "fine_tuned_from_model_id": null, "description": "A research preview model focused on mathematical and logical reasoning capabilities, demonstrating improved performance on tasks requiring step-by-step reasoning, mathematical problem-solving, and code generation. The model shows enhanced capabilities in formal reasoning while maintaining strong general capabilities.", "release_date": "2024-12-17", "announcement_date": "2024-12-17", "license_id": "proprietary", "multimodal": false, "knowledge_cutoff": null, "param_count": null, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://platform.openai.com/docs/models", "source_playground": null, "source_paper": "https://cdn.openai.com/o1-system-card-20240917.pdf", "source_scorecard_blog_link": "https://openai.com/index/learning-to-reason-with-llms", "source_repo_link": "https://openai.com/index/o1-and-new-tools-for-developers/", "source_weights_link": null, "created_at": "2025-07-19T19:49:05.855348+00:00", "updated_at": "2025-07-19T19:49:05.855348+00:00", "model_family_id": null } ================================================ FILE: data/organizations/openai/models/o1-mini/benchmarks.json ================================================ [ { "model_benchmark_id": 1910, "benchmark_id": "cybersecurity-ctfs", "model_id": "o1-mini", "score": 0.287, "normalized_score": 0.287, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/openai-o1-mini-advancing-cost-efficient-reasoning/", "verified_by_llmstats": false, "analysis_method": "Pass@12 accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.390045+00:00", "updated_at": "2025-07-19T19:56:15.390045+00:00", "benchmark_name": "Cybersecurity CTFs" }, { "model_benchmark_id": 356, "benchmark_id": "gpqa", "model_id": "o1-mini", "score": 0.6, "normalized_score": 0.6, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/openai-o1-mini-advancing-cost-efficient-reasoning/", "verified_by_llmstats": false, "analysis_method": "Diamond, 0-shot Chain of Thought", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.765864+00:00", "updated_at": "2025-07-19T19:56:11.765864+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 812, "benchmark_id": "humaneval", "model_id": "o1-mini", "score": 0.924, "normalized_score": 0.924, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/openai-o1-mini-advancing-cost-efficient-reasoning/", "verified_by_llmstats": false, "analysis_method": "Pass@1 accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.692107+00:00", "updated_at": "2025-07-19T19:56:12.692107+00:00", "benchmark_name": "HumanEval" }, { "model_benchmark_id": 513, "benchmark_id": "math-500", "model_id": "o1-mini", "score": 0.9, "normalized_score": 0.9, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/openai-o1-mini-advancing-cost-efficient-reasoning/", "verified_by_llmstats": false, "analysis_method": "0-shot Chain of Thought", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.065288+00:00", "updated_at": "2025-07-19T19:56:12.065288+00:00", "benchmark_name": "MATH-500" }, { "model_benchmark_id": 123, "benchmark_id": "mmlu", "model_id": "o1-mini", "score": 0.852, "normalized_score": 0.852, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/openai-o1-mini-advancing-cost-efficient-reasoning/", "verified_by_llmstats": false, "analysis_method": "0-shot Chain of Thought", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.327239+00:00", "updated_at": "2025-07-19T19:56:11.327239+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 1909, "benchmark_id": "superglue", "model_id": "o1-mini", "score": 0.75, "normalized_score": 0.75, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/openai-o1-mini-advancing-cost-efficient-reasoning/", "verified_by_llmstats": false, "analysis_method": "Evaluation on validation set", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.385801+00:00", "updated_at": "2025-07-19T19:56:15.385801+00:00", "benchmark_name": "SuperGLUE" } ] ================================================ FILE: data/organizations/openai/models/o1-mini/model.json ================================================ { "model_id": "o1-mini", "name": "o1-mini", "organization_id": "openai", "fine_tuned_from_model_id": null, "description": "o1-mini is a cost-efficient language model developed by OpenAI, designed for advanced reasoning tasks while minimizing computational resources.", "release_date": "2024-09-12", "announcement_date": "2024-09-12", "license_id": "proprietary", "multimodal": false, "knowledge_cutoff": null, "param_count": null, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://openai.com/api/o1-mini", "source_playground": "https://platform.openai.com/playground", "source_paper": "https://cdn.openai.com/o1-system-card-20240917.pdf", "source_scorecard_blog_link": "https://openai.com/index/openai-o1-mini-advancing-cost-efficient-reasoning/", "source_repo_link": null, "source_weights_link": null, "created_at": "2025-07-19T19:49:05.850010+00:00", "updated_at": "2025-07-19T19:49:05.850010+00:00", "model_family_id": null } ================================================ FILE: data/organizations/openai/models/o1-preview/benchmarks.json ================================================ [ { "model_benchmark_id": 491, "benchmark_id": "aime-2024", "model_id": "o1-preview", "score": 0.42, "normalized_score": 0.42, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/learning-to-reason-with-llms/", "verified_by_llmstats": false, "analysis_method": "pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.027037+00:00", "updated_at": "2025-07-19T19:56:12.027037+00:00", "benchmark_name": "AIME 2024" }, { "model_benchmark_id": 360, "benchmark_id": "gpqa", "model_id": "o1-preview", "score": 0.733, "normalized_score": 0.733, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/learning-to-reason-with-llms/", "verified_by_llmstats": false, "analysis_method": "pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.772534+00:00", "updated_at": "2025-07-19T19:56:11.772534+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 756, "benchmark_id": "livebench", "model_id": "o1-preview", "score": 0.523, "normalized_score": 0.523, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/learning-to-reason-with-llms/", "verified_by_llmstats": false, "analysis_method": "Coding", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.589687+00:00", "updated_at": "2025-07-19T19:56:12.589687+00:00", "benchmark_name": "LiveBench" }, { "model_benchmark_id": 430, "benchmark_id": "math", "model_id": "o1-preview", "score": 0.855, "normalized_score": 0.855, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/learning-to-reason-with-llms", "verified_by_llmstats": false, "analysis_method": "pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.910412+00:00", "updated_at": "2025-07-19T19:56:11.910412+00:00", "benchmark_name": "MATH" }, { "model_benchmark_id": 1300, "benchmark_id": "mgsm", "model_id": "o1-preview", "score": 0.908, "normalized_score": 0.908, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/learning-to-reason-with-llms/", "verified_by_llmstats": false, "analysis_method": "pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.718867+00:00", "updated_at": "2025-07-19T19:56:13.718867+00:00", "benchmark_name": "MGSM" }, { "model_benchmark_id": 127, "benchmark_id": "mmlu", "model_id": "o1-preview", "score": 0.908, "normalized_score": 0.908, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/learning-to-reason-with-llms", "verified_by_llmstats": false, "analysis_method": "pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.333269+00:00", "updated_at": "2025-07-19T19:56:11.333269+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 242, "benchmark_id": "simpleqa", "model_id": "o1-preview", "score": 0.424, "normalized_score": 0.424, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/learning-to-reason-with-llms/", "verified_by_llmstats": false, "analysis_method": "Factuality", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.562695+00:00", "updated_at": "2025-07-19T19:56:11.562695+00:00", "benchmark_name": "SimpleQA" }, { "model_benchmark_id": 1362, "benchmark_id": "swe-bench-verified", "model_id": "o1-preview", "score": 0.413, "normalized_score": 0.413, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/learning-to-reason-with-llms/", "verified_by_llmstats": false, "analysis_method": "Verified", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.867753+00:00", "updated_at": "2025-07-19T19:56:13.867753+00:00", "benchmark_name": "SWE-Bench Verified" } ] ================================================ FILE: data/organizations/openai/models/o1-preview/model.json ================================================ { "model_id": "o1-preview", "name": "o1-preview", "organization_id": "openai", "fine_tuned_from_model_id": null, "description": "A research preview model focused on mathematical and logical reasoning capabilities, demonstrating improved performance on tasks requiring step-by-step reasoning, mathematical problem-solving, and code generation. The model shows enhanced capabilities in formal reasoning while maintaining strong general capabilities.", "release_date": "2024-09-12", "announcement_date": "2024-09-12", "license_id": "proprietary", "multimodal": false, "knowledge_cutoff": null, "param_count": null, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://platform.openai.com/docs/models", "source_playground": null, "source_paper": "https://cdn.openai.com/o1-system-card-20240917.pdf", "source_scorecard_blog_link": "https://openai.com/index/learning-to-reason-with-llms", "source_repo_link": "https://github.com/openai", "source_weights_link": null, "created_at": "2025-07-19T19:49:05.862671+00:00", "updated_at": "2025-07-19T19:49:05.862671+00:00", "model_family_id": null } ================================================ FILE: data/organizations/openai/models/o1-pro/benchmarks.json ================================================ [ { "model_benchmark_id": 487, "benchmark_id": "aime-2024", "model_id": "o1-pro", "score": 0.86, "normalized_score": 0.86, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-chatgpt-pro/", "verified_by_llmstats": false, "analysis_method": "Pass@1 accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.021363+00:00", "updated_at": "2025-07-19T19:56:12.021363+00:00", "benchmark_name": "AIME 2024" }, { "model_benchmark_id": 354, "benchmark_id": "gpqa", "model_id": "o1-pro", "score": 0.79, "normalized_score": 0.79, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-chatgpt-pro/", "verified_by_llmstats": false, "analysis_method": "Diamond, Pass@1 accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.762804+00:00", "updated_at": "2025-07-19T19:56:11.762804+00:00", "benchmark_name": "GPQA" } ] ================================================ FILE: data/organizations/openai/models/o1-pro/model.json ================================================ { "model_id": "o1-pro", "name": "o1-pro", "organization_id": "openai", "fine_tuned_from_model_id": null, "description": "o1-pro is OpenAI's advanced language model optimized for complex reasoning and specialized professional tasks, offering enhanced capabilities while maintaining high efficiency.", "release_date": "2024-12-17", "announcement_date": "2024-12-17", "license_id": "proprietary", "multimodal": true, "knowledge_cutoff": "2023-09-30", "param_count": null, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://openai.com/api", "source_playground": "https://platform.openai.com/playground", "source_paper": "https://cdn.openai.com/o1-system-card-20240917.pdf", "source_scorecard_blog_link": "https://openai.com/index/introducing-chatgpt-pro/", "source_repo_link": null, "source_weights_link": null, "created_at": "2025-07-19T19:49:05.844613+00:00", "updated_at": "2025-07-19T19:49:05.844613+00:00", "model_family_id": null } ================================================ FILE: data/organizations/openai/models/o3-2025-04-16/benchmarks.json ================================================ [ { "model_benchmark_id": 666, "benchmark_id": "aider-polyglot", "model_id": "o3-2025-04-16", "score": 0.813, "normalized_score": 0.813, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-o3-and-o4-mini/", "verified_by_llmstats": false, "analysis_method": "accuracy (whole)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.380617+00:00", "updated_at": "2025-07-19T19:56:12.380617+00:00", "benchmark_name": "Aider-Polyglot" }, { "model_benchmark_id": 481, "benchmark_id": "aime-2024", "model_id": "o3-2025-04-16", "score": 0.916, "normalized_score": 0.916, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-o3-and-o4-mini/", "verified_by_llmstats": false, "analysis_method": "accuracy (no tools)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.012342+00:00", "updated_at": "2025-07-19T19:56:12.012342+00:00", "benchmark_name": "AIME 2024" }, { "model_benchmark_id": 705, "benchmark_id": "aime-2025", "model_id": "o3-2025-04-16", "score": 0.864, "normalized_score": 0.864, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-o3-and-o4-mini/", "verified_by_llmstats": false, "analysis_method": "pass@1 (no tools)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.475926+00:00", "updated_at": "2025-07-19T19:56:12.475926+00:00", "benchmark_name": "AIME 2025" }, { "model_benchmark_id": 1832, "benchmark_id": "arc-agi", "model_id": "o3-2025-04-16", "score": 0.88, "normalized_score": 0.88, "is_self_reported": true, "self_reported_source_link": "https://www.youtube.com/live/SKBG1sqdyIU?si=lWccKHt8bnttuYta", "verified_by_llmstats": false, "analysis_method": "test set evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.190370+00:00", "updated_at": "2025-07-19T19:56:15.190370+00:00", "benchmark_name": "ARC-AGI" }, { "model_benchmark_id": 1389, "benchmark_id": "arc-agi-v2", "model_id": "o3-2025-04-16", "score": 0.065, "normalized_score": 0.065, "is_self_reported": false, "self_reported_source_link": "https://x.com/xai/status/1943158495588815072", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.925569+00:00", "updated_at": "2025-07-19T19:56:13.925569+00:00", "benchmark_name": "ARC-AGI v2" }, { "model_benchmark_id": 1842, "benchmark_id": "browsecomp", "model_id": "o3-2025-04-16", "score": 0.497, "normalized_score": 0.497, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-o3-and-o4-mini/", "verified_by_llmstats": false, "analysis_method": "accuracy (with python + browsing)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.215315+00:00", "updated_at": "2025-07-19T19:56:15.215315+00:00", "benchmark_name": "BrowseComp" }, { "model_benchmark_id": 1833, "benchmark_id": "charxiv-r", "model_id": "o3-2025-04-16", "score": 0.786, "normalized_score": 0.786, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-5/", "verified_by_llmstats": false, "analysis_method": "OpenAI o3 with thinking mode - Scientific figure reasoning and interpretation.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.193874+00:00", "updated_at": "2025-07-19T19:56:15.193874+00:00", "benchmark_name": "CharXiv-R" }, { "model_benchmark_id": 1829, "benchmark_id": "frontiermath", "model_id": "o3-2025-04-16", "score": 0.158, "normalized_score": 0.158, "is_self_reported": true, "self_reported_source_link": "https://www.youtube.com/live/SKBG1sqdyIU?si=lWccKHt8bnttuYta", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.181554+00:00", "updated_at": "2025-07-19T19:56:15.181554+00:00", "benchmark_name": "FrontierMath" }, { "model_benchmark_id": 347, "benchmark_id": "gpqa", "model_id": "o3-2025-04-16", "score": 0.833, "normalized_score": 0.833, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-o3-and-o4-mini/", "verified_by_llmstats": false, "analysis_method": "OpenAI o3 - Diamond thinking no tools", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.750986+00:00", "updated_at": "2025-07-19T19:56:11.750986+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 725, "benchmark_id": "humanity's-last-exam", "model_id": "o3-2025-04-16", "score": 0.202, "normalized_score": 0.202, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-o3-and-o4-mini/", "verified_by_llmstats": false, "analysis_method": "accuracy (no tools)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.526631+00:00", "updated_at": "2025-07-19T19:56:12.526631+00:00", "benchmark_name": "Humanity's Last Exam" }, { "model_benchmark_id": 2001, "benchmark_id": "humanity's-last-exam", "model_id": "o3-2025-04-16", "score": 0.243, "normalized_score": 0.243, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-5/", "verified_by_llmstats": false, "analysis_method": "OpenAI o3 with thinking mode enabled (Python + browser tools) - Full set of expert-level questions across subjects.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "Humanity's Last Exam" }, { "model_benchmark_id": 2002, "benchmark_id": "humanity's-last-exam", "model_id": "o3-2025-04-16", "score": 0.147, "normalized_score": 0.147, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-5/", "verified_by_llmstats": false, "analysis_method": "OpenAI o3 with thinking mode enabled (no tools) - Full set of expert-level questions across subjects.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "Humanity's Last Exam" }, { "model_benchmark_id": 538, "benchmark_id": "mathvista", "model_id": "o3-2025-04-16", "score": 0.868, "normalized_score": 0.868, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-o3-and-o4-mini/", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.112692+00:00", "updated_at": "2025-07-19T19:56:12.112692+00:00", "benchmark_name": "MathVista" }, { "model_benchmark_id": 589, "benchmark_id": "mmmu", "model_id": "o3-2025-04-16", "score": 0.829, "normalized_score": 0.829, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-5/", "verified_by_llmstats": false, "analysis_method": "OpenAI o3 with thinking mode - College-level visual problem-solving with multimodal reasoning.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.211231+00:00", "updated_at": "2025-07-19T19:56:12.211231+00:00", "benchmark_name": "MMMU" }, { "model_benchmark_id": 1840, "benchmark_id": "scale-multichallenge", "model_id": "o3-2025-04-16", "score": 0.565, "normalized_score": 0.565, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-o3-and-o4-mini/", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.208929+00:00", "updated_at": "2025-07-19T19:56:15.208929+00:00", "benchmark_name": "Scale MultiChallenge" }, { "model_benchmark_id": 2004, "benchmark_id": "scale-multichallenge", "model_id": "o3-2025-04-16", "score": 0.604, "normalized_score": 0.604, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-5/", "verified_by_llmstats": false, "analysis_method": "OpenAI o3 with thinking mode enabled - Multi-turn instruction following benchmark.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "Scale MultiChallenge" }, { "model_benchmark_id": 2006, "benchmark_id": "collie", "model_id": "o3-2025-04-16", "score": 0.984, "normalized_score": 0.984, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-5/", "verified_by_llmstats": false, "analysis_method": "OpenAI o3 with thinking mode enabled - Instruction-following in freeform writing.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "COLLIE" }, { "model_benchmark_id": 2007, "benchmark_id": "tau2-airline", "model_id": "o3-2025-04-16", "score": 0.648, "normalized_score": 0.648, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-5/", "verified_by_llmstats": false, "analysis_method": "OpenAI o3 with thinking mode - Function calling benchmark (airline domain).", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "Tau2 airline" }, { "model_benchmark_id": 2008, "benchmark_id": "tau2-retail", "model_id": "o3-2025-04-16", "score": 0.802, "normalized_score": 0.802, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-5/", "verified_by_llmstats": false, "analysis_method": "OpenAI o3 with thinking mode - Function calling benchmark (retail domain).", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "Tau2 retail" }, { "model_benchmark_id": 2009, "benchmark_id": "tau2-telecom", "model_id": "o3-2025-04-16", "score": 0.582, "normalized_score": 0.582, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-5/", "verified_by_llmstats": false, "analysis_method": "OpenAI o3 with thinking mode - Function calling benchmark (telecom domain).", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "Tau2 telecom" }, { "model_benchmark_id": 2010, "benchmark_id": "mmmu-pro", "model_id": "o3-2025-04-16", "score": 0.764, "normalized_score": 0.764, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-5/", "verified_by_llmstats": false, "analysis_method": "OpenAI o3 with thinking mode - Graduate-level visual problem-solving with advanced multimodal reasoning.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "MMMU-Pro" }, { "model_benchmark_id": 2011, "benchmark_id": "videommmu", "model_id": "o3-2025-04-16", "score": 0.833, "normalized_score": 0.833, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-5/", "verified_by_llmstats": false, "analysis_method": "OpenAI o3 with thinking mode - Video-based multimodal reasoning (max frame 256).", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "VideoMMMU" }, { "model_benchmark_id": 2012, "benchmark_id": "erqa", "model_id": "o3-2025-04-16", "score": 0.64, "normalized_score": 0.64, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-5/", "verified_by_llmstats": false, "analysis_method": "OpenAI o3 with thinking mode - Multimodal spatial reasoning.", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "benchmark_name": "ERQA" }, { "model_benchmark_id": 1354, "benchmark_id": "swe-bench-verified", "model_id": "o3-2025-04-16", "score": 0.691, "normalized_score": 0.691, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-o3-and-o4-mini/", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.851256+00:00", "updated_at": "2025-07-19T19:56:13.851256+00:00", "benchmark_name": "SWE-Bench Verified" }, { "model_benchmark_id": 1844, "benchmark_id": "tau-bench", "model_id": "o3-2025-04-16", "score": 0.63, "normalized_score": 0.63, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-o3-and-o4-mini/", "verified_by_llmstats": false, "analysis_method": "accuracy (avg Airline/Retail)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.221470+00:00", "updated_at": "2025-07-19T19:56:15.221470+00:00", "benchmark_name": "Tau-bench" } ] ================================================ FILE: data/organizations/openai/models/o3-2025-04-16/model.json ================================================ { "model_id": "o3-2025-04-16", "name": "o3", "organization_id": "openai", "fine_tuned_from_model_id": null, "description": "OpenAI's most powerful reasoning model. o3 is a well-rounded and powerful model across domains. It sets a new standard for math, science, coding, and visual reasoning tasks. It also excels at technical writing and instruction-following. Use it to think through multi-step problems that involve analysis across text, code, and images.", "release_date": "2025-04-16", "announcement_date": "2025-04-16", "license_id": "proprietary", "multimodal": true, "knowledge_cutoff": "2024-05-31", "param_count": null, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://platform.openai.com/docs/models/o3", "source_playground": null, "source_paper": "https://cdn.openai.com/pdf/2221c875-02dc-4789-800b-e7758f3722c1/o3-and-o4-mini-system-card.pdf", "source_scorecard_blog_link": null, "source_repo_link": null, "source_weights_link": null, "created_at": "2025-07-19T19:49:05.818000+00:00", "updated_at": "2025-07-19T19:49:05.818000+00:00", "model_family_id": null } ================================================ FILE: data/organizations/openai/models/o3-mini/benchmarks.json ================================================ [ { "model_benchmark_id": 670, "benchmark_id": "aider-polyglot", "model_id": "o3-mini", "score": 0.667, "normalized_score": 0.667, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "benchmark score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.387419+00:00", "updated_at": "2025-07-19T19:56:12.387419+00:00", "benchmark_name": "Aider-Polyglot" }, { "model_benchmark_id": 1334, "benchmark_id": "aider-polyglot-edit", "model_id": "o3-mini", "score": 0.604, "normalized_score": 0.604, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "benchmark score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.806560+00:00", "updated_at": "2025-07-19T19:56:13.806560+00:00", "benchmark_name": "Aider-Polyglot Edit" }, { "model_benchmark_id": 485, "benchmark_id": "aime-2024", "model_id": "o3-mini", "score": 0.873, "normalized_score": 0.873, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/openai-o3-mini/", "verified_by_llmstats": false, "analysis_method": "test set evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.018382+00:00", "updated_at": "2025-07-19T19:56:12.018382+00:00", "benchmark_name": "AIME 2024" }, { "model_benchmark_id": 1859, "benchmark_id": "collie", "model_id": "o3-mini", "score": 0.987, "normalized_score": 0.987, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "benchmark score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.259314+00:00", "updated_at": "2025-07-19T19:56:15.259314+00:00", "benchmark_name": "COLLIE" }, { "model_benchmark_id": 1894, "benchmark_id": "complexfuncbench", "model_id": "o3-mini", "score": 0.176, "normalized_score": 0.176, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "benchmark score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.344047+00:00", "updated_at": "2025-07-19T19:56:15.344047+00:00", "benchmark_name": "ComplexFuncBench" }, { "model_benchmark_id": 1830, "benchmark_id": "frontiermath", "model_id": "o3-mini", "score": 0.092, "normalized_score": 0.092, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/openai-o3-mini/", "verified_by_llmstats": false, "analysis_method": "pass @ 1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.183728+00:00", "updated_at": "2025-07-19T19:56:15.183728+00:00", "benchmark_name": "FrontierMath" }, { "model_benchmark_id": 351, "benchmark_id": "gpqa", "model_id": "o3-mini", "score": 0.772, "normalized_score": 0.772, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "diamond", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.758026+00:00", "updated_at": "2025-07-19T19:56:11.758026+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 1904, "benchmark_id": "graphwalks-bfs-<128k", "model_id": "o3-mini", "score": 0.51, "normalized_score": 0.51, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "benchmark score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.368369+00:00", "updated_at": "2025-07-19T19:56:15.368369+00:00", "benchmark_name": "Graphwalks BFS <128k" }, { "model_benchmark_id": 1880, "benchmark_id": "graphwalks-parents-<128k", "model_id": "o3-mini", "score": 0.583, "normalized_score": 0.583, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "benchmark score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.310391+00:00", "updated_at": "2025-07-19T19:56:15.310391+00:00", "benchmark_name": "Graphwalks parents <128k" }, { "model_benchmark_id": 634, "benchmark_id": "ifeval", "model_id": "o3-mini", "score": 0.939, "normalized_score": 0.939, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "benchmark score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.302770+00:00", "updated_at": "2025-07-19T19:56:12.302770+00:00", "benchmark_name": "IFEval" }, { "model_benchmark_id": 1847, "benchmark_id": "internal-api-instruction-following-(hard)", "model_id": "o3-mini", "score": 0.5, "normalized_score": 0.5, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "benchmark score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.228737+00:00", "updated_at": "2025-07-19T19:56:15.228737+00:00", "benchmark_name": "Internal API instruction following (hard)" }, { "model_benchmark_id": 754, "benchmark_id": "livebench", "model_id": "o3-mini", "score": 0.846, "normalized_score": 0.846, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/openai-o3-mini/", "verified_by_llmstats": false, "analysis_method": "o3-mini high", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.585789+00:00", "updated_at": "2025-07-19T19:56:12.585789+00:00", "benchmark_name": "LiveBench" }, { "model_benchmark_id": 426, "benchmark_id": "math", "model_id": "o3-mini", "score": 0.979, "normalized_score": 0.979, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/openai-o3-mini/", "verified_by_llmstats": false, "analysis_method": "o3-mini high", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.901889+00:00", "updated_at": "2025-07-19T19:56:11.901889+00:00", "benchmark_name": "MATH" }, { "model_benchmark_id": 1296, "benchmark_id": "mgsm", "model_id": "o3-mini", "score": 0.92, "normalized_score": 0.92, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/openai-o3-mini/", "verified_by_llmstats": false, "analysis_method": "o3-mini high", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.712633+00:00", "updated_at": "2025-07-19T19:56:13.712633+00:00", "benchmark_name": "MGSM" }, { "model_benchmark_id": 119, "benchmark_id": "mmlu", "model_id": "o3-mini", "score": 0.869, "normalized_score": 0.869, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/openai-o3-mini/", "verified_by_llmstats": false, "analysis_method": "o3-mini high", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.320589+00:00", "updated_at": "2025-07-19T19:56:11.320589+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 742, "benchmark_id": "multichallenge", "model_id": "o3-mini", "score": 0.399, "normalized_score": 0.399, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "benchmark score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.560158+00:00", "updated_at": "2025-07-19T19:56:12.560158+00:00", "benchmark_name": "MultiChallenge" }, { "model_benchmark_id": 1853, "benchmark_id": "multichallenge-(o3-mini-grader)", "model_id": "o3-mini", "score": 0.502, "normalized_score": 0.502, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "benchmark score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.243415+00:00", "updated_at": "2025-07-19T19:56:15.243415+00:00", "benchmark_name": "MultiChallenge (o3-mini grader)" }, { "model_benchmark_id": 1652, "benchmark_id": "multi-if", "model_id": "o3-mini", "score": 0.795, "normalized_score": 0.795, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "benchmark score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.646496+00:00", "updated_at": "2025-07-19T19:56:14.646496+00:00", "benchmark_name": "Multi-IF" }, { "model_benchmark_id": 1474, "benchmark_id": "multilingual-mmlu", "model_id": "o3-mini", "score": 0.807, "normalized_score": 0.807, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "benchmark score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.143822+00:00", "updated_at": "2025-07-19T19:56:14.143822+00:00", "benchmark_name": "Multilingual MMLU" }, { "model_benchmark_id": 1865, "benchmark_id": "openai-mrcr:-2-needle-128k", "model_id": "o3-mini", "score": 0.187, "normalized_score": 0.187, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "benchmark score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.274261+00:00", "updated_at": "2025-07-19T19:56:15.274261+00:00", "benchmark_name": "OpenAI-MRCR: 2 needle 128k" }, { "model_benchmark_id": 238, "benchmark_id": "simpleqa", "model_id": "o3-mini", "score": 0.15, "normalized_score": 0.15, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-gpt-4-5/", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.554563+00:00", "updated_at": "2025-07-19T19:56:11.554563+00:00", "benchmark_name": "SimpleQA" }, { "model_benchmark_id": 1357, "benchmark_id": "swe-bench-verified", "model_id": "o3-mini", "score": 0.493, "normalized_score": 0.493, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/openai-o3-mini/", "verified_by_llmstats": false, "analysis_method": "verified", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.856039+00:00", "updated_at": "2025-07-19T19:56:13.856039+00:00", "benchmark_name": "SWE-Bench Verified" }, { "model_benchmark_id": 1898, "benchmark_id": "swe-lancer", "model_id": "o3-mini", "score": 0.18, "normalized_score": 0.18, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "percentage score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.355089+00:00", "updated_at": "2025-07-19T19:56:15.355089+00:00", "benchmark_name": "SWE-Lancer" }, { "model_benchmark_id": 1901, "benchmark_id": "swe-lancer-(ic-diamond-subset)", "model_id": "o3-mini", "score": 0.074, "normalized_score": 0.074, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "percentage score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.362026+00:00", "updated_at": "2025-07-19T19:56:15.362026+00:00", "benchmark_name": "SWE-Lancer (IC-Diamond subset)" }, { "model_benchmark_id": 1779, "benchmark_id": "tau-bench-airline", "model_id": "o3-mini", "score": 0.324, "normalized_score": 0.324, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "benchmark score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.013372+00:00", "updated_at": "2025-07-19T19:56:15.013372+00:00", "benchmark_name": "TAU-bench Airline" }, { "model_benchmark_id": 1765, "benchmark_id": "tau-bench-retail", "model_id": "o3-mini", "score": 0.576, "normalized_score": 0.576, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/gpt-4-1/", "verified_by_llmstats": false, "analysis_method": "benchmark score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.984653+00:00", "updated_at": "2025-07-19T19:56:14.984653+00:00", "benchmark_name": "TAU-bench Retail" } ] ================================================ FILE: data/organizations/openai/models/o3-mini/model.json ================================================ { "model_id": "o3-mini", "name": "o3-mini", "organization_id": "openai", "fine_tuned_from_model_id": null, "description": "A smaller variant of O3, expected to offer enhanced multimodal capabilities, improved reasoning, and more efficient resource utilization compared to previous models while maintaining strong performance on core tasks.", "release_date": "2025-01-30", "announcement_date": "2025-01-30", "license_id": "proprietary", "multimodal": false, "knowledge_cutoff": "2023-09-30", "param_count": null, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://platform.openai.com/docs/models", "source_playground": null, "source_paper": "https://cdn.openai.com/o3-mini-system-card.pdf", "source_scorecard_blog_link": "https://openai.com/index/openai-o3-mini/", "source_repo_link": "https://github.com/openai", "source_weights_link": null, "created_at": "2025-07-19T19:49:05.835007+00:00", "updated_at": "2025-07-19T19:49:05.835007+00:00", "model_family_id": null } ================================================ FILE: data/organizations/openai/models/o3-pro-2025-06-10/model.json ================================================ { "model_id": "o3-pro-2025-06-10", "name": "o3-pro", "organization_id": "openai", "fine_tuned_from_model_id": null, "description": "Version of o3 with more compute for better responses. The o3-pro model uses more compute to think harder and provide consistently better answers. Designed to tackle tough problems with advanced reasoning capabilities.", "release_date": "2025-06-10", "announcement_date": "2025-06-10", "license_id": "proprietary", "multimodal": true, "knowledge_cutoff": "2024-05-31", "param_count": null, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://platform.openai.com/docs/models/o3-pro", "source_playground": null, "source_paper": null, "source_scorecard_blog_link": null, "source_repo_link": null, "source_weights_link": null, "created_at": "2025-07-19T19:49:05.832229+00:00", "updated_at": "2025-07-19T19:49:05.832229+00:00", "model_family_id": null } ================================================ FILE: data/organizations/openai/models/o4-mini/benchmarks.json ================================================ [ { "model_benchmark_id": 668, "benchmark_id": "aider-polyglot", "model_id": "o4-mini", "score": 0.689, "normalized_score": 0.689, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-o3-and-o4-mini/", "verified_by_llmstats": false, "analysis_method": "accuracy (whole, o4-mini-high)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.384371+00:00", "updated_at": "2025-07-19T19:56:12.384371+00:00", "benchmark_name": "Aider-Polyglot" }, { "model_benchmark_id": 1332, "benchmark_id": "aider-polyglot-edit", "model_id": "o4-mini", "score": 0.582, "normalized_score": 0.582, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-o3-and-o4-mini/", "verified_by_llmstats": false, "analysis_method": "accuracy (diff, o4-mini-high)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.803065+00:00", "updated_at": "2025-07-19T19:56:13.803065+00:00", "benchmark_name": "Aider-Polyglot Edit" }, { "model_benchmark_id": 483, "benchmark_id": "aime-2024", "model_id": "o4-mini", "score": 0.934, "normalized_score": 0.934, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-o3-and-o4-mini/", "verified_by_llmstats": false, "analysis_method": "accuracy (no tools)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.015345+00:00", "updated_at": "2025-07-19T19:56:12.015345+00:00", "benchmark_name": "AIME 2024" }, { "model_benchmark_id": 706, "benchmark_id": "aime-2025", "model_id": "o4-mini", "score": 0.927, "normalized_score": 0.927, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-o3-and-o4-mini/", "verified_by_llmstats": false, "analysis_method": "accuracy (no tools)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.477657+00:00", "updated_at": "2025-07-19T19:56:12.477657+00:00", "benchmark_name": "AIME 2025" }, { "model_benchmark_id": 1843, "benchmark_id": "browsecomp", "model_id": "o4-mini", "score": 0.515, "normalized_score": 0.515, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-o3-and-o4-mini/", "verified_by_llmstats": false, "analysis_method": "accuracy (with python + browsing)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.217475+00:00", "updated_at": "2025-07-19T19:56:15.217475+00:00", "benchmark_name": "BrowseComp" }, { "model_benchmark_id": 1835, "benchmark_id": "charxiv-r", "model_id": "o4-mini", "score": 0.72, "normalized_score": 0.72, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-o3-and-o4-mini/", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.197036+00:00", "updated_at": "2025-07-19T19:56:15.197036+00:00", "benchmark_name": "CharXiv-R" }, { "model_benchmark_id": 349, "benchmark_id": "gpqa", "model_id": "o4-mini", "score": 0.814, "normalized_score": 0.814, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-o3-and-o4-mini/", "verified_by_llmstats": false, "analysis_method": "diamond accuracy (no tools)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.754610+00:00", "updated_at": "2025-07-19T19:56:11.754610+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 726, "benchmark_id": "humanity's-last-exam", "model_id": "o4-mini", "score": 0.147, "normalized_score": 0.147, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-o3-and-o4-mini/", "verified_by_llmstats": false, "analysis_method": "accuracy (no tools)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.528160+00:00", "updated_at": "2025-07-19T19:56:12.528160+00:00", "benchmark_name": "Humanity's Last Exam" }, { "model_benchmark_id": 540, "benchmark_id": "mathvista", "model_id": "o4-mini", "score": 0.843, "normalized_score": 0.843, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-o3-and-o4-mini/", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.115868+00:00", "updated_at": "2025-07-19T19:56:12.115868+00:00", "benchmark_name": "MathVista" }, { "model_benchmark_id": 591, "benchmark_id": "mmmu", "model_id": "o4-mini", "score": 0.816, "normalized_score": 0.816, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-o3-and-o4-mini/", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.218993+00:00", "updated_at": "2025-07-19T19:56:12.218993+00:00", "benchmark_name": "MMMU" }, { "model_benchmark_id": 1841, "benchmark_id": "scale-multichallenge", "model_id": "o4-mini", "score": 0.43, "normalized_score": 0.43, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-o3-and-o4-mini/", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.211372+00:00", "updated_at": "2025-07-19T19:56:15.211372+00:00", "benchmark_name": "Scale MultiChallenge" }, { "model_benchmark_id": 1356, "benchmark_id": "swe-bench-verified", "model_id": "o4-mini", "score": 0.681, "normalized_score": 0.681, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-o3-and-o4-mini/", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.854236+00:00", "updated_at": "2025-07-19T19:56:13.854236+00:00", "benchmark_name": "SWE-Bench Verified" }, { "model_benchmark_id": 1777, "benchmark_id": "tau-bench-airline", "model_id": "o4-mini", "score": 0.492, "normalized_score": 0.492, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-o3-and-o4-mini/", "verified_by_llmstats": false, "analysis_method": "accuracy (o4-mini-high)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.009611+00:00", "updated_at": "2025-07-19T19:56:15.009611+00:00", "benchmark_name": "TAU-bench Airline" }, { "model_benchmark_id": 1763, "benchmark_id": "tau-bench-retail", "model_id": "o4-mini", "score": 0.718, "normalized_score": 0.718, "is_self_reported": true, "self_reported_source_link": "https://openai.com/index/introducing-o3-and-o4-mini/", "verified_by_llmstats": false, "analysis_method": "accuracy (o4-mini-high)", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.980200+00:00", "updated_at": "2025-07-19T19:56:14.980200+00:00", "benchmark_name": "TAU-bench Retail" } ] ================================================ FILE: data/organizations/openai/models/o4-mini/model.json ================================================ { "model_id": "o4-mini", "name": "o4-mini", "organization_id": "openai", "fine_tuned_from_model_id": null, "description": "o4-mini is OpenAI's latest small o-series model, optimized for fast, effective reasoning with exceptionally efficient performance in coding and visual tasks. It is faster and more affordable than o3.", "release_date": "2025-04-16", "announcement_date": "2025-04-16", "license_id": "proprietary", "multimodal": true, "knowledge_cutoff": "2024-05-31", "param_count": null, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://platform.openai.com/docs/models/o4-mini", "source_playground": null, "source_paper": "https://cdn.openai.com/pdf/2221c875-02dc-4789-800b-e7758f3722c1/o3-and-o4-mini-system-card.pdf", "source_scorecard_blog_link": null, "source_repo_link": "https://github.com/openai", "source_weights_link": null, "created_at": "2025-07-19T19:49:05.824485+00:00", "updated_at": "2025-07-19T19:49:05.824485+00:00", "model_family_id": null } ================================================ FILE: data/organizations/openai/organization.json ================================================ { "organization_id": "openai", "name": "OpenAI", "website": "https://openai.com", "description": "Leading AI research company", "country": "US", "created_at": "2025-07-19T19:49:05.815252+00:00", "updated_at": "2025-07-19T19:49:05.815252+00:00" } ================================================ FILE: data/organizations/qwen/models/qvq-72b-preview/benchmarks.json ================================================ [ { "model_benchmark_id": 1675, "benchmark_id": "mathvision", "model_id": "qvq-72b-preview", "score": 0.359, "normalized_score": 0.359, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/QVQ-72B-Preview", "verified_by_llmstats": false, "analysis_method": "full", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.700746+00:00", "updated_at": "2025-07-19T19:56:14.700746+00:00", "benchmark_name": "MathVision" }, { "model_benchmark_id": 526, "benchmark_id": "mathvista", "model_id": "qvq-72b-preview", "score": 0.714, "normalized_score": 0.714, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/QVQ-72B-Preview", "verified_by_llmstats": false, "analysis_method": "mini", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.092107+00:00", "updated_at": "2025-07-19T19:56:12.092107+00:00", "benchmark_name": "MathVista" }, { "model_benchmark_id": 570, "benchmark_id": "mmmu", "model_id": "qvq-72b-preview", "score": 0.703, "normalized_score": 0.703, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/QVQ-72B-Preview", "verified_by_llmstats": false, "analysis_method": "val", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.173084+00:00", "updated_at": "2025-07-19T19:56:12.173084+00:00", "benchmark_name": "MMMU" }, { "model_benchmark_id": 1716, "benchmark_id": "olympiadbench", "model_id": "qvq-72b-preview", "score": 0.204, "normalized_score": 0.204, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/QVQ-72B-Preview", "verified_by_llmstats": false, "analysis_method": "full", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.824642+00:00", "updated_at": "2025-07-19T19:56:14.824642+00:00", "benchmark_name": "OlympiadBench" } ] ================================================ FILE: data/organizations/qwen/models/qvq-72b-preview/model.json ================================================ { "model_id": "qvq-72b-preview", "name": "QvQ-72B-Preview", "organization_id": "qwen", "fine_tuned_from_model_id": "qwen2-vl-72b", "description": "An experimental research model focusing on advanced visual reasoning and step-by-step cognitive capabilities. Achieves strong performance on multi-modal science and mathematics tasks, though exhibits some limitations such as potential language mixing and recursive reasoning loops.", "release_date": "2024-12-25", "announcement_date": "2024-12-25", "license_id": "qwen", "multimodal": true, "knowledge_cutoff": null, "param_count": 73400000000, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://huggingface.co/Qwen/QVQ-72B-Preview", "source_playground": null, "source_paper": null, "source_scorecard_blog_link": "https://qwenlm.github.io/blog/qvq-72b-preview/", "source_repo_link": "https://github.com/QwenLM/Qwen2", "source_weights_link": "https://huggingface.co/Qwen/QVQ-72B-Preview", "created_at": "2025-07-19T19:49:05.895366+00:00", "updated_at": "2025-07-19T19:49:05.895366+00:00", "model_family_id": null } ================================================ FILE: data/organizations/qwen/models/qwen-2.5-14b-instruct/benchmarks.json ================================================ [ { "model_benchmark_id": 21, "benchmark_id": "arc-c", "model_id": "qwen-2.5-14b-instruct", "score": 0.673, "normalized_score": 0.673, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/", "verified_by_llmstats": false, "analysis_method": "ARC-C benchmark evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.127541+00:00", "updated_at": "2025-07-19T19:56:11.127541+00:00", "benchmark_name": "ARC-C" }, { "model_benchmark_id": 971, "benchmark_id": "bbh", "model_id": "qwen-2.5-14b-instruct", "score": 0.782, "normalized_score": 0.782, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/", "verified_by_llmstats": false, "analysis_method": "BBH benchmark evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.042167+00:00", "updated_at": "2025-07-19T19:56:13.042167+00:00", "benchmark_name": "BBH" }, { "model_benchmark_id": 301, "benchmark_id": "gpqa", "model_id": "qwen-2.5-14b-instruct", "score": 0.455, "normalized_score": 0.455, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/", "verified_by_llmstats": false, "analysis_method": "GPQA benchmark evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.677954+00:00", "updated_at": "2025-07-19T19:56:11.677954+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 994, "benchmark_id": "gsm8k", "model_id": "qwen-2.5-14b-instruct", "score": 0.948, "normalized_score": 0.948, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/", "verified_by_llmstats": false, "analysis_method": "GSM8K benchmark evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.082212+00:00", "updated_at": "2025-07-19T19:56:13.082212+00:00", "benchmark_name": "GSM8k" }, { "model_benchmark_id": 786, "benchmark_id": "humaneval", "model_id": "qwen-2.5-14b-instruct", "score": 0.835, "normalized_score": 0.835, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/", "verified_by_llmstats": false, "analysis_method": "HumanEval benchmark evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.646500+00:00", "updated_at": "2025-07-19T19:56:12.646500+00:00", "benchmark_name": "HumanEval" }, { "model_benchmark_id": 1441, "benchmark_id": "humaneval+", "model_id": "qwen-2.5-14b-instruct", "score": 0.512, "normalized_score": 0.512, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/", "verified_by_llmstats": false, "analysis_method": "HumanEval+ benchmark evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.071967+00:00", "updated_at": "2025-07-19T19:56:14.071967+00:00", "benchmark_name": "HumanEval+" }, { "model_benchmark_id": 404, "benchmark_id": "math", "model_id": "qwen-2.5-14b-instruct", "score": 0.8, "normalized_score": 0.8, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/", "verified_by_llmstats": false, "analysis_method": "MATH benchmark evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.862254+00:00", "updated_at": "2025-07-19T19:56:11.862254+00:00", "benchmark_name": "MATH" }, { "model_benchmark_id": 1185, "benchmark_id": "mbpp", "model_id": "qwen-2.5-14b-instruct", "score": 0.82, "normalized_score": 0.82, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/", "verified_by_llmstats": false, "analysis_method": "MBPP benchmark evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.497488+00:00", "updated_at": "2025-07-19T19:56:13.497488+00:00", "benchmark_name": "MBPP" }, { "model_benchmark_id": 1602, "benchmark_id": "mbpp+", "model_id": "qwen-2.5-14b-instruct", "score": 0.632, "normalized_score": 0.632, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/", "verified_by_llmstats": false, "analysis_method": "MBPP+ benchmark evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.507421+00:00", "updated_at": "2025-07-19T19:56:14.507421+00:00", "benchmark_name": "MBPP+" }, { "model_benchmark_id": 89, "benchmark_id": "mmlu", "model_id": "qwen-2.5-14b-instruct", "score": 0.797, "normalized_score": 0.797, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/", "verified_by_llmstats": false, "analysis_method": "MMLU benchmark evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.269091+00:00", "updated_at": "2025-07-19T19:56:11.269091+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 194, "benchmark_id": "mmlu-pro", "model_id": "qwen-2.5-14b-instruct", "score": 0.637, "normalized_score": 0.637, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/", "verified_by_llmstats": false, "analysis_method": "MMLU-Pro benchmark evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.471047+00:00", "updated_at": "2025-07-19T19:56:11.471047+00:00", "benchmark_name": "MMLU-Pro" }, { "model_benchmark_id": 731, "benchmark_id": "mmlu-redux", "model_id": "qwen-2.5-14b-instruct", "score": 0.8, "normalized_score": 0.8, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/", "verified_by_llmstats": false, "analysis_method": "MMLU-redux benchmark evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.538944+00:00", "updated_at": "2025-07-19T19:56:12.538944+00:00", "benchmark_name": "MMLU-Redux" }, { "model_benchmark_id": 1600, "benchmark_id": "mmlu-stem", "model_id": "qwen-2.5-14b-instruct", "score": 0.764, "normalized_score": 0.764, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/", "verified_by_llmstats": false, "analysis_method": "MMLU-STEM benchmark evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.500528+00:00", "updated_at": "2025-07-19T19:56:14.500528+00:00", "benchmark_name": "MMLU-STEM" }, { "model_benchmark_id": 642, "benchmark_id": "multipl-e", "model_id": "qwen-2.5-14b-instruct", "score": 0.728, "normalized_score": 0.728, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/", "verified_by_llmstats": false, "analysis_method": "MultiPL-E benchmark evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.319213+00:00", "updated_at": "2025-07-19T19:56:12.319213+00:00", "benchmark_name": "MultiPL-E" }, { "model_benchmark_id": 1597, "benchmark_id": "theoremqa", "model_id": "qwen-2.5-14b-instruct", "score": 0.43, "normalized_score": 0.43, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/", "verified_by_llmstats": false, "analysis_method": "TheoremQA benchmark evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.492163+00:00", "updated_at": "2025-07-19T19:56:14.492163+00:00", "benchmark_name": "TheoremQA" }, { "model_benchmark_id": 138, "benchmark_id": "truthfulqa", "model_id": "qwen-2.5-14b-instruct", "score": 0.584, "normalized_score": 0.584, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/", "verified_by_llmstats": false, "analysis_method": "TruthfulQA benchmark evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.355004+00:00", "updated_at": "2025-07-19T19:56:11.355004+00:00", "benchmark_name": "TruthfulQA" } ] ================================================ FILE: data/organizations/qwen/models/qwen-2.5-14b-instruct/model.json ================================================ { "model_id": "qwen-2.5-14b-instruct", "name": "Qwen2.5 14B Instruct", "organization_id": "qwen", "fine_tuned_from_model_id": null, "description": "Qwen2.5-14B-Instruct is an instruction-tuned 14.7B parameter language model, part of the Qwen2.5 series. It features significant improvements in instruction following, long text generation (8K+ tokens), structured data understanding, and JSON output generation. The model supports a 128K token context length and multilingual capabilities across 29+ languages including Chinese, English, French, Spanish, and more.", "release_date": "2024-09-19", "announcement_date": "2024-09-19", "license_id": "apache_2_0", "multimodal": false, "knowledge_cutoff": null, "param_count": 14700000000, "training_tokens": 18000000000000, "available_in_zeroeval": true, "source_api_ref": "https://www.alibabacloud.com/help/en/model-studio/developer-reference/use-qwen-by-calling-api", "source_playground": null, "source_paper": "https://arxiv.org/abs/2407.10671", "source_scorecard_blog_link": "https://qwenlm.github.io/blog/qwen2.5-llm/", "source_repo_link": "https://github.com/QwenLM/Qwen2.5", "source_weights_link": "https://huggingface.co/Qwen/Qwen2.5-14B-Instruct", "created_at": "2025-07-19T19:49:05.615575+00:00", "updated_at": "2025-07-19T19:49:05.615575+00:00", "model_family_id": null } ================================================ FILE: data/organizations/qwen/models/qwen-2.5-32b-instruct/benchmarks.json ================================================ [ { "model_benchmark_id": 18, "benchmark_id": "arc-c", "model_id": "qwen-2.5-32b-instruct", "score": 0.704, "normalized_score": 0.704, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/", "verified_by_llmstats": false, "analysis_method": "ARC-C benchmark evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.121747+00:00", "updated_at": "2025-07-19T19:56:11.121747+00:00", "benchmark_name": "ARC-C" }, { "model_benchmark_id": 970, "benchmark_id": "bbh", "model_id": "qwen-2.5-32b-instruct", "score": 0.845, "normalized_score": 0.845, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/", "verified_by_llmstats": false, "analysis_method": "BBH benchmark evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.040428+00:00", "updated_at": "2025-07-19T19:56:13.040428+00:00", "benchmark_name": "BBH" }, { "model_benchmark_id": 297, "benchmark_id": "gpqa", "model_id": "qwen-2.5-32b-instruct", "score": 0.495, "normalized_score": 0.495, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/", "verified_by_llmstats": false, "analysis_method": "GPQA benchmark evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.671178+00:00", "updated_at": "2025-07-19T19:56:11.671178+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 990, "benchmark_id": "gsm8k", "model_id": "qwen-2.5-32b-instruct", "score": 0.959, "normalized_score": 0.959, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/", "verified_by_llmstats": false, "analysis_method": "GSM8K benchmark evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.074870+00:00", "updated_at": "2025-07-19T19:56:13.074870+00:00", "benchmark_name": "GSM8k" }, { "model_benchmark_id": 45, "benchmark_id": "hellaswag", "model_id": "qwen-2.5-32b-instruct", "score": 0.852, "normalized_score": 0.852, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/", "verified_by_llmstats": false, "analysis_method": "HellaSwag benchmark evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.178158+00:00", "updated_at": "2025-07-19T19:56:11.178158+00:00", "benchmark_name": "HellaSwag" }, { "model_benchmark_id": 782, "benchmark_id": "humaneval", "model_id": "qwen-2.5-32b-instruct", "score": 0.884, "normalized_score": 0.884, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/", "verified_by_llmstats": false, "analysis_method": "HumanEval benchmark evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.639922+00:00", "updated_at": "2025-07-19T19:56:12.639922+00:00", "benchmark_name": "HumanEval" }, { "model_benchmark_id": 1440, "benchmark_id": "humaneval+", "model_id": "qwen-2.5-32b-instruct", "score": 0.524, "normalized_score": 0.524, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/", "verified_by_llmstats": false, "analysis_method": "HumanEval+ benchmark evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.070409+00:00", "updated_at": "2025-07-19T19:56:14.070409+00:00", "benchmark_name": "HumanEval+" }, { "model_benchmark_id": 400, "benchmark_id": "math", "model_id": "qwen-2.5-32b-instruct", "score": 0.831, "normalized_score": 0.831, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/", "verified_by_llmstats": false, "analysis_method": "MATH benchmark evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.856115+00:00", "updated_at": "2025-07-19T19:56:11.856115+00:00", "benchmark_name": "MATH" }, { "model_benchmark_id": 1181, "benchmark_id": "mbpp", "model_id": "qwen-2.5-32b-instruct", "score": 0.84, "normalized_score": 0.84, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/", "verified_by_llmstats": false, "analysis_method": "MBPP benchmark evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.489427+00:00", "updated_at": "2025-07-19T19:56:13.489427+00:00", "benchmark_name": "MBPP" }, { "model_benchmark_id": 1601, "benchmark_id": "mbpp+", "model_id": "qwen-2.5-32b-instruct", "score": 0.672, "normalized_score": 0.672, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/", "verified_by_llmstats": false, "analysis_method": "MBPP+ benchmark evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.504915+00:00", "updated_at": "2025-07-19T19:56:14.504915+00:00", "benchmark_name": "MBPP+" }, { "model_benchmark_id": 85, "benchmark_id": "mmlu", "model_id": "qwen-2.5-32b-instruct", "score": 0.833, "normalized_score": 0.833, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/", "verified_by_llmstats": false, "analysis_method": "MMLU benchmark evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.261705+00:00", "updated_at": "2025-07-19T19:56:11.261705+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 190, "benchmark_id": "mmlu-pro", "model_id": "qwen-2.5-32b-instruct", "score": 0.69, "normalized_score": 0.69, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/", "verified_by_llmstats": false, "analysis_method": "MMLU-Pro benchmark evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.465052+00:00", "updated_at": "2025-07-19T19:56:11.465052+00:00", "benchmark_name": "MMLU-Pro" }, { "model_benchmark_id": 728, "benchmark_id": "mmlu-redux", "model_id": "qwen-2.5-32b-instruct", "score": 0.839, "normalized_score": 0.839, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/", "verified_by_llmstats": false, "analysis_method": "MMLU-redux benchmark evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.533630+00:00", "updated_at": "2025-07-19T19:56:12.533630+00:00", "benchmark_name": "MMLU-Redux" }, { "model_benchmark_id": 1599, "benchmark_id": "mmlu-stem", "model_id": "qwen-2.5-32b-instruct", "score": 0.809, "normalized_score": 0.809, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/", "verified_by_llmstats": false, "analysis_method": "MMLU-STEM benchmark evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.498255+00:00", "updated_at": "2025-07-19T19:56:14.498255+00:00", "benchmark_name": "MMLU-STEM" }, { "model_benchmark_id": 640, "benchmark_id": "multipl-e", "model_id": "qwen-2.5-32b-instruct", "score": 0.754, "normalized_score": 0.754, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/", "verified_by_llmstats": false, "analysis_method": "MultiPL-E benchmark evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.316384+00:00", "updated_at": "2025-07-19T19:56:12.316384+00:00", "benchmark_name": "MultiPL-E" }, { "model_benchmark_id": 1593, "benchmark_id": "theoremqa", "model_id": "qwen-2.5-32b-instruct", "score": 0.441, "normalized_score": 0.441, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/", "verified_by_llmstats": false, "analysis_method": "TheoremQA benchmark evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.482526+00:00", "updated_at": "2025-07-19T19:56:14.482526+00:00", "benchmark_name": "TheoremQA" }, { "model_benchmark_id": 135, "benchmark_id": "truthfulqa", "model_id": "qwen-2.5-32b-instruct", "score": 0.578, "normalized_score": 0.578, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/", "verified_by_llmstats": false, "analysis_method": "TruthfulQA benchmark evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.349397+00:00", "updated_at": "2025-07-19T19:56:11.349397+00:00", "benchmark_name": "TruthfulQA" }, { "model_benchmark_id": 150, "benchmark_id": "winogrande", "model_id": "qwen-2.5-32b-instruct", "score": 0.82, "normalized_score": 0.82, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/", "verified_by_llmstats": false, "analysis_method": "Winogrande benchmark evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.384431+00:00", "updated_at": "2025-07-19T19:56:11.384431+00:00", "benchmark_name": "Winogrande" } ] ================================================ FILE: data/organizations/qwen/models/qwen-2.5-32b-instruct/model.json ================================================ { "model_id": "qwen-2.5-32b-instruct", "name": "Qwen2.5 32B Instruct", "organization_id": "qwen", "fine_tuned_from_model_id": null, "description": "Qwen2.5-32B-Instruct is an instruction-tuned 32 billion parameter language model, part of the Qwen2.5 series. It is designed to follow instructions, generate long texts (over 8K tokens), understand structured data (e.g., tables), and generate structured outputs, especially JSON. The model supports multilingual capabilities across over 29 languages.", "release_date": "2024-09-19", "announcement_date": "2024-09-19", "license_id": "apache_2_0", "multimodal": false, "knowledge_cutoff": null, "param_count": 32500000000, "training_tokens": 18000000000000, "available_in_zeroeval": true, "source_api_ref": "https://www.alibabacloud.com/help/en/model-studio/developer-reference/use-qwen-by-calling-api", "source_playground": null, "source_paper": null, "source_scorecard_blog_link": "https://qwenlm.github.io/blog/qwen2.5/", "source_repo_link": "https://github.com/QwenLM/Qwen2.5", "source_weights_link": "https://huggingface.co/Qwen/Qwen2.5-32B-Instruct", "created_at": "2025-07-19T19:49:05.606261+00:00", "updated_at": "2025-07-19T19:49:05.606261+00:00", "model_family_id": null } ================================================ FILE: data/organizations/qwen/models/qwen-2.5-72b-instruct/benchmarks.json ================================================ [ { "model_benchmark_id": 1617, "benchmark_id": "alignbench", "model_id": "qwen-2.5-72b-instruct", "score": 0.816, "normalized_score": 0.816, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5/", "verified_by_llmstats": false, "analysis_method": "AlignBench v1.1 benchmark evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.546122+00:00", "updated_at": "2025-07-19T19:56:14.546122+00:00", "benchmark_name": "AlignBench" }, { "model_benchmark_id": 1453, "benchmark_id": "arena-hard", "model_id": "qwen-2.5-72b-instruct", "score": 0.812, "normalized_score": 0.812, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5/", "verified_by_llmstats": false, "analysis_method": "Arena Hard benchmark evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.097075+00:00", "updated_at": "2025-07-19T19:56:14.097075+00:00", "benchmark_name": "Arena Hard" }, { "model_benchmark_id": 303, "benchmark_id": "gpqa", "model_id": "qwen-2.5-72b-instruct", "score": 0.49, "normalized_score": 0.49, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5/", "verified_by_llmstats": false, "analysis_method": "GPQA benchmark evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.681073+00:00", "updated_at": "2025-07-19T19:56:11.681073+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 996, "benchmark_id": "gsm8k", "model_id": "qwen-2.5-72b-instruct", "score": 0.958, "normalized_score": 0.958, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5/", "verified_by_llmstats": false, "analysis_method": "GSM8K benchmark evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.085236+00:00", "updated_at": "2025-07-19T19:56:13.085236+00:00", "benchmark_name": "GSM8k" }, { "model_benchmark_id": 787, "benchmark_id": "humaneval", "model_id": "qwen-2.5-72b-instruct", "score": 0.866, "normalized_score": 0.866, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5/", "verified_by_llmstats": false, "analysis_method": "HumanEval benchmark evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.648406+00:00", "updated_at": "2025-07-19T19:56:12.648406+00:00", "benchmark_name": "HumanEval" }, { "model_benchmark_id": 620, "benchmark_id": "ifeval", "model_id": "qwen-2.5-72b-instruct", "score": 0.841, "normalized_score": 0.841, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5/", "verified_by_llmstats": false, "analysis_method": "IFEval strict-prompt benchmark evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.277303+00:00", "updated_at": "2025-07-19T19:56:12.277303+00:00", "benchmark_name": "IFEval" }, { "model_benchmark_id": 750, "benchmark_id": "livebench", "model_id": "qwen-2.5-72b-instruct", "score": 0.523, "normalized_score": 0.523, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5/", "verified_by_llmstats": false, "analysis_method": "LiveBench benchmark evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.577555+00:00", "updated_at": "2025-07-19T19:56:12.577555+00:00", "benchmark_name": "LiveBench" }, { "model_benchmark_id": 1124, "benchmark_id": "livecodebench", "model_id": "qwen-2.5-72b-instruct", "score": 0.555, "normalized_score": 0.555, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5/", "verified_by_llmstats": false, "analysis_method": "LiveCodeBench benchmark evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.346315+00:00", "updated_at": "2025-07-19T19:56:13.346315+00:00", "benchmark_name": "LiveCodeBench" }, { "model_benchmark_id": 406, "benchmark_id": "math", "model_id": "qwen-2.5-72b-instruct", "score": 0.831, "normalized_score": 0.831, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5/", "verified_by_llmstats": false, "analysis_method": "MATH benchmark evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.865721+00:00", "updated_at": "2025-07-19T19:56:11.865721+00:00", "benchmark_name": "MATH" }, { "model_benchmark_id": 1187, "benchmark_id": "mbpp", "model_id": "qwen-2.5-72b-instruct", "score": 0.882, "normalized_score": 0.882, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5/", "verified_by_llmstats": false, "analysis_method": "MBPP benchmark evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.503069+00:00", "updated_at": "2025-07-19T19:56:13.503069+00:00", "benchmark_name": "MBPP" }, { "model_benchmark_id": 196, "benchmark_id": "mmlu-pro", "model_id": "qwen-2.5-72b-instruct", "score": 0.711, "normalized_score": 0.711, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/", "verified_by_llmstats": false, "analysis_method": "MMLU-Pro benchmark evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.475182+00:00", "updated_at": "2025-07-19T19:56:11.475182+00:00", "benchmark_name": "MMLU-Pro" }, { "model_benchmark_id": 733, "benchmark_id": "mmlu-redux", "model_id": "qwen-2.5-72b-instruct", "score": 0.868, "normalized_score": 0.868, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5/", "verified_by_llmstats": false, "analysis_method": "MMLU-redux benchmark evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.542364+00:00", "updated_at": "2025-07-19T19:56:12.542364+00:00", "benchmark_name": "MMLU-Redux" }, { "model_benchmark_id": 1606, "benchmark_id": "mt-bench", "model_id": "qwen-2.5-72b-instruct", "score": 0.935, "normalized_score": 0.935, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5/", "verified_by_llmstats": false, "analysis_method": "MT-bench benchmark evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.521232+00:00", "updated_at": "2025-07-19T19:56:14.521232+00:00", "benchmark_name": "MT-Bench" }, { "model_benchmark_id": 644, "benchmark_id": "multipl-e", "model_id": "qwen-2.5-72b-instruct", "score": 0.751, "normalized_score": 0.751, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5/", "verified_by_llmstats": false, "analysis_method": "MultiPL-E benchmark evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.322800+00:00", "updated_at": "2025-07-19T19:56:12.322800+00:00", "benchmark_name": "MultiPL-E" } ] ================================================ FILE: data/organizations/qwen/models/qwen-2.5-72b-instruct/model.json ================================================ { "model_id": "qwen-2.5-72b-instruct", "name": "Qwen2.5 72B Instruct", "organization_id": "qwen", "fine_tuned_from_model_id": null, "description": "Qwen2.5-72B-Instruct is an instruction-tuned 72 billion parameter language model, part of the Qwen2.5 series. It is designed to follow instructions, generate long texts (over 8K tokens), understand structured data (e.g., tables), and generate structured outputs, especially JSON. The model supports multilingual capabilities across over 29 languages.", "release_date": "2024-09-19", "announcement_date": "2024-09-19", "license_id": "qwen", "multimodal": false, "knowledge_cutoff": null, "param_count": 72700000000, "training_tokens": 18000000000000, "available_in_zeroeval": true, "source_api_ref": "https://www.alibabacloud.com/help/en/model-studio/developer-reference/use-qwen-by-calling-api", "source_playground": null, "source_paper": null, "source_scorecard_blog_link": "https://qwenlm.github.io/blog/qwen2.5/", "source_repo_link": "https://github.com/QwenLM/Qwen2.5", "source_weights_link": "https://huggingface.co/Qwen/Qwen2.5-72B-Instruct", "created_at": "2025-07-19T19:49:05.627855+00:00", "updated_at": "2025-07-19T19:49:05.627855+00:00", "model_family_id": null } ================================================ FILE: data/organizations/qwen/models/qwen-2.5-7b-instruct/benchmarks.json ================================================ [ { "model_benchmark_id": 1618, "benchmark_id": "alignbench", "model_id": "qwen-2.5-7b-instruct", "score": 0.733, "normalized_score": 0.733, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/", "verified_by_llmstats": false, "analysis_method": "AlignBench v1.1 benchmark evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.548680+00:00", "updated_at": "2025-07-19T19:56:14.548680+00:00", "benchmark_name": "AlignBench" }, { "model_benchmark_id": 1455, "benchmark_id": "arena-hard", "model_id": "qwen-2.5-7b-instruct", "score": 0.52, "normalized_score": 0.52, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/", "verified_by_llmstats": false, "analysis_method": "Arena Hard benchmark evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.100766+00:00", "updated_at": "2025-07-19T19:56:14.100766+00:00", "benchmark_name": "Arena Hard" }, { "model_benchmark_id": 306, "benchmark_id": "gpqa", "model_id": "qwen-2.5-7b-instruct", "score": 0.364, "normalized_score": 0.364, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/", "verified_by_llmstats": false, "analysis_method": "GPQA benchmark evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.685965+00:00", "updated_at": "2025-07-19T19:56:11.685965+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 998, "benchmark_id": "gsm8k", "model_id": "qwen-2.5-7b-instruct", "score": 0.916, "normalized_score": 0.916, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/", "verified_by_llmstats": false, "analysis_method": "GSM8K benchmark evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.088027+00:00", "updated_at": "2025-07-19T19:56:13.088027+00:00", "benchmark_name": "GSM8k" }, { "model_benchmark_id": 789, "benchmark_id": "humaneval", "model_id": "qwen-2.5-7b-instruct", "score": 0.848, "normalized_score": 0.848, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/", "verified_by_llmstats": false, "analysis_method": "HumanEval benchmark evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.651744+00:00", "updated_at": "2025-07-19T19:56:12.651744+00:00", "benchmark_name": "HumanEval" }, { "model_benchmark_id": 621, "benchmark_id": "ifeval", "model_id": "qwen-2.5-7b-instruct", "score": 0.712, "normalized_score": 0.712, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/", "verified_by_llmstats": false, "analysis_method": "IFEval strict-prompt benchmark evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.278867+00:00", "updated_at": "2025-07-19T19:56:12.278867+00:00", "benchmark_name": "IFEval" }, { "model_benchmark_id": 753, "benchmark_id": "livebench", "model_id": "qwen-2.5-7b-instruct", "score": 0.359, "normalized_score": 0.359, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/", "verified_by_llmstats": false, "analysis_method": "LiveBench 0831 benchmark evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.584018+00:00", "updated_at": "2025-07-19T19:56:12.584018+00:00", "benchmark_name": "LiveBench" }, { "model_benchmark_id": 1126, "benchmark_id": "livecodebench", "model_id": "qwen-2.5-7b-instruct", "score": 0.287, "normalized_score": 0.287, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/", "verified_by_llmstats": false, "analysis_method": "LiveCodeBench 2305-2409 benchmark evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.352497+00:00", "updated_at": "2025-07-19T19:56:13.352497+00:00", "benchmark_name": "LiveCodeBench" }, { "model_benchmark_id": 408, "benchmark_id": "math", "model_id": "qwen-2.5-7b-instruct", "score": 0.755, "normalized_score": 0.755, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/", "verified_by_llmstats": false, "analysis_method": "MATH benchmark evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.869960+00:00", "updated_at": "2025-07-19T19:56:11.869960+00:00", "benchmark_name": "MATH" }, { "model_benchmark_id": 1189, "benchmark_id": "mbpp", "model_id": "qwen-2.5-7b-instruct", "score": 0.792, "normalized_score": 0.792, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/", "verified_by_llmstats": false, "analysis_method": "MBPP benchmark evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.506947+00:00", "updated_at": "2025-07-19T19:56:13.506947+00:00", "benchmark_name": "MBPP" }, { "model_benchmark_id": 198, "benchmark_id": "mmlu-pro", "model_id": "qwen-2.5-7b-instruct", "score": 0.563, "normalized_score": 0.563, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/", "verified_by_llmstats": false, "analysis_method": "MMLU-Pro benchmark evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.479104+00:00", "updated_at": "2025-07-19T19:56:11.479104+00:00", "benchmark_name": "MMLU-Pro" }, { "model_benchmark_id": 735, "benchmark_id": "mmlu-redux", "model_id": "qwen-2.5-7b-instruct", "score": 0.754, "normalized_score": 0.754, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/", "verified_by_llmstats": false, "analysis_method": "MMLU-redux benchmark evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.545338+00:00", "updated_at": "2025-07-19T19:56:12.545338+00:00", "benchmark_name": "MMLU-Redux" }, { "model_benchmark_id": 1607, "benchmark_id": "mt-bench", "model_id": "qwen-2.5-7b-instruct", "score": 0.875, "normalized_score": 0.875, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/", "verified_by_llmstats": false, "analysis_method": "MT-bench benchmark evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.523567+00:00", "updated_at": "2025-07-19T19:56:14.523567+00:00", "benchmark_name": "MT-Bench" }, { "model_benchmark_id": 646, "benchmark_id": "multipl-e", "model_id": "qwen-2.5-7b-instruct", "score": 0.704, "normalized_score": 0.704, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/", "verified_by_llmstats": false, "analysis_method": "MultiPL-E benchmark evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.325846+00:00", "updated_at": "2025-07-19T19:56:12.325846+00:00", "benchmark_name": "MultiPL-E" } ] ================================================ FILE: data/organizations/qwen/models/qwen-2.5-7b-instruct/model.json ================================================ { "model_id": "qwen-2.5-7b-instruct", "name": "Qwen2.5 7B Instruct", "organization_id": "qwen", "fine_tuned_from_model_id": null, "description": "Qwen2.5-7B-Instruct is an instruction-tuned 7B parameter language model that excels at following instructions, generating long texts (over 8K tokens), understanding structured data, and generating structured outputs like JSON. The model features enhanced capabilities in mathematics, coding, and multilingual support across 29+ languages including Chinese, English, French, Spanish, and more.", "release_date": "2024-09-19", "announcement_date": "2024-09-19", "license_id": "apache_2_0", "multimodal": false, "knowledge_cutoff": null, "param_count": 7610000000, "training_tokens": 18000000000000, "available_in_zeroeval": true, "source_api_ref": "https://www.alibabacloud.com/help/en/model-studio/developer-reference/use-qwen-by-calling-api", "source_playground": null, "source_paper": "https://arxiv.org/abs/2407.10671", "source_scorecard_blog_link": "https://qwenlm.github.io/blog/qwen2.5-llm/", "source_repo_link": "https://github.com/QwenLM/Qwen2.5", "source_weights_link": "https://huggingface.co/Qwen/Qwen2.5-7B-Instruct", "created_at": "2025-07-19T19:49:05.642960+00:00", "updated_at": "2025-07-19T19:49:05.642960+00:00", "model_family_id": null } ================================================ FILE: data/organizations/qwen/models/qwen-2.5-coder-32b-instruct/benchmarks.json ================================================ [ { "model_benchmark_id": 19, "benchmark_id": "arc-c", "model_id": "qwen-2.5-coder-32b-instruct", "score": 0.705, "normalized_score": 0.705, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/abs/2409.12186", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.123905+00:00", "updated_at": "2025-07-19T19:56:11.123905+00:00", "benchmark_name": "ARC-C" }, { "model_benchmark_id": 1603, "benchmark_id": "bigcodebench-full", "model_id": "qwen-2.5-coder-32b-instruct", "score": 0.496, "normalized_score": 0.496, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/abs/2409.12186", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.511653+00:00", "updated_at": "2025-07-19T19:56:14.511653+00:00", "benchmark_name": "BigCodeBench-Full" }, { "model_benchmark_id": 1604, "benchmark_id": "bigcodebench-hard", "model_id": "qwen-2.5-coder-32b-instruct", "score": 0.27, "normalized_score": 0.27, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/abs/2409.12186", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.515099+00:00", "updated_at": "2025-07-19T19:56:14.515099+00:00", "benchmark_name": "BigCodeBench-Hard" }, { "model_benchmark_id": 991, "benchmark_id": "gsm8k", "model_id": "qwen-2.5-coder-32b-instruct", "score": 0.911, "normalized_score": 0.911, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/abs/2409.12186", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.076453+00:00", "updated_at": "2025-07-19T19:56:13.076453+00:00", "benchmark_name": "GSM8k" }, { "model_benchmark_id": 46, "benchmark_id": "hellaswag", "model_id": "qwen-2.5-coder-32b-instruct", "score": 0.83, "normalized_score": 0.83, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/abs/2409.12186", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.180700+00:00", "updated_at": "2025-07-19T19:56:11.180700+00:00", "benchmark_name": "HellaSwag" }, { "model_benchmark_id": 783, "benchmark_id": "humaneval", "model_id": "qwen-2.5-coder-32b-instruct", "score": 0.927, "normalized_score": 0.927, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/abs/2409.12186", "verified_by_llmstats": false, "analysis_method": "pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.641672+00:00", "updated_at": "2025-07-19T19:56:12.641672+00:00", "benchmark_name": "HumanEval" }, { "model_benchmark_id": 1117, "benchmark_id": "livecodebench", "model_id": "qwen-2.5-coder-32b-instruct", "score": 0.314, "normalized_score": 0.314, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/abs/2409.12186", "verified_by_llmstats": false, "analysis_method": "pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.329968+00:00", "updated_at": "2025-07-19T19:56:13.329968+00:00", "benchmark_name": "LiveCodeBench" }, { "model_benchmark_id": 401, "benchmark_id": "math", "model_id": "qwen-2.5-coder-32b-instruct", "score": 0.572, "normalized_score": 0.572, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/abs/2409.12186", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.857514+00:00", "updated_at": "2025-07-19T19:56:11.857514+00:00", "benchmark_name": "MATH" }, { "model_benchmark_id": 1182, "benchmark_id": "mbpp", "model_id": "qwen-2.5-coder-32b-instruct", "score": 0.902, "normalized_score": 0.902, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/abs/2409.12186", "verified_by_llmstats": false, "analysis_method": "pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.491369+00:00", "updated_at": "2025-07-19T19:56:13.491369+00:00", "benchmark_name": "MBPP" }, { "model_benchmark_id": 86, "benchmark_id": "mmlu", "model_id": "qwen-2.5-coder-32b-instruct", "score": 0.751, "normalized_score": 0.751, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/abs/2409.12186", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.263438+00:00", "updated_at": "2025-07-19T19:56:11.263438+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 191, "benchmark_id": "mmlu-pro", "model_id": "qwen-2.5-coder-32b-instruct", "score": 0.504, "normalized_score": 0.504, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/abs/2409.12186", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.466410+00:00", "updated_at": "2025-07-19T19:56:11.466410+00:00", "benchmark_name": "MMLU-Pro" }, { "model_benchmark_id": 729, "benchmark_id": "mmlu-redux", "model_id": "qwen-2.5-coder-32b-instruct", "score": 0.775, "normalized_score": 0.775, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/abs/2409.12186", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.535302+00:00", "updated_at": "2025-07-19T19:56:12.535302+00:00", "benchmark_name": "MMLU-Redux" }, { "model_benchmark_id": 1594, "benchmark_id": "theoremqa", "model_id": "qwen-2.5-coder-32b-instruct", "score": 0.431, "normalized_score": 0.431, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/abs/2409.12186", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.485084+00:00", "updated_at": "2025-07-19T19:56:14.485084+00:00", "benchmark_name": "TheoremQA" }, { "model_benchmark_id": 136, "benchmark_id": "truthfulqa", "model_id": "qwen-2.5-coder-32b-instruct", "score": 0.542, "normalized_score": 0.542, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/abs/2409.12186", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.351250+00:00", "updated_at": "2025-07-19T19:56:11.351250+00:00", "benchmark_name": "TruthfulQA" }, { "model_benchmark_id": 1064, "benchmark_id": "winogrande", "model_id": "qwen-2.5-coder-32b-instruct", "score": 0.808, "normalized_score": 0.808, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/abs/2409.12186", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.219435+00:00", "updated_at": "2025-07-19T19:56:13.219435+00:00", "benchmark_name": "Winogrande" } ] ================================================ FILE: data/organizations/qwen/models/qwen-2.5-coder-32b-instruct/model.json ================================================ { "model_id": "qwen-2.5-coder-32b-instruct", "name": "Qwen2.5-Coder 32B Instruct", "organization_id": "qwen", "fine_tuned_from_model_id": "qwen-2.5-32b-instruct", "description": "Qwen2.5-Coder is a specialized coding model trained on 5.5 trillion tokens of code data, supporting 92 programming languages with a 128K context window. It excels in code generation, completion, repair, and multi-programming tasks while maintaining strong performance in mathematics and general capabilities.", "release_date": "2024-09-19", "announcement_date": "2024-09-19", "license_id": "apache_2_0", "multimodal": false, "knowledge_cutoff": null, "param_count": 32000000000, "training_tokens": 5500000000000, "available_in_zeroeval": true, "source_api_ref": "https://www.alibabacloud.com/help/en/model-studio/developer-reference/use-qwen-by-calling-api", "source_playground": null, "source_paper": "https://arxiv.org/abs/2409.12186", "source_scorecard_blog_link": "https://qwenlm.github.io/blog/qwen2.5-coder/", "source_repo_link": "https://github.com/QwenLM/Qwen2.5-Coder", "source_weights_link": "https://huggingface.co/Qwen/Qwen2.5-Coder-32B", "created_at": "2025-07-19T19:49:05.882455+00:00", "updated_at": "2025-07-19T19:49:05.882455+00:00", "model_family_id": null } ================================================ FILE: data/organizations/qwen/models/qwen-2.5-coder-7b-instruct/benchmarks.json ================================================ [ { "model_benchmark_id": 1624, "benchmark_id": "aider", "model_id": "qwen-2.5-coder-7b-instruct", "score": 0.556, "normalized_score": 0.556, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/abs/2409.12186", "verified_by_llmstats": false, "analysis_method": "pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.569369+00:00", "updated_at": "2025-07-19T19:56:14.569369+00:00", "benchmark_name": "Aider" }, { "model_benchmark_id": 20, "benchmark_id": "arc-c", "model_id": "qwen-2.5-coder-7b-instruct", "score": 0.609, "normalized_score": 0.609, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/abs/2409.12186", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.126002+00:00", "updated_at": "2025-07-19T19:56:11.126002+00:00", "benchmark_name": "ARC-C" }, { "model_benchmark_id": 1434, "benchmark_id": "bigcodebench", "model_id": "qwen-2.5-coder-7b-instruct", "score": 0.41, "normalized_score": 0.41, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/abs/2409.12186", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.052666+00:00", "updated_at": "2025-07-19T19:56:14.052666+00:00", "benchmark_name": "BigCodeBench" }, { "model_benchmark_id": 1620, "benchmark_id": "cruxeval-input-cot", "model_id": "qwen-2.5-coder-7b-instruct", "score": 0.565, "normalized_score": 0.565, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/abs/2409.12186", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.554528+00:00", "updated_at": "2025-07-19T19:56:14.554528+00:00", "benchmark_name": "CRUXEval-Input-CoT" }, { "model_benchmark_id": 1621, "benchmark_id": "cruxeval-output-cot", "model_id": "qwen-2.5-coder-7b-instruct", "score": 0.56, "normalized_score": 0.56, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/abs/2409.12186", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.558251+00:00", "updated_at": "2025-07-19T19:56:14.558251+00:00", "benchmark_name": "CRUXEval-Output-CoT" }, { "model_benchmark_id": 993, "benchmark_id": "gsm8k", "model_id": "qwen-2.5-coder-7b-instruct", "score": 0.839, "normalized_score": 0.839, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/abs/2409.12186", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.080381+00:00", "updated_at": "2025-07-19T19:56:13.080381+00:00", "benchmark_name": "GSM8k" }, { "model_benchmark_id": 47, "benchmark_id": "hellaswag", "model_id": "qwen-2.5-coder-7b-instruct", "score": 0.768, "normalized_score": 0.768, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/abs/2409.12186", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.182466+00:00", "updated_at": "2025-07-19T19:56:11.182466+00:00", "benchmark_name": "HellaSwag" }, { "model_benchmark_id": 785, "benchmark_id": "humaneval", "model_id": "qwen-2.5-coder-7b-instruct", "score": 0.884, "normalized_score": 0.884, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/abs/2409.12186", "verified_by_llmstats": false, "analysis_method": "pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.644936+00:00", "updated_at": "2025-07-19T19:56:12.644936+00:00", "benchmark_name": "HumanEval" }, { "model_benchmark_id": 1121, "benchmark_id": "livecodebench", "model_id": "qwen-2.5-coder-7b-instruct", "score": 0.182, "normalized_score": 0.182, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/abs/2409.12186", "verified_by_llmstats": false, "analysis_method": "pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.340042+00:00", "updated_at": "2025-07-19T19:56:13.340042+00:00", "benchmark_name": "LiveCodeBench" }, { "model_benchmark_id": 403, "benchmark_id": "math", "model_id": "qwen-2.5-coder-7b-instruct", "score": 0.466, "normalized_score": 0.466, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/abs/2409.12186", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.860821+00:00", "updated_at": "2025-07-19T19:56:11.860821+00:00", "benchmark_name": "MATH" }, { "model_benchmark_id": 1184, "benchmark_id": "mbpp", "model_id": "qwen-2.5-coder-7b-instruct", "score": 0.835, "normalized_score": 0.835, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/abs/2409.12186", "verified_by_llmstats": false, "analysis_method": "pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.495284+00:00", "updated_at": "2025-07-19T19:56:13.495284+00:00", "benchmark_name": "MBPP" }, { "model_benchmark_id": 88, "benchmark_id": "mmlu", "model_id": "qwen-2.5-coder-7b-instruct", "score": 0.676, "normalized_score": 0.676, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/abs/2409.12186", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.267319+00:00", "updated_at": "2025-07-19T19:56:11.267319+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 1623, "benchmark_id": "mmlu-base", "model_id": "qwen-2.5-coder-7b-instruct", "score": 0.68, "normalized_score": 0.68, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/abs/2409.12186", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.565292+00:00", "updated_at": "2025-07-19T19:56:14.565292+00:00", "benchmark_name": "MMLU-Base" }, { "model_benchmark_id": 193, "benchmark_id": "mmlu-pro", "model_id": "qwen-2.5-coder-7b-instruct", "score": 0.401, "normalized_score": 0.401, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/abs/2409.12186", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.469384+00:00", "updated_at": "2025-07-19T19:56:11.469384+00:00", "benchmark_name": "MMLU-Pro" }, { "model_benchmark_id": 730, "benchmark_id": "mmlu-redux", "model_id": "qwen-2.5-coder-7b-instruct", "score": 0.666, "normalized_score": 0.666, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/abs/2409.12186", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.537049+00:00", "updated_at": "2025-07-19T19:56:12.537049+00:00", "benchmark_name": "MMLU-Redux" }, { "model_benchmark_id": 1622, "benchmark_id": "stem", "model_id": "qwen-2.5-coder-7b-instruct", "score": 0.34, "normalized_score": 0.34, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/abs/2409.12186", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.561469+00:00", "updated_at": "2025-07-19T19:56:14.561469+00:00", "benchmark_name": "STEM" }, { "model_benchmark_id": 1596, "benchmark_id": "theoremqa", "model_id": "qwen-2.5-coder-7b-instruct", "score": 0.34, "normalized_score": 0.34, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/abs/2409.12186", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.489921+00:00", "updated_at": "2025-07-19T19:56:14.489921+00:00", "benchmark_name": "TheoremQA" }, { "model_benchmark_id": 137, "benchmark_id": "truthfulqa", "model_id": "qwen-2.5-coder-7b-instruct", "score": 0.506, "normalized_score": 0.506, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/abs/2409.12186", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.353301+00:00", "updated_at": "2025-07-19T19:56:11.353301+00:00", "benchmark_name": "TruthfulQA" }, { "model_benchmark_id": 1065, "benchmark_id": "winogrande", "model_id": "qwen-2.5-coder-7b-instruct", "score": 0.729, "normalized_score": 0.729, "is_self_reported": true, "self_reported_source_link": "https://arxiv.org/abs/2409.12186", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.221874+00:00", "updated_at": "2025-07-19T19:56:13.221874+00:00", "benchmark_name": "Winogrande" } ] ================================================ FILE: data/organizations/qwen/models/qwen-2.5-coder-7b-instruct/model.json ================================================ { "model_id": "qwen-2.5-coder-7b-instruct", "name": "Qwen2.5-Coder 7B Instruct", "organization_id": "qwen", "fine_tuned_from_model_id": "qwen-2.5-7b-instruct", "description": "Qwen2.5-Coder is a specialized coding model trained on 5.5 trillion tokens of code data, supporting 92 programming languages with a 128K context window. It excels in code generation, completion, and repair while maintaining strong performance in math and general tasks. The model demonstrates exceptional capabilities in multi-programming language tasks and code reasoning.", "release_date": "2024-09-19", "announcement_date": "2024-09-19", "license_id": "apache_2_0", "multimodal": false, "knowledge_cutoff": null, "param_count": 7000000000, "training_tokens": 5500000000000, "available_in_zeroeval": true, "source_api_ref": "https://www.alibabacloud.com/help/en/model-studio/developer-reference/use-qwen-by-calling-api", "source_playground": null, "source_paper": "https://arxiv.org/abs/2409.12186", "source_scorecard_blog_link": "https://qwenlm.github.io/blog/qwen2.5-coder/", "source_repo_link": "https://github.com/QwenLM/Qwen2", "source_weights_link": "https://huggingface.co/Qwen/Qwen2.5-7B-Coder", "created_at": "2025-07-19T19:49:05.890300+00:00", "updated_at": "2025-07-19T19:49:05.890300+00:00", "model_family_id": null } ================================================ FILE: data/organizations/qwen/models/qwen2-72b-instruct/benchmarks.json ================================================ [ { "model_benchmark_id": 22, "benchmark_id": "arc-c", "model_id": "qwen2-72b-instruct", "score": 0.689, "normalized_score": 0.689, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2-72B", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.129146+00:00", "updated_at": "2025-07-19T19:56:11.129146+00:00", "benchmark_name": "ARC-C" }, { "model_benchmark_id": 973, "benchmark_id": "bbh", "model_id": "qwen2-72b-instruct", "score": 0.824, "normalized_score": 0.824, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2-72B", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.045120+00:00", "updated_at": "2025-07-19T19:56:13.045120+00:00", "benchmark_name": "BBH" }, { "model_benchmark_id": 437, "benchmark_id": "c-eval", "model_id": "qwen2-72b-instruct", "score": 0.838, "normalized_score": 0.838, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2-72B", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.926225+00:00", "updated_at": "2025-07-19T19:56:11.926225+00:00", "benchmark_name": "C-Eval" }, { "model_benchmark_id": 1749, "benchmark_id": "cmmlu", "model_id": "qwen2-72b-instruct", "score": 0.901, "normalized_score": 0.901, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2-72B", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.943893+00:00", "updated_at": "2025-07-19T19:56:14.943893+00:00", "benchmark_name": "CMMLU" }, { "model_benchmark_id": 372, "benchmark_id": "evalplus", "model_id": "qwen2-72b-instruct", "score": 0.79, "normalized_score": 0.79, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2-72B", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.802955+00:00", "updated_at": "2025-07-19T19:56:11.802955+00:00", "benchmark_name": "EvalPlus" }, { "model_benchmark_id": 307, "benchmark_id": "gpqa", "model_id": "qwen2-72b-instruct", "score": 0.424, "normalized_score": 0.424, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2-72B", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.687633+00:00", "updated_at": "2025-07-19T19:56:11.687633+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 999, "benchmark_id": "gsm8k", "model_id": "qwen2-72b-instruct", "score": 0.911, "normalized_score": 0.911, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2-72B", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.089706+00:00", "updated_at": "2025-07-19T19:56:13.089706+00:00", "benchmark_name": "GSM8k" }, { "model_benchmark_id": 48, "benchmark_id": "hellaswag", "model_id": "qwen2-72b-instruct", "score": 0.876, "normalized_score": 0.876, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2-72B", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.184833+00:00", "updated_at": "2025-07-19T19:56:11.184833+00:00", "benchmark_name": "HellaSwag" }, { "model_benchmark_id": 790, "benchmark_id": "humaneval", "model_id": "qwen2-72b-instruct", "score": 0.86, "normalized_score": 0.86, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2-72B", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.653267+00:00", "updated_at": "2025-07-19T19:56:12.653267+00:00", "benchmark_name": "HumanEval" }, { "model_benchmark_id": 409, "benchmark_id": "math", "model_id": "qwen2-72b-instruct", "score": 0.597, "normalized_score": 0.597, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2-72B", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.871582+00:00", "updated_at": "2025-07-19T19:56:11.871582+00:00", "benchmark_name": "MATH" }, { "model_benchmark_id": 1190, "benchmark_id": "mbpp", "model_id": "qwen2-72b-instruct", "score": 0.802, "normalized_score": 0.802, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2-72B", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.508406+00:00", "updated_at": "2025-07-19T19:56:13.508406+00:00", "benchmark_name": "MBPP" }, { "model_benchmark_id": 91, "benchmark_id": "mmlu", "model_id": "qwen2-72b-instruct", "score": 0.823, "normalized_score": 0.823, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2-72B", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.272629+00:00", "updated_at": "2025-07-19T19:56:11.272629+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 199, "benchmark_id": "mmlu-pro", "model_id": "qwen2-72b-instruct", "score": 0.644, "normalized_score": 0.644, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2-72B", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.480879+00:00", "updated_at": "2025-07-19T19:56:11.480879+00:00", "benchmark_name": "MMLU-Pro" }, { "model_benchmark_id": 647, "benchmark_id": "multipl-e", "model_id": "qwen2-72b-instruct", "score": 0.692, "normalized_score": 0.692, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2-72B", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.327331+00:00", "updated_at": "2025-07-19T19:56:12.327331+00:00", "benchmark_name": "MultiPL-E" }, { "model_benchmark_id": 1598, "benchmark_id": "theoremqa", "model_id": "qwen2-72b-instruct", "score": 0.444, "normalized_score": 0.444, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2-72B", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.494165+00:00", "updated_at": "2025-07-19T19:56:14.494165+00:00", "benchmark_name": "TheoremQA" }, { "model_benchmark_id": 139, "benchmark_id": "truthfulqa", "model_id": "qwen2-72b-instruct", "score": 0.548, "normalized_score": 0.548, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2-72B", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.356602+00:00", "updated_at": "2025-07-19T19:56:11.356602+00:00", "benchmark_name": "TruthfulQA" }, { "model_benchmark_id": 151, "benchmark_id": "winogrande", "model_id": "qwen2-72b-instruct", "score": 0.851, "normalized_score": 0.851, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2-72B", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.386216+00:00", "updated_at": "2025-07-19T19:56:11.386216+00:00", "benchmark_name": "Winogrande" } ] ================================================ FILE: data/organizations/qwen/models/qwen2-72b-instruct/model.json ================================================ { "model_id": "qwen2-72b-instruct", "name": "Qwen2 72B Instruct", "organization_id": "qwen", "fine_tuned_from_model_id": null, "description": "Qwen2-72B-Instruct is an instruction-tuned language model with 72 billion parameters, supporting a context length of up to 131,072 tokens. It's part of the new Qwen2 series, which has surpassed most open-source models and demonstrates competitiveness against proprietary models across various benchmarks.", "release_date": "2024-07-23", "announcement_date": "2024-07-23", "license_id": "tongyi_qianwen", "multimodal": false, "knowledge_cutoff": null, "param_count": 72000000000, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://huggingface.co/Qwen/Qwen2-72B", "source_playground": "https://huggingface.co/Qwen/Qwen2-72B", "source_paper": "https://arxiv.org/abs/2309.00071", "source_scorecard_blog_link": "https://qwenlm.github.io/blog/qwen2/", "source_repo_link": "https://huggingface.co/Qwen/Qwen2-72B", "source_weights_link": "https://huggingface.co/Qwen/Qwen2-72B", "created_at": "2025-07-19T19:49:05.650844+00:00", "updated_at": "2025-07-19T19:49:05.650844+00:00", "model_family_id": null } ================================================ FILE: data/organizations/qwen/models/qwen2-7b-instruct/benchmarks.json ================================================ [ { "model_benchmark_id": 1616, "benchmark_id": "alignbench", "model_id": "qwen2-7b-instruct", "score": 0.721, "normalized_score": 0.721, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2-7B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.544441+00:00", "updated_at": "2025-07-19T19:56:14.544441+00:00", "benchmark_name": "AlignBench" }, { "model_benchmark_id": 436, "benchmark_id": "c-eval", "model_id": "qwen2-7b-instruct", "score": 0.772, "normalized_score": 0.772, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2-7B-Instruct", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.924104+00:00", "updated_at": "2025-07-19T19:56:11.924104+00:00", "benchmark_name": "C-Eval" }, { "model_benchmark_id": 370, "benchmark_id": "evalplus", "model_id": "qwen2-7b-instruct", "score": 0.703, "normalized_score": 0.703, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2-7B-Instruct", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.799094+00:00", "updated_at": "2025-07-19T19:56:11.799094+00:00", "benchmark_name": "EvalPlus" }, { "model_benchmark_id": 299, "benchmark_id": "gpqa", "model_id": "qwen2-7b-instruct", "score": 0.253, "normalized_score": 0.253, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2-7B-Instruct", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.674412+00:00", "updated_at": "2025-07-19T19:56:11.674412+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 992, "benchmark_id": "gsm8k", "model_id": "qwen2-7b-instruct", "score": 0.823, "normalized_score": 0.823, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2-7B-Instruct", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.078833+00:00", "updated_at": "2025-07-19T19:56:13.078833+00:00", "benchmark_name": "GSM8k" }, { "model_benchmark_id": 784, "benchmark_id": "humaneval", "model_id": "qwen2-7b-instruct", "score": 0.799, "normalized_score": 0.799, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2-7B-Instruct", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.643272+00:00", "updated_at": "2025-07-19T19:56:12.643272+00:00", "benchmark_name": "HumanEval" }, { "model_benchmark_id": 1119, "benchmark_id": "livecodebench", "model_id": "qwen2-7b-instruct", "score": 0.266, "normalized_score": 0.266, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2-7B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.335377+00:00", "updated_at": "2025-07-19T19:56:13.335377+00:00", "benchmark_name": "LiveCodeBench" }, { "model_benchmark_id": 402, "benchmark_id": "math", "model_id": "qwen2-7b-instruct", "score": 0.496, "normalized_score": 0.496, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2-7B-Instruct", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.859120+00:00", "updated_at": "2025-07-19T19:56:11.859120+00:00", "benchmark_name": "MATH" }, { "model_benchmark_id": 1183, "benchmark_id": "mbpp", "model_id": "qwen2-7b-instruct", "score": 0.672, "normalized_score": 0.672, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2-7B-Instruct", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.493272+00:00", "updated_at": "2025-07-19T19:56:13.493272+00:00", "benchmark_name": "MBPP" }, { "model_benchmark_id": 87, "benchmark_id": "mmlu", "model_id": "qwen2-7b-instruct", "score": 0.705, "normalized_score": 0.705, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2-7B-Instruct", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.265352+00:00", "updated_at": "2025-07-19T19:56:11.265352+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 192, "benchmark_id": "mmlu-pro", "model_id": "qwen2-7b-instruct", "score": 0.441, "normalized_score": 0.441, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2-7B-Instruct", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.467957+00:00", "updated_at": "2025-07-19T19:56:11.467957+00:00", "benchmark_name": "MMLU-Pro" }, { "model_benchmark_id": 1605, "benchmark_id": "mt-bench", "model_id": "qwen2-7b-instruct", "score": 0.841, "normalized_score": 0.841, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2-7B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.519120+00:00", "updated_at": "2025-07-19T19:56:14.519120+00:00", "benchmark_name": "MT-Bench" }, { "model_benchmark_id": 641, "benchmark_id": "multipl-e", "model_id": "qwen2-7b-instruct", "score": 0.591, "normalized_score": 0.591, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2-7B-Instruct", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.317803+00:00", "updated_at": "2025-07-19T19:56:12.317803+00:00", "benchmark_name": "MultiPL-E" }, { "model_benchmark_id": 1595, "benchmark_id": "theoremqa", "model_id": "qwen2-7b-instruct", "score": 0.253, "normalized_score": 0.253, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2-7B-Instruct", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.487702+00:00", "updated_at": "2025-07-19T19:56:14.487702+00:00", "benchmark_name": "TheoremQA" } ] ================================================ FILE: data/organizations/qwen/models/qwen2-7b-instruct/model.json ================================================ { "model_id": "qwen2-7b-instruct", "name": "Qwen2 7B Instruct", "organization_id": "qwen", "fine_tuned_from_model_id": null, "description": "Qwen2-7B-Instruct is an instruction-tuned language model with 7 billion parameters, supporting a context length of up to 131,072 tokens.", "release_date": "2024-07-23", "announcement_date": "2024-07-23", "license_id": "apache_2_0", "multimodal": false, "knowledge_cutoff": null, "param_count": 7620000000, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://huggingface.co/Qwen/Qwen2-7B-Instruct", "source_playground": "https://huggingface.co/Qwen/Qwen2-7B-Instruct", "source_paper": "https://arxiv.org/abs/2309.00071", "source_scorecard_blog_link": null, "source_repo_link": "https://huggingface.co/Qwen/Qwen2-7B-Instruct", "source_weights_link": "https://huggingface.co/Qwen/Qwen2-7B-Instruct", "created_at": "2025-07-19T19:49:05.612662+00:00", "updated_at": "2025-07-19T19:49:05.612662+00:00", "model_family_id": null } ================================================ FILE: data/organizations/qwen/models/qwen2-vl-72b/benchmarks.json ================================================ [ { "model_benchmark_id": 864, "benchmark_id": "chartqa", "model_id": "qwen2-vl-72b", "score": 0.883, "normalized_score": 0.883, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2", "verified_by_llmstats": false, "analysis_method": "score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.806635+00:00", "updated_at": "2025-07-19T19:56:12.806635+00:00", "benchmark_name": "ChartQA" }, { "model_benchmark_id": 1629, "benchmark_id": "docvqatest", "model_id": "qwen2-vl-72b", "score": 0.965, "normalized_score": 0.965, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2", "verified_by_llmstats": false, "analysis_method": "score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.582058+00:00", "updated_at": "2025-07-19T19:56:14.582058+00:00", "benchmark_name": "DocVQAtest" }, { "model_benchmark_id": 923, "benchmark_id": "egoschema", "model_id": "qwen2-vl-72b", "score": 0.779, "normalized_score": 0.779, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2", "verified_by_llmstats": false, "analysis_method": "score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.928297+00:00", "updated_at": "2025-07-19T19:56:12.928297+00:00", "benchmark_name": "EgoSchema" }, { "model_benchmark_id": 1630, "benchmark_id": "infovqatest", "model_id": "qwen2-vl-72b", "score": 0.845, "normalized_score": 0.845, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2", "verified_by_llmstats": false, "analysis_method": "score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.586477+00:00", "updated_at": "2025-07-19T19:56:14.586477+00:00", "benchmark_name": "InfoVQAtest" }, { "model_benchmark_id": 1269, "benchmark_id": "mathvista-mini", "model_id": "qwen2-vl-72b", "score": 0.705, "normalized_score": 0.705, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2", "verified_by_llmstats": false, "analysis_method": "score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.662750+00:00", "updated_at": "2025-07-19T19:56:13.662750+00:00", "benchmark_name": "MathVista-Mini" }, { "model_benchmark_id": 1639, "benchmark_id": "mmbench-test", "model_id": "qwen2-vl-72b", "score": 0.865, "normalized_score": 0.865, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2", "verified_by_llmstats": false, "analysis_method": "score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.610292+00:00", "updated_at": "2025-07-19T19:56:14.610292+00:00", "benchmark_name": "MMBench_test" }, { "model_benchmark_id": 1532, "benchmark_id": "mmmu-pro", "model_id": "qwen2-vl-72b", "score": 0.462, "normalized_score": 0.462, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2", "verified_by_llmstats": false, "analysis_method": "score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.292395+00:00", "updated_at": "2025-07-19T19:56:14.292395+00:00", "benchmark_name": "MMMU-Pro" }, { "model_benchmark_id": 1628, "benchmark_id": "mmmuval", "model_id": "qwen2-vl-72b", "score": 0.645, "normalized_score": 0.645, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2", "verified_by_llmstats": false, "analysis_method": "score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.578458+00:00", "updated_at": "2025-07-19T19:56:14.578458+00:00", "benchmark_name": "MMMUval" }, { "model_benchmark_id": 1640, "benchmark_id": "mmvetgpt4turbo", "model_id": "qwen2-vl-72b", "score": 0.74, "normalized_score": 0.74, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2", "verified_by_llmstats": false, "analysis_method": "score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.613913+00:00", "updated_at": "2025-07-19T19:56:14.613913+00:00", "benchmark_name": "MMVetGPT4Turbo" }, { "model_benchmark_id": 1631, "benchmark_id": "mtvqa", "model_id": "qwen2-vl-72b", "score": 0.309, "normalized_score": 0.309, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2", "verified_by_llmstats": false, "analysis_method": "score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.590936+00:00", "updated_at": "2025-07-19T19:56:14.590936+00:00", "benchmark_name": "MTVQA" }, { "model_benchmark_id": 1641, "benchmark_id": "mvbench", "model_id": "qwen2-vl-72b", "score": 0.736, "normalized_score": 0.736, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2", "verified_by_llmstats": false, "analysis_method": "score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.618622+00:00", "updated_at": "2025-07-19T19:56:14.618622+00:00", "benchmark_name": "MVBench" }, { "model_benchmark_id": 1539, "benchmark_id": "ocrbench", "model_id": "qwen2-vl-72b", "score": 0.877, "normalized_score": 0.877, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2", "verified_by_llmstats": false, "analysis_method": "score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.311748+00:00", "updated_at": "2025-07-19T19:56:14.311748+00:00", "benchmark_name": "OCRBench" }, { "model_benchmark_id": 1633, "benchmark_id": "realworldqa", "model_id": "qwen2-vl-72b", "score": 0.778, "normalized_score": 0.778, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2", "verified_by_llmstats": false, "analysis_method": "score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.597450+00:00", "updated_at": "2025-07-19T19:56:14.597450+00:00", "benchmark_name": "RealWorldQA" }, { "model_benchmark_id": 909, "benchmark_id": "textvqa", "model_id": "qwen2-vl-72b", "score": 0.855, "normalized_score": 0.855, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2", "verified_by_llmstats": false, "analysis_method": "score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.894922+00:00", "updated_at": "2025-07-19T19:56:12.894922+00:00", "benchmark_name": "TextVQA" }, { "model_benchmark_id": 1632, "benchmark_id": "vcr-en-easy", "model_id": "qwen2-vl-72b", "score": 0.9193, "normalized_score": 0.9193, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2", "verified_by_llmstats": false, "analysis_method": "score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.594379+00:00", "updated_at": "2025-07-19T19:56:14.594379+00:00", "benchmark_name": "VCR_en_easy" } ] ================================================ FILE: data/organizations/qwen/models/qwen2-vl-72b/model.json ================================================ { "model_id": "qwen2-vl-72b", "name": "Qwen2-VL-72B-Instruct", "organization_id": "qwen", "fine_tuned_from_model_id": null, "description": "An instruction-tuned, large multimodal model that excels at visual understanding and step-by-step reasoning. It supports image and video input, with dynamic resolution handling and improved positional embeddings (M-ROPE), enabling advanced capabilities such as complex problem solving, multilingual text recognition in images, and agent-like interactions in video contexts.", "release_date": "2024-08-29", "announcement_date": "2024-08-29", "license_id": "tongyi_qianwen", "multimodal": true, "knowledge_cutoff": "2023-06-30", "param_count": 73400000000, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct", "source_playground": null, "source_paper": "https://arxiv.org/abs/2409.12191", "source_scorecard_blog_link": "https://qwenlm.github.io/blog/qwen2-vl/", "source_repo_link": "https://github.com/QwenLM/Qwen2-VL", "source_weights_link": "https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct", "created_at": "2025-07-19T19:49:05.619575+00:00", "updated_at": "2025-07-19T19:49:05.619575+00:00", "model_family_id": null } ================================================ FILE: data/organizations/qwen/models/qwen2.5-omni-7b/benchmarks.json ================================================ [ { "model_benchmark_id": 1254, "benchmark_id": "ai2d", "model_id": "qwen2.5-omni-7b", "score": 0.832, "normalized_score": 0.832, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.633399+00:00", "updated_at": "2025-07-19T19:56:13.633399+00:00", "benchmark_name": "AI2D" }, { "model_benchmark_id": 866, "benchmark_id": "chartqa", "model_id": "qwen2.5-omni-7b", "score": 0.853, "normalized_score": 0.853, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.809953+00:00", "updated_at": "2025-07-19T19:56:12.809953+00:00", "benchmark_name": "ChartQA" }, { "model_benchmark_id": 1718, "benchmark_id": "common-voice-15", "model_id": "qwen2.5-omni-7b", "score": 0.076, "normalized_score": 0.076, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni", "verified_by_llmstats": false, "analysis_method": "WER", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.833534+00:00", "updated_at": "2025-07-19T19:56:14.833534+00:00", "benchmark_name": "Common Voice 15" }, { "model_benchmark_id": 1717, "benchmark_id": "covost2-en-zh", "model_id": "qwen2.5-omni-7b", "score": 0.414, "normalized_score": 0.414, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni", "verified_by_llmstats": false, "analysis_method": "BLEU", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.828460+00:00", "updated_at": "2025-07-19T19:56:14.828460+00:00", "benchmark_name": "CoVoST2 en-zh" }, { "model_benchmark_id": 1719, "benchmark_id": "crperelation", "model_id": "qwen2.5-omni-7b", "score": 0.765, "normalized_score": 0.765, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.837425+00:00", "updated_at": "2025-07-19T19:56:14.837425+00:00", "benchmark_name": "CRPErelation" }, { "model_benchmark_id": 887, "benchmark_id": "docvqa", "model_id": "qwen2.5-omni-7b", "score": 0.952, "normalized_score": 0.952, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.846061+00:00", "updated_at": "2025-07-19T19:56:12.846061+00:00", "benchmark_name": "DocVQA" }, { "model_benchmark_id": 924, "benchmark_id": "egoschema", "model_id": "qwen2.5-omni-7b", "score": 0.686, "normalized_score": 0.686, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.931056+00:00", "updated_at": "2025-07-19T19:56:12.931056+00:00", "benchmark_name": "EgoSchema" }, { "model_benchmark_id": 1401, "benchmark_id": "fleurs", "model_id": "qwen2.5-omni-7b", "score": 0.041, "normalized_score": 0.041, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni", "verified_by_llmstats": false, "analysis_method": "WER", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.953081+00:00", "updated_at": "2025-07-19T19:56:13.953081+00:00", "benchmark_name": "FLEURS" }, { "model_benchmark_id": 1720, "benchmark_id": "giantsteps-tempo", "model_id": "qwen2.5-omni-7b", "score": 0.88, "normalized_score": 0.88, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.841583+00:00", "updated_at": "2025-07-19T19:56:14.841583+00:00", "benchmark_name": "GiantSteps Tempo" }, { "model_benchmark_id": 305, "benchmark_id": "gpqa", "model_id": "qwen2.5-omni-7b", "score": 0.308, "normalized_score": 0.308, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.684328+00:00", "updated_at": "2025-07-19T19:56:11.684328+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 997, "benchmark_id": "gsm8k", "model_id": "qwen2.5-omni-7b", "score": 0.887, "normalized_score": 0.887, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.086524+00:00", "updated_at": "2025-07-19T19:56:13.086524+00:00", "benchmark_name": "GSM8k" }, { "model_benchmark_id": 788, "benchmark_id": "humaneval", "model_id": "qwen2.5-omni-7b", "score": 0.787, "normalized_score": 0.787, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.650243+00:00", "updated_at": "2025-07-19T19:56:12.650243+00:00", "benchmark_name": "HumanEval" }, { "model_benchmark_id": 752, "benchmark_id": "livebench", "model_id": "qwen2.5-omni-7b", "score": 0.296, "normalized_score": 0.296, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.581448+00:00", "updated_at": "2025-07-19T19:56:12.581448+00:00", "benchmark_name": "LiveBench" }, { "model_benchmark_id": 407, "benchmark_id": "math", "model_id": "qwen2.5-omni-7b", "score": 0.715, "normalized_score": 0.715, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.867189+00:00", "updated_at": "2025-07-19T19:56:11.867189+00:00", "benchmark_name": "MATH" }, { "model_benchmark_id": 1676, "benchmark_id": "mathvision", "model_id": "qwen2.5-omni-7b", "score": 0.25, "normalized_score": 0.25, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.702750+00:00", "updated_at": "2025-07-19T19:56:14.702750+00:00", "benchmark_name": "MathVision" }, { "model_benchmark_id": 527, "benchmark_id": "mathvista", "model_id": "qwen2.5-omni-7b", "score": 0.679, "normalized_score": 0.679, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.094090+00:00", "updated_at": "2025-07-19T19:56:12.094090+00:00", "benchmark_name": "MathVista" }, { "model_benchmark_id": 1188, "benchmark_id": "mbpp", "model_id": "qwen2.5-omni-7b", "score": 0.732, "normalized_score": 0.732, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.504920+00:00", "updated_at": "2025-07-19T19:56:13.504920+00:00", "benchmark_name": "MBPP" }, { "model_benchmark_id": 1721, "benchmark_id": "meld", "model_id": "qwen2.5-omni-7b", "score": 0.57, "normalized_score": 0.57, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.845437+00:00", "updated_at": "2025-07-19T19:56:14.845437+00:00", "benchmark_name": "Meld" }, { "model_benchmark_id": 1722, "benchmark_id": "mmau", "model_id": "qwen2.5-omni-7b", "score": 0.656, "normalized_score": 0.656, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.849392+00:00", "updated_at": "2025-07-19T19:56:14.849392+00:00", "benchmark_name": "MMAU" }, { "model_benchmark_id": 1723, "benchmark_id": "mmau-music", "model_id": "qwen2.5-omni-7b", "score": 0.6916, "normalized_score": 0.6916, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.854098+00:00", "updated_at": "2025-07-19T19:56:14.854098+00:00", "benchmark_name": "MMAU Music" }, { "model_benchmark_id": 1724, "benchmark_id": "mmau-sound", "model_id": "qwen2.5-omni-7b", "score": 0.6787, "normalized_score": 0.6787, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.862523+00:00", "updated_at": "2025-07-19T19:56:14.862523+00:00", "benchmark_name": "MMAU Sound" }, { "model_benchmark_id": 1725, "benchmark_id": "mmau-speech", "model_id": "qwen2.5-omni-7b", "score": 0.5976, "normalized_score": 0.5976, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.867393+00:00", "updated_at": "2025-07-19T19:56:14.867393+00:00", "benchmark_name": "MMAU Speech" }, { "model_benchmark_id": 1726, "benchmark_id": "mmbench-v1.1", "model_id": "qwen2.5-omni-7b", "score": 0.818, "normalized_score": 0.818, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.871500+00:00", "updated_at": "2025-07-19T19:56:14.871500+00:00", "benchmark_name": "MMBench-V1.1" }, { "model_benchmark_id": 1730, "benchmark_id": "mme-realworld", "model_id": "qwen2.5-omni-7b", "score": 0.616, "normalized_score": 0.616, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.879804+00:00", "updated_at": "2025-07-19T19:56:14.879804+00:00", "benchmark_name": "MME-RealWorld" }, { "model_benchmark_id": 197, "benchmark_id": "mmlu-pro", "model_id": "qwen2.5-omni-7b", "score": 0.47, "normalized_score": 0.47, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.477278+00:00", "updated_at": "2025-07-19T19:56:11.477278+00:00", "benchmark_name": "MMLU-Pro" }, { "model_benchmark_id": 734, "benchmark_id": "mmlu-redux", "model_id": "qwen2.5-omni-7b", "score": 0.71, "normalized_score": 0.71, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.544013+00:00", "updated_at": "2025-07-19T19:56:12.544013+00:00", "benchmark_name": "MMLU-Redux" }, { "model_benchmark_id": 1731, "benchmark_id": "mm-mt-bench", "model_id": "qwen2.5-omni-7b", "score": 0.06, "normalized_score": 0.06, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.883880+00:00", "updated_at": "2025-07-19T19:56:14.883880+00:00", "benchmark_name": "MM-MT-Bench" }, { "model_benchmark_id": 571, "benchmark_id": "mmmu", "model_id": "qwen2.5-omni-7b", "score": 0.592, "normalized_score": 0.592, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.175251+00:00", "updated_at": "2025-07-19T19:56:12.175251+00:00", "benchmark_name": "MMMU" }, { "model_benchmark_id": 1534, "benchmark_id": "mmmu-pro", "model_id": "qwen2.5-omni-7b", "score": 0.366, "normalized_score": 0.366, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.296124+00:00", "updated_at": "2025-07-19T19:56:14.296124+00:00", "benchmark_name": "MMMU-Pro" }, { "model_benchmark_id": 1660, "benchmark_id": "mmstar", "model_id": "qwen2.5-omni-7b", "score": 0.64, "normalized_score": 0.64, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.664551+00:00", "updated_at": "2025-07-19T19:56:14.664551+00:00", "benchmark_name": "MMStar" }, { "model_benchmark_id": 1734, "benchmark_id": "muirbench", "model_id": "qwen2.5-omni-7b", "score": 0.592, "normalized_score": 0.592, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.891075+00:00", "updated_at": "2025-07-19T19:56:14.891075+00:00", "benchmark_name": "MuirBench" }, { "model_benchmark_id": 645, "benchmark_id": "multipl-e", "model_id": "qwen2.5-omni-7b", "score": 0.658, "normalized_score": 0.658, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.324318+00:00", "updated_at": "2025-07-19T19:56:12.324318+00:00", "benchmark_name": "MultiPL-E" }, { "model_benchmark_id": 1735, "benchmark_id": "musiccaps", "model_id": "qwen2.5-omni-7b", "score": 0.328, "normalized_score": 0.328, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.894342+00:00", "updated_at": "2025-07-19T19:56:14.894342+00:00", "benchmark_name": "MusicCaps" }, { "model_benchmark_id": 1643, "benchmark_id": "mvbench", "model_id": "qwen2.5-omni-7b", "score": 0.703, "normalized_score": 0.703, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.621841+00:00", "updated_at": "2025-07-19T19:56:14.621841+00:00", "benchmark_name": "MVBench" }, { "model_benchmark_id": 1736, "benchmark_id": "nmos", "model_id": "qwen2.5-omni-7b", "score": 0.0451, "normalized_score": 0.0451, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-omni/", "verified_by_llmstats": false, "analysis_method": "NMOS", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.897653+00:00", "updated_at": "2025-07-19T19:56:14.897653+00:00", "benchmark_name": "NMOS" }, { "model_benchmark_id": 1737, "benchmark_id": "ocrbench-v2", "model_id": "qwen2.5-omni-7b", "score": 0.578, "normalized_score": 0.578, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.901546+00:00", "updated_at": "2025-07-19T19:56:14.901546+00:00", "benchmark_name": "OCRBench_V2" }, { "model_benchmark_id": 1738, "benchmark_id": "odinw", "model_id": "qwen2.5-omni-7b", "score": 0.424, "normalized_score": 0.424, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.905294+00:00", "updated_at": "2025-07-19T19:56:14.905294+00:00", "benchmark_name": "ODinW" }, { "model_benchmark_id": 1739, "benchmark_id": "omnibench", "model_id": "qwen2.5-omni-7b", "score": 0.5613, "normalized_score": 0.5613, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.909979+00:00", "updated_at": "2025-07-19T19:56:14.909979+00:00", "benchmark_name": "OmniBench" }, { "model_benchmark_id": 1740, "benchmark_id": "omnibench-music", "model_id": "qwen2.5-omni-7b", "score": 0.5283, "normalized_score": 0.5283, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.913742+00:00", "updated_at": "2025-07-19T19:56:14.913742+00:00", "benchmark_name": "OmniBench Music" }, { "model_benchmark_id": 1741, "benchmark_id": "pointgrounding", "model_id": "qwen2.5-omni-7b", "score": 0.665, "normalized_score": 0.665, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.918183+00:00", "updated_at": "2025-07-19T19:56:14.918183+00:00", "benchmark_name": "PointGrounding" }, { "model_benchmark_id": 1634, "benchmark_id": "realworldqa", "model_id": "qwen2.5-omni-7b", "score": 0.703, "normalized_score": 0.703, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.599392+00:00", "updated_at": "2025-07-19T19:56:14.599392+00:00", "benchmark_name": "RealWorldQA" }, { "model_benchmark_id": 911, "benchmark_id": "textvqa", "model_id": "qwen2.5-omni-7b", "score": 0.844, "normalized_score": 0.844, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.899579+00:00", "updated_at": "2025-07-19T19:56:12.899579+00:00", "benchmark_name": "TextVQA" }, { "model_benchmark_id": 1685, "benchmark_id": "videomme-w-sub.", "model_id": "qwen2.5-omni-7b", "score": 0.724, "normalized_score": 0.724, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.727965+00:00", "updated_at": "2025-07-19T19:56:14.727965+00:00", "benchmark_name": "VideoMME w sub." }, { "model_benchmark_id": 1742, "benchmark_id": "vocalsound", "model_id": "qwen2.5-omni-7b", "score": 0.939, "normalized_score": 0.939, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.921505+00:00", "updated_at": "2025-07-19T19:56:14.921505+00:00", "benchmark_name": "VocalSound" }, { "model_benchmark_id": 1743, "benchmark_id": "voicebench-avg", "model_id": "qwen2.5-omni-7b", "score": 0.7412, "normalized_score": 0.7412, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.925208+00:00", "updated_at": "2025-07-19T19:56:14.925208+00:00", "benchmark_name": "VoiceBench Avg" } ] ================================================ FILE: data/organizations/qwen/models/qwen2.5-omni-7b/model.json ================================================ { "model_id": "qwen2.5-omni-7b", "name": "Qwen2.5-Omni-7B", "organization_id": "qwen", "fine_tuned_from_model_id": null, "description": "Qwen2.5-Omni is the flagship end-to-end multimodal model in the Qwen series. It processes diverse inputs including text, images, audio, and video, delivering real-time streaming responses through text generation and natural speech synthesis using a novel Thinker-Talker architecture.", "release_date": "2025-03-27", "announcement_date": "2025-03-27", "license_id": "apache_2_0", "multimodal": true, "knowledge_cutoff": null, "param_count": 7000000000, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": null, "source_playground": "https://chat.qwen.ai/", "source_paper": "https://arxiv.org/pdf/2503.20215", "source_scorecard_blog_link": "https://qwenlm.github.io/blog/qwen2.5-omni/", "source_repo_link": "https://github.com/QwenLM/Qwen2.5-Omni", "source_weights_link": "https://huggingface.co/Qwen/Qwen2.5-Omni-7B", "created_at": "2025-07-19T19:49:05.639433+00:00", "updated_at": "2025-07-19T19:49:05.639433+00:00", "model_family_id": null } ================================================ FILE: data/organizations/qwen/models/qwen2.5-vl-32b/benchmarks.json ================================================ [ { "model_benchmark_id": 1704, "benchmark_id": "aitz-em", "model_id": "qwen2.5-vl-32b", "score": 0.831, "normalized_score": 0.831, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct", "verified_by_llmstats": false, "analysis_method": "EM", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.791493+00:00", "updated_at": "2025-07-19T19:56:14.791493+00:00", "benchmark_name": "AITZ_EM" }, { "model_benchmark_id": 1707, "benchmark_id": "android-control-high-em", "model_id": "qwen2.5-vl-32b", "score": 0.696, "normalized_score": 0.696, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct", "verified_by_llmstats": false, "analysis_method": "EM", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.798431+00:00", "updated_at": "2025-07-19T19:56:14.798431+00:00", "benchmark_name": "Android Control High_EM" }, { "model_benchmark_id": 1710, "benchmark_id": "android-control-low-em", "model_id": "qwen2.5-vl-32b", "score": 0.933, "normalized_score": 0.933, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct", "verified_by_llmstats": false, "analysis_method": "EM", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.807428+00:00", "updated_at": "2025-07-19T19:56:14.807428+00:00", "benchmark_name": "Android Control Low_EM" }, { "model_benchmark_id": 1713, "benchmark_id": "androidworld-sr", "model_id": "qwen2.5-vl-32b", "score": 0.22, "normalized_score": 0.22, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct", "verified_by_llmstats": false, "analysis_method": "SR", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.815734+00:00", "updated_at": "2025-07-19T19:56:14.815734+00:00", "benchmark_name": "AndroidWorld_SR" }, { "model_benchmark_id": 1658, "benchmark_id": "cc-ocr", "model_id": "qwen2.5-vl-32b", "score": 0.771, "normalized_score": 0.771, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.659496+00:00", "updated_at": "2025-07-19T19:56:14.659496+00:00", "benchmark_name": "CC-OCR" }, { "model_benchmark_id": 1695, "benchmark_id": "charadessta", "model_id": "qwen2.5-vl-32b", "score": 0.542, "normalized_score": 0.542, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.765807+00:00", "updated_at": "2025-07-19T19:56:14.765807+00:00", "benchmark_name": "CharadesSTA" }, { "model_benchmark_id": 889, "benchmark_id": "docvqa", "model_id": "qwen2.5-vl-32b", "score": 0.948, "normalized_score": 0.948, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.850117+00:00", "updated_at": "2025-07-19T19:56:12.850117+00:00", "benchmark_name": "DocVQA" }, { "model_benchmark_id": 1751, "benchmark_id": "gpqa", "model_id": "qwen2.5-vl-32b", "score": 0.46, "normalized_score": 0.46, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.953480+00:00", "updated_at": "2025-07-19T19:56:14.953480+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 791, "benchmark_id": "humaneval", "model_id": "qwen2.5-vl-32b", "score": 0.915, "normalized_score": 0.915, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.655022+00:00", "updated_at": "2025-07-19T19:56:12.655022+00:00", "benchmark_name": "HumanEval" }, { "model_benchmark_id": 1243, "benchmark_id": "infovqa", "model_id": "qwen2.5-vl-32b", "score": 0.834, "normalized_score": 0.834, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.612560+00:00", "updated_at": "2025-07-19T19:56:13.612560+00:00", "benchmark_name": "InfoVQA" }, { "model_benchmark_id": 830, "benchmark_id": "lvbench", "model_id": "qwen2.5-vl-32b", "score": 0.49, "normalized_score": 0.49, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.733525+00:00", "updated_at": "2025-07-19T19:56:12.733525+00:00", "benchmark_name": "LVBench" }, { "model_benchmark_id": 410, "benchmark_id": "math", "model_id": "qwen2.5-vl-32b", "score": 0.822, "normalized_score": 0.822, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.873375+00:00", "updated_at": "2025-07-19T19:56:11.873375+00:00", "benchmark_name": "MATH" }, { "model_benchmark_id": 1678, "benchmark_id": "mathvision", "model_id": "qwen2.5-vl-32b", "score": 0.384, "normalized_score": 0.384, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-VL", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.707439+00:00", "updated_at": "2025-07-19T19:56:14.707439+00:00", "benchmark_name": "MathVision" }, { "model_benchmark_id": 1272, "benchmark_id": "mathvista-mini", "model_id": "qwen2.5-vl-32b", "score": 0.747, "normalized_score": 0.747, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.668155+00:00", "updated_at": "2025-07-19T19:56:13.668155+00:00", "benchmark_name": "MathVista-Mini" }, { "model_benchmark_id": 1191, "benchmark_id": "mbpp", "model_id": "qwen2.5-vl-32b", "score": 0.84, "normalized_score": 0.84, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.509907+00:00", "updated_at": "2025-07-19T19:56:13.509907+00:00", "benchmark_name": "MBPP" }, { "model_benchmark_id": 1690, "benchmark_id": "mmbench-video", "model_id": "qwen2.5-vl-32b", "score": 0.0193, "normalized_score": 0.0193, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-VL", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.747059+00:00", "updated_at": "2025-07-19T19:56:14.747059+00:00", "benchmark_name": "MMBench-Video" }, { "model_benchmark_id": 92, "benchmark_id": "mmlu", "model_id": "qwen2.5-vl-32b", "score": 0.784, "normalized_score": 0.784, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.274441+00:00", "updated_at": "2025-07-19T19:56:11.274441+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 200, "benchmark_id": "mmlu-pro", "model_id": "qwen2.5-vl-32b", "score": 0.688, "normalized_score": 0.688, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.482355+00:00", "updated_at": "2025-07-19T19:56:11.482355+00:00", "benchmark_name": "MMLU-Pro" }, { "model_benchmark_id": 573, "benchmark_id": "mmmu", "model_id": "qwen2.5-vl-32b", "score": 0.7, "normalized_score": 0.7, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.179390+00:00", "updated_at": "2025-07-19T19:56:12.179390+00:00", "benchmark_name": "MMMU" }, { "model_benchmark_id": 1536, "benchmark_id": "mmmu-pro", "model_id": "qwen2.5-vl-32b", "score": 0.495, "normalized_score": 0.495, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.299391+00:00", "updated_at": "2025-07-19T19:56:14.299391+00:00", "benchmark_name": "MMMU-Pro" }, { "model_benchmark_id": 1662, "benchmark_id": "mmstar", "model_id": "qwen2.5-vl-32b", "score": 0.695, "normalized_score": 0.695, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.668445+00:00", "updated_at": "2025-07-19T19:56:14.668445+00:00", "benchmark_name": "MMStar" }, { "model_benchmark_id": 1745, "benchmark_id": "ocrbench-v2-(en)", "model_id": "qwen2.5-vl-32b", "score": 0.572, "normalized_score": 0.572, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.930331+00:00", "updated_at": "2025-07-19T19:56:14.930331+00:00", "benchmark_name": "OCRBench-V2 (en)" }, { "model_benchmark_id": 1750, "benchmark_id": "ocrbench-v2-(zh)", "model_id": "qwen2.5-vl-32b", "score": 0.591, "normalized_score": 0.591, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.947420+00:00", "updated_at": "2025-07-19T19:56:14.947420+00:00", "benchmark_name": "OCRBench-V2 (zh)" }, { "model_benchmark_id": 1748, "benchmark_id": "osworld", "model_id": "qwen2.5-vl-32b", "score": 0.0592, "normalized_score": 0.0592, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.939263+00:00", "updated_at": "2025-07-19T19:56:14.939263+00:00", "benchmark_name": "OSWorld" }, { "model_benchmark_id": 1698, "benchmark_id": "screenspot", "model_id": "qwen2.5-vl-32b", "score": 0.885, "normalized_score": 0.885, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.775538+00:00", "updated_at": "2025-07-19T19:56:14.775538+00:00", "benchmark_name": "ScreenSpot" }, { "model_benchmark_id": 1701, "benchmark_id": "screenspot-pro", "model_id": "qwen2.5-vl-32b", "score": 0.394, "normalized_score": 0.394, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.783897+00:00", "updated_at": "2025-07-19T19:56:14.783897+00:00", "benchmark_name": "ScreenSpot Pro" }, { "model_benchmark_id": 1683, "benchmark_id": "videomme-w-o-sub.", "model_id": "qwen2.5-vl-32b", "score": 0.705, "normalized_score": 0.705, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.722056+00:00", "updated_at": "2025-07-19T19:56:14.722056+00:00", "benchmark_name": "VideoMME w/o sub." }, { "model_benchmark_id": 1686, "benchmark_id": "videomme-w-sub.", "model_id": "qwen2.5-vl-32b", "score": 0.779, "normalized_score": 0.779, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.729388+00:00", "updated_at": "2025-07-19T19:56:14.729388+00:00", "benchmark_name": "VideoMME w sub." } ] ================================================ FILE: data/organizations/qwen/models/qwen2.5-vl-32b/model.json ================================================ { "model_id": "qwen2.5-vl-32b", "name": "Qwen2.5 VL 32B Instruct", "organization_id": "qwen", "fine_tuned_from_model_id": null, "description": "Qwen2.5-VL is a vision-language model from the Qwen family. Key enhancements include visual understanding (objects, text, charts, layouts), visual agent capabilities (tool use, computer/phone control), long video comprehension with event pinpointing, visual localization (bounding boxes/points), and structured output generation.", "release_date": "2025-02-28", "announcement_date": "2025-02-28", "license_id": "apache_2_0", "multimodal": true, "knowledge_cutoff": null, "param_count": 33500000000, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": null, "source_playground": "https://chat.qwen.ai/", "source_paper": "https://arxiv.org/pdf/2502.13923", "source_scorecard_blog_link": "https://qwenlm.github.io/blog/qwen2.5-vl/", "source_repo_link": "https://github.com/QwenLM/Qwen2.5-VL", "source_weights_link": "https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct", "created_at": "2025-07-19T19:49:05.653921+00:00", "updated_at": "2025-07-19T19:49:05.653921+00:00", "model_family_id": null } ================================================ FILE: data/organizations/qwen/models/qwen2.5-vl-72b/benchmarks.json ================================================ [ { "model_benchmark_id": 1255, "benchmark_id": "ai2d", "model_id": "qwen2.5-vl-72b", "score": 0.884, "normalized_score": 0.884, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.635049+00:00", "updated_at": "2025-07-19T19:56:13.635049+00:00", "benchmark_name": "AI2D" }, { "model_benchmark_id": 1703, "benchmark_id": "aitz-em", "model_id": "qwen2.5-vl-72b", "score": 0.832, "normalized_score": 0.832, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct", "verified_by_llmstats": false, "analysis_method": "EM", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.789425+00:00", "updated_at": "2025-07-19T19:56:14.789425+00:00", "benchmark_name": "AITZ_EM" }, { "model_benchmark_id": 1706, "benchmark_id": "android-control-high-em", "model_id": "qwen2.5-vl-72b", "score": 0.6736, "normalized_score": 0.6736, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct", "verified_by_llmstats": false, "analysis_method": "EM", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.796411+00:00", "updated_at": "2025-07-19T19:56:14.796411+00:00", "benchmark_name": "Android Control High_EM" }, { "model_benchmark_id": 1709, "benchmark_id": "android-control-low-em", "model_id": "qwen2.5-vl-72b", "score": 0.937, "normalized_score": 0.937, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct", "verified_by_llmstats": false, "analysis_method": "EM", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.805303+00:00", "updated_at": "2025-07-19T19:56:14.805303+00:00", "benchmark_name": "Android Control Low_EM" }, { "model_benchmark_id": 1712, "benchmark_id": "androidworld-sr", "model_id": "qwen2.5-vl-72b", "score": 0.35, "normalized_score": 0.35, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct", "verified_by_llmstats": false, "analysis_method": "SR", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.813492+00:00", "updated_at": "2025-07-19T19:56:14.813492+00:00", "benchmark_name": "AndroidWorld_SR" }, { "model_benchmark_id": 1657, "benchmark_id": "cc-ocr", "model_id": "qwen2.5-vl-72b", "score": 0.798, "normalized_score": 0.798, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.657333+00:00", "updated_at": "2025-07-19T19:56:14.657333+00:00", "benchmark_name": "CC-OCR" }, { "model_benchmark_id": 867, "benchmark_id": "chartqa", "model_id": "qwen2.5-vl-72b", "score": 0.895, "normalized_score": 0.895, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.811401+00:00", "updated_at": "2025-07-19T19:56:12.811401+00:00", "benchmark_name": "ChartQA" }, { "model_benchmark_id": 888, "benchmark_id": "docvqa", "model_id": "qwen2.5-vl-72b", "score": 0.964, "normalized_score": 0.964, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.848273+00:00", "updated_at": "2025-07-19T19:56:12.848273+00:00", "benchmark_name": "DocVQA" }, { "model_benchmark_id": 925, "benchmark_id": "egoschema", "model_id": "qwen2.5-vl-72b", "score": 0.762, "normalized_score": 0.762, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.933582+00:00", "updated_at": "2025-07-19T19:56:12.933582+00:00", "benchmark_name": "EgoSchema" }, { "model_benchmark_id": 1673, "benchmark_id": "hallusion-bench", "model_id": "qwen2.5-vl-72b", "score": 0.5516, "normalized_score": 0.5516, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.694733+00:00", "updated_at": "2025-07-19T19:56:14.694733+00:00", "benchmark_name": "Hallusion Bench" }, { "model_benchmark_id": 829, "benchmark_id": "lvbench", "model_id": "qwen2.5-vl-72b", "score": 0.473, "normalized_score": 0.473, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.731476+00:00", "updated_at": "2025-07-19T19:56:12.731476+00:00", "benchmark_name": "LVBench" }, { "model_benchmark_id": 1677, "benchmark_id": "mathvision", "model_id": "qwen2.5-vl-72b", "score": 0.381, "normalized_score": 0.381, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.705119+00:00", "updated_at": "2025-07-19T19:56:14.705119+00:00", "benchmark_name": "MathVision" }, { "model_benchmark_id": 1271, "benchmark_id": "mathvista-mini", "model_id": "qwen2.5-vl-72b", "score": 0.748, "normalized_score": 0.748, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.666379+00:00", "updated_at": "2025-07-19T19:56:13.666379+00:00", "benchmark_name": "MathVista-Mini" }, { "model_benchmark_id": 1746, "benchmark_id": "mlvu-m", "model_id": "qwen2.5-vl-72b", "score": 0.746, "normalized_score": 0.746, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.934328+00:00", "updated_at": "2025-07-19T19:56:14.934328+00:00", "benchmark_name": "MLVU-M" }, { "model_benchmark_id": 1512, "benchmark_id": "mmbench", "model_id": "qwen2.5-vl-72b", "score": 0.88, "normalized_score": 0.88, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.243543+00:00", "updated_at": "2025-07-19T19:56:14.243543+00:00", "benchmark_name": "MMBench" }, { "model_benchmark_id": 1689, "benchmark_id": "mmbench-video", "model_id": "qwen2.5-vl-72b", "score": 0.0202, "normalized_score": 0.0202, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-VL", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.744558+00:00", "updated_at": "2025-07-19T19:56:14.744558+00:00", "benchmark_name": "MMBench-Video" }, { "model_benchmark_id": 572, "benchmark_id": "mmmu", "model_id": "qwen2.5-vl-72b", "score": 0.702, "normalized_score": 0.702, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.177290+00:00", "updated_at": "2025-07-19T19:56:12.177290+00:00", "benchmark_name": "MMMU" }, { "model_benchmark_id": 1535, "benchmark_id": "mmmu-pro", "model_id": "qwen2.5-vl-72b", "score": 0.511, "normalized_score": 0.511, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.297757+00:00", "updated_at": "2025-07-19T19:56:14.297757+00:00", "benchmark_name": "MMMU-Pro" }, { "model_benchmark_id": 1661, "benchmark_id": "mmstar", "model_id": "qwen2.5-vl-72b", "score": 0.708, "normalized_score": 0.708, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.666719+00:00", "updated_at": "2025-07-19T19:56:14.666719+00:00", "benchmark_name": "MMStar" }, { "model_benchmark_id": 1671, "benchmark_id": "mmvet", "model_id": "qwen2.5-vl-72b", "score": 0.7619, "normalized_score": 0.7619, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.688513+00:00", "updated_at": "2025-07-19T19:56:14.688513+00:00", "benchmark_name": "MMVet" }, { "model_benchmark_id": 1715, "benchmark_id": "mobileminiwob++-sr", "model_id": "qwen2.5-vl-72b", "score": 0.68, "normalized_score": 0.68, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct", "verified_by_llmstats": false, "analysis_method": "SR", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.820961+00:00", "updated_at": "2025-07-19T19:56:14.820961+00:00", "benchmark_name": "MobileMiniWob++_SR" }, { "model_benchmark_id": 1644, "benchmark_id": "mvbench", "model_id": "qwen2.5-vl-72b", "score": 0.704, "normalized_score": 0.704, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.623550+00:00", "updated_at": "2025-07-19T19:56:14.623550+00:00", "benchmark_name": "MVBench" }, { "model_benchmark_id": 1541, "benchmark_id": "ocrbench", "model_id": "qwen2.5-vl-72b", "score": 0.885, "normalized_score": 0.885, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.318110+00:00", "updated_at": "2025-07-19T19:56:14.318110+00:00", "benchmark_name": "OCRBench" }, { "model_benchmark_id": 1744, "benchmark_id": "ocrbench-v2-(en)", "model_id": "qwen2.5-vl-72b", "score": 0.615, "normalized_score": 0.615, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.928710+00:00", "updated_at": "2025-07-19T19:56:14.928710+00:00", "benchmark_name": "OCRBench-V2 (en)" }, { "model_benchmark_id": 1747, "benchmark_id": "osworld", "model_id": "qwen2.5-vl-72b", "score": 0.0883, "normalized_score": 0.0883, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.937610+00:00", "updated_at": "2025-07-19T19:56:14.937610+00:00", "benchmark_name": "OSWorld" }, { "model_benchmark_id": 1680, "benchmark_id": "perceptiontest", "model_id": "qwen2.5-vl-72b", "score": 0.732, "normalized_score": 0.732, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.713944+00:00", "updated_at": "2025-07-19T19:56:14.713944+00:00", "benchmark_name": "PerceptionTest" }, { "model_benchmark_id": 1697, "benchmark_id": "screenspot", "model_id": "qwen2.5-vl-72b", "score": 0.871, "normalized_score": 0.871, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.773284+00:00", "updated_at": "2025-07-19T19:56:14.773284+00:00", "benchmark_name": "ScreenSpot" }, { "model_benchmark_id": 1700, "benchmark_id": "screenspot-pro", "model_id": "qwen2.5-vl-72b", "score": 0.436, "normalized_score": 0.436, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.780898+00:00", "updated_at": "2025-07-19T19:56:14.780898+00:00", "benchmark_name": "ScreenSpot Pro" }, { "model_benchmark_id": 1692, "benchmark_id": "tempcompass", "model_id": "qwen2.5-vl-72b", "score": 0.748, "normalized_score": 0.748, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.754032+00:00", "updated_at": "2025-07-19T19:56:14.754032+00:00", "benchmark_name": "TempCompass" }, { "model_benchmark_id": 1682, "benchmark_id": "videomme-w-o-sub.", "model_id": "qwen2.5-vl-72b", "score": 0.733, "normalized_score": 0.733, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.720259+00:00", "updated_at": "2025-07-19T19:56:14.720259+00:00", "benchmark_name": "VideoMME w/o sub." } ] ================================================ FILE: data/organizations/qwen/models/qwen2.5-vl-72b/model.json ================================================ { "model_id": "qwen2.5-vl-72b", "name": "Qwen2.5 VL 72B Instruct", "organization_id": "qwen", "fine_tuned_from_model_id": null, "description": "Qwen2.5-VL is the new flagship vision-language model of Qwen, significantly improved from Qwen2-VL. It excels at recognizing objects, analyzing text/charts/layouts in images, acting as a visual agent, understanding long videos (over 1 hour) with event pinpointing, performing visual localization (bounding boxes/points), and generating structured outputs from documents.", "release_date": "2025-01-26", "announcement_date": "2025-01-26", "license_id": "tongyi_qianwen", "multimodal": true, "knowledge_cutoff": null, "param_count": 72000000000, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": null, "source_playground": "https://chat.qwen.ai/", "source_paper": "https://arxiv.org/pdf/2502.13923", "source_scorecard_blog_link": "https://qwenlm.github.io/blog/qwen2.5-vl/", "source_repo_link": "https://github.com/QwenLM/Qwen2.5-VL", "source_weights_link": "https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct", "created_at": "2025-07-19T19:49:05.647509+00:00", "updated_at": "2025-07-19T19:49:05.647509+00:00", "model_family_id": null } ================================================ FILE: data/organizations/qwen/models/qwen2.5-vl-7b/benchmarks.json ================================================ [ { "model_benchmark_id": 1702, "benchmark_id": "aitz-em", "model_id": "qwen2.5-vl-7b", "score": 0.819, "normalized_score": 0.819, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct", "verified_by_llmstats": false, "analysis_method": "EM", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.787781+00:00", "updated_at": "2025-07-19T19:56:14.787781+00:00", "benchmark_name": "AITZ_EM" }, { "model_benchmark_id": 1705, "benchmark_id": "android-control-high-em", "model_id": "qwen2.5-vl-7b", "score": 0.601, "normalized_score": 0.601, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct", "verified_by_llmstats": false, "analysis_method": "EM", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.794879+00:00", "updated_at": "2025-07-19T19:56:14.794879+00:00", "benchmark_name": "Android Control High_EM" }, { "model_benchmark_id": 1708, "benchmark_id": "android-control-low-em", "model_id": "qwen2.5-vl-7b", "score": 0.914, "normalized_score": 0.914, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-VL", "verified_by_llmstats": false, "analysis_method": "EM", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.803305+00:00", "updated_at": "2025-07-19T19:56:14.803305+00:00", "benchmark_name": "Android Control Low_EM" }, { "model_benchmark_id": 1711, "benchmark_id": "androidworld-sr", "model_id": "qwen2.5-vl-7b", "score": 0.255, "normalized_score": 0.255, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct", "verified_by_llmstats": false, "analysis_method": "SR", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.811782+00:00", "updated_at": "2025-07-19T19:56:14.811782+00:00", "benchmark_name": "AndroidWorld_SR" }, { "model_benchmark_id": 1656, "benchmark_id": "cc-ocr", "model_id": "qwen2.5-vl-7b", "score": 0.778, "normalized_score": 0.778, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.655251+00:00", "updated_at": "2025-07-19T19:56:14.655251+00:00", "benchmark_name": "CC-OCR" }, { "model_benchmark_id": 1694, "benchmark_id": "charadessta", "model_id": "qwen2.5-vl-7b", "score": 0.436, "normalized_score": 0.436, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct", "verified_by_llmstats": false, "analysis_method": "mIoU", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.763802+00:00", "updated_at": "2025-07-19T19:56:14.763802+00:00", "benchmark_name": "CharadesSTA" }, { "model_benchmark_id": 865, "benchmark_id": "chartqa", "model_id": "qwen2.5-vl-7b", "score": 0.873, "normalized_score": 0.873, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.808329+00:00", "updated_at": "2025-07-19T19:56:12.808329+00:00", "benchmark_name": "ChartQA" }, { "model_benchmark_id": 886, "benchmark_id": "docvqa", "model_id": "qwen2.5-vl-7b", "score": 0.957, "normalized_score": 0.957, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.844347+00:00", "updated_at": "2025-07-19T19:56:12.844347+00:00", "benchmark_name": "DocVQA" }, { "model_benchmark_id": 1672, "benchmark_id": "hallusion-bench", "model_id": "qwen2.5-vl-7b", "score": 0.529, "normalized_score": 0.529, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.693096+00:00", "updated_at": "2025-07-19T19:56:14.693096+00:00", "benchmark_name": "Hallusion Bench" }, { "model_benchmark_id": 1242, "benchmark_id": "infovqa", "model_id": "qwen2.5-vl-7b", "score": 0.826, "normalized_score": 0.826, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.610945+00:00", "updated_at": "2025-07-19T19:56:13.610945+00:00", "benchmark_name": "InfoVQA" }, { "model_benchmark_id": 1687, "benchmark_id": "longvideobench", "model_id": "qwen2.5-vl-7b", "score": 0.547, "normalized_score": 0.547, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.737450+00:00", "updated_at": "2025-07-19T19:56:14.737450+00:00", "benchmark_name": "LongVideoBench" }, { "model_benchmark_id": 828, "benchmark_id": "lvbench", "model_id": "qwen2.5-vl-7b", "score": 0.453, "normalized_score": 0.453, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.729778+00:00", "updated_at": "2025-07-19T19:56:12.729778+00:00", "benchmark_name": "LVBench" }, { "model_benchmark_id": 1674, "benchmark_id": "mathvision", "model_id": "qwen2.5-vl-7b", "score": 0.2507, "normalized_score": 0.2507, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.698748+00:00", "updated_at": "2025-07-19T19:56:14.698748+00:00", "benchmark_name": "MathVision" }, { "model_benchmark_id": 1270, "benchmark_id": "mathvista-mini", "model_id": "qwen2.5-vl-7b", "score": 0.682, "normalized_score": 0.682, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.664381+00:00", "updated_at": "2025-07-19T19:56:13.664381+00:00", "benchmark_name": "MathVista-Mini" }, { "model_benchmark_id": 1693, "benchmark_id": "mlvu", "model_id": "qwen2.5-vl-7b", "score": 0.702, "normalized_score": 0.702, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.758833+00:00", "updated_at": "2025-07-19T19:56:14.758833+00:00", "benchmark_name": "MLVU" }, { "model_benchmark_id": 1511, "benchmark_id": "mmbench", "model_id": "qwen2.5-vl-7b", "score": 0.843, "normalized_score": 0.843, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-VL", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.241869+00:00", "updated_at": "2025-07-19T19:56:14.241869+00:00", "benchmark_name": "MMBench" }, { "model_benchmark_id": 1688, "benchmark_id": "mmbench-video", "model_id": "qwen2.5-vl-7b", "score": 0.0179, "normalized_score": 0.0179, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-VL", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.742467+00:00", "updated_at": "2025-07-19T19:56:14.742467+00:00", "benchmark_name": "MMBench-Video" }, { "model_benchmark_id": 569, "benchmark_id": "mmmu", "model_id": "qwen2.5-vl-7b", "score": 0.586, "normalized_score": 0.586, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.170987+00:00", "updated_at": "2025-07-19T19:56:12.170987+00:00", "benchmark_name": "MMMU" }, { "model_benchmark_id": 1533, "benchmark_id": "mmmu-pro", "model_id": "qwen2.5-vl-7b", "score": 0.383, "normalized_score": 0.383, "is_self_reported": true, "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-VL", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.294582+00:00", "updated_at": "2025-07-19T19:56:14.294582+00:00", "benchmark_name": "MMMU-Pro" }, { "model_benchmark_id": 1659, "benchmark_id": "mmstar", "model_id": "qwen2.5-vl-7b", "score": 0.639, "normalized_score": 0.639, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.662888+00:00", "updated_at": "2025-07-19T19:56:14.662888+00:00", "benchmark_name": "MMStar" }, { "model_benchmark_id": 1666, "benchmark_id": "mmt-bench", "model_id": "qwen2.5-vl-7b", "score": 0.636, "normalized_score": 0.636, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.676869+00:00", "updated_at": "2025-07-19T19:56:14.676869+00:00", "benchmark_name": "MMT-Bench" }, { "model_benchmark_id": 1670, "benchmark_id": "mmvet", "model_id": "qwen2.5-vl-7b", "score": 0.671, "normalized_score": 0.671, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.687023+00:00", "updated_at": "2025-07-19T19:56:14.687023+00:00", "benchmark_name": "MMVet" }, { "model_benchmark_id": 1714, "benchmark_id": "mobileminiwob++-sr", "model_id": "qwen2.5-vl-7b", "score": 0.914, "normalized_score": 0.914, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct", "verified_by_llmstats": false, "analysis_method": "SR", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.819401+00:00", "updated_at": "2025-07-19T19:56:14.819401+00:00", "benchmark_name": "MobileMiniWob++_SR" }, { "model_benchmark_id": 1642, "benchmark_id": "mvbench", "model_id": "qwen2.5-vl-7b", "score": 0.696, "normalized_score": 0.696, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.620310+00:00", "updated_at": "2025-07-19T19:56:14.620310+00:00", "benchmark_name": "MVBench" }, { "model_benchmark_id": 1540, "benchmark_id": "ocrbench", "model_id": "qwen2.5-vl-7b", "score": 0.864, "normalized_score": 0.864, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.315649+00:00", "updated_at": "2025-07-19T19:56:14.315649+00:00", "benchmark_name": "OCRBench" }, { "model_benchmark_id": 1679, "benchmark_id": "perceptiontest", "model_id": "qwen2.5-vl-7b", "score": 0.705, "normalized_score": 0.705, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.712010+00:00", "updated_at": "2025-07-19T19:56:14.712010+00:00", "benchmark_name": "PerceptionTest" }, { "model_benchmark_id": 1696, "benchmark_id": "screenspot", "model_id": "qwen2.5-vl-7b", "score": 0.847, "normalized_score": 0.847, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.771516+00:00", "updated_at": "2025-07-19T19:56:14.771516+00:00", "benchmark_name": "ScreenSpot" }, { "model_benchmark_id": 1699, "benchmark_id": "screenspot-pro", "model_id": "qwen2.5-vl-7b", "score": 0.29, "normalized_score": 0.29, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.779312+00:00", "updated_at": "2025-07-19T19:56:14.779312+00:00", "benchmark_name": "ScreenSpot Pro" }, { "model_benchmark_id": 1691, "benchmark_id": "tempcompass", "model_id": "qwen2.5-vl-7b", "score": 0.717, "normalized_score": 0.717, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.752008+00:00", "updated_at": "2025-07-19T19:56:14.752008+00:00", "benchmark_name": "TempCompass" }, { "model_benchmark_id": 910, "benchmark_id": "textvqa", "model_id": "qwen2.5-vl-7b", "score": 0.849, "normalized_score": 0.849, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.896871+00:00", "updated_at": "2025-07-19T19:56:12.896871+00:00", "benchmark_name": "TextVQA" }, { "model_benchmark_id": 1681, "benchmark_id": "videomme-w-o-sub.", "model_id": "qwen2.5-vl-7b", "score": 0.651, "normalized_score": 0.651, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.718319+00:00", "updated_at": "2025-07-19T19:56:14.718319+00:00", "benchmark_name": "VideoMME w/o sub." }, { "model_benchmark_id": 1684, "benchmark_id": "videomme-w-sub.", "model_id": "qwen2.5-vl-7b", "score": 0.716, "normalized_score": 0.716, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.726358+00:00", "updated_at": "2025-07-19T19:56:14.726358+00:00", "benchmark_name": "VideoMME w sub." } ] ================================================ FILE: data/organizations/qwen/models/qwen2.5-vl-7b/model.json ================================================ { "model_id": "qwen2.5-vl-7b", "name": "Qwen2.5 VL 7B Instruct", "organization_id": "qwen", "fine_tuned_from_model_id": null, "description": "Qwen2.5-VL is a vision-language model from the Qwen family. Key enhancements include visual understanding (objects, text, charts, layouts), visual agent capabilities (tool use, computer/phone control), long video comprehension with event pinpointing, visual localization (bounding boxes/points), and structured output generation.", "release_date": "2025-01-26", "announcement_date": "2025-01-26", "license_id": "apache_2_0", "multimodal": true, "knowledge_cutoff": null, "param_count": 8290000000, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": null, "source_playground": "https://chat.qwen.ai/", "source_paper": "https://arxiv.org/pdf/2502.13923", "source_scorecard_blog_link": "https://qwenlm.github.io/blog/qwen2.5-vl/", "source_repo_link": "https://github.com/QwenLM/Qwen2.5-VL", "source_weights_link": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct", "created_at": "2025-07-19T19:49:05.635630+00:00", "updated_at": "2025-07-19T19:49:05.635630+00:00", "model_family_id": null } ================================================ FILE: data/organizations/qwen/models/qwen3-235b-a22b/benchmarks.json ================================================ [ { "model_benchmark_id": 1626, "benchmark_id": "aider", "model_id": "qwen3-235b-a22b", "score": 0.618, "normalized_score": 0.618, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/", "verified_by_llmstats": false, "analysis_method": "Pass@2", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.572970+00:00", "updated_at": "2025-07-19T19:56:14.572970+00:00", "benchmark_name": "Aider" }, { "model_benchmark_id": 454, "benchmark_id": "aime-2024", "model_id": "qwen3-235b-a22b", "score": 0.857, "normalized_score": 0.857, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/", "verified_by_llmstats": false, "analysis_method": "Pass@64", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.963641+00:00", "updated_at": "2025-07-19T19:56:11.963641+00:00", "benchmark_name": "AIME 2024" }, { "model_benchmark_id": 690, "benchmark_id": "aime-2025", "model_id": "qwen3-235b-a22b", "score": 0.815, "normalized_score": 0.815, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/", "verified_by_llmstats": false, "analysis_method": "Pass@64", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.447678+00:00", "updated_at": "2025-07-19T19:56:12.447678+00:00", "benchmark_name": "AIME 2025" }, { "model_benchmark_id": 1452, "benchmark_id": "arena-hard", "model_id": "qwen3-235b-a22b", "score": 0.956, "normalized_score": 0.956, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.095282+00:00", "updated_at": "2025-07-19T19:56:14.095282+00:00", "benchmark_name": "Arena Hard" }, { "model_benchmark_id": 972, "benchmark_id": "bbh", "model_id": "qwen3-235b-a22b", "score": 0.8887, "normalized_score": 0.8887, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.043683+00:00", "updated_at": "2025-07-19T19:56:13.043683+00:00", "benchmark_name": "BBH" }, { "model_benchmark_id": 851, "benchmark_id": "bfcl", "model_id": "qwen3-235b-a22b", "score": 0.708, "normalized_score": 0.708, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/", "verified_by_llmstats": false, "analysis_method": "v3", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.780457+00:00", "updated_at": "2025-07-19T19:56:12.780457+00:00", "benchmark_name": "BFCL" }, { "model_benchmark_id": 1648, "benchmark_id": "crux-o", "model_id": "qwen3-235b-a22b", "score": 0.79, "normalized_score": 0.79, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.637715+00:00", "updated_at": "2025-07-19T19:56:14.637715+00:00", "benchmark_name": "CRUX-O" }, { "model_benchmark_id": 371, "benchmark_id": "evalplus", "model_id": "qwen3-235b-a22b", "score": 0.776, "normalized_score": 0.776, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.801301+00:00", "updated_at": "2025-07-19T19:56:11.801301+00:00", "benchmark_name": "EvalPlus" }, { "model_benchmark_id": 302, "benchmark_id": "gpqa", "model_id": "qwen3-235b-a22b", "score": 0.4747, "normalized_score": 0.4747, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.679464+00:00", "updated_at": "2025-07-19T19:56:11.679464+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 995, "benchmark_id": "gsm8k", "model_id": "qwen3-235b-a22b", "score": 0.9439, "normalized_score": 0.9439, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.083824+00:00", "updated_at": "2025-07-19T19:56:13.083824+00:00", "benchmark_name": "GSM8k" }, { "model_benchmark_id": 1308, "benchmark_id": "include", "model_id": "qwen3-235b-a22b", "score": 0.7346, "normalized_score": 0.7346, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.737543+00:00", "updated_at": "2025-07-19T19:56:13.737543+00:00", "benchmark_name": "Include" }, { "model_benchmark_id": 749, "benchmark_id": "livebench", "model_id": "qwen3-235b-a22b", "score": 0.771, "normalized_score": 0.771, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.575629+00:00", "updated_at": "2025-07-19T19:56:12.575629+00:00", "benchmark_name": "LiveBench" }, { "model_benchmark_id": 1123, "benchmark_id": "livecodebench", "model_id": "qwen3-235b-a22b", "score": 0.707, "normalized_score": 0.707, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/", "verified_by_llmstats": false, "analysis_method": "v5", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.344206+00:00", "updated_at": "2025-07-19T19:56:13.344206+00:00", "benchmark_name": "LiveCodeBench" }, { "model_benchmark_id": 405, "benchmark_id": "math", "model_id": "qwen3-235b-a22b", "score": 0.7184, "normalized_score": 0.7184, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.863985+00:00", "updated_at": "2025-07-19T19:56:11.863985+00:00", "benchmark_name": "MATH" }, { "model_benchmark_id": 1186, "benchmark_id": "mbpp", "model_id": "qwen3-235b-a22b", "score": 0.814, "normalized_score": 0.814, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.500617+00:00", "updated_at": "2025-07-19T19:56:13.500617+00:00", "benchmark_name": "MBPP" }, { "model_benchmark_id": 1289, "benchmark_id": "mgsm", "model_id": "qwen3-235b-a22b", "score": 0.8353, "normalized_score": 0.8353, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.700097+00:00", "updated_at": "2025-07-19T19:56:13.700097+00:00", "benchmark_name": "MGSM" }, { "model_benchmark_id": 90, "benchmark_id": "mmlu", "model_id": "qwen3-235b-a22b", "score": 0.8781, "normalized_score": 0.8781, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.270963+00:00", "updated_at": "2025-07-19T19:56:11.270963+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 195, "benchmark_id": "mmlu-pro", "model_id": "qwen3-235b-a22b", "score": 0.6818, "normalized_score": 0.6818, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.472627+00:00", "updated_at": "2025-07-19T19:56:11.472627+00:00", "benchmark_name": "MMLU-Pro" }, { "model_benchmark_id": 732, "benchmark_id": "mmlu-redux", "model_id": "qwen3-235b-a22b", "score": 0.874, "normalized_score": 0.874, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.540685+00:00", "updated_at": "2025-07-19T19:56:12.540685+00:00", "benchmark_name": "MMLU-Redux" }, { "model_benchmark_id": 1477, "benchmark_id": "mmmlu", "model_id": "qwen3-235b-a22b", "score": 0.867, "normalized_score": 0.867, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.150792+00:00", "updated_at": "2025-07-19T19:56:14.150792+00:00", "benchmark_name": "MMMLU" }, { "model_benchmark_id": 1647, "benchmark_id": "multilf", "model_id": "qwen3-235b-a22b", "score": 0.719, "normalized_score": 0.719, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.633963+00:00", "updated_at": "2025-07-19T19:56:14.633963+00:00", "benchmark_name": "MultiLF" }, { "model_benchmark_id": 643, "benchmark_id": "multipl-e", "model_id": "qwen3-235b-a22b", "score": 0.6594, "normalized_score": 0.6594, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.320821+00:00", "updated_at": "2025-07-19T19:56:12.320821+00:00", "benchmark_name": "MultiPL-E" }, { "model_benchmark_id": 366, "benchmark_id": "supergpqa", "model_id": "qwen3-235b-a22b", "score": 0.4406, "normalized_score": 0.4406, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.784624+00:00", "updated_at": "2025-07-19T19:56:11.784624+00:00", "benchmark_name": "SuperGPQA" } ] ================================================ FILE: data/organizations/qwen/models/qwen3-235b-a22b/model.json ================================================ { "model_id": "qwen3-235b-a22b", "name": "Qwen3 235B A22B", "organization_id": "qwen", "fine_tuned_from_model_id": null, "description": "Qwen3 235B A22B is a large language model developed by Alibaba, featuring a Mixture-of-Experts (MoE) architecture with 235 billion total parameters and 22 billion activated parameters. It achieves competitive results in benchmark evaluations of coding, math, general capabilities, and more, compared to other top-tier models.", "release_date": "2025-04-29", "announcement_date": "2025-04-29", "license_id": "apache_2_0", "multimodal": false, "knowledge_cutoff": null, "param_count": 235000000000, "training_tokens": 36000000000000, "available_in_zeroeval": true, "source_api_ref": "https://qwenlm.github.io/blog/qwen3/", "source_playground": "https://chat.qwen.ai/", "source_paper": null, "source_scorecard_blog_link": null, "source_repo_link": "https://github.com/QwenLM/Qwen3", "source_weights_link": "https://huggingface.co/Qwen/Qwen3-235B-A22B", "created_at": "2025-07-19T19:49:05.624683+00:00", "updated_at": "2025-07-19T19:49:05.624683+00:00", "model_family_id": null } ================================================ FILE: data/organizations/qwen/models/qwen3-235b-a22b-instruct-2507/benchmarks.json ================================================ [ { "model_benchmark_id": 15972, "benchmark_id": "aider-polyglot", "model_id": "qwen3-235b-a22b-instruct-2507", "score": 0.573, "normalized_score": 0.573, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-03T22:06:13.609026+00:00", "updated_at": "2025-08-03T22:06:13.609026+00:00", "benchmark_name": "Aider-Polyglot" }, { "model_benchmark_id": 15973, "benchmark_id": "aime-2025", "model_id": "qwen3-235b-a22b-instruct-2507", "score": 0.703, "normalized_score": 0.703, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-03T22:06:13.611021+00:00", "updated_at": "2025-08-03T22:06:13.611021+00:00", "benchmark_name": "AIME 2025" }, { "model_benchmark_id": 15974, "benchmark_id": "arc-agi", "model_id": "qwen3-235b-a22b-instruct-2507", "score": 0.418, "normalized_score": 0.418, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-03T22:06:13.618116+00:00", "updated_at": "2025-08-03T22:06:13.618116+00:00", "benchmark_name": "ARC-AGI" }, { "model_benchmark_id": 15975, "benchmark_id": "arena-hard-v2", "model_id": "qwen3-235b-a22b-instruct-2507", "score": 0.792, "normalized_score": 0.792, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507", "verified_by_llmstats": false, "analysis_method": "Win Rate", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-03T22:06:13.620187+00:00", "updated_at": "2025-08-03T22:06:13.620187+00:00", "benchmark_name": "Arena-Hard v2" }, { "model_benchmark_id": 15976, "benchmark_id": "bfcl-v3", "model_id": "qwen3-235b-a22b-instruct-2507", "score": 0.709, "normalized_score": 0.709, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-03T22:06:13.622144+00:00", "updated_at": "2025-08-03T22:06:13.622144+00:00", "benchmark_name": "BFCL-v3" }, { "model_benchmark_id": 15977, "benchmark_id": "creative-writing-v3", "model_id": "qwen3-235b-a22b-instruct-2507", "score": 0.875, "normalized_score": 0.875, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-03T22:06:13.626065+00:00", "updated_at": "2025-08-03T22:06:13.626065+00:00", "benchmark_name": "Creative Writing v3" }, { "model_benchmark_id": 15978, "benchmark_id": "csimpleqa", "model_id": "qwen3-235b-a22b-instruct-2507", "score": 0.843, "normalized_score": 0.843, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-03T22:06:13.629696+00:00", "updated_at": "2025-08-03T22:06:13.629696+00:00", "benchmark_name": "CSimpleQA" }, { "model_benchmark_id": 15979, "benchmark_id": "gpqa", "model_id": "qwen3-235b-a22b-instruct-2507", "score": 0.775, "normalized_score": 0.775, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-03T22:06:13.631769+00:00", "updated_at": "2025-08-03T22:06:13.631769+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 15980, "benchmark_id": "hmmt25", "model_id": "qwen3-235b-a22b-instruct-2507", "score": 0.554, "normalized_score": 0.554, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-03T22:06:13.633387+00:00", "updated_at": "2025-08-03T22:06:13.633387+00:00", "benchmark_name": "HMMT25" }, { "model_benchmark_id": 15981, "benchmark_id": "ifeval", "model_id": "qwen3-235b-a22b-instruct-2507", "score": 0.887, "normalized_score": 0.887, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-03T22:06:13.635001+00:00", "updated_at": "2025-08-03T22:06:13.635001+00:00", "benchmark_name": "IFEval" }, { "model_benchmark_id": 15982, "benchmark_id": "include", "model_id": "qwen3-235b-a22b-instruct-2507", "score": 0.795, "normalized_score": 0.795, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-03T22:06:13.636605+00:00", "updated_at": "2025-08-03T22:06:13.636605+00:00", "benchmark_name": "INCLUDE" }, { "model_benchmark_id": 15983, "benchmark_id": "livebench-20241125", "model_id": "qwen3-235b-a22b-instruct-2507", "score": 0.754, "normalized_score": 0.754, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-03T22:06:13.638166+00:00", "updated_at": "2025-08-03T22:06:13.638166+00:00", "benchmark_name": "LiveBench 20241125" }, { "model_benchmark_id": 15984, "benchmark_id": "livecodebench-v6", "model_id": "qwen3-235b-a22b-instruct-2507", "score": 0.518, "normalized_score": 0.518, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-03T22:06:13.639661+00:00", "updated_at": "2025-08-03T22:06:13.639661+00:00", "benchmark_name": "LiveCodeBench v6" }, { "model_benchmark_id": 15985, "benchmark_id": "mmlu-pro", "model_id": "qwen3-235b-a22b-instruct-2507", "score": 0.83, "normalized_score": 0.83, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-03T22:06:13.641236+00:00", "updated_at": "2025-08-03T22:06:13.641236+00:00", "benchmark_name": "MMLU-Pro" }, { "model_benchmark_id": 15986, "benchmark_id": "mmlu-prox", "model_id": "qwen3-235b-a22b-instruct-2507", "score": 0.794, "normalized_score": 0.794, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-03T22:06:13.642908+00:00", "updated_at": "2025-08-03T22:06:13.642908+00:00", "benchmark_name": "MMLU-ProX" }, { "model_benchmark_id": 15987, "benchmark_id": "mmlu-redux", "model_id": "qwen3-235b-a22b-instruct-2507", "score": 0.931, "normalized_score": 0.931, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-03T22:06:13.644630+00:00", "updated_at": "2025-08-03T22:06:13.644630+00:00", "benchmark_name": "MMLU-Redux" }, { "model_benchmark_id": 15988, "benchmark_id": "multi-if", "model_id": "qwen3-235b-a22b-instruct-2507", "score": 0.775, "normalized_score": 0.775, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-03T22:06:13.646355+00:00", "updated_at": "2025-08-03T22:06:13.646355+00:00", "benchmark_name": "Multi-IF" }, { "model_benchmark_id": 15989, "benchmark_id": "multipl-e", "model_id": "qwen3-235b-a22b-instruct-2507", "score": 0.879, "normalized_score": 0.879, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507", "verified_by_llmstats": false, "analysis_method": "Score", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-03T22:06:13.648211+00:00", "updated_at": "2025-08-03T22:06:13.648211+00:00", "benchmark_name": "MultiPL-E" }, { "model_benchmark_id": 15990, "benchmark_id": "polymath", "model_id": "qwen3-235b-a22b-instruct-2507", "score": 0.502, "normalized_score": 0.502, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-03T22:06:13.649756+00:00", "updated_at": "2025-08-03T22:06:13.649756+00:00", "benchmark_name": "PolyMATH" }, { "model_benchmark_id": 15991, "benchmark_id": "simpleqa", "model_id": "qwen3-235b-a22b-instruct-2507", "score": 0.543, "normalized_score": 0.543, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-03T22:06:13.651445+00:00", "updated_at": "2025-08-03T22:06:13.651445+00:00", "benchmark_name": "SimpleQA" }, { "model_benchmark_id": 15992, "benchmark_id": "supergpqa", "model_id": "qwen3-235b-a22b-instruct-2507", "score": 0.626, "normalized_score": 0.626, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-03T22:06:13.652980+00:00", "updated_at": "2025-08-03T22:06:13.652980+00:00", "benchmark_name": "SuperGPQA" }, { "model_benchmark_id": 15993, "benchmark_id": "tau2-airline", "model_id": "qwen3-235b-a22b-instruct-2507", "score": 0.44, "normalized_score": 0.44, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-03T22:06:13.654737+00:00", "updated_at": "2025-08-03T22:06:13.654737+00:00", "benchmark_name": "Tau2 airline" }, { "model_benchmark_id": 15994, "benchmark_id": "tau2-retail", "model_id": "qwen3-235b-a22b-instruct-2507", "score": 0.713, "normalized_score": 0.713, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-03T22:06:13.656359+00:00", "updated_at": "2025-08-03T22:06:13.656359+00:00", "benchmark_name": "Tau2 retail" }, { "model_benchmark_id": 15995, "benchmark_id": "writingbench", "model_id": "qwen3-235b-a22b-instruct-2507", "score": 0.852, "normalized_score": 0.852, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-03T22:06:13.657968+00:00", "updated_at": "2025-08-03T22:06:13.657968+00:00", "benchmark_name": "WritingBench" }, { "model_benchmark_id": 15996, "benchmark_id": "zebralogic", "model_id": "qwen3-235b-a22b-instruct-2507", "score": 0.95, "normalized_score": 0.95, "is_self_reported": true, "self_reported_source_link": "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-08-03T22:06:13.659618+00:00", "updated_at": "2025-08-03T22:06:13.659618+00:00", "benchmark_name": "ZebraLogic" } ] ================================================ FILE: data/organizations/qwen/models/qwen3-235b-a22b-instruct-2507/model.json ================================================ { "model_id": "qwen3-235b-a22b-instruct-2507", "name": "Qwen3-235B-A22B-Instruct-2507", "organization_id": "qwen", "fine_tuned_from_model_id": null, "description": "Qwen3-235B-A22B-Instruct-2507 is the updated instruct version of Qwen3-235B-A22B featuring significant improvements in general capabilities including instruction following, logical reasoning, text comprehension, mathematics, science, coding and tool usage. It provides substantial gains in long-tail knowledge coverage across multiple languages and markedly better alignment with user preferences in subjective and open-ended tasks.", "release_date": "2025-07-22", "announcement_date": "2025-07-22", "license_id": "apache_2_0", "multimodal": false, "knowledge_cutoff": null, "param_count": 235000000000, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://qwenlm.github.io/blog/qwen3/", "source_playground": "https://chat.qwen.ai/", "source_paper": "https://arxiv.org/abs/2505.09388", "source_scorecard_blog_link": "https://qwenlm.github.io/blog/qwen3/", "source_repo_link": "https://github.com/QwenLM/Qwen3", "source_weights_link": "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507", "created_at": "2025-08-03T22:06:11.701778+00:00", "updated_at": "2025-08-03T22:06:11.701778+00:00", "model_family_id": null } ================================================ FILE: data/organizations/qwen/models/qwen3-235b-a22b-thinking-2507/benchmarks.json ================================================ [ { "model_benchmark_id": 9101, "benchmark_id": "mmlu-pro", "model_id": "qwen3-235b-a22b-thinking-2507", "score": 0.844, "normalized_score": 0.844, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-thinking/", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-25T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "MMLU-Pro" }, { "model_benchmark_id": 9102, "benchmark_id": "mmlu-redux", "model_id": "qwen3-235b-a22b-thinking-2507", "score": 0.938, "normalized_score": 0.938, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-thinking/", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-25T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "MMLU-Redux" }, { "model_benchmark_id": 9103, "benchmark_id": "gpqa", "model_id": "qwen3-235b-a22b-thinking-2507", "score": 0.811, "normalized_score": 0.811, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-thinking/", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-25T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 9104, "benchmark_id": "supergpqa", "model_id": "qwen3-235b-a22b-thinking-2507", "score": 0.649, "normalized_score": 0.649, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-thinking/", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-25T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "SuperGPQA" }, { "model_benchmark_id": 9105, "benchmark_id": "aime-2025", "model_id": "qwen3-235b-a22b-thinking-2507", "score": 0.923, "normalized_score": 0.923, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-thinking/", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-25T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "AIME 2025" }, { "model_benchmark_id": 9106, "benchmark_id": "hmmt25", "model_id": "qwen3-235b-a22b-thinking-2507", "score": 0.839, "normalized_score": 0.839, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-thinking/", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-25T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "HMMT25" }, { "model_benchmark_id": 9107, "benchmark_id": "livebench-20241125", "model_id": "qwen3-235b-a22b-thinking-2507", "score": 0.784, "normalized_score": 0.784, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-thinking/", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-25T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "LiveBench 20241125" }, { "model_benchmark_id": 9108, "benchmark_id": "humanity's-last-exam", "model_id": "qwen3-235b-a22b-thinking-2507", "score": 0.182, "normalized_score": 0.182, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-thinking/", "verified_by_llmstats": false, "analysis_method": "text-only subset", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": "Score refers to text-only subset as model is not multi-modal", "created_at": "2025-07-25T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "HLE" }, { "model_benchmark_id": 9109, "benchmark_id": "livecodebench-v6", "model_id": "qwen3-235b-a22b-thinking-2507", "score": 0.741, "normalized_score": 0.741, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-thinking/", "verified_by_llmstats": false, "analysis_method": "25.02-25.05", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-25T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "LiveCodeBench v6" }, { "model_benchmark_id": 9110, "benchmark_id": "cfeval", "model_id": "qwen3-235b-a22b-thinking-2507", "score": 2134, "normalized_score": 0.2134, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-thinking/", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": "Raw score: 2134", "created_at": "2025-07-25T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "CFEval" }, { "model_benchmark_id": 9111, "benchmark_id": "ojbench", "model_id": "qwen3-235b-a22b-thinking-2507", "score": 0.325, "normalized_score": 0.325, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-thinking/", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-25T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "OJBench" }, { "model_benchmark_id": 9112, "benchmark_id": "ifeval", "model_id": "qwen3-235b-a22b-thinking-2507", "score": 0.878, "normalized_score": 0.878, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-thinking/", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-25T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "IFEval" }, { "model_benchmark_id": 9113, "benchmark_id": "arena-hard-v2", "model_id": "qwen3-235b-a22b-thinking-2507", "score": 0.797, "normalized_score": 0.797, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-thinking/", "verified_by_llmstats": false, "analysis_method": "GPT-4 evaluated win rates", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-25T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "Arena-Hard v2" }, { "model_benchmark_id": 9114, "benchmark_id": "creative-writing-v3", "model_id": "qwen3-235b-a22b-thinking-2507", "score": 0.861, "normalized_score": 0.861, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-thinking/", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-25T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "Creative Writing v3" }, { "model_benchmark_id": 9115, "benchmark_id": "writingbench", "model_id": "qwen3-235b-a22b-thinking-2507", "score": 0.883, "normalized_score": 0.883, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-thinking/", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-25T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "WritingBench" }, { "model_benchmark_id": 9116, "benchmark_id": "bfcl-v3", "model_id": "qwen3-235b-a22b-thinking-2507", "score": 0.719, "normalized_score": 0.719, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-thinking/", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-25T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "BFCL-v3" }, { "model_benchmark_id": 9117, "benchmark_id": "tau-bench-retail", "model_id": "qwen3-235b-a22b-thinking-2507", "score": 0.678, "normalized_score": 0.678, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-thinking/", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": "TAU1-Retail", "created_at": "2025-07-25T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "TAU1-Retail" }, { "model_benchmark_id": 9118, "benchmark_id": "tau-bench-airline", "model_id": "qwen3-235b-a22b-thinking-2507", "score": 0.46, "normalized_score": 0.46, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-thinking/", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": "TAU1-Airline", "created_at": "2025-07-25T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "TAU1-Airline" }, { "model_benchmark_id": 9119, "benchmark_id": "tau2-retail", "model_id": "qwen3-235b-a22b-thinking-2507", "score": 0.719, "normalized_score": 0.719, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-thinking/", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-25T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "TAU2-Retail" }, { "model_benchmark_id": 9120, "benchmark_id": "tau2-airline", "model_id": "qwen3-235b-a22b-thinking-2507", "score": 0.58, "normalized_score": 0.58, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-thinking/", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-25T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "TAU2-Airline" }, { "model_benchmark_id": 9121, "benchmark_id": "tau2-telecom", "model_id": "qwen3-235b-a22b-thinking-2507", "score": 0.456, "normalized_score": 0.456, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-thinking/", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-25T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "TAU2-Telecom" }, { "model_benchmark_id": 9122, "benchmark_id": "multi-if", "model_id": "qwen3-235b-a22b-thinking-2507", "score": 0.806, "normalized_score": 0.806, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-thinking/", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": "MultiIF", "created_at": "2025-07-25T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "MultiIF" }, { "model_benchmark_id": 9123, "benchmark_id": "mmlu-prox", "model_id": "qwen3-235b-a22b-thinking-2507", "score": 0.81, "normalized_score": 0.81, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-thinking/", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": "MMLU-ProX", "created_at": "2025-07-25T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "MMLU-ProX" }, { "model_benchmark_id": 9124, "benchmark_id": "include", "model_id": "qwen3-235b-a22b-thinking-2507", "score": 0.81, "normalized_score": 0.81, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-thinking/", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-25T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "INCLUDE" }, { "model_benchmark_id": 9125, "benchmark_id": "polymath", "model_id": "qwen3-235b-a22b-thinking-2507", "score": 0.601, "normalized_score": 0.601, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-thinking/", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-25T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "PolyMATH" } ] ================================================ FILE: data/organizations/qwen/models/qwen3-235b-a22b-thinking-2507/model.json ================================================ { "model_id": "qwen3-235b-a22b-thinking-2507", "name": "Qwen3-235B-A22B-Thinking-2507", "organization_id": "qwen", "model_family_id": null, "fine_tuned_from_model_id": "qwen3-235b-a22b", "description": "Qwen3-235B-A22B-Thinking-2507 is a state-of-the-art thinking-enabled Mixture-of-Experts (MoE) model with 235B total parameters (22B activated). It features 94 layers, 128 experts (8 activated), and supports 262K native context length. This version delivers significantly improved reasoning performance, achieving state-of-the-art results among open-source thinking models on logical reasoning, mathematics, science, coding, and academic benchmarks. Key enhancements include markedly better general capabilities (instruction following, tool usage, text generation), enhanced 256K long-context understanding, and increased thinking depth. The model supports only thinking mode with automatic tag inclusion.", "release_date": "2025-07-25", "announcement_date": "2025-07-25", "license_id": "apache_2_0", "multimodal": false, "knowledge_cutoff": null, "param_count": 235000000000, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507", "source_playground": "https://chat.qwen.ai/", "source_paper": null, "source_scorecard_blog_link": "https://qwenlm.github.io/blog/qwen3-thinking/", "source_repo_link": "https://github.com/QwenLM/Qwen3", "source_weights_link": "https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507", "created_at": "2025-07-25T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00" } ================================================ FILE: data/organizations/qwen/models/qwen3-30b-a3b/benchmarks.json ================================================ [ { "model_benchmark_id": 455, "benchmark_id": "aime-2024", "model_id": "qwen3-30b-a3b", "score": 0.804, "normalized_score": 0.804, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.965575+00:00", "updated_at": "2025-07-19T19:56:11.965575+00:00", "benchmark_name": "AIME 2024" }, { "model_benchmark_id": 691, "benchmark_id": "aime-2025", "model_id": "qwen3-30b-a3b", "score": 0.709, "normalized_score": 0.709, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.449947+00:00", "updated_at": "2025-07-19T19:56:12.449947+00:00", "benchmark_name": "AIME 2025" }, { "model_benchmark_id": 1454, "benchmark_id": "arena-hard", "model_id": "qwen3-30b-a3b", "score": 0.91, "normalized_score": 0.91, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.098594+00:00", "updated_at": "2025-07-19T19:56:14.098594+00:00", "benchmark_name": "Arena Hard" }, { "model_benchmark_id": 852, "benchmark_id": "bfcl", "model_id": "qwen3-30b-a3b", "score": 0.691, "normalized_score": 0.691, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/", "verified_by_llmstats": false, "analysis_method": "v3", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.782049+00:00", "updated_at": "2025-07-19T19:56:12.782049+00:00", "benchmark_name": "BFCL" }, { "model_benchmark_id": 304, "benchmark_id": "gpqa", "model_id": "qwen3-30b-a3b", "score": 0.658, "normalized_score": 0.658, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.682771+00:00", "updated_at": "2025-07-19T19:56:11.682771+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 751, "benchmark_id": "livebench", "model_id": "qwen3-30b-a3b", "score": 0.743, "normalized_score": 0.743, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.579527+00:00", "updated_at": "2025-07-19T19:56:12.579527+00:00", "benchmark_name": "LiveBench" }, { "model_benchmark_id": 1125, "benchmark_id": "livecodebench", "model_id": "qwen3-30b-a3b", "score": 0.626, "normalized_score": 0.626, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/", "verified_by_llmstats": false, "analysis_method": "v5", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.349221+00:00", "updated_at": "2025-07-19T19:56:13.349221+00:00", "benchmark_name": "LiveCodeBench" }, { "model_benchmark_id": 1649, "benchmark_id": "multi-if", "model_id": "qwen3-30b-a3b", "score": 0.722, "normalized_score": 0.722, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.641584+00:00", "updated_at": "2025-07-19T19:56:14.641584+00:00", "benchmark_name": "Multi-IF" } ] ================================================ FILE: data/organizations/qwen/models/qwen3-30b-a3b/model.json ================================================ { "model_id": "qwen3-30b-a3b", "name": "Qwen3 30B A3B", "organization_id": "qwen", "fine_tuned_from_model_id": null, "description": "Qwen3-30B-A3B is a smaller Mixture-of-Experts (MoE) model from the Qwen3 series by Alibaba, with 30.5 billion total parameters and 3.3 billion activated parameters. Features hybrid thinking/non-thinking modes, support for 119 languages, and enhanced agent capabilities. It aims to outperform previous models like QwQ-32B while using significantly fewer activated parameters.", "release_date": "2025-04-29", "announcement_date": "2025-04-29", "license_id": "apache_2_0", "multimodal": false, "knowledge_cutoff": null, "param_count": 30500000000, "training_tokens": 36000000000000, "available_in_zeroeval": true, "source_api_ref": "https://qwenlm.github.io/blog/qwen3/", "source_playground": "https://chat.qwen.ai/", "source_paper": null, "source_scorecard_blog_link": "https://qwenlm.github.io/blog/qwen3/", "source_repo_link": "https://github.com/QwenLM/Qwen3", "source_weights_link": "https://huggingface.co/Qwen/Qwen3-30B-A3B", "created_at": "2025-07-19T19:49:05.631206+00:00", "updated_at": "2025-07-19T19:49:05.631206+00:00", "model_family_id": null } ================================================ FILE: data/organizations/qwen/models/qwen3-32b/benchmarks.json ================================================ [ { "model_benchmark_id": 1625, "benchmark_id": "aider", "model_id": "qwen3-32b", "score": 0.502, "normalized_score": 0.502, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/", "verified_by_llmstats": false, "analysis_method": "Pass@2", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.571165+00:00", "updated_at": "2025-07-19T19:56:14.571165+00:00", "benchmark_name": "Aider" }, { "model_benchmark_id": 453, "benchmark_id": "aime-2024", "model_id": "qwen3-32b", "score": 0.814, "normalized_score": 0.814, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/", "verified_by_llmstats": false, "analysis_method": "Pass@64", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.961658+00:00", "updated_at": "2025-07-19T19:56:11.961658+00:00", "benchmark_name": "AIME 2024" }, { "model_benchmark_id": 689, "benchmark_id": "aime-2025", "model_id": "qwen3-32b", "score": 0.729, "normalized_score": 0.729, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/", "verified_by_llmstats": false, "analysis_method": "Pass@64", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.446075+00:00", "updated_at": "2025-07-19T19:56:12.446075+00:00", "benchmark_name": "AIME 2025" }, { "model_benchmark_id": 1451, "benchmark_id": "arena-hard", "model_id": "qwen3-32b", "score": 0.938, "normalized_score": 0.938, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.093495+00:00", "updated_at": "2025-07-19T19:56:14.093495+00:00", "benchmark_name": "Arena Hard" }, { "model_benchmark_id": 850, "benchmark_id": "bfcl", "model_id": "qwen3-32b", "score": 0.703, "normalized_score": 0.703, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/", "verified_by_llmstats": false, "analysis_method": "v3", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.778924+00:00", "updated_at": "2025-07-19T19:56:12.778924+00:00", "benchmark_name": "BFCL" }, { "model_benchmark_id": 1645, "benchmark_id": "codeforces", "model_id": "qwen3-32b", "score": 0.659, "normalized_score": 0.659, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/", "verified_by_llmstats": false, "analysis_method": "Elo Rating", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.627279+00:00", "updated_at": "2025-07-19T19:56:14.627279+00:00", "benchmark_name": "CodeForces" }, { "model_benchmark_id": 748, "benchmark_id": "livebench", "model_id": "qwen3-32b", "score": 0.749, "normalized_score": 0.749, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.573432+00:00", "updated_at": "2025-07-19T19:56:12.573432+00:00", "benchmark_name": "LiveBench" }, { "model_benchmark_id": 1122, "benchmark_id": "livecodebench", "model_id": "qwen3-32b", "score": 0.657, "normalized_score": 0.657, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/", "verified_by_llmstats": false, "analysis_method": "v5", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.342304+00:00", "updated_at": "2025-07-19T19:56:13.342304+00:00", "benchmark_name": "LiveCodeBench" }, { "model_benchmark_id": 1646, "benchmark_id": "multilf", "model_id": "qwen3-32b", "score": 0.73, "normalized_score": 0.73, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/", "verified_by_llmstats": false, "analysis_method": "Accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.630716+00:00", "updated_at": "2025-07-19T19:56:14.630716+00:00", "benchmark_name": "MultiLF" } ] ================================================ FILE: data/organizations/qwen/models/qwen3-32b/model.json ================================================ { "model_id": "qwen3-32b", "name": "Qwen3 32B", "organization_id": "qwen", "fine_tuned_from_model_id": null, "description": "Qwen3-32B is a large language model from Alibaba's Qwen3 series. It features 32.8 billion parameters, a 128k token context window, support for 119 languages, and hybrid thinking modes allowing switching between deep reasoning and fast responses. It demonstrates strong performance in reasoning, instruction-following, and agent capabilities.", "release_date": "2025-04-29", "announcement_date": "2025-04-29", "license_id": "apache_2_0", "multimodal": false, "knowledge_cutoff": null, "param_count": 32800000000, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": null, "source_playground": "https://chat.qwen.ai/", "source_paper": null, "source_scorecard_blog_link": "https://qwenlm.github.io/blog/qwen3/", "source_repo_link": "https://github.com/QwenLM/Qwen3", "source_weights_link": "https://huggingface.co/Qwen/Qwen3-32B", "created_at": "2025-07-19T19:49:05.621845+00:00", "updated_at": "2025-07-19T19:49:05.621845+00:00", "model_family_id": null } ================================================ FILE: data/organizations/qwen/models/qwen3-next-80b-a3b-base/benchmarks.json ================================================ [] ================================================ FILE: data/organizations/qwen/models/qwen3-next-80b-a3b-base/model.json ================================================ { "model_id": "qwen3-next-80b-a3b-base", "name": "Qwen3-Next-80B-A3B-Base", "organization_id": "qwen", "model_family_id": null, "fine_tuned_from_model_id": null, "description": "Qwen3-Next-80B-A3B-Base is the foundation model in the Qwen3-Next series, featuring revolutionary architectural innovations for ultimate training and inference efficiency. It introduces Hybrid Attention combining Gated DeltaNet (75% layers) and Gated Attention (25% layers) for efficient ultra-long context modeling, Ultra-Sparse MoE with 512 total experts but only 10 routed + 1 shared expert activated (3.7% activation ratio), and native Multi-Token Prediction for faster inference. With 80B total parameters and only ~3B activated per inference step, it achieves performance comparable to Qwen3-32B while using less than 10% training cost and delivering 10x+ throughput for 32K+ contexts. Trained on 15T tokens with training-stability-friendly designs including Zero-Centered RMSNorm and normalized MoE router parameters. Supports 256K context length, extensible to 1M tokens with YaRN scaling.", "release_date": "2025-09-10", "announcement_date": "2025-09-10", "license_id": "apache_2_0", "multimodal": false, "knowledge_cutoff": null, "param_count": 80000000000, "training_tokens": 15000000000000, "available_in_zeroeval": true, "source_api_ref": "https://huggingface.co/Qwen/Qwen3-Next-80B-A3B-Base", "source_playground": null, "source_paper": null, "source_scorecard_blog_link": "https://qwenlm.github.io/blog/qwen3-next/", "source_repo_link": "https://github.com/QwenLM/Qwen3", "source_weights_link": "https://huggingface.co/Qwen/Qwen3-Next-80B-A3B-Base", "created_at": "2025-09-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00" } ================================================ FILE: data/organizations/qwen/models/qwen3-next-80b-a3b-instruct/benchmarks.json ================================================ [ { "model_benchmark_id": 9301, "benchmark_id": "mmlu-pro", "model_id": "qwen3-next-80b-a3b-instruct", "score": 0.806, "normalized_score": 0.806, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-01-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "MMLU-Pro" }, { "model_benchmark_id": 9302, "benchmark_id": "mmlu-redux", "model_id": "qwen3-next-80b-a3b-instruct", "score": 0.909, "normalized_score": 0.909, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-01-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "MMLU-Redux" }, { "model_benchmark_id": 9303, "benchmark_id": "gpqa", "model_id": "qwen3-next-80b-a3b-instruct", "score": 0.729, "normalized_score": 0.729, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-01-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 9304, "benchmark_id": "supergpqa", "model_id": "qwen3-next-80b-a3b-instruct", "score": 0.588, "normalized_score": 0.588, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-01-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "SuperGPQA" }, { "model_benchmark_id": 9305, "benchmark_id": "aime-2025", "model_id": "qwen3-next-80b-a3b-instruct", "score": 0.695, "normalized_score": 0.695, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-01-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "AIME 2025" }, { "model_benchmark_id": 9306, "benchmark_id": "hmmt25", "model_id": "qwen3-next-80b-a3b-instruct", "score": 0.541, "normalized_score": 0.541, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-01-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "HMMT25" }, { "model_benchmark_id": 9307, "benchmark_id": "livebench-20241125", "model_id": "qwen3-next-80b-a3b-instruct", "score": 0.758, "normalized_score": 0.758, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-01-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "LiveBench 20241125" }, { "model_benchmark_id": 9308, "benchmark_id": "livecodebench-v6", "model_id": "qwen3-next-80b-a3b-instruct", "score": 0.566, "normalized_score": 0.566, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/", "verified_by_llmstats": false, "analysis_method": "25.02-25.05", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-01-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "LiveCodeBench v6" }, { "model_benchmark_id": 9309, "benchmark_id": "multipl-e", "model_id": "qwen3-next-80b-a3b-instruct", "score": 0.878, "normalized_score": 0.878, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-01-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "MultiPL-E" }, { "model_benchmark_id": 9310, "benchmark_id": "aider-polyglot", "model_id": "qwen3-next-80b-a3b-instruct", "score": 0.498, "normalized_score": 0.498, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-01-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "Aider-Polyglot" }, { "model_benchmark_id": 9311, "benchmark_id": "ifeval", "model_id": "qwen3-next-80b-a3b-instruct", "score": 0.876, "normalized_score": 0.876, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-01-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "IFEval" }, { "model_benchmark_id": 9312, "benchmark_id": "arena-hard-v2", "model_id": "qwen3-next-80b-a3b-instruct", "score": 0.827, "normalized_score": 0.827, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/", "verified_by_llmstats": false, "analysis_method": "GPT-4.1 evaluated win rates", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-01-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "Arena-Hard v2" }, { "model_benchmark_id": 9313, "benchmark_id": "creative-writing-v3", "model_id": "qwen3-next-80b-a3b-instruct", "score": 0.853, "normalized_score": 0.853, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-01-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "Creative Writing v3" }, { "model_benchmark_id": 9314, "benchmark_id": "writingbench", "model_id": "qwen3-next-80b-a3b-instruct", "score": 0.873, "normalized_score": 0.873, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-01-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "WritingBench" }, { "model_benchmark_id": 9315, "benchmark_id": "bfcl-v3", "model_id": "qwen3-next-80b-a3b-instruct", "score": 0.703, "normalized_score": 0.703, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-01-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "BFCL-v3" }, { "model_benchmark_id": 9316, "benchmark_id": "tau-bench-retail", "model_id": "qwen3-next-80b-a3b-instruct", "score": 0.609, "normalized_score": 0.609, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": "TAU1-Retail", "created_at": "2025-01-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "TAU1-Retail" }, { "model_benchmark_id": 9317, "benchmark_id": "tau-bench-airline", "model_id": "qwen3-next-80b-a3b-instruct", "score": 0.44, "normalized_score": 0.44, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": "TAU1-Airline", "created_at": "2025-01-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "TAU1-Airline" }, { "model_benchmark_id": 9318, "benchmark_id": "tau2-retail", "model_id": "qwen3-next-80b-a3b-instruct", "score": 0.573, "normalized_score": 0.573, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-01-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "TAU2-Retail" }, { "model_benchmark_id": 9319, "benchmark_id": "tau2-airline", "model_id": "qwen3-next-80b-a3b-instruct", "score": 0.455, "normalized_score": 0.455, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-01-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "TAU2-Airline" }, { "model_benchmark_id": 9320, "benchmark_id": "tau2-telecom", "model_id": "qwen3-next-80b-a3b-instruct", "score": 0.132, "normalized_score": 0.132, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-01-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "TAU2-Telecom" }, { "model_benchmark_id": 9321, "benchmark_id": "multi-if", "model_id": "qwen3-next-80b-a3b-instruct", "score": 0.758, "normalized_score": 0.758, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": "MultiIF", "created_at": "2025-01-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "MultiIF" }, { "model_benchmark_id": 9322, "benchmark_id": "mmlu-prox", "model_id": "qwen3-next-80b-a3b-instruct", "score": 0.767, "normalized_score": 0.767, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": "MMLU-ProX", "created_at": "2025-01-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "MMLU-ProX" }, { "model_benchmark_id": 9323, "benchmark_id": "include", "model_id": "qwen3-next-80b-a3b-instruct", "score": 0.789, "normalized_score": 0.789, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-01-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "INCLUDE" }, { "model_benchmark_id": 9324, "benchmark_id": "polymath", "model_id": "qwen3-next-80b-a3b-instruct", "score": 0.459, "normalized_score": 0.459, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-01-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "PolyMATH" } ] ================================================ FILE: data/organizations/qwen/models/qwen3-next-80b-a3b-instruct/model.json ================================================ { "model_id": "qwen3-next-80b-a3b-instruct", "name": "Qwen3-Next-80B-A3B-Instruct", "organization_id": "qwen", "model_family_id": null, "fine_tuned_from_model_id": null, "description": "Qwen3-Next-80B-A3B-Instruct is the first in the Qwen3-Next series, featuring groundbreaking architectural innovations. It uses Hybrid Attention combining Gated DeltaNet and Gated Attention for efficient ultra-long context modeling, High-Sparsity MoE with 512 experts (10 activated + 1 shared) achieving extreme low activation ratio, and Multi-Token Prediction for improved performance and faster inference. With 80B total parameters and only 3B activated, it outperforms Qwen3-32B-Base with 10% training cost and 10x throughput for 32K+ contexts. The model performs on par with Qwen3-235B-A22B-Instruct-2507 while excelling at ultra-long-context tasks up to 256K tokens (extensible to 1M with YaRN). Architecture: 48 layers, 15T training tokens, hybrid layout of 12*(3*(Gated DeltaNet->MoE)->(Gated Attention->MoE)).", "release_date": "2025-09-10", "announcement_date": "2025-09-10", "license_id": "apache_2_0", "multimodal": false, "knowledge_cutoff": null, "param_count": 80000000000, "training_tokens": 15000000000000, "available_in_zeroeval": true, "source_api_ref": "https://huggingface.co/Qwen/Qwen3-Next-80B-A3B-Instruct", "source_playground": "https://chat.qwen.ai/", "source_paper": null, "source_scorecard_blog_link": "https://qwenlm.github.io/blog/qwen3-next/", "source_repo_link": "https://github.com/QwenLM/Qwen3", "source_weights_link": "https://huggingface.co/Qwen/Qwen3-Next-80B-A3B-Instruct", "created_at": "2025-09-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00" } ================================================ FILE: data/organizations/qwen/models/qwen3-next-80b-a3b-thinking/benchmarks.json ================================================ [ { "model_benchmark_id": 9201, "benchmark_id": "mmlu-pro", "model_id": "qwen3-next-80b-a3b-thinking", "score": 0.827, "normalized_score": 0.827, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-01-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "MMLU-Pro" }, { "model_benchmark_id": 9202, "benchmark_id": "mmlu-redux", "model_id": "qwen3-next-80b-a3b-thinking", "score": 0.925, "normalized_score": 0.925, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-01-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "MMLU-Redux" }, { "model_benchmark_id": 9203, "benchmark_id": "gpqa", "model_id": "qwen3-next-80b-a3b-thinking", "score": 0.772, "normalized_score": 0.772, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-01-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 9204, "benchmark_id": "supergpqa", "model_id": "qwen3-next-80b-a3b-thinking", "score": 0.608, "normalized_score": 0.608, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-01-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "SuperGPQA" }, { "model_benchmark_id": 9205, "benchmark_id": "aime-2025", "model_id": "qwen3-next-80b-a3b-thinking", "score": 0.878, "normalized_score": 0.878, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-01-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "AIME 2025" }, { "model_benchmark_id": 9206, "benchmark_id": "hmmt25", "model_id": "qwen3-next-80b-a3b-thinking", "score": 0.739, "normalized_score": 0.739, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-01-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "HMMT25" }, { "model_benchmark_id": 9207, "benchmark_id": "livebench-20241125", "model_id": "qwen3-next-80b-a3b-thinking", "score": 0.766, "normalized_score": 0.766, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-01-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "LiveBench 241125" }, { "model_benchmark_id": 9208, "benchmark_id": "livecodebench-v6", "model_id": "qwen3-next-80b-a3b-thinking", "score": 0.687, "normalized_score": 0.687, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/", "verified_by_llmstats": false, "analysis_method": "25.02-25.05", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-01-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "LiveCodeBench v6" }, { "model_benchmark_id": 9209, "benchmark_id": "cfeval", "model_id": "qwen3-next-80b-a3b-thinking", "score": 2071, "normalized_score": 0.2071, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": "Raw score: 2071", "created_at": "2025-01-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "CFEval" }, { "model_benchmark_id": 9210, "benchmark_id": "ojbench", "model_id": "qwen3-next-80b-a3b-thinking", "score": 0.297, "normalized_score": 0.297, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-01-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "OJBench" }, { "model_benchmark_id": 9211, "benchmark_id": "ifeval", "model_id": "qwen3-next-80b-a3b-thinking", "score": 0.889, "normalized_score": 0.889, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-01-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "IFEval" }, { "model_benchmark_id": 9212, "benchmark_id": "arena-hard-v2", "model_id": "qwen3-next-80b-a3b-thinking", "score": 0.623, "normalized_score": 0.623, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/", "verified_by_llmstats": false, "analysis_method": "GPT-4.1 evaluated win rates", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-01-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "Arena-Hard v2" }, { "model_benchmark_id": 9213, "benchmark_id": "writingbench", "model_id": "qwen3-next-80b-a3b-thinking", "score": 0.846, "normalized_score": 0.846, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-01-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "WritingBench" }, { "model_benchmark_id": 9214, "benchmark_id": "bfcl-v3", "model_id": "qwen3-next-80b-a3b-thinking", "score": 0.72, "normalized_score": 0.72, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-01-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "BFCL-v3" }, { "model_benchmark_id": 9215, "benchmark_id": "tau-bench-retail", "model_id": "qwen3-next-80b-a3b-thinking", "score": 0.696, "normalized_score": 0.696, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": "TAU1-Retail", "created_at": "2025-01-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "TAU1-Retail" }, { "model_benchmark_id": 9216, "benchmark_id": "tau-bench-airline", "model_id": "qwen3-next-80b-a3b-thinking", "score": 0.49, "normalized_score": 0.49, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": "TAU1-Airline", "created_at": "2025-01-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "TAU1-Airline" }, { "model_benchmark_id": 9217, "benchmark_id": "tau2-retail", "model_id": "qwen3-next-80b-a3b-thinking", "score": 0.678, "normalized_score": 0.678, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-01-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "TAU2-Retail" }, { "model_benchmark_id": 9218, "benchmark_id": "tau2-airline", "model_id": "qwen3-next-80b-a3b-thinking", "score": 0.605, "normalized_score": 0.605, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-01-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "TAU2-Airline" }, { "model_benchmark_id": 9219, "benchmark_id": "tau2-telecom", "model_id": "qwen3-next-80b-a3b-thinking", "score": 0.439, "normalized_score": 0.439, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-01-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "TAU2-Telecom" }, { "model_benchmark_id": 9220, "benchmark_id": "multi-if", "model_id": "qwen3-next-80b-a3b-thinking", "score": 0.778, "normalized_score": 0.778, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": "MultiIF", "created_at": "2025-01-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "MultiIF" }, { "model_benchmark_id": 9221, "benchmark_id": "mmlu-prox", "model_id": "qwen3-next-80b-a3b-thinking", "score": 0.787, "normalized_score": 0.787, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": "MMLU-ProX", "created_at": "2025-01-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "MMLU-ProX" }, { "model_benchmark_id": 9222, "benchmark_id": "include", "model_id": "qwen3-next-80b-a3b-thinking", "score": 0.789, "normalized_score": 0.789, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-01-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "INCLUDE" }, { "model_benchmark_id": 9223, "benchmark_id": "polymath", "model_id": "qwen3-next-80b-a3b-thinking", "score": 0.563, "normalized_score": 0.563, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/", "verified_by_llmstats": false, "analysis_method": null, "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-01-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "benchmark_name": "PolyMATH" } ] ================================================ FILE: data/organizations/qwen/models/qwen3-next-80b-a3b-thinking/model.json ================================================ { "model_id": "qwen3-next-80b-a3b-thinking", "name": "Qwen3-Next-80B-A3B-Thinking", "organization_id": "qwen", "model_family_id": null, "fine_tuned_from_model_id": null, "description": "Qwen3-Next-80B-A3B-Thinking is the thinking variant of the Qwen3-Next series, featuring the same groundbreaking architecture as the instruct model. Leveraging GSPO, it addresses stability and efficiency challenges of hybrid attention + high-sparsity MoE in RL training. It uses Hybrid Attention combining Gated DeltaNet and Gated Attention for efficient ultra-long context modeling, High-Sparsity MoE with 512 experts (10 activated + 1 shared), and Multi-Token Prediction. With 80B total parameters and only 3B activated, it demonstrates outstanding performance on complex reasoning tasks — outperforming Qwen3-30B-A3B-Thinking-2507, Qwen3-32B-Thinking, and even the proprietary Gemini-2.5-Flash-Thinking across multiple benchmarks. Architecture: 48 layers, 15T training tokens, hybrid layout of 12*(3*(Gated DeltaNet->MoE)->(Gated Attention->MoE)). Supports only thinking mode with automatic tag inclusion, may generate longer thinking content.", "release_date": "2025-09-10", "announcement_date": "2025-09-10", "license_id": "apache_2_0", "multimodal": false, "knowledge_cutoff": null, "param_count": 80000000000, "training_tokens": 15000000000000, "available_in_zeroeval": true, "source_api_ref": "https://huggingface.co/Qwen/Qwen3-Next-80B-A3B-Thinking", "source_playground": "https://chat.qwen.ai/", "source_paper": null, "source_scorecard_blog_link": "https://qwenlm.github.io/blog/qwen3-next/", "source_repo_link": "https://github.com/QwenLM/Qwen3", "source_weights_link": "https://huggingface.co/Qwen/Qwen3-Next-80B-A3B-Thinking", "created_at": "2025-09-10T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00" } ================================================ FILE: data/organizations/qwen/models/qwq-32b/benchmarks.json ================================================ [ { "model_benchmark_id": 451, "benchmark_id": "aime-2024", "model_id": "qwq-32b", "score": 0.795, "normalized_score": 0.795, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwq-32b/", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.957773+00:00", "updated_at": "2025-07-19T19:56:11.957773+00:00", "benchmark_name": "AIME 2024" }, { "model_benchmark_id": 849, "benchmark_id": "bfcl", "model_id": "qwq-32b", "score": 0.664, "normalized_score": 0.664, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwq-32b/", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.777209+00:00", "updated_at": "2025-07-19T19:56:12.777209+00:00", "benchmark_name": "BFCL" }, { "model_benchmark_id": 298, "benchmark_id": "gpqa", "model_id": "qwq-32b", "score": 0.652, "normalized_score": 0.652, "is_self_reported": true, "self_reported_source_link": "https://qwen-ai.com/qwq-32b/", "verified_by_llmstats": false, "analysis_method": "Pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.672880+00:00", "updated_at": "2025-07-19T19:56:11.672880+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 619, "benchmark_id": "ifeval", "model_id": "qwq-32b", "score": 0.839, "normalized_score": 0.839, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwq-32b/", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.275723+00:00", "updated_at": "2025-07-19T19:56:12.275723+00:00", "benchmark_name": "IFEval" }, { "model_benchmark_id": 747, "benchmark_id": "livebench", "model_id": "qwq-32b", "score": 0.731, "normalized_score": 0.731, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwq-32b/", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.570952+00:00", "updated_at": "2025-07-19T19:56:12.570952+00:00", "benchmark_name": "LiveBench" }, { "model_benchmark_id": 1118, "benchmark_id": "livecodebench", "model_id": "qwq-32b", "score": 0.634, "normalized_score": 0.634, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwq-32b/", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.332752+00:00", "updated_at": "2025-07-19T19:56:13.332752+00:00", "benchmark_name": "LiveCodeBench" }, { "model_benchmark_id": 495, "benchmark_id": "math-500", "model_id": "qwq-32b", "score": 0.906, "normalized_score": 0.906, "is_self_reported": true, "self_reported_source_link": "https://qwen-ai.com/qwq-32b/", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.034467+00:00", "updated_at": "2025-07-19T19:56:12.034467+00:00", "benchmark_name": "MATH-500" } ] ================================================ FILE: data/organizations/qwen/models/qwq-32b/model.json ================================================ { "model_id": "qwq-32b", "name": "QwQ-32B", "organization_id": "qwen", "fine_tuned_from_model_id": null, "description": "A model focused on advancing AI reasoning capabilities, particularly excelling in mathematics and programming. Features deep introspection and self-questioning abilities while having some limitations in language mixing and recursive/endless reasoning patterns.", "release_date": "2025-03-05", "announcement_date": "2025-03-05", "license_id": "apache_2_0", "multimodal": false, "knowledge_cutoff": "2024-11-28", "param_count": 32500000000, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://huggingface.co/Qwen/QwQ-32B", "source_playground": "https://huggingface.co/playground?modelId=Qwen/QwQ-32B", "source_paper": "https://arxiv.org/abs/2412.15115", "source_scorecard_blog_link": "https://qwenlm.github.io/blog/qwq-32b/", "source_repo_link": "https://github.com/QwenLM/QwQ", "source_weights_link": "https://huggingface.co/Qwen/QwQ-32B", "created_at": "2025-07-19T19:49:05.609393+00:00", "updated_at": "2025-07-19T19:49:05.609393+00:00", "model_family_id": null } ================================================ FILE: data/organizations/qwen/models/qwq-32b-preview/benchmarks.json ================================================ [ { "model_benchmark_id": 452, "benchmark_id": "aime-2024", "model_id": "qwq-32b-preview", "score": 0.5, "normalized_score": 0.5, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwq-32b-preview/", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.959852+00:00", "updated_at": "2025-07-19T19:56:11.959852+00:00", "benchmark_name": "AIME 2024" }, { "model_benchmark_id": 300, "benchmark_id": "gpqa", "model_id": "qwq-32b-preview", "score": 0.652, "normalized_score": 0.652, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwq-32b-preview/", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.675997+00:00", "updated_at": "2025-07-19T19:56:11.675997+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 1120, "benchmark_id": "livecodebench", "model_id": "qwq-32b-preview", "score": 0.5, "normalized_score": 0.5, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwq-32b-preview/", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.337401+00:00", "updated_at": "2025-07-19T19:56:13.337401+00:00", "benchmark_name": "LiveCodeBench" }, { "model_benchmark_id": 496, "benchmark_id": "math-500", "model_id": "qwq-32b-preview", "score": 0.906, "normalized_score": 0.906, "is_self_reported": true, "self_reported_source_link": "https://qwenlm.github.io/blog/qwq-32b-preview/", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.036449+00:00", "updated_at": "2025-07-19T19:56:12.036449+00:00", "benchmark_name": "MATH-500" } ] ================================================ FILE: data/organizations/qwen/models/qwq-32b-preview/model.json ================================================ { "model_id": "qwq-32b-preview", "name": "QwQ-32B-Preview", "organization_id": "qwen", "fine_tuned_from_model_id": "qwen-2.5-32b-instruct", "description": "An experimental research model focused on advancing AI reasoning capabilities, particularly excelling in mathematics and programming. Features deep introspection and self-questioning abilities while having some limitations in language mixing and recursive reasoning patterns.", "release_date": "2024-11-28", "announcement_date": "2024-11-28", "license_id": "apache_2_0", "multimodal": false, "knowledge_cutoff": "2024-11-28", "param_count": 32500000000, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://huggingface.co/Qwen/QwQ-32B-Preview", "source_playground": "https://huggingface.co/spaces/Qwen/QwQ-32B-Preview", "source_paper": "https://arxiv.org/abs/2407.10671", "source_scorecard_blog_link": "https://qwenlm.github.io/blog/qwq-32b-preview/", "source_repo_link": "https://github.com/QwenLM/Qwen2", "source_weights_link": "https://huggingface.co/Qwen/QwQ-32B-Preview", "created_at": "2025-07-19T19:49:05.887027+00:00", "updated_at": "2025-07-19T19:49:05.887027+00:00", "model_family_id": null } ================================================ FILE: data/organizations/qwen/organization.json ================================================ { "organization_id": "qwen", "name": "Alibaba Cloud / Qwen Team", "website": "https://qwenlm.github.io", "description": "The Qwen Team from Alibaba Cloud, developing the Qwen series of large language models including state-of-the-art mixture-of-experts and thinking-enabled models", "country": "CN", "created_at": "2025-07-19T19:49:05.604449+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00" } ================================================ FILE: data/organizations/unknown/organization.json ================================================ { "organization_id": "unknown", "name": "Unknown", "website": "", "description": "Default organization for missing data", "country": null, "created_at": "2025-08-03T22:06:10.791768+00:00", "updated_at": "2025-08-03T22:06:10.791768+00:00" } ================================================ FILE: data/organizations/xai/models/grok-1.5/benchmarks.json ================================================ [ { "model_benchmark_id": 894, "benchmark_id": "docvqa", "model_id": "grok-1.5", "score": 0.856, "normalized_score": 0.856, "is_self_reported": true, "self_reported_source_link": "https://x.ai/blog/grok-2", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.861804+00:00", "updated_at": "2025-07-19T19:56:12.861804+00:00", "benchmark_name": "DocVQA" }, { "model_benchmark_id": 322, "benchmark_id": "gpqa", "model_id": "grok-1.5", "score": 0.359, "normalized_score": 0.359, "is_self_reported": true, "self_reported_source_link": "https://x.ai/blog/grok-2", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.711788+00:00", "updated_at": "2025-07-19T19:56:11.711788+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 1001, "benchmark_id": "gsm8k", "model_id": "grok-1.5", "score": 0.9, "normalized_score": 0.9, "is_self_reported": true, "self_reported_source_link": "https://x.ai/blog/grok-1.5", "verified_by_llmstats": false, "analysis_method": "8-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.092882+00:00", "updated_at": "2025-07-19T19:56:13.092882+00:00", "benchmark_name": "GSM8k" }, { "model_benchmark_id": 794, "benchmark_id": "humaneval", "model_id": "grok-1.5", "score": 0.741, "normalized_score": 0.741, "is_self_reported": true, "self_reported_source_link": "https://x.ai/blog/grok-1.5", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.660557+00:00", "updated_at": "2025-07-19T19:56:12.660557+00:00", "benchmark_name": "HumanEval" }, { "model_benchmark_id": 413, "benchmark_id": "math", "model_id": "grok-1.5", "score": 0.506, "normalized_score": 0.506, "is_self_reported": true, "self_reported_source_link": "https://x.ai/blog/grok-1.5", "verified_by_llmstats": false, "analysis_method": "4-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.878054+00:00", "updated_at": "2025-07-19T19:56:11.878054+00:00", "benchmark_name": "MATH" }, { "model_benchmark_id": 532, "benchmark_id": "mathvista", "model_id": "grok-1.5", "score": 0.528, "normalized_score": 0.528, "is_self_reported": true, "self_reported_source_link": "https://x.ai/blog/grok-2", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.103226+00:00", "updated_at": "2025-07-19T19:56:12.103226+00:00", "benchmark_name": "MathVista" }, { "model_benchmark_id": 97, "benchmark_id": "mmlu", "model_id": "grok-1.5", "score": 0.813, "normalized_score": 0.813, "is_self_reported": true, "self_reported_source_link": "https://x.ai/blog/grok-1.5", "verified_by_llmstats": false, "analysis_method": "5-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.283997+00:00", "updated_at": "2025-07-19T19:56:11.283997+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 206, "benchmark_id": "mmlu-pro", "model_id": "grok-1.5", "score": 0.51, "normalized_score": 0.51, "is_self_reported": true, "self_reported_source_link": "https://x.ai/blog/grok-2", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.492470+00:00", "updated_at": "2025-07-19T19:56:11.492470+00:00", "benchmark_name": "MMLU-Pro" }, { "model_benchmark_id": 578, "benchmark_id": "mmmu", "model_id": "grok-1.5", "score": 0.536, "normalized_score": 0.536, "is_self_reported": true, "self_reported_source_link": "https://x.ai/blog/grok-2", "verified_by_llmstats": false, "analysis_method": "0-shot", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.189264+00:00", "updated_at": "2025-07-19T19:56:12.189264+00:00", "benchmark_name": "MMMU" } ] ================================================ FILE: data/organizations/xai/models/grok-1.5/model.json ================================================ { "model_id": "grok-1.5", "name": "Grok-1.5", "organization_id": "xai", "fine_tuned_from_model_id": null, "description": "An advanced language model with improved reasoning capabilities, particularly excelling in coding and mathematical tasks. Features a 128K token context window and enhanced problem-solving abilities compared to its predecessor.", "release_date": "2024-03-28", "announcement_date": "2024-03-28", "license_id": "proprietary", "multimodal": false, "knowledge_cutoff": null, "param_count": null, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://x.ai/api", "source_playground": null, "source_paper": null, "source_scorecard_blog_link": "https://x.ai/blog/grok-1.5", "source_repo_link": "https://github.com/xai-org/grok-1", "source_weights_link": null, "created_at": "2025-07-19T19:49:05.705047+00:00", "updated_at": "2025-07-19T19:49:05.705047+00:00", "model_family_id": null } ================================================ FILE: data/organizations/xai/models/grok-1.5v/benchmarks.json ================================================ [ { "model_benchmark_id": 1259, "benchmark_id": "ai2d", "model_id": "grok-1.5v", "score": 0.883, "normalized_score": 0.883, "is_self_reported": true, "self_reported_source_link": "https://x.ai/blog/grok-1.5v", "verified_by_llmstats": false, "analysis_method": "zero-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.641849+00:00", "updated_at": "2025-07-19T19:56:13.641849+00:00", "benchmark_name": "AI2D" }, { "model_benchmark_id": 871, "benchmark_id": "chartqa", "model_id": "grok-1.5v", "score": 0.761, "normalized_score": 0.761, "is_self_reported": true, "self_reported_source_link": "https://x.ai/blog/grok-1.5v", "verified_by_llmstats": false, "analysis_method": "zero-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.817786+00:00", "updated_at": "2025-07-19T19:56:12.817786+00:00", "benchmark_name": "ChartQA" }, { "model_benchmark_id": 896, "benchmark_id": "docvqa", "model_id": "grok-1.5v", "score": 0.856, "normalized_score": 0.856, "is_self_reported": true, "self_reported_source_link": "https://x.ai/blog/grok-1.5v", "verified_by_llmstats": false, "analysis_method": "zero-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.865566+00:00", "updated_at": "2025-07-19T19:56:12.865566+00:00", "benchmark_name": "DocVQA" }, { "model_benchmark_id": 534, "benchmark_id": "mathvista", "model_id": "grok-1.5v", "score": 0.528, "normalized_score": 0.528, "is_self_reported": true, "self_reported_source_link": "https://x.ai/blog/grok-1.5v", "verified_by_llmstats": false, "analysis_method": "zero-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.106344+00:00", "updated_at": "2025-07-19T19:56:12.106344+00:00", "benchmark_name": "MathVista" }, { "model_benchmark_id": 581, "benchmark_id": "mmmu", "model_id": "grok-1.5v", "score": 0.536, "normalized_score": 0.536, "is_self_reported": true, "self_reported_source_link": "https://x.ai/blog/grok-1.5v", "verified_by_llmstats": false, "analysis_method": "zero-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.195047+00:00", "updated_at": "2025-07-19T19:56:12.195047+00:00", "benchmark_name": "MMMU" }, { "model_benchmark_id": 1638, "benchmark_id": "realworldqa", "model_id": "grok-1.5v", "score": 0.687, "normalized_score": 0.687, "is_self_reported": true, "self_reported_source_link": "https://x.ai/blog/grok-1.5v", "verified_by_llmstats": false, "analysis_method": "zero-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:14.606610+00:00", "updated_at": "2025-07-19T19:56:14.606610+00:00", "benchmark_name": "RealWorldQA" }, { "model_benchmark_id": 915, "benchmark_id": "textvqa", "model_id": "grok-1.5v", "score": 0.781, "normalized_score": 0.781, "is_self_reported": true, "self_reported_source_link": "https://x.ai/blog/grok-1.5v", "verified_by_llmstats": false, "analysis_method": "zero-shot evaluation", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.908800+00:00", "updated_at": "2025-07-19T19:56:12.908800+00:00", "benchmark_name": "TextVQA" } ] ================================================ FILE: data/organizations/xai/models/grok-1.5v/model.json ================================================ { "model_id": "grok-1.5v", "name": "Grok-1.5V", "organization_id": "xai", "fine_tuned_from_model_id": null, "description": "A multimodal model capable of processing text and visual information, including documents, diagrams, charts, screenshots, and photographs. Notable for strong real-world spatial understanding capabilities.", "release_date": "2024-04-12", "announcement_date": "2024-04-12", "license_id": "proprietary", "multimodal": true, "knowledge_cutoff": null, "param_count": null, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://x.ai/api", "source_playground": null, "source_paper": null, "source_scorecard_blog_link": "https://x.ai/blog/grok-1.5v", "source_repo_link": null, "source_weights_link": null, "created_at": "2025-07-19T19:49:05.717803+00:00", "updated_at": "2025-07-19T19:49:05.717803+00:00", "model_family_id": null } ================================================ FILE: data/organizations/xai/models/grok-2/benchmarks.json ================================================ [ { "model_benchmark_id": 895, "benchmark_id": "docvqa", "model_id": "grok-2", "score": 0.936, "normalized_score": 0.936, "is_self_reported": true, "self_reported_source_link": "https://x.ai/blog/grok-2", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.863462+00:00", "updated_at": "2025-07-19T19:56:12.863462+00:00", "benchmark_name": "DocVQA" }, { "model_benchmark_id": 325, "benchmark_id": "gpqa", "model_id": "grok-2", "score": 0.56, "normalized_score": 0.56, "is_self_reported": true, "self_reported_source_link": "https://x.ai/blog/grok-2", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.716230+00:00", "updated_at": "2025-07-19T19:56:11.716230+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 795, "benchmark_id": "humaneval", "model_id": "grok-2", "score": 0.884, "normalized_score": 0.884, "is_self_reported": true, "self_reported_source_link": "https://x.ai/blog/grok-2", "verified_by_llmstats": false, "analysis_method": "pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.662404+00:00", "updated_at": "2025-07-19T19:56:12.662404+00:00", "benchmark_name": "HumanEval" }, { "model_benchmark_id": 414, "benchmark_id": "math", "model_id": "grok-2", "score": 0.761, "normalized_score": 0.761, "is_self_reported": true, "self_reported_source_link": "https://x.ai/blog/grok-2", "verified_by_llmstats": false, "analysis_method": "maj@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.880368+00:00", "updated_at": "2025-07-19T19:56:11.880368+00:00", "benchmark_name": "MATH" }, { "model_benchmark_id": 533, "benchmark_id": "mathvista", "model_id": "grok-2", "score": 0.69, "normalized_score": 0.69, "is_self_reported": true, "self_reported_source_link": "https://x.ai/blog/grok-2", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.104885+00:00", "updated_at": "2025-07-19T19:56:12.104885+00:00", "benchmark_name": "MathVista" }, { "model_benchmark_id": 98, "benchmark_id": "mmlu", "model_id": "grok-2", "score": 0.875, "normalized_score": 0.875, "is_self_reported": true, "self_reported_source_link": "https://x.ai/blog/grok-2", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.285517+00:00", "updated_at": "2025-07-19T19:56:11.285517+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 207, "benchmark_id": "mmlu-pro", "model_id": "grok-2", "score": 0.755, "normalized_score": 0.755, "is_self_reported": true, "self_reported_source_link": "https://x.ai/blog/grok-2", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.494333+00:00", "updated_at": "2025-07-19T19:56:11.494333+00:00", "benchmark_name": "MMLU-Pro" }, { "model_benchmark_id": 580, "benchmark_id": "mmmu", "model_id": "grok-2", "score": 0.661, "normalized_score": 0.661, "is_self_reported": true, "self_reported_source_link": "https://x.ai/blog/grok-2", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.193698+00:00", "updated_at": "2025-07-19T19:56:12.193698+00:00", "benchmark_name": "MMMU" } ] ================================================ FILE: data/organizations/xai/models/grok-2/model.json ================================================ { "model_id": "grok-2", "name": "Grok-2", "organization_id": "xai", "fine_tuned_from_model_id": null, "description": "Grok-2 is a frontier language model with state-of-the-art reasoning capabilities, featuring advanced abilities in chat, coding, and reasoning. It demonstrates superior performance in visual math reasoning, document-based question answering, and excels across various academic benchmarks including reasoning, reading comprehension, math, and science.", "release_date": "2024-08-13", "announcement_date": "2024-08-13", "license_id": "proprietary", "multimodal": true, "knowledge_cutoff": null, "param_count": null, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://x.ai/api", "source_playground": null, "source_paper": null, "source_scorecard_blog_link": "https://x.ai/blog/grok-2", "source_repo_link": null, "source_weights_link": null, "created_at": "2025-07-19T19:49:05.715016+00:00", "updated_at": "2025-07-19T19:49:05.715016+00:00", "model_family_id": null } ================================================ FILE: data/organizations/xai/models/grok-2-mini/benchmarks.json ================================================ [ { "model_benchmark_id": 893, "benchmark_id": "docvqa", "model_id": "grok-2-mini", "score": 0.932, "normalized_score": 0.932, "is_self_reported": true, "self_reported_source_link": "https://x.ai/blog/grok-2", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.860093+00:00", "updated_at": "2025-07-19T19:56:12.860093+00:00", "benchmark_name": "DocVQA" }, { "model_benchmark_id": 321, "benchmark_id": "gpqa", "model_id": "grok-2-mini", "score": 0.51, "normalized_score": 0.51, "is_self_reported": true, "self_reported_source_link": "https://x.ai/blog/grok-2", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.710285+00:00", "updated_at": "2025-07-19T19:56:11.710285+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 793, "benchmark_id": "humaneval", "model_id": "grok-2-mini", "score": 0.857, "normalized_score": 0.857, "is_self_reported": true, "self_reported_source_link": "https://x.ai/blog/grok-2", "verified_by_llmstats": false, "analysis_method": "pass@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.658802+00:00", "updated_at": "2025-07-19T19:56:12.658802+00:00", "benchmark_name": "HumanEval" }, { "model_benchmark_id": 412, "benchmark_id": "math", "model_id": "grok-2-mini", "score": 0.73, "normalized_score": 0.73, "is_self_reported": true, "self_reported_source_link": "https://x.ai/blog/grok-2", "verified_by_llmstats": false, "analysis_method": "maj@1", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.876593+00:00", "updated_at": "2025-07-19T19:56:11.876593+00:00", "benchmark_name": "MATH" }, { "model_benchmark_id": 531, "benchmark_id": "mathvista", "model_id": "grok-2-mini", "score": 0.681, "normalized_score": 0.681, "is_self_reported": true, "self_reported_source_link": "https://x.ai/blog/grok-2", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.101817+00:00", "updated_at": "2025-07-19T19:56:12.101817+00:00", "benchmark_name": "MathVista" }, { "model_benchmark_id": 96, "benchmark_id": "mmlu", "model_id": "grok-2-mini", "score": 0.862, "normalized_score": 0.862, "is_self_reported": true, "self_reported_source_link": "https://x.ai/blog/grok-2", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.281643+00:00", "updated_at": "2025-07-19T19:56:11.281643+00:00", "benchmark_name": "MMLU" }, { "model_benchmark_id": 205, "benchmark_id": "mmlu-pro", "model_id": "grok-2-mini", "score": 0.72, "normalized_score": 0.72, "is_self_reported": true, "self_reported_source_link": "https://x.ai/blog/grok-2", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.490630+00:00", "updated_at": "2025-07-19T19:56:11.490630+00:00", "benchmark_name": "MMLU-Pro" }, { "model_benchmark_id": 577, "benchmark_id": "mmmu", "model_id": "grok-2-mini", "score": 0.632, "normalized_score": 0.632, "is_self_reported": true, "self_reported_source_link": "https://x.ai/blog/grok-2", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.186961+00:00", "updated_at": "2025-07-19T19:56:12.186961+00:00", "benchmark_name": "MMMU" } ] ================================================ FILE: data/organizations/xai/models/grok-2-mini/model.json ================================================ { "model_id": "grok-2-mini", "name": "Grok-2 mini", "organization_id": "xai", "fine_tuned_from_model_id": null, "description": "Grok-2 mini is a smaller, faster variant of Grok-2 that offers a balance between speed and answer quality. While more compact than its larger sibling, it maintains strong capabilities across various tasks including reasoning, coding, and chat interactions.", "release_date": "2024-08-13", "announcement_date": "2024-08-13", "license_id": "proprietary", "multimodal": true, "knowledge_cutoff": null, "param_count": null, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://x.ai/api", "source_playground": null, "source_paper": null, "source_scorecard_blog_link": "https://x.ai/blog/grok-2", "source_repo_link": null, "source_weights_link": null, "created_at": "2025-07-19T19:49:05.702680+00:00", "updated_at": "2025-07-19T19:49:05.702680+00:00", "model_family_id": null } ================================================ FILE: data/organizations/xai/models/grok-3/benchmarks.json ================================================ [ { "model_benchmark_id": 475, "benchmark_id": "aime-2024", "model_id": "grok-3", "score": 0.933, "normalized_score": 0.933, "is_self_reported": true, "self_reported_source_link": "https://x.ai/blog/grok-3", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.003392+00:00", "updated_at": "2025-07-19T19:56:12.003392+00:00", "benchmark_name": "AIME 2024" }, { "model_benchmark_id": 696, "benchmark_id": "aime-2025", "model_id": "grok-3", "score": 0.933, "normalized_score": 0.933, "is_self_reported": true, "self_reported_source_link": "https://x.ai/blog/grok-3", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.457788+00:00", "updated_at": "2025-07-19T19:56:12.457788+00:00", "benchmark_name": "AIME 2025" }, { "model_benchmark_id": 324, "benchmark_id": "gpqa", "model_id": "grok-3", "score": 0.846, "normalized_score": 0.846, "is_self_reported": true, "self_reported_source_link": "https://x.ai/blog/grok-3", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.714708+00:00", "updated_at": "2025-07-19T19:56:11.714708+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 1142, "benchmark_id": "livecodebench", "model_id": "grok-3", "score": 0.794, "normalized_score": 0.794, "is_self_reported": true, "self_reported_source_link": "https://x.ai/blog/grok-3", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.402422+00:00", "updated_at": "2025-07-19T19:56:13.402422+00:00", "benchmark_name": "LiveCodeBench" }, { "model_benchmark_id": 579, "benchmark_id": "mmmu", "model_id": "grok-3", "score": 0.78, "normalized_score": 0.78, "is_self_reported": true, "self_reported_source_link": "https://x.ai/blog/grok-3", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.191844+00:00", "updated_at": "2025-07-19T19:56:12.191844+00:00", "benchmark_name": "MMMU" } ] ================================================ FILE: data/organizations/xai/models/grok-3/model.json ================================================ { "model_id": "grok-3", "name": "Grok-3", "organization_id": "xai", "fine_tuned_from_model_id": null, "description": "Grok 3, launched by xAI on February 17, 2025, is an advanced AI model with significantly enhanced capabilities compared to Grok 2, boasting an order of magnitude increase in performance. Trained on a vast dataset that includes legal documents among others, and utilizing a massive compute infrastructure with around 200,000 GPUs in a Memphis data center, Grok 3's training used ten times more compute than its predecessor. It features specialized models like Grok 3 Reasoning and Grok 3 Mini Reasoning for complex problem-solving, and it excels in benchmarks like AIME for mathematics and GPQA for PhD-level science.", "release_date": "2025-02-17", "announcement_date": "2025-02-17", "license_id": "proprietary", "multimodal": true, "knowledge_cutoff": "2024-11-17", "param_count": null, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://x.ai/api", "source_playground": null, "source_paper": null, "source_scorecard_blog_link": null, "source_repo_link": null, "source_weights_link": null, "created_at": "2025-07-19T19:49:05.711845+00:00", "updated_at": "2025-07-19T19:49:05.711845+00:00", "model_family_id": null } ================================================ FILE: data/organizations/xai/models/grok-3-mini/benchmarks.json ================================================ [ { "model_benchmark_id": 474, "benchmark_id": "aime-2024", "model_id": "grok-3-mini", "score": 0.958, "normalized_score": 0.958, "is_self_reported": true, "self_reported_source_link": "https://x.ai/blog/grok-3", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.001587+00:00", "updated_at": "2025-07-19T19:56:12.001587+00:00", "benchmark_name": "AIME 2024" }, { "model_benchmark_id": 693, "benchmark_id": "aime-2025", "model_id": "grok-3-mini", "score": 0.908, "normalized_score": 0.908, "is_self_reported": true, "self_reported_source_link": "https://x.ai/blog/grok-3", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.452930+00:00", "updated_at": "2025-07-19T19:56:12.452930+00:00", "benchmark_name": "AIME 2025" }, { "model_benchmark_id": 319, "benchmark_id": "gpqa", "model_id": "grok-3-mini", "score": 0.84, "normalized_score": 0.84, "is_self_reported": true, "self_reported_source_link": "https://x.ai/blog/grok-3", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.707259+00:00", "updated_at": "2025-07-19T19:56:11.707259+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 1139, "benchmark_id": "livecodebench", "model_id": "grok-3-mini", "score": 0.804, "normalized_score": 0.804, "is_self_reported": true, "self_reported_source_link": "https://x.ai/blog/grok-3", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.394024+00:00", "updated_at": "2025-07-19T19:56:13.394024+00:00", "benchmark_name": "LiveCodeBench" } ] ================================================ FILE: data/organizations/xai/models/grok-3-mini/model.json ================================================ { "model_id": "grok-3-mini", "name": "Grok-3 Mini", "organization_id": "xai", "fine_tuned_from_model_id": null, "description": "Grok 3 Mini is a streamlined version of xAI's Grok 3 AI model, designed for quicker response times while maintaining utility. It's tailored for users who require speed over the comprehensive capabilities of the full Grok 3 model, making it suitable for tasks where rapid information retrieval is key. Grok 3 Mini still leverages the advanced training and data that Grok 3 was built on but offers a lighter, more efficient version for everyday use.", "release_date": "2025-02-17", "announcement_date": "2025-02-17", "license_id": "proprietary", "multimodal": true, "knowledge_cutoff": "2024-11-17", "param_count": null, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://x.ai/api", "source_playground": null, "source_paper": null, "source_scorecard_blog_link": null, "source_repo_link": null, "source_weights_link": null, "created_at": "2025-07-19T19:49:05.697297+00:00", "updated_at": "2025-07-19T19:49:05.697297+00:00", "model_family_id": null } ================================================ FILE: data/organizations/xai/models/grok-4/benchmarks.json ================================================ [ { "model_benchmark_id": 695, "benchmark_id": "aime-2025", "model_id": "grok-4", "score": 0.917, "normalized_score": 0.917, "is_self_reported": true, "self_reported_source_link": "https://x.com/xai/status/1943158495588815072", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.456102+00:00", "updated_at": "2025-07-19T19:56:12.456102+00:00", "benchmark_name": "AIME 2025" }, { "model_benchmark_id": 1387, "benchmark_id": "arc-agi-v2", "model_id": "grok-4", "score": 0.159, "normalized_score": 0.159, "is_self_reported": true, "self_reported_source_link": "https://x.com/xai/status/1943158495588815072", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.922021+00:00", "updated_at": "2025-07-19T19:56:13.922021+00:00", "benchmark_name": "ARC-AGI v2" }, { "model_benchmark_id": 323, "benchmark_id": "gpqa", "model_id": "grok-4", "score": 0.875, "normalized_score": 0.875, "is_self_reported": true, "self_reported_source_link": "https://x.com/xai/status/1943158495588815072", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.713248+00:00", "updated_at": "2025-07-19T19:56:11.713248+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 1799, "benchmark_id": "hmmt25", "model_id": "grok-4", "score": 0.9, "normalized_score": 0.9, "is_self_reported": true, "self_reported_source_link": "https://x.com/xai/status/1943158495588815072", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.065811+00:00", "updated_at": "2025-07-19T19:56:15.065811+00:00", "benchmark_name": "HMMT25" }, { "model_benchmark_id": 723, "benchmark_id": "humanity's-last-exam", "model_id": "grok-4", "score": 0.4, "normalized_score": 0.4, "is_self_reported": true, "self_reported_source_link": "https://x.com/xai/status/1943158495588815072", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.523105+00:00", "updated_at": "2025-07-19T19:56:12.523105+00:00", "benchmark_name": "Humanity's Last Exam" }, { "model_benchmark_id": 1141, "benchmark_id": "livecodebench", "model_id": "grok-4", "score": 0.79, "normalized_score": 0.79, "is_self_reported": true, "self_reported_source_link": "https://x.com/xai/status/1943158495588815072", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.399716+00:00", "updated_at": "2025-07-19T19:56:13.399716+00:00", "benchmark_name": "LiveCodeBench" }, { "model_benchmark_id": 1801, "benchmark_id": "usamo25", "model_id": "grok-4", "score": 0.375, "normalized_score": 0.375, "is_self_reported": true, "self_reported_source_link": "https://x.com/xai/status/1943158495588815072", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.071894+00:00", "updated_at": "2025-07-19T19:56:15.071894+00:00", "benchmark_name": "USAMO25" } ] ================================================ FILE: data/organizations/xai/models/grok-4/model.json ================================================ { "model_id": "grok-4", "name": "Grok-4", "organization_id": "xai", "fine_tuned_from_model_id": null, "description": "Grok 4, announced by xAI in summer 2025, represents a major leap in AI capabilities, described as 'the smartest AI in the world.' Built on version 6 of xAI's foundation model, it uses 100x more training compute than Grok 2 and 10x more reinforcement learning compute than Grok 3. The model achieves PhD-level performance across all academic disciplines simultaneously, scoring perfect on standardized tests like the SAT and near-perfect on graduate exams like the GRE. Unlike Grok 3, tool usage is built into the training process rather than relying on generalization. Trained using 200,000 GPUs, Grok 4 excels at complex reasoning, mathematical problem-solving, and coding tasks, though it has acknowledged weaknesses in multimodal capabilities that are being addressed in the next version.", "release_date": "2025-07-09", "announcement_date": "2025-07-09", "license_id": "proprietary", "multimodal": true, "knowledge_cutoff": "2024-12-31", "param_count": null, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://x.ai/api", "source_playground": null, "source_paper": null, "source_scorecard_blog_link": null, "source_repo_link": null, "source_weights_link": null, "created_at": "2025-07-19T19:49:05.707962+00:00", "updated_at": "2025-07-19T19:49:05.707962+00:00", "model_family_id": null } ================================================ FILE: data/organizations/xai/models/grok-4-fast/benchmarks.json ================================================ [ { "model_benchmark_id": 22228, "benchmark_id": "gpqa", "model_id": "grok-4-fast", "score": 0.857, "normalized_score": 0.857, "is_self_reported": true, "self_reported_source_link": "https://x.ai/news/grok-4-fast", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-10-11T00:00:00.000000+00:00", "updated_at": "2025-10-11T00:00:00.000000+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 22229, "benchmark_id": "aime-2025", "model_id": "grok-4-fast", "score": 0.920, "normalized_score": 0.920, "is_self_reported": true, "self_reported_source_link": "https://x.ai/news/grok-4-fast", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-10-11T00:00:00.000000+00:00", "updated_at": "2025-10-11T00:00:00.000000+00:00", "benchmark_name": "AIME 2025" }, { "model_benchmark_id": 22230, "benchmark_id": "hmmt-2025", "model_id": "grok-4-fast", "score": 0.933, "normalized_score": 0.933, "is_self_reported": true, "self_reported_source_link": "https://x.ai/news/grok-4-fast", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-10-11T00:00:00.000000+00:00", "updated_at": "2025-10-11T00:00:00.000000+00:00", "benchmark_name": "HMMT 2025" }, { "model_benchmark_id": 22231, "benchmark_id": "hle", "model_id": "grok-4-fast", "score": 0.200, "normalized_score": 0.200, "is_self_reported": true, "self_reported_source_link": "https://x.ai/news/grok-4-fast", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-10-11T00:00:00.000000+00:00", "updated_at": "2025-10-11T00:00:00.000000+00:00", "benchmark_name": "HLE" }, { "model_benchmark_id": 22232, "benchmark_id": "livecodebench", "model_id": "grok-4-fast", "score": 0.800, "normalized_score": 0.800, "is_self_reported": true, "self_reported_source_link": "https://x.ai/news/grok-4-fast", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-10-11T00:00:00.000000+00:00", "updated_at": "2025-10-11T00:00:00.000000+00:00", "benchmark_name": "LiveCodeBench" }, { "model_benchmark_id": 22233, "benchmark_id": "browsecomp", "model_id": "grok-4-fast", "score": 0.449, "normalized_score": 0.449, "is_self_reported": true, "self_reported_source_link": "https://x.ai/news/grok-4-fast", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-10-11T00:00:00.000000+00:00", "updated_at": "2025-10-11T00:00:00.000000+00:00", "benchmark_name": "BrowseComp" }, { "model_benchmark_id": 22234, "benchmark_id": "simpleqa", "model_id": "grok-4-fast", "score": 0.950, "normalized_score": 0.950, "is_self_reported": true, "self_reported_source_link": "https://x.ai/news/grok-4-fast", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-10-11T00:00:00.000000+00:00", "updated_at": "2025-10-11T00:00:00.000000+00:00", "benchmark_name": "SimpleQA" } ] ================================================ FILE: data/organizations/xai/models/grok-4-fast/model.json ================================================ { "model_id": "grok-4-fast", "name": "Grok 4 Fast", "organization_id": "xai", "fine_tuned_from_model_id": null, "description": "Grok 4 Fast is a high-speed variant of Grok-4, optimized for faster inference while maintaining strong reasoning capabilities. It offers improved throughput and lower latency compared to the standard Grok-4 model.", "release_date": "2025-08-28", "announcement_date": "2025-08-28", "license_id": "proprietary", "multimodal": true, "knowledge_cutoff": null, "param_count": null, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://x.ai/api", "source_playground": null, "source_paper": "https://data.x.ai/2025-08-26-grok-code-fast-1-model-card.pdf", "source_scorecard_blog_link": null, "source_repo_link": null, "source_weights_link": null, "created_at": "2025-10-11T00:00:00.000000+00:00", "updated_at": "2025-10-11T00:00:00.000000+00:00", "model_family_id": null } ================================================ FILE: data/organizations/xai/models/grok-4-heavy/benchmarks.json ================================================ [ { "model_benchmark_id": 694, "benchmark_id": "aime-2025", "model_id": "grok-4-heavy", "score": 1.0, "normalized_score": 1.0, "is_self_reported": true, "self_reported_source_link": "https://x.com/xai/status/1943158495588815072", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.454500+00:00", "updated_at": "2025-07-19T19:56:12.454500+00:00", "benchmark_name": "AIME 2025" }, { "model_benchmark_id": 320, "benchmark_id": "gpqa", "model_id": "grok-4-heavy", "score": 0.884, "normalized_score": 0.884, "is_self_reported": true, "self_reported_source_link": "https://x.com/xai/status/1943158495588815072", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:11.708827+00:00", "updated_at": "2025-07-19T19:56:11.708827+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 1798, "benchmark_id": "hmmt25", "model_id": "grok-4-heavy", "score": 0.967, "normalized_score": 0.967, "is_self_reported": true, "self_reported_source_link": "https://x.com/xai/status/1943158495588815072", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.063588+00:00", "updated_at": "2025-07-19T19:56:15.063588+00:00", "benchmark_name": "HMMT25" }, { "model_benchmark_id": 722, "benchmark_id": "humanity's-last-exam", "model_id": "grok-4-heavy", "score": 0.507, "normalized_score": 0.507, "is_self_reported": true, "self_reported_source_link": "https://x.com/xai/status/1943158495588815072", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:12.521361+00:00", "updated_at": "2025-07-19T19:56:12.521361+00:00", "benchmark_name": "Humanity's Last Exam" }, { "model_benchmark_id": 1140, "benchmark_id": "livecodebench", "model_id": "grok-4-heavy", "score": 0.794, "normalized_score": 0.794, "is_self_reported": true, "self_reported_source_link": "https://x.com/xai/status/1943158495588815072", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:13.396669+00:00", "updated_at": "2025-07-19T19:56:13.396669+00:00", "benchmark_name": "LiveCodeBench" }, { "model_benchmark_id": 1800, "benchmark_id": "usamo25", "model_id": "grok-4-heavy", "score": 0.619, "normalized_score": 0.619, "is_self_reported": true, "self_reported_source_link": "https://x.com/xai/status/1943158495588815072", "verified_by_llmstats": false, "analysis_method": "accuracy", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-19T19:56:15.070427+00:00", "updated_at": "2025-07-19T19:56:15.070427+00:00", "benchmark_name": "USAMO25" } ] ================================================ FILE: data/organizations/xai/models/grok-4-heavy/model.json ================================================ { "model_id": "grok-4-heavy", "name": "Grok-4 Heavy", "organization_id": "xai", "fine_tuned_from_model_id": null, "description": "Grok 4 Heavy is the multi-agent version of Grok 4, released alongside the standard model in summer 2025. This system spawns multiple Grok 4 agents in parallel that work independently on problems and then collaborate by comparing their solutions, similar to a study group. The agents share insights and tricks they discover, with the system intelligently combining their work rather than simply using majority voting. Grok 4 Heavy uses approximately 10x more test-time compute than regular Grok 4, enabling it to solve significantly more complex problems. On the Humanities Last Exam, it achieves over 50% accuracy on text-only problems, and it scored a perfect result on the AIME 2025 mathematics competition. The system represents a major advancement in multi-agent AI collaboration and reasoning capabilities.", "release_date": "2025-07-09", "announcement_date": "2025-07-09", "license_id": "proprietary", "multimodal": true, "knowledge_cutoff": "2024-12-31", "param_count": null, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://x.ai/api", "source_playground": null, "source_paper": null, "source_scorecard_blog_link": null, "source_repo_link": null, "source_weights_link": null, "created_at": "2025-07-19T19:49:05.700416+00:00", "updated_at": "2025-07-19T19:49:05.700416+00:00", "model_family_id": null } ================================================ FILE: data/organizations/xai/models/grok-code-fast-1/benchmarks.json ================================================ [ { "model_benchmark_id": 22227, "benchmark_id": "swe-bench-verified", "model_id": "grok-code-fast-1", "score": 0.708, "normalized_score": 0.708, "is_self_reported": true, "self_reported_source_link": "https://x.ai/blog/grok-code-fast-1", "verified_by_llmstats": false, "analysis_method": "full subset, internal harness", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-10-03T00:00:00.000000+00:00", "updated_at": "2025-10-03T00:00:00.000000+00:00", "benchmark_name": "SWE-Bench Verified" } ] ================================================ FILE: data/organizations/xai/models/grok-code-fast-1/model.json ================================================ { "model_id": "grok-code-fast-1", "name": "Grok Code Fast 1", "organization_id": "xai", "fine_tuned_from_model_id": null, "description": "Grok Code Fast 1 is a speedy and economical reasoning model that excels at agentic coding. Built from scratch with a brand-new model architecture, it features a pre-training corpus rich with programming-related content and post-training datasets that reflect real-world pull requests and coding tasks. The model has mastered the use of common tools like grep, terminal, and file editing, making it ideal for integration with IDEs. It is exceptionally versatile across the full software development stack and is particularly adept at TypeScript, Python, Java, Rust, C++, and Go.", "release_date": "2025-08-28", "announcement_date": "2025-08-28", "license_id": "proprietary", "multimodal": false, "knowledge_cutoff": null, "param_count": null, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://x.ai/api", "source_playground": null, "source_paper": "https://data.x.ai/2025-08-26-grok-code-fast-1-model-card.pdf", "source_scorecard_blog_link": "https://x.ai/blog/grok-code-fast-1", "source_repo_link": null, "source_weights_link": null, "created_at": "2025-10-03T00:00:00.000000+00:00", "updated_at": "2025-10-03T00:00:00.000000+00:00", "model_family_id": null } ================================================ FILE: data/organizations/xai/organization.json ================================================ { "organization_id": "xai", "name": "xAI", "website": "https://x.ai", "description": "Elon Musk AI company", "country": "US", "created_at": "2025-07-19T19:49:05.695344+00:00", "updated_at": "2025-07-19T19:49:05.695344+00:00" } ================================================ FILE: data/organizations/zai-org/models/glm-4.5/benchmarks.json ================================================ [ { "model_benchmark_id": 7001, "benchmark_id": "mmlu-pro", "model_id": "glm-4.5", "score": 0.846, "normalized_score": 0.846, "is_self_reported": true, "self_reported_source_link": "https://z.ai/blog/glm-4.5", "verified_by_llmstats": false, "analysis_method": "standard", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-28T00:00:00.000000+00:00", "updated_at": "2025-07-28T00:00:00.000000+00:00", "benchmark_name": "MMLU-Pro" }, { "model_benchmark_id": 7002, "benchmark_id": "aime-2024", "model_id": "glm-4.5", "score": 0.91, "normalized_score": 0.91, "is_self_reported": true, "self_reported_source_link": "https://z.ai/blog/glm-4.5", "verified_by_llmstats": false, "analysis_method": "Avg@32", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-28T00:00:00.000000+00:00", "updated_at": "2025-07-28T00:00:00.000000+00:00", "benchmark_name": "AIME24" }, { "model_benchmark_id": 7003, "benchmark_id": "math-500", "model_id": "glm-4.5", "score": 0.982, "normalized_score": 0.982, "is_self_reported": true, "self_reported_source_link": "https://z.ai/blog/glm-4.5", "verified_by_llmstats": false, "analysis_method": "standard", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-28T00:00:00.000000+00:00", "updated_at": "2025-07-28T00:00:00.000000+00:00", "benchmark_name": "MATH-500" }, { "model_benchmark_id": 7004, "benchmark_id": "scicode", "model_id": "glm-4.5", "score": 0.417, "normalized_score": 0.417, "is_self_reported": true, "self_reported_source_link": "https://z.ai/blog/glm-4.5", "verified_by_llmstats": false, "analysis_method": "standard", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-28T00:00:00.000000+00:00", "updated_at": "2025-07-28T00:00:00.000000+00:00", "benchmark_name": "SciCode" }, { "model_benchmark_id": 7005, "benchmark_id": "gpqa", "model_id": "glm-4.5", "score": 0.791, "normalized_score": 0.791, "is_self_reported": true, "self_reported_source_link": "https://z.ai/blog/glm-4.5", "verified_by_llmstats": false, "analysis_method": "Avg@8", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-28T00:00:00.000000+00:00", "updated_at": "2025-07-28T00:00:00.000000+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 7006, "benchmark_id": "livecodebench", "model_id": "glm-4.5", "score": 0.729, "normalized_score": 0.729, "is_self_reported": true, "self_reported_source_link": "https://z.ai/blog/glm-4.5", "verified_by_llmstats": false, "analysis_method": "2407-2501", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-28T00:00:00.000000+00:00", "updated_at": "2025-07-28T00:00:00.000000+00:00", "benchmark_name": "LiveCodeBench" }, { "model_benchmark_id": 7007, "benchmark_id": "swe-bench-verified", "model_id": "glm-4.5", "score": 0.642, "normalized_score": 0.642, "is_self_reported": true, "self_reported_source_link": "https://z.ai/blog/glm-4.5", "verified_by_llmstats": false, "analysis_method": "OpenHands v0.34.0", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-28T00:00:00.000000+00:00", "updated_at": "2025-07-28T00:00:00.000000+00:00", "benchmark_name": "SWE-bench-Verified" }, { "model_benchmark_id": 7008, "benchmark_id": "tau-bench-retail", "model_id": "glm-4.5", "score": 0.797, "normalized_score": 0.797, "is_self_reported": true, "self_reported_source_link": "https://z.ai/blog/glm-4.5", "verified_by_llmstats": false, "analysis_method": "optimized user simulator", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-28T00:00:00.000000+00:00", "updated_at": "2025-07-28T00:00:00.000000+00:00", "benchmark_name": "TAU-bench-Retail" }, { "model_benchmark_id": 7009, "benchmark_id": "bfcl-v3", "model_id": "glm-4.5", "score": 0.778, "normalized_score": 0.778, "is_self_reported": true, "self_reported_source_link": "https://z.ai/blog/glm-4.5", "verified_by_llmstats": false, "analysis_method": "Full", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-28T00:00:00.000000+00:00", "updated_at": "2025-07-28T00:00:00.000000+00:00", "benchmark_name": "BFCL-v3" }, { "model_benchmark_id": 7010, "benchmark_id": "tau-bench-airline", "model_id": "glm-4.5", "score": 0.604, "normalized_score": 0.604, "is_self_reported": true, "self_reported_source_link": "https://z.ai/blog/glm-4.5", "verified_by_llmstats": false, "analysis_method": "optimized user simulator", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-28T00:00:00.000000+00:00", "updated_at": "2025-07-28T00:00:00.000000+00:00", "benchmark_name": "TAU-bench-Airline" }, { "model_benchmark_id": 7011, "benchmark_id": "browsecomp", "model_id": "glm-4.5", "score": 0.264, "normalized_score": 0.264, "is_self_reported": true, "self_reported_source_link": "https://z.ai/blog/glm-4.5", "verified_by_llmstats": false, "analysis_method": "standard", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-28T00:00:00.000000+00:00", "updated_at": "2025-07-28T00:00:00.000000+00:00", "benchmark_name": "BrowseComp" }, { "model_benchmark_id": 7012, "benchmark_id": "hle", "model_id": "glm-4.5", "score": 0.144, "normalized_score": 0.144, "is_self_reported": true, "self_reported_source_link": "https://z.ai/blog/glm-4.5", "verified_by_llmstats": false, "analysis_method": "text-based questions only", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-28T00:00:00.000000+00:00", "updated_at": "2025-07-28T00:00:00.000000+00:00", "benchmark_name": "HLE" }, { "model_benchmark_id": 7013, "benchmark_id": "aa-index", "model_id": "glm-4.5", "score": 0.677, "normalized_score": 0.677, "is_self_reported": true, "self_reported_source_link": "https://z.ai/blog/glm-4.5", "verified_by_llmstats": false, "analysis_method": "Estimated", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-28T00:00:00.000000+00:00", "updated_at": "2025-07-28T00:00:00.000000+00:00", "benchmark_name": "AA-Index" }, { "model_benchmark_id": 7014, "benchmark_id": "terminal-bench", "model_id": "glm-4.5", "score": 0.375, "normalized_score": 0.375, "is_self_reported": true, "self_reported_source_link": "https://z.ai/blog/glm-4.5", "verified_by_llmstats": false, "analysis_method": "Terminus framework", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-28T00:00:00.000000+00:00", "updated_at": "2025-07-28T00:00:00.000000+00:00", "benchmark_name": "Terminal-Bench" } ] ================================================ FILE: data/organizations/zai-org/models/glm-4.5/model.json ================================================ { "model_id": "glm-4.5", "name": "GLM-4.5", "organization_id": "zai-org", "fine_tuned_from_model_id": null, "description": "GLM-4.5 is an Agentic, Reasoning, and Coding (ARC) foundation model designed for intelligent agents, featuring 355 billion total parameters with 32 billion active parameters using MoE architecture. Trained on 23T tokens through multi-stage training, it is a hybrid reasoning model that provides two modes: thinking mode for complex reasoning and tool usage, and non-thinking mode for immediate responses. The model unifies agentic, reasoning, and coding capabilities with 128K context length support. It achieves exceptional performance with a score of 63.2 across 12 industry-standard benchmarks, placing 3rd among all proprietary and open-source models. Released under MIT open-source license allowing commercial use and secondary development.", "release_date": "2025-07-28", "announcement_date": "2025-07-28", "license_id": "mit", "multimodal": false, "knowledge_cutoff": null, "param_count": 355000000000, "training_tokens": 23000000000000, "available_in_zeroeval": true, "source_api_ref": "https://docs.z.ai/guides/llm/glm-4.5", "source_playground": "https://chat.z.ai", "source_paper": "https://arxiv.org/pdf/2508.06471", "source_scorecard_blog_link": "https://z.ai/blog/glm-4.5", "source_repo_link": "https://github.com/zai-org/GLM-4.5", "source_weights_link": "https://huggingface.co/zai-org/GLM-4.5", "created_at": "2025-09-15T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "model_family_id": null } ================================================ FILE: data/organizations/zai-org/models/glm-4.5-air/benchmarks.json ================================================ [ { "model_benchmark_id": 7101, "benchmark_id": "mmlu-pro", "model_id": "glm-4.5-air", "score": 0.814, "normalized_score": 0.814, "is_self_reported": true, "self_reported_source_link": "https://z.ai/blog/glm-4.5", "verified_by_llmstats": false, "analysis_method": "standard", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-28T00:00:00.000000+00:00", "updated_at": "2025-07-28T00:00:00.000000+00:00", "benchmark_name": "MMLU-Pro" }, { "model_benchmark_id": 7102, "benchmark_id": "aime-2024", "model_id": "glm-4.5-air", "score": 0.894, "normalized_score": 0.894, "is_self_reported": true, "self_reported_source_link": "https://z.ai/blog/glm-4.5", "verified_by_llmstats": false, "analysis_method": "Avg@32", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-28T00:00:00.000000+00:00", "updated_at": "2025-07-28T00:00:00.000000+00:00", "benchmark_name": "AIME24" }, { "model_benchmark_id": 7103, "benchmark_id": "math-500", "model_id": "glm-4.5-air", "score": 0.981, "normalized_score": 0.981, "is_self_reported": true, "self_reported_source_link": "https://z.ai/blog/glm-4.5", "verified_by_llmstats": false, "analysis_method": "standard", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-28T00:00:00.000000+00:00", "updated_at": "2025-07-28T00:00:00.000000+00:00", "benchmark_name": "MATH-500" }, { "model_benchmark_id": 7104, "benchmark_id": "scicode", "model_id": "glm-4.5-air", "score": 0.373, "normalized_score": 0.373, "is_self_reported": true, "self_reported_source_link": "https://z.ai/blog/glm-4.5", "verified_by_llmstats": false, "analysis_method": "standard", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-28T00:00:00.000000+00:00", "updated_at": "2025-07-28T00:00:00.000000+00:00", "benchmark_name": "SciCode" }, { "model_benchmark_id": 7105, "benchmark_id": "gpqa", "model_id": "glm-4.5-air", "score": 0.75, "normalized_score": 0.75, "is_self_reported": true, "self_reported_source_link": "https://z.ai/blog/glm-4.5", "verified_by_llmstats": false, "analysis_method": "Avg@8", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-28T00:00:00.000000+00:00", "updated_at": "2025-07-28T00:00:00.000000+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 7106, "benchmark_id": "livecodebench", "model_id": "glm-4.5-air", "score": 0.707, "normalized_score": 0.707, "is_self_reported": true, "self_reported_source_link": "https://z.ai/blog/glm-4.5", "verified_by_llmstats": false, "analysis_method": "2407-2501", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-28T00:00:00.000000+00:00", "updated_at": "2025-07-28T00:00:00.000000+00:00", "benchmark_name": "LiveCodeBench" }, { "model_benchmark_id": 7107, "benchmark_id": "swe-bench-verified", "model_id": "glm-4.5-air", "score": 0.576, "normalized_score": 0.576, "is_self_reported": true, "self_reported_source_link": "https://z.ai/blog/glm-4.5", "verified_by_llmstats": false, "analysis_method": "OpenHands v0.34.0", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-28T00:00:00.000000+00:00", "updated_at": "2025-07-28T00:00:00.000000+00:00", "benchmark_name": "SWE-bench-Verified" }, { "model_benchmark_id": 7108, "benchmark_id": "tau-bench-retail", "model_id": "glm-4.5-air", "score": 0.779, "normalized_score": 0.779, "is_self_reported": true, "self_reported_source_link": "https://z.ai/blog/glm-4.5", "verified_by_llmstats": false, "analysis_method": "optimized user simulator", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-28T00:00:00.000000+00:00", "updated_at": "2025-07-28T00:00:00.000000+00:00", "benchmark_name": "TAU-bench-Retail" }, { "model_benchmark_id": 7109, "benchmark_id": "bfcl-v3", "model_id": "glm-4.5-air", "score": 0.764, "normalized_score": 0.764, "is_self_reported": true, "self_reported_source_link": "https://z.ai/blog/glm-4.5", "verified_by_llmstats": false, "analysis_method": "Full", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-28T00:00:00.000000+00:00", "updated_at": "2025-07-28T00:00:00.000000+00:00", "benchmark_name": "BFCL-v3" }, { "model_benchmark_id": 7110, "benchmark_id": "tau-bench-airline", "model_id": "glm-4.5-air", "score": 0.608, "normalized_score": 0.608, "is_self_reported": true, "self_reported_source_link": "https://z.ai/blog/glm-4.5", "verified_by_llmstats": false, "analysis_method": "optimized user simulator", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-28T00:00:00.000000+00:00", "updated_at": "2025-07-28T00:00:00.000000+00:00", "benchmark_name": "TAU-bench-Airline" }, { "model_benchmark_id": 7111, "benchmark_id": "browsecomp", "model_id": "glm-4.5-air", "score": 0.213, "normalized_score": 0.213, "is_self_reported": true, "self_reported_source_link": "https://z.ai/blog/glm-4.5", "verified_by_llmstats": false, "analysis_method": "standard", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-28T00:00:00.000000+00:00", "updated_at": "2025-07-28T00:00:00.000000+00:00", "benchmark_name": "BrowseComp" }, { "model_benchmark_id": 7112, "benchmark_id": "hle", "model_id": "glm-4.5-air", "score": 0.106, "normalized_score": 0.106, "is_self_reported": true, "self_reported_source_link": "https://z.ai/blog/glm-4.5", "verified_by_llmstats": false, "analysis_method": "text-based questions only", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-28T00:00:00.000000+00:00", "updated_at": "2025-07-28T00:00:00.000000+00:00", "benchmark_name": "HLE" }, { "model_benchmark_id": 7113, "benchmark_id": "aa-index", "model_id": "glm-4.5-air", "score": 0.648, "normalized_score": 0.648, "is_self_reported": true, "self_reported_source_link": "https://z.ai/blog/glm-4.5", "verified_by_llmstats": false, "analysis_method": "Estimated", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-28T00:00:00.000000+00:00", "updated_at": "2025-07-28T00:00:00.000000+00:00", "benchmark_name": "AA-Index" }, { "model_benchmark_id": 7114, "benchmark_id": "terminal-bench", "model_id": "glm-4.5-air", "score": 0.3, "normalized_score": 0.3, "is_self_reported": true, "self_reported_source_link": "https://z.ai/blog/glm-4.5", "verified_by_llmstats": false, "analysis_method": "Terminus framework", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-28T00:00:00.000000+00:00", "updated_at": "2025-07-28T00:00:00.000000+00:00", "benchmark_name": "Terminal-Bench" } ] ================================================ FILE: data/organizations/zai-org/models/glm-4.5-air/model.json ================================================ { "model_id": "glm-4.5-air", "name": "GLM-4.5-Air", "organization_id": "zai-org", "fine_tuned_from_model_id": null, "description": "GLM-4.5-Air is a more compact variant of GLM-4.5 designed for efficient Agentic, Reasoning, and Coding (ARC) applications. It features 106 billion total parameters with 12 billion active parameters using MoE architecture. Like GLM-4.5, it is a hybrid reasoning model providing thinking mode for complex reasoning and tool usage, and non-thinking mode for immediate responses. Despite its compact design, GLM-4.5-Air delivers competitive performance with a score of 59.8 across 12 industry-standard benchmarks, ranking 6th overall while maintaining superior efficiency. It supports 128K context length and is released under MIT open-source license allowing commercial use.", "release_date": "2025-07-28", "announcement_date": "2025-07-28", "license_id": "mit", "multimodal": false, "knowledge_cutoff": null, "param_count": 106000000000, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://docs.z.ai/guides/llm/glm-4.5", "source_playground": "https://chat.z.ai", "source_paper": "https://arxiv.org/pdf/2508.06471", "source_scorecard_blog_link": "https://z.ai/blog/glm-4.5", "source_repo_link": "https://github.com/zai-org/GLM-4.5", "source_weights_link": "https://huggingface.co/zai-org/GLM-4.5-Air", "created_at": "2025-09-15T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "model_family_id": null } ================================================ FILE: data/organizations/zai-org/models/glm-4.5v/benchmarks.json ================================================ [] ================================================ FILE: data/organizations/zai-org/models/glm-4.5v/model.json ================================================ { "model_id": "glm-4.5v", "name": "GLM-4.5V", "organization_id": "zai-org", "model_family_id": null, "fine_tuned_from_model_id": "glm-4.5-air", "description": "GLM-4.5V is a multimodal (vision-language) model based on GLM-4.5-Air (106B total, 12B active) that extends hybrid reasoning to images and video. It achieves state-of-the-art results across 40+ VLM benchmarks (image reasoning, video understanding, GUI tasks, chart/document parsing, grounding) while supporting a Thinking Mode switch for deep reasoning. Released under MIT with FP8/BF16 variants and tooling in Transformers, vLLM, and SGLang.", "release_date": "2025-08-11", "announcement_date": "2025-08-11", "license_id": "mit", "multimodal": true, "knowledge_cutoff": null, "param_count": 108000000000, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": null, "source_playground": "https://chat.z.ai", "source_paper": "https://arxiv.org/abs/2507.01006", "source_scorecard_blog_link": null, "source_repo_link": "https://github.com/zai-org/GLM-V/", "source_weights_link": "https://huggingface.co/zai-org/GLM-4.5V", "created_at": "2025-09-29T00:00:00.000000+00:00", "updated_at": "2025-09-29T00:00:00.000000+00:00" } ================================================ FILE: data/organizations/zai-org/models/glm-4.6/benchmarks.json ================================================ [ { "model_benchmark_id": 7002, "benchmark_id": "aime-2025", "model_id": "glm-4.6", "score": 0.939, "normalized_score": 0.939, "is_self_reported": true, "self_reported_source_link": "https://z.ai/blog/glm-4.6", "verified_by_llmstats": false, "analysis_method": "standard", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-30T00:00:00.000000+00:00", "updated_at": "2025-07-30T00:00:00.000000+00:00", "benchmark_name": "AIME 2025" }, { "model_benchmark_id": 7005, "benchmark_id": "gpqa", "model_id": "glm-4.6", "score": 0.81, "normalized_score": 0.81, "is_self_reported": true, "self_reported_source_link": "https://z.ai/blog/glm-4.6", "verified_by_llmstats": false, "analysis_method": "standard", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-30T00:00:00.000000+00:00", "updated_at": "2025-07-30T00:00:00.000000+00:00", "benchmark_name": "GPQA" }, { "model_benchmark_id": 7006, "benchmark_id": "livecodebench-v6", "model_id": "glm-4.6", "score": 0.828, "normalized_score": 0.828, "is_self_reported": true, "self_reported_source_link": "https://z.ai/blog/glm-4.6", "verified_by_llmstats": false, "analysis_method": "standard", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-30T00:00:00.000000+00:00", "updated_at": "2025-07-30T00:00:00.000000+00:00", "benchmark_name": "LiveCodeBench v6" }, { "model_benchmark_id": 7007, "benchmark_id": "swe-bench-verified", "model_id": "glm-4.6", "score": 0.68, "normalized_score": 0.68, "is_self_reported": true, "self_reported_source_link": "https://z.ai/blog/glm-4.6", "verified_by_llmstats": false, "analysis_method": "OpenHands v0.34.0", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-30T00:00:00.000000+00:00", "updated_at": "2025-07-30T00:00:00.000000+00:00", "benchmark_name": "SWE-bench-Verified" }, { "model_benchmark_id": 7011, "benchmark_id": "browsecomp", "model_id": "glm-4.6", "score": 0.451, "normalized_score": 0.451, "is_self_reported": true, "self_reported_source_link": "https://z.ai/blog/glm-4.6", "verified_by_llmstats": false, "analysis_method": "standard", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-30T00:00:00.000000+00:00", "updated_at": "2025-07-30T00:00:00.000000+00:00", "benchmark_name": "BrowseComp" }, { "model_benchmark_id": 7012, "benchmark_id": "hle", "model_id": "glm-4.6", "score": 0.172, "normalized_score": 0.172, "is_self_reported": true, "self_reported_source_link": "https://z.ai/blog/glm-4.6", "verified_by_llmstats": false, "analysis_method": "standard", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-30T00:00:00.000000+00:00", "updated_at": "2025-07-30T00:00:00.000000+00:00", "benchmark_name": "HLE" }, { "model_benchmark_id": 7014, "benchmark_id": "terminal-bench", "model_id": "glm-4.6", "score": 0.405, "normalized_score": 0.405, "is_self_reported": true, "self_reported_source_link": "https://z.ai/blog/glm-4.6", "verified_by_llmstats": false, "analysis_method": "standard", "verification_provider_id": null, "verification_hardware": null, "verification_date": null, "verification_notes": null, "created_at": "2025-07-30T00:00:00.000000+00:00", "updated_at": "2025-07-30T00:00:00.000000+00:00", "benchmark_name": "Terminal-Bench" } ] ================================================ FILE: data/organizations/zai-org/models/glm-4.6/model.json ================================================ { "model_id": "glm-4.6", "name": "GLM-4.6", "organization_id": "zai-org", "model_family_id": null, "fine_tuned_from_model_id": null, "description": "GLM-4.6 is the latest version of Z.ai's flagship model, bringing significant improvements over GLM-4.5. Key features include: 200K token context window (expanded from 128K), superior coding performance with better real-world application in Claude Code/Cline/Roo Code/Kilo Code, advanced reasoning with tool use during inference, stronger agent capabilities, and refined writing aligned with human preferences. GLM-4.6 achieves competitive performance with DeepSeek-V3.2-Exp and Claude Sonnet 4, reaching near parity with Claude Sonnet 4 (48.6% win rate) on CC-Bench real-world coding tasks.", "release_date": "2025-09-30", "announcement_date": "2025-09-30", "license_id": "mit", "multimodal": true, "knowledge_cutoff": null, "param_count": 357000000000, "training_tokens": null, "available_in_zeroeval": true, "source_api_ref": "https://docs.z.ai/guides/llm/glm-4.6", "source_playground": "https://chat.z.ai", "source_paper": "https://arxiv.org/pdf/2508.06471", "source_scorecard_blog_link": "https://huggingface.co/zai-org/GLM-4.6", "source_repo_link": null, "source_weights_link": null, "created_at": "2025-09-29T00:00:00.000000+00:00", "updated_at": "2025-09-30T00:00:00.000000+00:00" } ================================================ FILE: data/organizations/zai-org/organization.json ================================================ { "organization_id": "zai-org", "name": "Zhipu AI", "website": "https://z.ai", "description": "Zhipu AI is a Chinese AI company that provides a suite of AI tools and services.", "country": "CN", "created_at": "2025-09-15T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00" } ================================================ FILE: data/providers/anthropic/models.json ================================================ [ { "model_provider_id": 398, "model_id": "claude-3-5-haiku-20241022", "provider_id": "anthropic", "deprecated_at": null, "input_cents_per_million_tokens": 100, "output_cents_per_million_tokens": 500, "quantization": null, "max_input_tokens": 200000, "max_output_tokens": 200000, "throughput": 100.0, "latency": 0.3, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.073101+00:00", "updated_at": "2025-07-19T19:49:17.073101+00:00", "provider_model_id_used": "claude-3-5-haiku-20241022", "model_name": "Claude 3.5 Haiku", "organization_id": "anthropic" }, { "model_provider_id": 397, "model_id": "claude-3-5-sonnet-20241022", "provider_id": "anthropic", "deprecated_at": null, "input_cents_per_million_tokens": 300, "output_cents_per_million_tokens": 1500, "quantization": null, "max_input_tokens": 200000, "max_output_tokens": 200000, "throughput": 100.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.071608+00:00", "updated_at": "2025-07-19T19:49:17.071608+00:00", "provider_model_id_used": "claude-3-5-sonnet-20241022", "model_name": "Claude 3.5 Sonnet", "organization_id": "anthropic" }, { "model_provider_id": 402, "model_id": "claude-3-7-sonnet-20250219", "provider_id": "anthropic", "deprecated_at": null, "input_cents_per_million_tokens": 300, "output_cents_per_million_tokens": 1500, "quantization": null, "max_input_tokens": 200000, "max_output_tokens": 128000, "throughput": 42.0, "latency": 0.4, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.082450+00:00", "updated_at": "2025-07-19T19:49:17.082450+00:00", "provider_model_id_used": "claude-3-7-sonnet-20250219", "model_name": "Claude 3.7 Sonnet", "organization_id": "anthropic" }, { "model_provider_id": 401, "model_id": "claude-3-haiku-20240307", "provider_id": "anthropic", "deprecated_at": null, "input_cents_per_million_tokens": 25, "output_cents_per_million_tokens": 125, "quantization": null, "max_input_tokens": 200000, "max_output_tokens": 200000, "throughput": 100.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.080579+00:00", "updated_at": "2025-07-19T19:49:17.080579+00:00", "provider_model_id_used": "claude-3-haiku-20240307", "model_name": "Claude 3 Haiku", "organization_id": "anthropic" }, { "model_provider_id": 399, "model_id": "claude-3-opus-20240229", "provider_id": "anthropic", "deprecated_at": null, "input_cents_per_million_tokens": 1500, "output_cents_per_million_tokens": 7500, "quantization": null, "max_input_tokens": 200000, "max_output_tokens": 200000, "throughput": 100.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.075485+00:00", "updated_at": "2025-07-19T19:49:17.075485+00:00", "provider_model_id_used": "claude-3-opus-20240229", "model_name": "Claude 3 Opus", "organization_id": "anthropic" }, { "model_provider_id": 400, "model_id": "claude-3-sonnet-20240229", "provider_id": "anthropic", "deprecated_at": null, "input_cents_per_million_tokens": 300, "output_cents_per_million_tokens": 1500, "quantization": null, "max_input_tokens": 200000, "max_output_tokens": 200000, "throughput": 100.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.078602+00:00", "updated_at": "2025-07-19T19:49:17.078602+00:00", "provider_model_id_used": "claude-3-sonnet-20240229", "model_name": "Claude 3 Sonnet", "organization_id": "anthropic" }, { "model_provider_id": 404, "model_id": "claude-opus-4-20250514", "provider_id": "anthropic", "deprecated_at": null, "input_cents_per_million_tokens": 1500, "output_cents_per_million_tokens": 7500, "quantization": null, "max_input_tokens": 200000, "max_output_tokens": 128000, "throughput": 100.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.086661+00:00", "updated_at": "2025-07-19T19:49:17.086661+00:00", "provider_model_id_used": "claude-opus-4-20250514", "model_name": "Claude Opus 4", "organization_id": "anthropic" }, { "model_provider_id": 405, "model_id": "claude-opus-4-1-20250805", "provider_id": "anthropic", "deprecated_at": null, "input_cents_per_million_tokens": 1500, "output_cents_per_million_tokens": 7500, "quantization": null, "max_input_tokens": 200000, "max_output_tokens": 32000, "throughput": 100.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-08-05T00:00:00.000000+00:00", "updated_at": "2025-08-05T00:00:00.000000+00:00", "provider_model_id_used": "claude-opus-4-1-20250805", "model_name": "Claude Opus 4.1", "organization_id": "anthropic" }, { "model_provider_id": 403, "model_id": "claude-sonnet-4-20250514", "provider_id": "anthropic", "deprecated_at": null, "input_cents_per_million_tokens": 300, "output_cents_per_million_tokens": 1500, "quantization": null, "max_input_tokens": 200000, "max_output_tokens": 128000, "throughput": 42.0, "latency": 0.4, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.084616+00:00", "updated_at": "2025-07-19T19:49:17.084616+00:00", "provider_model_id_used": "claude-sonnet-4-20250514", "model_name": "Claude Sonnet 4", "organization_id": "anthropic" }, { "model_provider_id": 406, "model_id": "claude-sonnet-4-5-20250929", "provider_id": "anthropic", "deprecated_at": null, "input_cents_per_million_tokens": 300, "output_cents_per_million_tokens": 1500, "quantization": null, "max_input_tokens": 200000, "max_output_tokens": 64000, "throughput": 42.0, "latency": 0.4, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": true, "input_modality_video": true, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.084616+00:00", "updated_at": "2025-07-19T19:49:17.084616+00:00", "provider_model_id_used": "claude-sonnet-4-5-20250929", "model_name": "Claude Sonnet 4.5", "organization_id": "anthropic" }, { "model_provider_id": 407, "model_id": "claude-haiku-4-5-20251015", "provider_id": "anthropic", "deprecated_at": null, "input_cents_per_million_tokens": 100, "output_cents_per_million_tokens": 500, "quantization": null, "max_input_tokens": 200000, "max_output_tokens": 200000, "throughput": 100.0, "latency": 0.3, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-10-15T00:00:00.000000+00:00", "updated_at": "2025-10-15T00:00:00.000000+00:00", "provider_model_id_used": "claude-haiku-4-5-20251015", "model_name": "Claude Haiku 4.5", "organization_id": "anthropic" } ] ================================================ FILE: data/providers/anthropic/provider.json ================================================ { "provider_id": "anthropic", "name": "Anthropic", "website": "https://anthropic.com", "created_at": "2025-07-19T19:49:17.069874+00:00", "updated_at": "2025-07-19T19:49:17.069874+00:00" } ================================================ FILE: data/providers/azure/models.json ================================================ [ { "model_provider_id": 261, "model_id": "gpt-3.5-turbo-0125", "provider_id": "azure", "deprecated_at": null, "input_cents_per_million_tokens": 50, "output_cents_per_million_tokens": 150, "quantization": null, "max_input_tokens": 16385, "max_output_tokens": 4096, "throughput": 90.0, "latency": 0.8, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.759540+00:00", "updated_at": "2025-07-19T19:49:16.759540+00:00", "provider_model_id_used": "gpt-3.5-turbo-0125", "model_name": "GPT-3.5 Turbo", "organization_id": "openai" }, { "model_provider_id": 259, "model_id": "gpt-4-0613", "provider_id": "azure", "deprecated_at": null, "input_cents_per_million_tokens": 3000, "output_cents_per_million_tokens": 6000, "quantization": null, "max_input_tokens": 32768, "max_output_tokens": 32768, "throughput": 104.0, "latency": 0.3, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.751649+00:00", "updated_at": "2025-07-19T19:49:16.751649+00:00", "provider_model_id_used": "gpt-4-0613", "model_name": "GPT-4", "organization_id": "openai" }, { "model_provider_id": 264, "model_id": "gpt-4o-2024-05-13", "provider_id": "azure", "deprecated_at": null, "input_cents_per_million_tokens": 250, "output_cents_per_million_tokens": 1000, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 4096, "throughput": 92.0, "latency": 0.54, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.767540+00:00", "updated_at": "2025-07-19T19:49:16.767540+00:00", "provider_model_id_used": "gpt-4o-2024-05-13", "model_name": "GPT-4o", "organization_id": "openai" }, { "model_provider_id": 263, "model_id": "gpt-4o-2024-08-06", "provider_id": "azure", "deprecated_at": null, "input_cents_per_million_tokens": 250, "output_cents_per_million_tokens": 1000, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 16384, "throughput": 99.0, "latency": 0.53, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.765163+00:00", "updated_at": "2025-07-19T19:49:16.765163+00:00", "provider_model_id_used": "gpt-4o-2024-08-06", "model_name": "GPT-4o", "organization_id": "openai" }, { "model_provider_id": 262, "model_id": "gpt-4o-mini-2024-07-18", "provider_id": "azure", "deprecated_at": null, "input_cents_per_million_tokens": 15, "output_cents_per_million_tokens": 60, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 16384, "throughput": 92.0, "latency": 0.52, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.762692+00:00", "updated_at": "2025-07-19T19:49:16.762692+00:00", "provider_model_id_used": "gpt-4o-mini-2024-07-18", "model_name": "GPT-4o mini", "organization_id": "openai" }, { "model_provider_id": 260, "model_id": "gpt-4-turbo-2024-04-09", "provider_id": "azure", "deprecated_at": null, "input_cents_per_million_tokens": 1000, "output_cents_per_million_tokens": 3000, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 4096, "throughput": 97.0, "latency": 0.6, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.755438+00:00", "updated_at": "2025-07-19T19:49:16.755438+00:00", "provider_model_id_used": "gpt-4-turbo-2024-04-09", "model_name": "GPT-4 Turbo", "organization_id": "openai" }, { "model_provider_id": 266, "model_id": "o1-2024-12-17", "provider_id": "azure", "deprecated_at": null, "input_cents_per_million_tokens": 1500, "output_cents_per_million_tokens": 6000, "quantization": null, "max_input_tokens": 200000, "max_output_tokens": 100000, "throughput": 16.0, "latency": 0.54, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.772502+00:00", "updated_at": "2025-07-19T19:49:16.772502+00:00", "provider_model_id_used": "o1-2024-12-17", "model_name": "o1", "organization_id": "openai" }, { "model_provider_id": 267, "model_id": "o1-mini", "provider_id": "azure", "deprecated_at": null, "input_cents_per_million_tokens": 330, "output_cents_per_million_tokens": 1320, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 65536, "throughput": 100.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.774395+00:00", "updated_at": "2025-07-19T19:49:16.774395+00:00", "provider_model_id_used": "o1-mini", "model_name": "o1-mini", "organization_id": "openai" }, { "model_provider_id": 265, "model_id": "o1-preview", "provider_id": "azure", "deprecated_at": null, "input_cents_per_million_tokens": 1650, "output_cents_per_million_tokens": 6600, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 32768, "throughput": 16.0, "latency": 0.54, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.770395+00:00", "updated_at": "2025-07-19T19:49:16.770395+00:00", "provider_model_id_used": "o1-preview", "model_name": "o1-preview", "organization_id": "openai" }, { "model_provider_id": 268, "model_id": "o3-mini", "provider_id": "azure", "deprecated_at": null, "input_cents_per_million_tokens": 110, "output_cents_per_million_tokens": 440, "quantization": null, "max_input_tokens": 200000, "max_output_tokens": 100000, "throughput": 115.0, "latency": 5.2, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.776480+00:00", "updated_at": "2025-07-19T19:49:16.776480+00:00", "provider_model_id_used": "o3-mini", "model_name": "o3-mini", "organization_id": "openai" }, { "model_provider_id": 269, "model_id": "phi-3.5-mini-instruct", "provider_id": "azure", "deprecated_at": null, "input_cents_per_million_tokens": 10, "output_cents_per_million_tokens": 10, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 23.0, "latency": 0.52, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.778852+00:00", "updated_at": "2025-07-19T19:49:16.778852+00:00", "provider_model_id_used": "phi-3.5-mini-instruct", "model_name": "Phi-3.5-mini-instruct", "organization_id": "microsoft" } ] ================================================ FILE: data/providers/azure/provider.json ================================================ { "provider_id": "azure", "name": "Azure", "website": "https://azure.microsoft.com", "created_at": "2025-07-19T19:49:16.749000+00:00", "updated_at": "2025-07-19T19:49:16.749000+00:00" } ================================================ FILE: data/providers/bedrock/models.json ================================================ [ { "model_provider_id": 369, "model_id": "claude-3-5-haiku-20241022", "provider_id": "bedrock", "deprecated_at": null, "input_cents_per_million_tokens": 80, "output_cents_per_million_tokens": 400, "quantization": null, "max_input_tokens": 200000, "max_output_tokens": 200000, "throughput": 104.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.009862+00:00", "updated_at": "2025-07-19T19:49:17.009862+00:00", "provider_model_id_used": "claude-3-5-haiku-20241022", "model_name": "Claude 3.5 Haiku", "organization_id": "anthropic" }, { "model_provider_id": 368, "model_id": "claude-3-5-sonnet-20240620", "provider_id": "bedrock", "deprecated_at": null, "input_cents_per_million_tokens": 300, "output_cents_per_million_tokens": 1500, "quantization": null, "max_input_tokens": 200000, "max_output_tokens": 200000, "throughput": 101.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.007722+00:00", "updated_at": "2025-07-19T19:49:17.007722+00:00", "provider_model_id_used": "claude-3-5-sonnet-20240620", "model_name": "Claude 3.5 Sonnet", "organization_id": "anthropic" }, { "model_provider_id": 367, "model_id": "claude-3-5-sonnet-20241022", "provider_id": "bedrock", "deprecated_at": null, "input_cents_per_million_tokens": 300, "output_cents_per_million_tokens": 1500, "quantization": null, "max_input_tokens": 200000, "max_output_tokens": 200000, "throughput": 101.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.005765+00:00", "updated_at": "2025-07-19T19:49:17.005765+00:00", "provider_model_id_used": "claude-3-5-sonnet-20241022", "model_name": "Claude 3.5 Sonnet", "organization_id": "anthropic" }, { "model_provider_id": 385, "model_id": "claude-3-7-sonnet-20250219", "provider_id": "bedrock", "deprecated_at": null, "input_cents_per_million_tokens": 300, "output_cents_per_million_tokens": 1500, "quantization": null, "max_input_tokens": 200000, "max_output_tokens": 128000, "throughput": 101.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.041625+00:00", "updated_at": "2025-07-19T19:49:17.041625+00:00", "provider_model_id_used": "claude-3-7-sonnet-20250219", "model_name": "Claude 3.7 Sonnet", "organization_id": "anthropic" }, { "model_provider_id": 372, "model_id": "claude-3-haiku-20240307", "provider_id": "bedrock", "deprecated_at": null, "input_cents_per_million_tokens": 25, "output_cents_per_million_tokens": 125, "quantization": null, "max_input_tokens": 200000, "max_output_tokens": 200000, "throughput": 104.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.016542+00:00", "updated_at": "2025-07-19T19:49:17.016542+00:00", "provider_model_id_used": "claude-3-haiku-20240307", "model_name": "Claude 3 Haiku", "organization_id": "anthropic" }, { "model_provider_id": 370, "model_id": "claude-3-opus-20240229", "provider_id": "bedrock", "deprecated_at": null, "input_cents_per_million_tokens": 1500, "output_cents_per_million_tokens": 7500, "quantization": null, "max_input_tokens": 200000, "max_output_tokens": 200000, "throughput": 120.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.011523+00:00", "updated_at": "2025-07-19T19:49:17.011523+00:00", "provider_model_id_used": "claude-3-opus-20240229", "model_name": "Claude 3 Opus", "organization_id": "anthropic" }, { "model_provider_id": 371, "model_id": "claude-3-sonnet-20240229", "provider_id": "bedrock", "deprecated_at": null, "input_cents_per_million_tokens": 300, "output_cents_per_million_tokens": 1500, "quantization": null, "max_input_tokens": 200000, "max_output_tokens": 200000, "throughput": 120.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.014573+00:00", "updated_at": "2025-07-19T19:49:17.014573+00:00", "provider_model_id_used": "claude-3-sonnet-20240229", "model_name": "Claude 3 Sonnet", "organization_id": "anthropic" }, { "model_provider_id": 387, "model_id": "claude-opus-4-20250514", "provider_id": "bedrock", "deprecated_at": null, "input_cents_per_million_tokens": 1500, "output_cents_per_million_tokens": 7500, "quantization": null, "max_input_tokens": 200000, "max_output_tokens": 128000, "throughput": 120.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.046935+00:00", "updated_at": "2025-07-19T19:49:17.046935+00:00", "provider_model_id_used": "claude-opus-4-20250514", "model_name": "Claude Opus 4", "organization_id": "anthropic" }, { "model_provider_id": 388, "model_id": "claude-opus-4-1-20250805", "provider_id": "bedrock", "deprecated_at": null, "input_cents_per_million_tokens": 1500, "output_cents_per_million_tokens": 7500, "quantization": null, "max_input_tokens": 200000, "max_output_tokens": 32000, "throughput": 120.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-08-05T00:00:00.000000+00:00", "updated_at": "2025-08-05T00:00:00.000000+00:00", "provider_model_id_used": "claude-opus-4-1-20250805", "model_name": "Claude Opus 4.1", "organization_id": "anthropic" }, { "model_provider_id": 386, "model_id": "claude-sonnet-4-20250514", "provider_id": "bedrock", "deprecated_at": null, "input_cents_per_million_tokens": 300, "output_cents_per_million_tokens": 1500, "quantization": null, "max_input_tokens": 200000, "max_output_tokens": 128000, "throughput": 101.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.044184+00:00", "updated_at": "2025-07-19T19:49:17.044184+00:00", "provider_model_id_used": "claude-sonnet-4-20250514", "model_name": "Claude Sonnet 4", "organization_id": "anthropic" }, { "model_provider_id": 381, "model_id": "command-r-plus-04-2024", "provider_id": "bedrock", "deprecated_at": null, "input_cents_per_million_tokens": 300, "output_cents_per_million_tokens": 1500, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 100.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.034365+00:00", "updated_at": "2025-07-19T19:49:17.034365+00:00", "provider_model_id_used": "command-r-plus-04-2024", "model_name": "Command R+", "organization_id": "cohere" }, { "model_provider_id": 374, "model_id": "jamba-1.5-large", "provider_id": "bedrock", "deprecated_at": null, "input_cents_per_million_tokens": 200, "output_cents_per_million_tokens": 800, "quantization": null, "max_input_tokens": 256000, "max_output_tokens": 256000, "throughput": 100.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.020432+00:00", "updated_at": "2025-07-19T19:49:17.020432+00:00", "provider_model_id_used": "jamba-1.5-large", "model_name": "Jamba 1.5 Large", "organization_id": "ai21" }, { "model_provider_id": 373, "model_id": "jamba-1.5-mini", "provider_id": "bedrock", "deprecated_at": null, "input_cents_per_million_tokens": 20, "output_cents_per_million_tokens": 40, "quantization": null, "max_input_tokens": 256144, "max_output_tokens": 256144, "throughput": 100.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.018357+00:00", "updated_at": "2025-07-19T19:49:17.018357+00:00", "provider_model_id_used": "jamba-1.5-mini", "model_name": "Jamba 1.5 Mini", "organization_id": "ai21" }, { "model_provider_id": 376, "model_id": "llama-3.1-405b-instruct", "provider_id": "bedrock", "deprecated_at": null, "input_cents_per_million_tokens": 300, "output_cents_per_million_tokens": 300, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 100.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.024314+00:00", "updated_at": "2025-07-19T19:49:17.024314+00:00", "provider_model_id_used": "llama-3.1-405b-instruct", "model_name": "Llama 3.1 405B Instruct", "organization_id": "meta" }, { "model_provider_id": 375, "model_id": "llama-3.1-70b-instruct", "provider_id": "bedrock", "deprecated_at": null, "input_cents_per_million_tokens": 89, "output_cents_per_million_tokens": 89, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 100.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.022256+00:00", "updated_at": "2025-07-19T19:49:17.022256+00:00", "provider_model_id_used": "llama-3.1-70b-instruct", "model_name": "Llama 3.1 70B Instruct", "organization_id": "meta" }, { "model_provider_id": 377, "model_id": "llama-3.1-8b-instruct", "provider_id": "bedrock", "deprecated_at": null, "input_cents_per_million_tokens": 22, "output_cents_per_million_tokens": 22, "quantization": null, "max_input_tokens": 131072, "max_output_tokens": 131072, "throughput": 100.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.026582+00:00", "updated_at": "2025-07-19T19:49:17.026582+00:00", "provider_model_id_used": "llama-3.1-8b-instruct", "model_name": "Llama 3.1 8B Instruct", "organization_id": "meta" }, { "model_provider_id": 378, "model_id": "llama-3.2-11b-instruct", "provider_id": "bedrock", "deprecated_at": null, "input_cents_per_million_tokens": 16, "output_cents_per_million_tokens": 16, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 100.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.028853+00:00", "updated_at": "2025-07-19T19:49:17.028853+00:00", "provider_model_id_used": "llama-3.2-11b-instruct", "model_name": "Llama 3.2 11B Instruct", "organization_id": "meta" }, { "model_provider_id": 379, "model_id": "llama-3.2-90b-instruct", "provider_id": "bedrock", "deprecated_at": null, "input_cents_per_million_tokens": 72, "output_cents_per_million_tokens": 72, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 100.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.030727+00:00", "updated_at": "2025-07-19T19:49:17.030727+00:00", "provider_model_id_used": "llama-3.2-90b-instruct", "model_name": "Llama 3.2 90B Instruct", "organization_id": "meta" }, { "model_provider_id": 380, "model_id": "llama-3.3-70b-instruct", "provider_id": "bedrock", "deprecated_at": null, "input_cents_per_million_tokens": 72, "output_cents_per_million_tokens": 72, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 100.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.032478+00:00", "updated_at": "2025-07-19T19:49:17.032478+00:00", "provider_model_id_used": "llama-3.3-70b-instruct", "model_name": "Llama 3.3 70B Instruct", "organization_id": "meta" }, { "model_provider_id": 383, "model_id": "nova-lite", "provider_id": "bedrock", "deprecated_at": null, "input_cents_per_million_tokens": 6, "output_cents_per_million_tokens": 24, "quantization": null, "max_input_tokens": 300000, "max_output_tokens": 2048, "throughput": 100.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.037841+00:00", "updated_at": "2025-07-19T19:49:17.037841+00:00", "provider_model_id_used": "nova-lite", "model_name": "Nova Lite", "organization_id": "amazon" }, { "model_provider_id": 382, "model_id": "nova-micro", "provider_id": "bedrock", "deprecated_at": null, "input_cents_per_million_tokens": 3, "output_cents_per_million_tokens": 14, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 100.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.036065+00:00", "updated_at": "2025-07-19T19:49:17.036065+00:00", "provider_model_id_used": "nova-micro", "model_name": "Nova Micro", "organization_id": "amazon" }, { "model_provider_id": 384, "model_id": "nova-pro", "provider_id": "bedrock", "deprecated_at": null, "input_cents_per_million_tokens": 80, "output_cents_per_million_tokens": 320, "quantization": null, "max_input_tokens": 300000, "max_output_tokens": 300000, "throughput": 100.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.039606+00:00", "updated_at": "2025-07-19T19:49:17.039606+00:00", "provider_model_id_used": "nova-pro", "model_name": "Nova Pro", "organization_id": "amazon" } ] ================================================ FILE: data/providers/bedrock/provider.json ================================================ { "provider_id": "bedrock", "name": "Bedrock", "website": "https://aws.amazon.com/bedrock/", "created_at": "2025-07-19T19:49:17.004009+00:00", "updated_at": "2025-07-19T19:49:17.004009+00:00" } ================================================ FILE: data/providers/cerebras/models.json ================================================ [ { "model_provider_id": 405, "model_id": "llama-3.1-70b-instruct", "provider_id": "cerebras", "deprecated_at": null, "input_cents_per_million_tokens": 60, "output_cents_per_million_tokens": 60, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 1204.0, "latency": 0.2, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.090362+00:00", "updated_at": "2025-07-19T19:49:17.090362+00:00", "provider_model_id_used": "llama-3.1-70b-instruct", "model_name": "Llama 3.1 70B Instruct", "organization_id": "meta" }, { "model_provider_id": 406, "model_id": "llama-3.1-8b-instruct", "provider_id": "cerebras", "deprecated_at": null, "input_cents_per_million_tokens": 10, "output_cents_per_million_tokens": 10, "quantization": null, "max_input_tokens": 131072, "max_output_tokens": 131072, "throughput": 2047.0, "latency": 0.2, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.092709+00:00", "updated_at": "2025-07-19T19:49:17.092709+00:00", "provider_model_id_used": "llama-3.1-8b-instruct", "model_name": "Llama 3.1 8B Instruct", "organization_id": "meta" }, { "model_provider_id": 407, "model_id": "llama-3.3-70b-instruct", "provider_id": "cerebras", "deprecated_at": null, "input_cents_per_million_tokens": 70, "output_cents_per_million_tokens": 80, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 2220.0, "latency": 0.65, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.095252+00:00", "updated_at": "2025-07-19T19:49:17.095252+00:00", "provider_model_id_used": "llama-3.3-70b-instruct", "model_name": "Llama 3.3 70B Instruct", "organization_id": "meta" } ] ================================================ FILE: data/providers/cerebras/provider.json ================================================ { "provider_id": "cerebras", "name": "Cerebras", "website": "https://cerebras.ai", "created_at": "2025-07-19T19:49:17.088130+00:00", "updated_at": "2025-07-19T19:49:17.088130+00:00" } ================================================ FILE: data/providers/cohere/models.json ================================================ [ { "model_provider_id": 238, "model_id": "command-r-plus-04-2024", "provider_id": "cohere", "deprecated_at": null, "input_cents_per_million_tokens": 25, "output_cents_per_million_tokens": 100, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 59.0, "latency": 0.65, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.693641+00:00", "updated_at": "2025-07-19T19:49:16.693641+00:00", "provider_model_id_used": "command-r-plus-04-2024", "model_name": "Command R+", "organization_id": "cohere" } ] ================================================ FILE: data/providers/cohere/provider.json ================================================ { "provider_id": "cohere", "name": "Cohere", "website": "https://cohere.ai", "created_at": "2025-07-19T19:49:16.663117+00:00", "updated_at": "2025-07-19T19:49:16.663117+00:00" } ================================================ FILE: data/providers/deepinfra/models.json ================================================ [ { "model_provider_id": 290, "model_id": "deepseek-r1", "provider_id": "deepinfra", "deprecated_at": null, "input_cents_per_million_tokens": 85, "output_cents_per_million_tokens": 250, "quantization": null, "max_input_tokens": 131072, "max_output_tokens": 131072, "throughput": 0.9, "latency": 0.3, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.830887+00:00", "updated_at": "2025-07-19T19:49:16.830887+00:00", "provider_model_id_used": "deepseek-r1", "model_name": "DeepSeek-R1", "organization_id": "deepseek" }, { "model_provider_id": 304, "model_id": "deepseek-r1-0528", "provider_id": "deepinfra", "deprecated_at": null, "input_cents_per_million_tokens": 50, "output_cents_per_million_tokens": 215, "quantization": null, "max_input_tokens": 131072, "max_output_tokens": 131072, "throughput": 45.04, "latency": 0.61, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.862375+00:00", "updated_at": "2025-07-19T19:49:16.862375+00:00", "provider_model_id_used": "deepseek-r1-0528", "model_name": "DeepSeek-R1-0528", "organization_id": "deepseek" }, { "model_provider_id": 298, "model_id": "deepseek-r1-distill-llama-70b", "provider_id": "deepinfra", "deprecated_at": null, "input_cents_per_million_tokens": 10, "output_cents_per_million_tokens": 40, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 37.0, "latency": 0.65, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.847437+00:00", "updated_at": "2025-07-19T19:49:16.847437+00:00", "provider_model_id_used": "deepseek-r1-distill-llama-70b", "model_name": "DeepSeek R1 Distill Llama 70B", "organization_id": "deepseek" }, { "model_provider_id": 299, "model_id": "deepseek-r1-distill-qwen-32b", "provider_id": "deepinfra", "deprecated_at": null, "input_cents_per_million_tokens": 12, "output_cents_per_million_tokens": 18, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 37.0, "latency": 0.65, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.849673+00:00", "updated_at": "2025-07-19T19:49:16.849673+00:00", "provider_model_id_used": "deepseek-r1-distill-qwen-32b", "model_name": "DeepSeek R1 Distill Qwen 32B", "organization_id": "deepseek" }, { "model_provider_id": 284, "model_id": "deepseek-v2.5", "provider_id": "deepinfra", "deprecated_at": null, "input_cents_per_million_tokens": 70, "output_cents_per_million_tokens": 140, "quantization": null, "max_input_tokens": 8192, "max_output_tokens": 8192, "throughput": 63.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.819006+00:00", "updated_at": "2025-07-19T19:49:16.819006+00:00", "provider_model_id_used": "deepseek-v2.5", "model_name": "DeepSeek-V2.5", "organization_id": "deepseek" }, { "model_provider_id": 305, "model_id": "deepseek-v3.1", "provider_id": "deepinfra", "deprecated_at": null, "input_cents_per_million_tokens": 27, "output_cents_per_million_tokens": 100, "quantization": "int4", "max_input_tokens": 163840, "max_output_tokens": 163840, "throughput": null, "latency": null, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-09-15T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "provider_model_id_used": "deepseek-ai/DeepSeek-V3.1", "model_name": "DeepSeek V3.1", "organization_id": "deepseek" }, { "model_provider_id": 306, "model_id": "glm-4.5", "provider_id": "deepinfra", "deprecated_at": null, "input_cents_per_million_tokens": 40, "output_cents_per_million_tokens": 160, "quantization": "fp8", "max_input_tokens": 131072, "max_output_tokens": 131072, "throughput": null, "latency": null, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": false, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-09-15T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "provider_model_id_used": "zai-org/GLM-4.5", "model_name": "GLM-4.5", "organization_id": "zai-org" }, { "model_provider_id": 307, "model_id": "gpt-oss-120b", "provider_id": "deepinfra", "deprecated_at": null, "input_cents_per_million_tokens": 9, "output_cents_per_million_tokens": 45, "quantization": "int4", "max_input_tokens": 131072, "max_output_tokens": 131072, "throughput": null, "latency": null, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-09-15T00:00:00.000000+00:00", "updated_at": "2025-09-15T00:00:00.000000+00:00", "provider_model_id_used": "openai/gpt-oss-120b", "model_name": "GPT-OSS-120B", "organization_id": "openai" }, { "model_provider_id": 294, "model_id": "gemma-3-12b-it", "provider_id": "deepinfra", "deprecated_at": null, "input_cents_per_million_tokens": 5, "output_cents_per_million_tokens": 10, "quantization": null, "max_input_tokens": 131072, "max_output_tokens": 131072, "throughput": 33.0, "latency": 0.2, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.839147+00:00", "updated_at": "2025-07-19T19:49:16.839147+00:00", "provider_model_id_used": "gemma-3-12b-it", "model_name": "Gemma 3 12B", "organization_id": "google" }, { "model_provider_id": 295, "model_id": "gemma-3-27b-it", "provider_id": "deepinfra", "deprecated_at": null, "input_cents_per_million_tokens": 10, "output_cents_per_million_tokens": 20, "quantization": null, "max_input_tokens": 131072, "max_output_tokens": 131072, "throughput": 33.0, "latency": 0.2, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.841300+00:00", "updated_at": "2025-07-19T19:49:16.841300+00:00", "provider_model_id_used": "gemma-3-27b-it", "model_name": "Gemma 3 27B", "organization_id": "google" }, { "model_provider_id": 293, "model_id": "gemma-3-4b-it", "provider_id": "deepinfra", "deprecated_at": null, "input_cents_per_million_tokens": 2, "output_cents_per_million_tokens": 4, "quantization": null, "max_input_tokens": 131072, "max_output_tokens": 131072, "throughput": 33.0, "latency": 0.2, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.837297+00:00", "updated_at": "2025-07-19T19:49:16.837297+00:00", "provider_model_id_used": "gemma-3-4b-it", "model_name": "Gemma 3 4B", "organization_id": "google" }, { "model_provider_id": 281, "model_id": "llama-3.1-405b-instruct", "provider_id": "deepinfra", "deprecated_at": null, "input_cents_per_million_tokens": 179, "output_cents_per_million_tokens": 179, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 27.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.812645+00:00", "updated_at": "2025-07-19T19:49:16.812645+00:00", "provider_model_id_used": "llama-3.1-405b-instruct", "model_name": "Llama 3.1 405B Instruct", "organization_id": "meta" }, { "model_provider_id": 279, "model_id": "llama-3.1-70b-instruct", "provider_id": "deepinfra", "deprecated_at": null, "input_cents_per_million_tokens": 35, "output_cents_per_million_tokens": 40, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 25.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.808506+00:00", "updated_at": "2025-07-19T19:49:16.808506+00:00", "provider_model_id_used": "llama-3.1-70b-instruct", "model_name": "Llama 3.1 70B Instruct", "organization_id": "meta" }, { "model_provider_id": 280, "model_id": "llama-3.1-8b-instruct", "provider_id": "deepinfra", "deprecated_at": null, "input_cents_per_million_tokens": 5, "output_cents_per_million_tokens": 5, "quantization": null, "max_input_tokens": 131072, "max_output_tokens": 131072, "throughput": 118.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.810724+00:00", "updated_at": "2025-07-19T19:49:16.810724+00:00", "provider_model_id_used": "llama-3.1-8b-instruct", "model_name": "Llama 3.1 8B Instruct", "organization_id": "meta" }, { "model_provider_id": 283, "model_id": "llama-3.2-11b-instruct", "provider_id": "deepinfra", "deprecated_at": null, "input_cents_per_million_tokens": 5, "output_cents_per_million_tokens": 5, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 108.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.817103+00:00", "updated_at": "2025-07-19T19:49:16.817103+00:00", "provider_model_id_used": "llama-3.2-11b-instruct", "model_name": "Llama 3.2 11B Instruct", "organization_id": "meta" }, { "model_provider_id": 289, "model_id": "llama-3.2-3b-instruct", "provider_id": "deepinfra", "deprecated_at": null, "input_cents_per_million_tokens": 1, "output_cents_per_million_tokens": 2, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 171.5, "latency": 0.24, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.828875+00:00", "updated_at": "2025-07-19T19:49:16.828875+00:00", "provider_model_id_used": "llama-3.2-3b-instruct", "model_name": "Llama 3.2 3B Instruct", "organization_id": "meta" }, { "model_provider_id": 282, "model_id": "llama-3.2-90b-instruct", "provider_id": "deepinfra", "deprecated_at": null, "input_cents_per_million_tokens": 35, "output_cents_per_million_tokens": 40, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 24.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.814472+00:00", "updated_at": "2025-07-19T19:49:16.814472+00:00", "provider_model_id_used": "llama-3.2-90b-instruct", "model_name": "Llama 3.2 90B Instruct", "organization_id": "meta" }, { "model_provider_id": 288, "model_id": "llama-3.3-70b-instruct", "provider_id": "deepinfra", "deprecated_at": null, "input_cents_per_million_tokens": 23, "output_cents_per_million_tokens": 40, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 37.0, "latency": 0.65, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.827019+00:00", "updated_at": "2025-07-19T19:49:16.827019+00:00", "provider_model_id_used": "llama-3.3-70b-instruct", "model_name": "Llama 3.3 70B Instruct", "organization_id": "meta" }, { "model_provider_id": 296, "model_id": "llama-4-maverick", "provider_id": "deepinfra", "deprecated_at": null, "input_cents_per_million_tokens": 17, "output_cents_per_million_tokens": 60, "quantization": null, "max_input_tokens": 1000000, "max_output_tokens": 1000000, "throughput": 83.59, "latency": 0.38, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.843444+00:00", "updated_at": "2025-07-19T19:49:16.843444+00:00", "provider_model_id_used": "llama-4-maverick", "model_name": "Llama 4 Maverick", "organization_id": "meta" }, { "model_provider_id": 297, "model_id": "llama-4-scout", "provider_id": "deepinfra", "deprecated_at": null, "input_cents_per_million_tokens": 8, "output_cents_per_million_tokens": 30, "quantization": null, "max_input_tokens": 10000000, "max_output_tokens": 10000000, "throughput": 76.1, "latency": 0.31, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.845085+00:00", "updated_at": "2025-07-19T19:49:16.845085+00:00", "provider_model_id_used": "llama-4-scout", "model_name": "Llama 4 Scout", "organization_id": "meta" }, { "model_provider_id": 291, "model_id": "mistral-small-24b-instruct-2501", "provider_id": "deepinfra", "deprecated_at": null, "input_cents_per_million_tokens": 7, "output_cents_per_million_tokens": 14, "quantization": null, "max_input_tokens": 32000, "max_output_tokens": 32000, "throughput": 49.0, "latency": 0.2, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.832954+00:00", "updated_at": "2025-07-19T19:49:16.832954+00:00", "provider_model_id_used": "mistral-small-24b-instruct-2501", "model_name": "Mistral Small 3 24B Instruct", "organization_id": "mistral" }, { "model_provider_id": 292, "model_id": "phi-4", "provider_id": "deepinfra", "deprecated_at": null, "input_cents_per_million_tokens": 7, "output_cents_per_million_tokens": 14, "quantization": null, "max_input_tokens": 16000, "max_output_tokens": 16000, "throughput": 33.0, "latency": 0.2, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.835314+00:00", "updated_at": "2025-07-19T19:49:16.835314+00:00", "provider_model_id_used": "phi-4", "model_name": "Phi 4", "organization_id": "microsoft" }, { "model_provider_id": 300, "model_id": "phi-4-multimodal-instruct", "provider_id": "deepinfra", "deprecated_at": null, "input_cents_per_million_tokens": 5, "output_cents_per_million_tokens": 10, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 25.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.852868+00:00", "updated_at": "2025-07-19T19:49:16.852868+00:00", "provider_model_id_used": "phi-4-multimodal-instruct", "model_name": "Phi-4-multimodal-instruct", "organization_id": "microsoft" }, { "model_provider_id": 286, "model_id": "qwen-2.5-72b-instruct", "provider_id": "deepinfra", "deprecated_at": null, "input_cents_per_million_tokens": 35, "output_cents_per_million_tokens": 40, "quantization": null, "max_input_tokens": 131072, "max_output_tokens": 8192, "throughput": 10.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.822329+00:00", "updated_at": "2025-07-19T19:49:16.822329+00:00", "provider_model_id_used": "qwen-2.5-72b-instruct", "model_name": "Qwen2.5 72B Instruct", "organization_id": "qwen" }, { "model_provider_id": 285, "model_id": "qwen-2.5-coder-32b-instruct", "provider_id": "deepinfra", "deprecated_at": null, "input_cents_per_million_tokens": 18, "output_cents_per_million_tokens": 18, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 44.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.820492+00:00", "updated_at": "2025-07-19T19:49:16.820492+00:00", "provider_model_id_used": "qwen-2.5-coder-32b-instruct", "model_name": "Qwen2.5-Coder 32B Instruct", "organization_id": "qwen" }, { "model_provider_id": 301, "model_id": "qwen3-235b-a22b", "provider_id": "deepinfra", "deprecated_at": null, "input_cents_per_million_tokens": 20, "output_cents_per_million_tokens": 60, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 21.74, "latency": 1.23, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.855452+00:00", "updated_at": "2025-07-19T19:49:16.855452+00:00", "provider_model_id_used": "qwen3-235b-a22b", "model_name": "Qwen3 235B A22B", "organization_id": "qwen" }, { "model_provider_id": 303, "model_id": "qwen3-30b-a3b", "provider_id": "deepinfra", "deprecated_at": null, "input_cents_per_million_tokens": 10, "output_cents_per_million_tokens": 30, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 82.57, "latency": 0.84, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.859780+00:00", "updated_at": "2025-07-19T19:49:16.859780+00:00", "provider_model_id_used": "qwen3-30b-a3b", "model_name": "Qwen3 30B A3B", "organization_id": "qwen" }, { "model_provider_id": 302, "model_id": "qwen3-32b", "provider_id": "deepinfra", "deprecated_at": null, "input_cents_per_million_tokens": 10, "output_cents_per_million_tokens": 30, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 26.95, "latency": 1.19, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.857468+00:00", "updated_at": "2025-07-19T19:49:16.857468+00:00", "provider_model_id_used": "qwen3-32b", "model_name": "Qwen3 32B", "organization_id": "qwen" }, { "model_provider_id": 287, "model_id": "qwq-32b-preview", "provider_id": "deepinfra", "deprecated_at": null, "input_cents_per_million_tokens": 15, "output_cents_per_million_tokens": 60, "quantization": null, "max_input_tokens": 32768, "max_output_tokens": 32768, "throughput": 76.04, "latency": 0.44, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.825039+00:00", "updated_at": "2025-07-19T19:49:16.825039+00:00", "provider_model_id_used": "qwq-32b-preview", "model_name": "QwQ-32B-Preview", "organization_id": "qwen" }, { "model_provider_id": 288, "model_id": "glm-4.6", "provider_id": "deepinfra", "deprecated_at": null, "input_cents_per_million_tokens": 60, "output_cents_per_million_tokens": 200, "quantization": "fp8", "max_input_tokens": 131072, "max_output_tokens": 65536, "throughput": 85.0, "latency": 0.7, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": true, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-09-30T00:00:00.000000+00:00", "updated_at": "2025-09-30T00:00:00.000000+00:00", "provider_model_id_used": "zai-org/GLM-4.6", "model_name": "GLM-4.6", "organization_id": "zai-org" } ] ================================================ FILE: data/providers/deepinfra/provider.json ================================================ { "provider_id": "deepinfra", "name": "DeepInfra", "website": "https://deepinfra.com/", "created_at": "2025-07-19T19:49:16.806529+00:00", "updated_at": "2025-07-19T19:49:16.806529+00:00" } ================================================ FILE: data/providers/deepseek/models.json ================================================ [ { "model_provider_id": 361, "model_id": "deepseek-r1", "provider_id": "deepseek", "deprecated_at": null, "input_cents_per_million_tokens": 55, "output_cents_per_million_tokens": 219, "quantization": null, "max_input_tokens": 131072, "max_output_tokens": 131072, "throughput": 9.0, "latency": 0.3, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.991378+00:00", "updated_at": "2025-07-19T19:49:16.991378+00:00", "provider_model_id_used": "deepseek-r1", "model_name": "DeepSeek-R1", "organization_id": "deepseek" }, { "model_provider_id": 362, "model_id": "deepseek-r1-0528", "provider_id": "deepseek", "deprecated_at": null, "input_cents_per_million_tokens": 55, "output_cents_per_million_tokens": 219, "quantization": null, "max_input_tokens": 131072, "max_output_tokens": 131072, "throughput": 9.0, "latency": 0.3, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.993656+00:00", "updated_at": "2025-07-19T19:49:16.993656+00:00", "provider_model_id_used": "deepseek-r1-0528", "model_name": "DeepSeek-R1-0528", "organization_id": "deepseek" }, { "model_provider_id": 359, "model_id": "deepseek-v2.5", "provider_id": "deepseek", "deprecated_at": null, "input_cents_per_million_tokens": 14, "output_cents_per_million_tokens": 28, "quantization": null, "max_input_tokens": 8192, "max_output_tokens": 8192, "throughput": 100.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.987664+00:00", "updated_at": "2025-07-19T19:49:16.987664+00:00", "provider_model_id_used": "deepseek-v2.5", "model_name": "DeepSeek-V2.5", "organization_id": "deepseek" }, { "model_provider_id": 360, "model_id": "deepseek-v3", "provider_id": "deepseek", "deprecated_at": null, "input_cents_per_million_tokens": 27, "output_cents_per_million_tokens": 110, "quantization": null, "max_input_tokens": 131072, "max_output_tokens": 131072, "throughput": 100.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.989355+00:00", "updated_at": "2025-07-19T19:49:16.989355+00:00", "provider_model_id_used": "deepseek-v3", "model_name": "DeepSeek-V3", "organization_id": "deepseek" } ] ================================================ FILE: data/providers/deepseek/provider.json ================================================ { "provider_id": "deepseek", "name": "DeepSeek", "website": "https://deepseek.com/", "created_at": "2025-07-19T19:49:16.986078+00:00", "updated_at": "2025-07-19T19:49:16.986078+00:00" } ================================================ FILE: data/providers/fireworks/models.json ================================================ [ { "model_provider_id": 340, "model_id": "deepseek-r1", "provider_id": "fireworks", "deprecated_at": null, "input_cents_per_million_tokens": 800, "output_cents_per_million_tokens": 800, "quantization": null, "max_input_tokens": 131072, "max_output_tokens": 131072, "throughput": 2.1, "latency": 0.3, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.942224+00:00", "updated_at": "2025-07-19T19:49:16.942224+00:00", "provider_model_id_used": "deepseek-r1", "model_name": "DeepSeek-R1", "organization_id": "deepseek" }, { "model_provider_id": 331, "model_id": "llama-3.1-405b-instruct", "provider_id": "fireworks", "deprecated_at": null, "input_cents_per_million_tokens": 300, "output_cents_per_million_tokens": 300, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 78.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.923810+00:00", "updated_at": "2025-07-19T19:49:16.923810+00:00", "provider_model_id_used": "llama-3.1-405b-instruct", "model_name": "Llama 3.1 405B Instruct", "organization_id": "meta" }, { "model_provider_id": 332, "model_id": "llama-3.1-70b-instruct", "provider_id": "fireworks", "deprecated_at": null, "input_cents_per_million_tokens": 89, "output_cents_per_million_tokens": 89, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 32.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.926263+00:00", "updated_at": "2025-07-19T19:49:16.926263+00:00", "provider_model_id_used": "llama-3.1-70b-instruct", "model_name": "Llama 3.1 70B Instruct", "organization_id": "meta" }, { "model_provider_id": 333, "model_id": "llama-3.1-8b-instruct", "provider_id": "fireworks", "deprecated_at": null, "input_cents_per_million_tokens": 20, "output_cents_per_million_tokens": 20, "quantization": null, "max_input_tokens": 131072, "max_output_tokens": 131072, "throughput": 292.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.928500+00:00", "updated_at": "2025-07-19T19:49:16.928500+00:00", "provider_model_id_used": "llama-3.1-8b-instruct", "model_name": "Llama 3.1 8B Instruct", "organization_id": "meta" }, { "model_provider_id": 335, "model_id": "llama-3.2-11b-instruct", "provider_id": "fireworks", "deprecated_at": null, "input_cents_per_million_tokens": 20, "output_cents_per_million_tokens": 20, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 125.0, "latency": 0.2, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.932316+00:00", "updated_at": "2025-07-19T19:49:16.932316+00:00", "provider_model_id_used": "llama-3.2-11b-instruct", "model_name": "Llama 3.2 11B Instruct", "organization_id": "meta" }, { "model_provider_id": 334, "model_id": "llama-3.2-90b-instruct", "provider_id": "fireworks", "deprecated_at": null, "input_cents_per_million_tokens": 89, "output_cents_per_million_tokens": 89, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 50.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.930486+00:00", "updated_at": "2025-07-19T19:49:16.930486+00:00", "provider_model_id_used": "llama-3.2-90b-instruct", "model_name": "Llama 3.2 90B Instruct", "organization_id": "meta" }, { "model_provider_id": 339, "model_id": "llama-3.3-70b-instruct", "provider_id": "fireworks", "deprecated_at": null, "input_cents_per_million_tokens": 89, "output_cents_per_million_tokens": 89, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 197.0, "latency": 0.65, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.939993+00:00", "updated_at": "2025-07-19T19:49:16.939993+00:00", "provider_model_id_used": "llama-3.3-70b-instruct", "model_name": "Llama 3.3 70B Instruct", "organization_id": "meta" }, { "model_provider_id": 341, "model_id": "llama-4-maverick", "provider_id": "fireworks", "deprecated_at": null, "input_cents_per_million_tokens": 22, "output_cents_per_million_tokens": 88, "quantization": null, "max_input_tokens": 1000000, "max_output_tokens": 1000000, "throughput": 63.03, "latency": 0.62, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.944370+00:00", "updated_at": "2025-07-19T19:49:16.944370+00:00", "provider_model_id_used": "llama-4-maverick", "model_name": "Llama 4 Maverick", "organization_id": "meta" }, { "model_provider_id": 342, "model_id": "llama-4-scout", "provider_id": "fireworks", "deprecated_at": null, "input_cents_per_million_tokens": 15, "output_cents_per_million_tokens": 60, "quantization": null, "max_input_tokens": 10000000, "max_output_tokens": 10000000, "throughput": 116.1, "latency": 0.53, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.946725+00:00", "updated_at": "2025-07-19T19:49:16.946725+00:00", "provider_model_id_used": "llama-4-scout", "model_name": "Llama 4 Scout", "organization_id": "meta" }, { "model_provider_id": 337, "model_id": "qwen-2.5-72b-instruct", "provider_id": "fireworks", "deprecated_at": null, "input_cents_per_million_tokens": 89, "output_cents_per_million_tokens": 89, "quantization": null, "max_input_tokens": 131072, "max_output_tokens": 8192, "throughput": 59.0, "latency": 0.37, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.936092+00:00", "updated_at": "2025-07-19T19:49:16.936092+00:00", "provider_model_id_used": "qwen-2.5-72b-instruct", "model_name": "Qwen2.5 72B Instruct", "organization_id": "qwen" }, { "model_provider_id": 336, "model_id": "qwen-2.5-coder-32b-instruct", "provider_id": "fireworks", "deprecated_at": null, "input_cents_per_million_tokens": 89, "output_cents_per_million_tokens": 89, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 110.0, "latency": 0.26, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.934183+00:00", "updated_at": "2025-07-19T19:49:16.934183+00:00", "provider_model_id_used": "qwen-2.5-coder-32b-instruct", "model_name": "Qwen2.5-Coder 32B Instruct", "organization_id": "qwen" }, { "model_provider_id": 343, "model_id": "qwen3-235b-a22b", "provider_id": "fireworks", "deprecated_at": null, "input_cents_per_million_tokens": 10, "output_cents_per_million_tokens": 10, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 68.17, "latency": 0.78, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.949833+00:00", "updated_at": "2025-07-19T19:49:16.949833+00:00", "provider_model_id_used": "qwen3-235b-a22b", "model_name": "Qwen3 235B A22B", "organization_id": "qwen" }, { "model_provider_id": 344, "model_id": "qwen3-30b-a3b", "provider_id": "fireworks", "deprecated_at": null, "input_cents_per_million_tokens": 89, "output_cents_per_million_tokens": 89, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 122.4, "latency": 0.66, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.951886+00:00", "updated_at": "2025-07-19T19:49:16.951886+00:00", "provider_model_id_used": "qwen3-30b-a3b", "model_name": "Qwen3 30B A3B", "organization_id": "qwen" }, { "model_provider_id": 338, "model_id": "qwq-32b-preview", "provider_id": "fireworks", "deprecated_at": null, "input_cents_per_million_tokens": 89, "output_cents_per_million_tokens": 89, "quantization": null, "max_input_tokens": 32768, "max_output_tokens": 32768, "throughput": 99.15, "latency": 0.53, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.937841+00:00", "updated_at": "2025-07-19T19:49:16.937841+00:00", "provider_model_id_used": "qwq-32b-preview", "model_name": "QwQ-32B-Preview", "organization_id": "qwen" } ] ================================================ FILE: data/providers/fireworks/provider.json ================================================ { "provider_id": "fireworks", "name": "Fireworks", "website": "https://fireworks.ai/", "created_at": "2025-07-19T19:49:16.921865+00:00", "updated_at": "2025-07-19T19:49:16.921865+00:00" } ================================================ FILE: data/providers/google/models.json ================================================ [ { "model_provider_id": 318, "model_id": "claude-3-5-haiku-20241022", "provider_id": "google", "deprecated_at": null, "input_cents_per_million_tokens": 80, "output_cents_per_million_tokens": 400, "quantization": null, "max_input_tokens": 200000, "max_output_tokens": 200000, "throughput": 42.0, "latency": 0.4, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.896052+00:00", "updated_at": "2025-07-19T19:49:16.896052+00:00", "provider_model_id_used": "claude-3-5-haiku-20241022", "model_name": "Claude 3.5 Haiku", "organization_id": "anthropic" }, { "model_provider_id": 320, "model_id": "claude-3-5-sonnet-20240620", "provider_id": "google", "deprecated_at": null, "input_cents_per_million_tokens": 300, "output_cents_per_million_tokens": 1500, "quantization": null, "max_input_tokens": 200000, "max_output_tokens": 200000, "throughput": 42.0, "latency": 0.4, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.900161+00:00", "updated_at": "2025-07-19T19:49:16.900161+00:00", "provider_model_id_used": "claude-3-5-sonnet-20240620", "model_name": "Claude 3.5 Sonnet", "organization_id": "anthropic" }, { "model_provider_id": 319, "model_id": "claude-3-5-sonnet-20241022", "provider_id": "google", "deprecated_at": null, "input_cents_per_million_tokens": 300, "output_cents_per_million_tokens": 1500, "quantization": null, "max_input_tokens": 200000, "max_output_tokens": 200000, "throughput": 42.0, "latency": 0.4, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.898073+00:00", "updated_at": "2025-07-19T19:49:16.898073+00:00", "provider_model_id_used": "claude-3-5-sonnet-20241022", "model_name": "Claude 3.5 Sonnet", "organization_id": "anthropic" }, { "model_provider_id": 327, "model_id": "claude-3-7-sonnet-20250219", "provider_id": "google", "deprecated_at": null, "input_cents_per_million_tokens": 300, "output_cents_per_million_tokens": 1500, "quantization": null, "max_input_tokens": 200000, "max_output_tokens": 128000, "throughput": 42.0, "latency": 0.4, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.914565+00:00", "updated_at": "2025-07-19T19:49:16.914565+00:00", "provider_model_id_used": "claude-3-7-sonnet-20250219", "model_name": "Claude 3.7 Sonnet", "organization_id": "anthropic" }, { "model_provider_id": 328, "model_id": "claude-3-haiku-20240307", "provider_id": "google", "deprecated_at": null, "input_cents_per_million_tokens": 25, "output_cents_per_million_tokens": 125, "quantization": null, "max_input_tokens": 200000, "max_output_tokens": 200000, "throughput": 42.0, "latency": 0.4, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.916491+00:00", "updated_at": "2025-07-19T19:49:16.916491+00:00", "provider_model_id_used": "claude-3-haiku-20240307", "model_name": "Claude 3 Haiku", "organization_id": "anthropic" }, { "model_provider_id": 322, "model_id": "claude-3-opus-20240229", "provider_id": "google", "deprecated_at": null, "input_cents_per_million_tokens": 1500, "output_cents_per_million_tokens": 7500, "quantization": null, "max_input_tokens": 200000, "max_output_tokens": 200000, "throughput": 42.0, "latency": 0.4, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.903705+00:00", "updated_at": "2025-07-19T19:49:16.903705+00:00", "provider_model_id_used": "claude-3-opus-20240229", "model_name": "Claude 3 Opus", "organization_id": "anthropic" }, { "model_provider_id": 321, "model_id": "claude-3-sonnet-20240229", "provider_id": "google", "deprecated_at": null, "input_cents_per_million_tokens": 300, "output_cents_per_million_tokens": 1500, "quantization": null, "max_input_tokens": 200000, "max_output_tokens": 200000, "throughput": 42.0, "latency": 0.4, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.902100+00:00", "updated_at": "2025-07-19T19:49:16.902100+00:00", "provider_model_id_used": "claude-3-sonnet-20240229", "model_name": "Claude 3 Sonnet", "organization_id": "anthropic" }, { "model_provider_id": 330, "model_id": "claude-opus-4-20250514", "provider_id": "google", "deprecated_at": null, "input_cents_per_million_tokens": 1500, "output_cents_per_million_tokens": 7500, "quantization": null, "max_input_tokens": 200000, "max_output_tokens": 128000, "throughput": 42.0, "latency": 0.4, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.920504+00:00", "updated_at": "2025-07-19T19:49:16.920504+00:00", "provider_model_id_used": "claude-opus-4-20250514", "model_name": "Claude Opus 4", "organization_id": "anthropic" }, { "model_provider_id": 331, "model_id": "claude-opus-4-1-20250805", "provider_id": "google", "deprecated_at": null, "input_cents_per_million_tokens": 1500, "output_cents_per_million_tokens": 7500, "quantization": null, "max_input_tokens": 200000, "max_output_tokens": 32000, "throughput": 42.0, "latency": 0.4, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-08-05T00:00:00.000000+00:00", "updated_at": "2025-08-05T00:00:00.000000+00:00", "provider_model_id_used": "claude-opus-4-1-20250805", "model_name": "Claude Opus 4.1", "organization_id": "anthropic" }, { "model_provider_id": 329, "model_id": "claude-sonnet-4-20250514", "provider_id": "google", "deprecated_at": null, "input_cents_per_million_tokens": 300, "output_cents_per_million_tokens": 1500, "quantization": null, "max_input_tokens": 200000, "max_output_tokens": 128000, "throughput": 42.0, "latency": 0.4, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.918456+00:00", "updated_at": "2025-07-19T19:49:16.918456+00:00", "provider_model_id_used": "claude-sonnet-4-20250514", "model_name": "Claude Sonnet 4", "organization_id": "anthropic" }, { "model_provider_id": 312, "model_id": "gemini-1.0-pro", "provider_id": "google", "deprecated_at": null, "input_cents_per_million_tokens": 50, "output_cents_per_million_tokens": 150, "quantization": null, "max_input_tokens": 32760, "max_output_tokens": 8192, "throughput": 120.0, "latency": 0.4, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.882424+00:00", "updated_at": "2025-07-19T19:49:16.882424+00:00", "provider_model_id_used": "gemini-1.0-pro", "model_name": "Gemini 1.0 Pro", "organization_id": "google" }, { "model_provider_id": 313, "model_id": "gemini-1.5-flash", "provider_id": "google", "deprecated_at": null, "input_cents_per_million_tokens": 15, "output_cents_per_million_tokens": 60, "quantization": null, "max_input_tokens": 1048576, "max_output_tokens": 8192, "throughput": 150.0, "latency": 0.3, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.885387+00:00", "updated_at": "2025-07-19T19:49:16.885387+00:00", "provider_model_id_used": "gemini-1.5-flash", "model_name": "Gemini 1.5 Flash", "organization_id": "google" }, { "model_provider_id": 314, "model_id": "gemini-1.5-flash-8b", "provider_id": "google", "deprecated_at": null, "input_cents_per_million_tokens": 7, "output_cents_per_million_tokens": 30, "quantization": null, "max_input_tokens": 1048576, "max_output_tokens": 8192, "throughput": 150.0, "latency": 0.3, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.887626+00:00", "updated_at": "2025-07-19T19:49:16.887626+00:00", "provider_model_id_used": "gemini-1.5-flash-8b", "model_name": "Gemini 1.5 Flash 8B", "organization_id": "google" }, { "model_provider_id": 311, "model_id": "gemini-1.5-pro", "provider_id": "google", "deprecated_at": null, "input_cents_per_million_tokens": 250, "output_cents_per_million_tokens": 1000, "quantization": null, "max_input_tokens": 2097152, "max_output_tokens": 8192, "throughput": 85.0, "latency": 0.7, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.880526+00:00", "updated_at": "2025-07-19T19:49:16.880526+00:00", "provider_model_id_used": "gemini-1.5-pro", "model_name": "Gemini 1.5 Pro", "organization_id": "google" }, { "model_provider_id": 310, "model_id": "gemini-2.0-flash", "provider_id": "google", "deprecated_at": null, "input_cents_per_million_tokens": 10, "output_cents_per_million_tokens": 40, "quantization": null, "max_input_tokens": 1048576, "max_output_tokens": 8192, "throughput": 183.0, "latency": 0.4, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.878419+00:00", "updated_at": "2025-07-19T19:49:16.878419+00:00", "provider_model_id_used": "gemini-2.0-flash", "model_name": "Gemini 2.0 Flash", "organization_id": "google" }, { "model_provider_id": 309, "model_id": "gemini-2.0-flash-lite", "provider_id": "google", "deprecated_at": null, "input_cents_per_million_tokens": 7, "output_cents_per_million_tokens": 30, "quantization": null, "max_input_tokens": 1048576, "max_output_tokens": 8192, "throughput": 85.0, "latency": 0.7, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.876262+00:00", "updated_at": "2025-07-19T19:49:16.876262+00:00", "provider_model_id_used": "gemini-2.0-flash-lite", "model_name": "Gemini 2.0 Flash-Lite", "organization_id": "google" }, { "model_provider_id": 306, "model_id": "gemini-2.5-flash", "provider_id": "google", "deprecated_at": null, "input_cents_per_million_tokens": 30, "output_cents_per_million_tokens": 250, "quantization": null, "max_input_tokens": 1048576, "max_output_tokens": 65536, "throughput": 85.0, "latency": 0.7, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.868859+00:00", "updated_at": "2025-07-19T19:49:16.868859+00:00", "provider_model_id_used": "gemini-2.5-flash", "model_name": "Gemini 2.5 Flash", "organization_id": "google" }, { "model_provider_id": 305, "model_id": "gemini-2.5-flash-lite", "provider_id": "google", "deprecated_at": null, "input_cents_per_million_tokens": 10, "output_cents_per_million_tokens": 40, "quantization": null, "max_input_tokens": 1048576, "max_output_tokens": 65536, "throughput": 5.69, "latency": 0.44, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.866570+00:00", "updated_at": "2025-07-19T19:49:16.866570+00:00", "provider_model_id_used": "gemini-2.5-flash-lite", "model_name": "Gemini 2.5 Flash-Lite", "organization_id": "google" }, { "model_provider_id": 307, "model_id": "gemini-2.5-pro", "provider_id": "google", "deprecated_at": null, "input_cents_per_million_tokens": 125, "output_cents_per_million_tokens": 1000, "quantization": null, "max_input_tokens": 1048576, "max_output_tokens": 65536, "throughput": 85.0, "latency": 0.7, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.871063+00:00", "updated_at": "2025-07-19T19:49:16.871063+00:00", "provider_model_id_used": "gemini-2.5-pro", "model_name": "Gemini 2.5 Pro", "organization_id": "google" }, { "model_provider_id": 308, "model_id": "gemini-2.5-pro-preview-06-05", "provider_id": "google", "deprecated_at": null, "input_cents_per_million_tokens": 125, "output_cents_per_million_tokens": 1000, "quantization": null, "max_input_tokens": 1048576, "max_output_tokens": 65535, "throughput": 85.0, "latency": 0.7, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.873667+00:00", "updated_at": "2025-07-19T19:49:16.873667+00:00", "provider_model_id_used": "gemini-2.5-pro-preview-06-05", "model_name": "Gemini 2.5 Pro Preview 06-05", "organization_id": "google" }, { "model_provider_id": 316, "model_id": "jamba-1.5-large", "provider_id": "google", "deprecated_at": null, "input_cents_per_million_tokens": 200, "output_cents_per_million_tokens": 800, "quantization": null, "max_input_tokens": 256000, "max_output_tokens": 256000, "throughput": 42.0, "latency": 0.3, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.891518+00:00", "updated_at": "2025-07-19T19:49:16.891518+00:00", "provider_model_id_used": "jamba-1.5-large", "model_name": "Jamba 1.5 Large", "organization_id": "ai21" }, { "model_provider_id": 317, "model_id": "jamba-1.5-mini", "provider_id": "google", "deprecated_at": null, "input_cents_per_million_tokens": 20, "output_cents_per_million_tokens": 40, "quantization": null, "max_input_tokens": 256144, "max_output_tokens": 256144, "throughput": 100.0, "latency": 0.3, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.893779+00:00", "updated_at": "2025-07-19T19:49:16.893779+00:00", "provider_model_id_used": "jamba-1.5-mini", "model_name": "Jamba 1.5 Mini", "organization_id": "ai21" }, { "model_provider_id": 323, "model_id": "llama-3.1-405b-instruct", "provider_id": "google", "deprecated_at": null, "input_cents_per_million_tokens": 500, "output_cents_per_million_tokens": 1600, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 42.0, "latency": 0.4, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.905332+00:00", "updated_at": "2025-07-19T19:49:16.905332+00:00", "provider_model_id_used": "llama-3.1-405b-instruct", "model_name": "Llama 3.1 405B Instruct", "organization_id": "meta" }, { "model_provider_id": 324, "model_id": "mistral-large-2-2407", "provider_id": "google", "deprecated_at": null, "input_cents_per_million_tokens": 200, "output_cents_per_million_tokens": 600, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 42.0, "latency": 0.4, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.907260+00:00", "updated_at": "2025-07-19T19:49:16.907260+00:00", "provider_model_id_used": "mistral-large-2-2407", "model_name": "Mistral Large 2", "organization_id": "mistral" }, { "model_provider_id": 325, "model_id": "mistral-nemo-instruct-2407", "provider_id": "google", "deprecated_at": null, "input_cents_per_million_tokens": 15, "output_cents_per_million_tokens": 15, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 42.0, "latency": 0.4, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.909863+00:00", "updated_at": "2025-07-19T19:49:16.909863+00:00", "provider_model_id_used": "mistral-nemo-instruct-2407", "model_name": "Mistral NeMo Instruct", "organization_id": "mistral" } ] ================================================ FILE: data/providers/google/provider.json ================================================ { "provider_id": "google", "name": "Google", "website": "https://ai.google.dev", "created_at": "2025-07-19T19:49:16.864633+00:00", "updated_at": "2025-07-19T19:49:16.864633+00:00" } ================================================ FILE: data/providers/groq/models.json ================================================ [ { "model_provider_id": 345, "model_id": "llama-3.1-70b-instruct", "provider_id": "groq", "deprecated_at": null, "input_cents_per_million_tokens": 59, "output_cents_per_million_tokens": 78, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 250.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.955618+00:00", "updated_at": "2025-07-19T19:49:16.955618+00:00", "provider_model_id_used": "llama-3.1-70b-instruct", "model_name": "Llama 3.1 70B Instruct", "organization_id": "meta" }, { "model_provider_id": 346, "model_id": "llama-3.1-8b-instruct", "provider_id": "groq", "deprecated_at": null, "input_cents_per_million_tokens": 5, "output_cents_per_million_tokens": 8, "quantization": null, "max_input_tokens": 131072, "max_output_tokens": 131072, "throughput": 750.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.957463+00:00", "updated_at": "2025-07-19T19:49:16.957463+00:00", "provider_model_id_used": "llama-3.1-8b-instruct", "model_name": "Llama 3.1 8B Instruct", "organization_id": "meta" }, { "model_provider_id": 347, "model_id": "llama-3.2-11b-instruct", "provider_id": "groq", "deprecated_at": null, "input_cents_per_million_tokens": 18, "output_cents_per_million_tokens": 18, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 100.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.959974+00:00", "updated_at": "2025-07-19T19:49:16.959974+00:00", "provider_model_id_used": "llama-3.2-11b-instruct", "model_name": "Llama 3.2 11B Instruct", "organization_id": "meta" }, { "model_provider_id": 348, "model_id": "llama-3.3-70b-instruct", "provider_id": "groq", "deprecated_at": null, "input_cents_per_million_tokens": 59, "output_cents_per_million_tokens": 790, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 268.0, "latency": 0.65, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.962122+00:00", "updated_at": "2025-07-19T19:49:16.962122+00:00", "provider_model_id_used": "llama-3.3-70b-instruct", "model_name": "Llama 3.3 70B Instruct", "organization_id": "meta" }, { "model_provider_id": 349, "model_id": "llama-4-maverick", "provider_id": "groq", "deprecated_at": null, "input_cents_per_million_tokens": 20, "output_cents_per_million_tokens": 60, "quantization": null, "max_input_tokens": 1000000, "max_output_tokens": 1000000, "throughput": 307.3, "latency": 0.27, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.963701+00:00", "updated_at": "2025-07-19T19:49:16.963701+00:00", "provider_model_id_used": "llama-4-maverick", "model_name": "Llama 4 Maverick", "organization_id": "meta" }, { "model_provider_id": 350, "model_id": "llama-4-scout", "provider_id": "groq", "deprecated_at": null, "input_cents_per_million_tokens": 11, "output_cents_per_million_tokens": 34, "quantization": null, "max_input_tokens": 10000000, "max_output_tokens": 10000000, "throughput": 776.1, "latency": 1.08, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.965756+00:00", "updated_at": "2025-07-19T19:49:16.965756+00:00", "provider_model_id_used": "llama-4-scout", "model_name": "Llama 4 Scout", "organization_id": "meta" }, { "model_provider_id": 1231, "model_id": "gpt-oss-120b", "provider_id": "groq", "deprecated_at": null, "input_cents_per_million_tokens": 15, "output_cents_per_million_tokens": 60, "quantization": null, "max_input_tokens": 131000, "max_output_tokens": 30000, "throughput": 500, "latency": 0.5, "feature_web_search": true, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": true, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-08-05T19:49:16.965756+00:00", "updated_at": "2025-08-05T19:49:16.965756+00:00", "provider_model_id_used": "gpt-oss-120b", "model_name": "OpenAI OSS 120B", "organization_id": "openai" }, { "model_provider_id": 1232, "model_id": "gpt-oss-20b", "provider_id": "groq", "deprecated_at": null, "input_cents_per_million_tokens": 10, "output_cents_per_million_tokens": 50, "quantization": null, "max_input_tokens": 131000, "max_output_tokens": 30000, "throughput": 1000, "latency": 0.38, "feature_web_search": true, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": true, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-08-05T19:49:16.965756+00:00", "updated_at": "2025-08-05T19:49:16.965756+00:00", "provider_model_id_used": "gpt-oss-20b", "model_name": "OpenAI OSS 20B", "organization_id": "openai" } ] ================================================ FILE: data/providers/groq/provider.json ================================================ { "provider_id": "groq", "name": "Groq", "website": "https://groq.com/", "created_at": "2025-07-19T19:49:16.953587+00:00", "updated_at": "2025-07-19T19:49:16.953587+00:00" } ================================================ FILE: data/providers/hyperbolic/models.json ================================================ [ { "model_provider_id": 276, "model_id": "deepseek-v2.5", "provider_id": "hyperbolic", "deprecated_at": null, "input_cents_per_million_tokens": 200, "output_cents_per_million_tokens": 200, "quantization": null, "max_input_tokens": 8192, "max_output_tokens": 8192, "throughput": 100.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.801424+00:00", "updated_at": "2025-07-19T19:49:16.801424+00:00", "provider_model_id_used": "deepseek-v2.5", "model_name": "DeepSeek-V2.5", "organization_id": "deepseek" }, { "model_provider_id": 272, "model_id": "llama-3.1-405b-instruct", "provider_id": "hyperbolic", "deprecated_at": null, "input_cents_per_million_tokens": 400, "output_cents_per_million_tokens": 400, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 40.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.788610+00:00", "updated_at": "2025-07-19T19:49:16.788610+00:00", "provider_model_id_used": "llama-3.1-405b-instruct", "model_name": "Llama 3.1 405B Instruct", "organization_id": "meta" }, { "model_provider_id": 271, "model_id": "llama-3.1-70b-instruct", "provider_id": "hyperbolic", "deprecated_at": null, "input_cents_per_million_tokens": 40, "output_cents_per_million_tokens": 40, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 100.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.785874+00:00", "updated_at": "2025-07-19T19:49:16.785874+00:00", "provider_model_id_used": "llama-3.1-70b-instruct", "model_name": "Llama 3.1 70B Instruct", "organization_id": "meta" }, { "model_provider_id": 270, "model_id": "llama-3.1-8b-instruct", "provider_id": "hyperbolic", "deprecated_at": null, "input_cents_per_million_tokens": 10, "output_cents_per_million_tokens": 10, "quantization": null, "max_input_tokens": 131072, "max_output_tokens": 131072, "throughput": 200.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.783230+00:00", "updated_at": "2025-07-19T19:49:16.783230+00:00", "provider_model_id_used": "llama-3.1-8b-instruct", "model_name": "Llama 3.1 8B Instruct", "organization_id": "meta" }, { "model_provider_id": 273, "model_id": "llama-3.2-90b-instruct", "provider_id": "hyperbolic", "deprecated_at": null, "input_cents_per_million_tokens": 200, "output_cents_per_million_tokens": 200, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 42.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.791634+00:00", "updated_at": "2025-07-19T19:49:16.791634+00:00", "provider_model_id_used": "llama-3.2-90b-instruct", "model_name": "Llama 3.2 90B Instruct", "organization_id": "meta" }, { "model_provider_id": 278, "model_id": "llama-3.3-70b-instruct", "provider_id": "hyperbolic", "deprecated_at": null, "input_cents_per_million_tokens": 40, "output_cents_per_million_tokens": 40, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 42.0, "latency": 0.65, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.805164+00:00", "updated_at": "2025-07-19T19:49:16.805164+00:00", "provider_model_id_used": "llama-3.3-70b-instruct", "model_name": "Llama 3.3 70B Instruct", "organization_id": "meta" }, { "model_provider_id": 274, "model_id": "qwen-2.5-72b-instruct", "provider_id": "hyperbolic", "deprecated_at": null, "input_cents_per_million_tokens": 40, "output_cents_per_million_tokens": 40, "quantization": null, "max_input_tokens": 131072, "max_output_tokens": 8192, "throughput": 100.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.795011+00:00", "updated_at": "2025-07-19T19:49:16.795011+00:00", "provider_model_id_used": "qwen-2.5-72b-instruct", "model_name": "Qwen2.5 72B Instruct", "organization_id": "qwen" }, { "model_provider_id": 275, "model_id": "qwen-2.5-coder-32b-instruct", "provider_id": "hyperbolic", "deprecated_at": null, "input_cents_per_million_tokens": 20, "output_cents_per_million_tokens": 20, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 100.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.798904+00:00", "updated_at": "2025-07-19T19:49:16.798904+00:00", "provider_model_id_used": "qwen-2.5-coder-32b-instruct", "model_name": "Qwen2.5-Coder 32B Instruct", "organization_id": "qwen" }, { "model_provider_id": 277, "model_id": "qwq-32b-preview", "provider_id": "hyperbolic", "deprecated_at": null, "input_cents_per_million_tokens": 20, "output_cents_per_million_tokens": 20, "quantization": null, "max_input_tokens": 32768, "max_output_tokens": 32768, "throughput": 31.9, "latency": 1.05, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.803353+00:00", "updated_at": "2025-07-19T19:49:16.803353+00:00", "provider_model_id_used": "qwq-32b-preview", "model_name": "QwQ-32B-Preview", "organization_id": "qwen" } ] ================================================ FILE: data/providers/hyperbolic/provider.json ================================================ { "provider_id": "hyperbolic", "name": "Hyperbolic", "website": "https://hyperbolic.xyz", "created_at": "2025-07-19T19:49:16.780946+00:00", "updated_at": "2025-07-19T19:49:16.780946+00:00" } ================================================ FILE: data/providers/lambda/models.json ================================================ [ { "model_provider_id": 390, "model_id": "llama-3.1-405b-instruct", "provider_id": "lambda", "deprecated_at": null, "input_cents_per_million_tokens": 89, "output_cents_per_million_tokens": 89, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 42.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.054217+00:00", "updated_at": "2025-07-19T19:49:17.054217+00:00", "provider_model_id_used": "llama-3.1-405b-instruct", "model_name": "Llama 3.1 405B Instruct", "organization_id": "meta" }, { "model_provider_id": 389, "model_id": "llama-3.1-70b-instruct", "provider_id": "lambda", "deprecated_at": null, "input_cents_per_million_tokens": 20, "output_cents_per_million_tokens": 20, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 42.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.051981+00:00", "updated_at": "2025-07-19T19:49:17.051981+00:00", "provider_model_id_used": "llama-3.1-70b-instruct", "model_name": "Llama 3.1 70B Instruct", "organization_id": "meta" }, { "model_provider_id": 388, "model_id": "llama-3.1-8b-instruct", "provider_id": "lambda", "deprecated_at": null, "input_cents_per_million_tokens": 3, "output_cents_per_million_tokens": 3, "quantization": null, "max_input_tokens": 131072, "max_output_tokens": 131072, "throughput": 42.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.050200+00:00", "updated_at": "2025-07-19T19:49:17.050200+00:00", "provider_model_id_used": "llama-3.1-8b-instruct", "model_name": "Llama 3.1 8B Instruct", "organization_id": "meta" }, { "model_provider_id": 391, "model_id": "llama-3.3-70b-instruct", "provider_id": "lambda", "deprecated_at": null, "input_cents_per_million_tokens": 20, "output_cents_per_million_tokens": 20, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 42.0, "latency": 0.65, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.056567+00:00", "updated_at": "2025-07-19T19:49:17.056567+00:00", "provider_model_id_used": "llama-3.3-70b-instruct", "model_name": "Llama 3.3 70B Instruct", "organization_id": "meta" }, { "model_provider_id": 393, "model_id": "llama-4-maverick", "provider_id": "lambda", "deprecated_at": null, "input_cents_per_million_tokens": 18, "output_cents_per_million_tokens": 60, "quantization": null, "max_input_tokens": 1000000, "max_output_tokens": 1000000, "throughput": 93.69, "latency": 0.65, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.060734+00:00", "updated_at": "2025-07-19T19:49:17.060734+00:00", "provider_model_id_used": "llama-4-maverick", "model_name": "Llama 4 Maverick", "organization_id": "meta" }, { "model_provider_id": 394, "model_id": "llama-4-scout", "provider_id": "lambda", "deprecated_at": null, "input_cents_per_million_tokens": 8, "output_cents_per_million_tokens": 30, "quantization": null, "max_input_tokens": 10000000, "max_output_tokens": 10000000, "throughput": 139.7, "latency": 0.43, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.062783+00:00", "updated_at": "2025-07-19T19:49:17.062783+00:00", "provider_model_id_used": "llama-4-scout", "model_name": "Llama 4 Scout", "organization_id": "meta" }, { "model_provider_id": 392, "model_id": "qwen-2.5-coder-32b-instruct", "provider_id": "lambda", "deprecated_at": null, "input_cents_per_million_tokens": 9, "output_cents_per_million_tokens": 9, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 42.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.058608+00:00", "updated_at": "2025-07-19T19:49:17.058608+00:00", "provider_model_id_used": "qwen-2.5-coder-32b-instruct", "model_name": "Qwen2.5-Coder 32B Instruct", "organization_id": "qwen" } ] ================================================ FILE: data/providers/lambda/provider.json ================================================ { "provider_id": "lambda", "name": "Lambda", "website": "https://lambdalabs.com", "created_at": "2025-07-19T19:49:17.048564+00:00", "updated_at": "2025-07-19T19:49:17.048564+00:00" } ================================================ FILE: data/providers/mistral/models.json ================================================ [ { "model_provider_id": 408, "model_id": "devstral-medium-2507", "provider_id": "mistral", "deprecated_at": null, "input_cents_per_million_tokens": 40, "output_cents_per_million_tokens": 200, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 137.1, "latency": 0.23, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.098942+00:00", "updated_at": "2025-07-19T19:49:17.098942+00:00", "provider_model_id_used": "devstral-medium-2507", "model_name": "Devstral Medium", "organization_id": "mistral" }, { "model_provider_id": 409, "model_id": "devstral-small-2507", "provider_id": "mistral", "deprecated_at": null, "input_cents_per_million_tokens": 10, "output_cents_per_million_tokens": 30, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 137.1, "latency": 0.23, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.100512+00:00", "updated_at": "2025-07-19T19:49:17.100512+00:00", "provider_model_id_used": "devstral-small-2507", "model_name": "Devstral Small 1.1", "organization_id": "mistral" }, { "model_provider_id": 415, "model_id": "ministral-8b-instruct-2410", "provider_id": "mistral", "deprecated_at": null, "input_cents_per_million_tokens": 10, "output_cents_per_million_tokens": 10, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 0.1, "latency": 0.18, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.113059+00:00", "updated_at": "2025-07-19T19:49:17.113059+00:00", "provider_model_id_used": "ministral-8b-instruct-2410", "model_name": "Ministral 8B Instruct", "organization_id": "mistral" }, { "model_provider_id": 412, "model_id": "mistral-large-2-2407", "provider_id": "mistral", "deprecated_at": null, "input_cents_per_million_tokens": 200, "output_cents_per_million_tokens": 600, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 0.1, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.106626+00:00", "updated_at": "2025-07-19T19:49:17.106626+00:00", "provider_model_id_used": "mistral-large-2-2407", "model_name": "Mistral Large 2", "organization_id": "mistral" }, { "model_provider_id": 417, "model_id": "mistral-nemo-instruct-2407", "provider_id": "mistral", "deprecated_at": null, "input_cents_per_million_tokens": 15, "output_cents_per_million_tokens": 15, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 0.1, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.116560+00:00", "updated_at": "2025-07-19T19:49:17.116560+00:00", "provider_model_id_used": "mistral-nemo-instruct-2407", "model_name": "Mistral NeMo Instruct", "organization_id": "mistral" }, { "model_provider_id": 414, "model_id": "mistral-small-2409", "provider_id": "mistral", "deprecated_at": null, "input_cents_per_million_tokens": 20, "output_cents_per_million_tokens": 60, "quantization": null, "max_input_tokens": 32768, "max_output_tokens": 32768, "throughput": 0.1, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.111268+00:00", "updated_at": "2025-07-19T19:49:17.111268+00:00", "provider_model_id_used": "mistral-small-2409", "model_name": "Mistral Small", "organization_id": "mistral" }, { "model_provider_id": 419, "model_id": "mistral-small-24b-instruct-2501", "provider_id": "mistral", "deprecated_at": null, "input_cents_per_million_tokens": 10, "output_cents_per_million_tokens": 30, "quantization": null, "max_input_tokens": 32000, "max_output_tokens": 32000, "throughput": 134.0, "latency": 0.2, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.120575+00:00", "updated_at": "2025-07-19T19:49:17.120575+00:00", "provider_model_id_used": "mistral-small-24b-instruct-2501", "model_name": "Mistral Small 3 24B Instruct", "organization_id": "mistral" }, { "model_provider_id": 410, "model_id": "mistral-small-3.1-24b-base-2503", "provider_id": "mistral", "deprecated_at": null, "input_cents_per_million_tokens": 10, "output_cents_per_million_tokens": 30, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 137.1, "latency": 0.23, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.102773+00:00", "updated_at": "2025-07-19T19:49:17.102773+00:00", "provider_model_id_used": "mistral-small-3.1-24b-base-2503", "model_name": "Mistral Small 3.1 24B Base", "organization_id": "mistral" }, { "model_provider_id": 416, "model_id": "pixtral-12b-2409", "provider_id": "mistral", "deprecated_at": null, "input_cents_per_million_tokens": 15, "output_cents_per_million_tokens": 15, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 8192, "throughput": 0.1, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.114646+00:00", "updated_at": "2025-07-19T19:49:17.114646+00:00", "provider_model_id_used": "pixtral-12b-2409", "model_name": "Pixtral-12B", "organization_id": "mistral" }, { "model_provider_id": 413, "model_id": "pixtral-large", "provider_id": "mistral", "deprecated_at": null, "input_cents_per_million_tokens": 200, "output_cents_per_million_tokens": 600, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 0.1, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.108807+00:00", "updated_at": "2025-07-19T19:49:17.108807+00:00", "provider_model_id_used": "pixtral-large", "model_name": "Pixtral Large", "organization_id": "mistral" } ] ================================================ FILE: data/providers/mistral/provider.json ================================================ { "provider_id": "mistral", "name": "Mistral AI", "website": "https://mistral.ai", "created_at": "2025-07-19T19:49:17.096952+00:00", "updated_at": "2025-07-19T19:49:17.096952+00:00" } ================================================ FILE: data/providers/novita/models.json ================================================ [ { "model_provider_id": 359, "model_id": "qwen3-235b-a22b-instruct-2507", "provider_id": "novita", "deprecated_at": null, "input_cents_per_million_tokens": 15, "output_cents_per_million_tokens": 80, "quantization": "fp8", "max_input_tokens": 131072, "max_output_tokens": 16384, "throughput": null, "latency": null, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-09-14T00:00:00.000000+00:00", "updated_at": "2025-09-14T00:00:00.000000+00:00", "provider_model_id_used": "qwen/qwen3-235b-a22b-instruct-2507", "model_name": "Qwen3-235B-A22B-Instruct-2507", "organization_id": "qwen" }, { "model_provider_id": 360, "model_id": "gpt-oss-20b", "provider_id": "novita", "deprecated_at": null, "input_cents_per_million_tokens": 5, "output_cents_per_million_tokens": 20, "quantization": "bf16", "max_input_tokens": 131072, "max_output_tokens": 32768, "throughput": null, "latency": null, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-09-14T00:00:00.000000+00:00", "updated_at": "2025-09-14T00:00:00.000000+00:00", "provider_model_id_used": "openai/gpt-oss-20b", "model_name": "GPT-OSS-20B", "organization_id": "openai" }, { "model_provider_id": 364, "model_id": "gpt-oss-120b", "provider_id": "novita", "deprecated_at": null, "input_cents_per_million_tokens": 10, "output_cents_per_million_tokens": 50, "quantization": "bf16", "max_input_tokens": 131072, "max_output_tokens": 131072, "throughput": null, "latency": null, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-09-14T00:00:00.000000+00:00", "updated_at": "2025-09-14T00:00:00.000000+00:00", "provider_model_id_used": "openai/gpt-oss-120b", "model_name": "GPT-OSS-120B", "organization_id": "openai" }, { "model_provider_id": 361, "model_id": "qwen3-235b-a22b-thinking-2507", "provider_id": "novita", "deprecated_at": null, "input_cents_per_million_tokens": 30, "output_cents_per_million_tokens": 300, "quantization": "fp8", "max_input_tokens": 256000, "max_output_tokens": 131072, "throughput": null, "latency": null, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-09-14T00:00:00.000000+00:00", "updated_at": "2025-09-14T00:00:00.000000+00:00", "provider_model_id_used": "qwen/qwen3-235b-a22b-thinking-2507", "model_name": "Qwen3-235B-A22B-Thinking-2507", "organization_id": "qwen" }, { "model_provider_id": 362, "model_id": "deepseek-v3-0324", "provider_id": "novita", "deprecated_at": null, "input_cents_per_million_tokens": 28, "output_cents_per_million_tokens": 114, "quantization": "fp8", "max_input_tokens": 163840, "max_output_tokens": 163840, "throughput": null, "latency": null, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-09-14T00:00:00.000000+00:00", "updated_at": "2025-09-14T00:00:00.000000+00:00", "provider_model_id_used": "deepseek/deepseek-v3-0324", "model_name": "DeepSeek-V3-0324", "organization_id": "deepseek" }, { "model_provider_id": 363, "model_id": "deepseek-v3.1", "provider_id": "novita", "deprecated_at": null, "input_cents_per_million_tokens": 27, "output_cents_per_million_tokens": 100, "quantization": "fp8", "max_input_tokens": 163840, "max_output_tokens": 163840, "throughput": null, "latency": null, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-09-14T00:00:00.000000+00:00", "updated_at": "2025-09-14T00:00:00.000000+00:00", "provider_model_id_used": "deepseek/deepseek-v3.1", "model_name": "DeepSeek V3.1", "organization_id": "deepseek" }, { "model_provider_id": 357, "model_id": "deepseek-r1-0528", "provider_id": "novita", "deprecated_at": null, "input_cents_per_million_tokens": 70, "output_cents_per_million_tokens": 250, "quantization": null, "max_input_tokens": 131072, "max_output_tokens": 131072, "throughput": 37.96, "latency": 1.18, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.982118+00:00", "updated_at": "2025-07-19T19:49:16.982118+00:00", "provider_model_id_used": "deepseek-r1-0528", "model_name": "DeepSeek-R1-0528", "organization_id": "deepseek" }, { "model_provider_id": 351, "model_id": "gemma-3-27b-it", "provider_id": "novita", "deprecated_at": null, "input_cents_per_million_tokens": 11, "output_cents_per_million_tokens": 20, "quantization": null, "max_input_tokens": 131072, "max_output_tokens": 131072, "throughput": 33.0, "latency": 0.2, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.969199+00:00", "updated_at": "2025-07-19T19:49:16.969199+00:00", "provider_model_id_used": "gemma-3-27b-it", "model_name": "Gemma 3 27B", "organization_id": "google" }, { "model_provider_id": 358, "model_id": "kimi-k2-instruct", "provider_id": "novita", "deprecated_at": null, "input_cents_per_million_tokens": 57, "output_cents_per_million_tokens": 230, "quantization": "fp8", "max_input_tokens": 131072, "max_output_tokens": 131072, "throughput": 45.0, "latency": 0.95, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.984536+00:00", "updated_at": "2025-09-14T00:00:00.000000+00:00", "provider_model_id_used": "moonshotai/kimi-k2-instruct", "model_name": "Kimi K2 Instruct", "organization_id": "moonshotai" }, { "model_provider_id": 365, "model_id": "kimi-k2-0905", "provider_id": "novita", "deprecated_at": null, "input_cents_per_million_tokens": 60, "output_cents_per_million_tokens": 250, "quantization": "fp8", "max_input_tokens": 262144, "max_output_tokens": 262144, "throughput": null, "latency": null, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-09-14T00:00:00.000000+00:00", "updated_at": "2025-09-14T00:00:00.000000+00:00", "provider_model_id_used": "moonshotai/kimi-k2-0905", "model_name": "Kimi K2 0905", "organization_id": "moonshotai" }, { "model_provider_id": 366, "model_id": "qwen3-next-80b-a3b-thinking", "provider_id": "novita", "deprecated_at": null, "input_cents_per_million_tokens": 15, "output_cents_per_million_tokens": 150, "quantization": "bf16", "max_input_tokens": 65536, "max_output_tokens": 65536, "throughput": null, "latency": null, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-09-14T00:00:00.000000+00:00", "updated_at": "2025-09-14T00:00:00.000000+00:00", "provider_model_id_used": "qwen/qwen3-next-80b-a3b-thinking", "model_name": "Qwen3 Next 80B A3B Thinking", "organization_id": "qwen" }, { "model_provider_id": 367, "model_id": "qwen3-next-80b-a3b-instruct", "provider_id": "novita", "deprecated_at": null, "input_cents_per_million_tokens": 15, "output_cents_per_million_tokens": 150, "quantization": "bf16", "max_input_tokens": 65536, "max_output_tokens": 65536, "throughput": null, "latency": null, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-09-14T00:00:00.000000+00:00", "updated_at": "2025-09-14T00:00:00.000000+00:00", "provider_model_id_used": "qwen/qwen3-next-80b-a3b-instruct", "model_name": "Qwen3 Next 80B A3B Instruct", "organization_id": "qwen" }, { "model_provider_id": 352, "model_id": "llama-4-maverick", "provider_id": "novita", "deprecated_at": null, "input_cents_per_million_tokens": 17, "output_cents_per_million_tokens": 85, "quantization": null, "max_input_tokens": 1000000, "max_output_tokens": 1000000, "throughput": 69.42, "latency": 0.62, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.970871+00:00", "updated_at": "2025-07-19T19:49:16.970871+00:00", "provider_model_id_used": "llama-4-maverick", "model_name": "Llama 4 Maverick", "organization_id": "meta" }, { "model_provider_id": 353, "model_id": "llama-4-scout", "provider_id": "novita", "deprecated_at": null, "input_cents_per_million_tokens": 10, "output_cents_per_million_tokens": 50, "quantization": null, "max_input_tokens": 10000000, "max_output_tokens": 10000000, "throughput": 69.82, "latency": 0.85, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.972719+00:00", "updated_at": "2025-07-19T19:49:16.972719+00:00", "provider_model_id_used": "llama-4-scout", "model_name": "Llama 4 Scout", "organization_id": "meta" }, { "model_provider_id": 354, "model_id": "qwen3-235b-a22b", "provider_id": "novita", "deprecated_at": null, "input_cents_per_million_tokens": 20, "output_cents_per_million_tokens": 80, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 38.51, "latency": 1.02, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.975233+00:00", "updated_at": "2025-07-19T19:49:16.975233+00:00", "provider_model_id_used": "qwen3-235b-a22b", "model_name": "Qwen3 235B A22B", "organization_id": "qwen" }, { "model_provider_id": 356, "model_id": "qwen3-30b-a3b", "provider_id": "novita", "deprecated_at": null, "input_cents_per_million_tokens": 10, "output_cents_per_million_tokens": 44, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 88.84, "latency": 0.73, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.980126+00:00", "updated_at": "2025-07-19T19:49:16.980126+00:00", "provider_model_id_used": "qwen3-30b-a3b", "model_name": "Qwen3 30B A3B", "organization_id": "qwen" }, { "model_provider_id": 355, "model_id": "qwen3-32b", "provider_id": "novita", "deprecated_at": null, "input_cents_per_million_tokens": 10, "output_cents_per_million_tokens": 44, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 32.43, "latency": 0.93, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.977464+00:00", "updated_at": "2025-07-19T19:49:16.977464+00:00", "provider_model_id_used": "qwen3-32b", "model_name": "Qwen3 32B", "organization_id": "qwen" }, { "model_provider_id": 368, "model_id": "deepseek-v3.2-exp", "provider_id": "novita", "deprecated_at": null, "input_cents_per_million_tokens": 27, "output_cents_per_million_tokens": 41, "quantization": "fp8", "max_input_tokens": 163840, "max_output_tokens": 65536, "throughput": null, "latency": null, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-09-29T00:00:00.000000+00:00", "updated_at": "2025-09-29T00:00:00.000000+00:00", "provider_model_id_used": "deepseek/deepseek-v3.2-exp", "model_name": "DeepSeek V3.2 Exp", "organization_id": "deepseek" }, { "model_provider_id": 369, "model_id": "glm-4.5", "provider_id": "novita", "deprecated_at": null, "input_cents_per_million_tokens": 60, "output_cents_per_million_tokens": 220, "quantization": "fp8", "max_input_tokens": 131072, "max_output_tokens": 98304, "throughput": null, "latency": null, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-09-29T00:00:00.000000+00:00", "updated_at": "2025-09-29T00:00:00.000000+00:00", "provider_model_id_used": "zai-org/glm-4.5", "model_name": "GLM-4.5", "organization_id": "zai-org" }, { "model_provider_id": 370, "model_id": "glm-4.5v", "provider_id": "novita", "deprecated_at": null, "input_cents_per_million_tokens": 60, "output_cents_per_million_tokens": 220, "quantization": "fp8", "max_input_tokens": 131072, "max_output_tokens": 65536, "throughput": null, "latency": null, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": true, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-09-29T00:00:00.000000+00:00", "updated_at": "2025-09-29T00:00:00.000000+00:00", "provider_model_id_used": "zai-org/GLM-4.5V", "model_name": "GLM-4.5V", "organization_id": "zai-org" } ] ================================================ FILE: data/providers/novita/provider.json ================================================ { "provider_id": "novita", "name": "Novita", "website": "https://novita.ai/", "created_at": "2025-07-19T19:49:16.967182+00:00", "updated_at": "2025-07-19T19:49:16.967182+00:00" } ================================================ FILE: data/providers/openai/models.json ================================================ [ { "model_provider_id": 422, "model_id": "gpt-3.5-turbo-0125", "provider_id": "openai", "deprecated_at": null, "input_cents_per_million_tokens": 50, "output_cents_per_million_tokens": 150, "quantization": null, "max_input_tokens": 16385, "max_output_tokens": 4096, "throughput": 100.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.128446+00:00", "updated_at": "2025-07-19T19:49:17.128446+00:00", "provider_model_id_used": "gpt-3.5-turbo-0125", "model_name": "GPT-3.5 Turbo", "organization_id": "openai" }, { "model_provider_id": 420, "model_id": "gpt-4-0613", "provider_id": "openai", "deprecated_at": null, "input_cents_per_million_tokens": 3000, "output_cents_per_million_tokens": 6000, "quantization": null, "max_input_tokens": 32768, "max_output_tokens": 32768, "throughput": 100.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.123888+00:00", "updated_at": "2025-07-19T19:49:17.123888+00:00", "provider_model_id_used": "gpt-4-0613", "model_name": "GPT-4", "organization_id": "openai" }, { "model_provider_id": 430, "model_id": "gpt-4.1-2025-04-14", "provider_id": "openai", "deprecated_at": null, "input_cents_per_million_tokens": 200, "output_cents_per_million_tokens": 800, "quantization": null, "max_input_tokens": 1047576, "max_output_tokens": 32768, "throughput": 100.0, "latency": 10.0, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.150851+00:00", "updated_at": "2025-07-19T19:49:17.150851+00:00", "provider_model_id_used": "gpt-4.1-2025-04-14", "model_name": "GPT-4.1", "organization_id": "openai" }, { "model_provider_id": 431, "model_id": "gpt-4.1-mini-2025-04-14", "provider_id": "openai", "deprecated_at": null, "input_cents_per_million_tokens": 40, "output_cents_per_million_tokens": 160, "quantization": null, "max_input_tokens": 1047576, "max_output_tokens": 32768, "throughput": 150.0, "latency": 5.0, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.152948+00:00", "updated_at": "2025-07-19T19:49:17.152948+00:00", "provider_model_id_used": "gpt-4.1-mini-2025-04-14", "model_name": "GPT-4.1 mini", "organization_id": "openai" }, { "model_provider_id": 432, "model_id": "gpt-4.1-nano-2025-04-14", "provider_id": "openai", "deprecated_at": null, "input_cents_per_million_tokens": 10, "output_cents_per_million_tokens": 40, "quantization": null, "max_input_tokens": 1047576, "max_output_tokens": 32768, "throughput": 200.0, "latency": 2.0, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.154798+00:00", "updated_at": "2025-07-19T19:49:17.154798+00:00", "provider_model_id_used": "gpt-4.1-nano-2025-04-14", "model_name": "GPT-4.1 nano", "organization_id": "openai" }, { "model_provider_id": 429, "model_id": "gpt-4.5", "provider_id": "openai", "deprecated_at": null, "input_cents_per_million_tokens": 7500, "output_cents_per_million_tokens": 15000, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 4096, "throughput": 50.0, "latency": 20.0, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.148982+00:00", "updated_at": "2025-07-19T19:49:17.148982+00:00", "provider_model_id_used": "gpt-4.5", "model_name": "GPT-4.5", "organization_id": "openai" }, { "model_provider_id": 424, "model_id": "gpt-4o-2024-05-13", "provider_id": "openai", "deprecated_at": null, "input_cents_per_million_tokens": 250, "output_cents_per_million_tokens": 1000, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 4096, "throughput": 100.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.132398+00:00", "updated_at": "2025-07-19T19:49:17.132398+00:00", "provider_model_id_used": "gpt-4o-2024-05-13", "model_name": "GPT-4o", "organization_id": "openai" }, { "model_provider_id": 423, "model_id": "gpt-4o-2024-08-06", "provider_id": "openai", "deprecated_at": null, "input_cents_per_million_tokens": 250, "output_cents_per_million_tokens": 1000, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 16384, "throughput": 132.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.130542+00:00", "updated_at": "2025-07-19T19:49:17.130542+00:00", "provider_model_id_used": "gpt-4o-2024-08-06", "model_name": "GPT-4o", "organization_id": "openai" }, { "model_provider_id": 421, "model_id": "gpt-4-turbo-2024-04-09", "provider_id": "openai", "deprecated_at": null, "input_cents_per_million_tokens": 1000, "output_cents_per_million_tokens": 3000, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 4096, "throughput": 100.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.126193+00:00", "updated_at": "2025-07-19T19:49:17.126193+00:00", "provider_model_id_used": "gpt-4-turbo-2024-04-09", "model_name": "GPT-4 Turbo", "organization_id": "openai" }, { "model_provider_id": 426, "model_id": "o1-2024-12-17", "provider_id": "openai", "deprecated_at": null, "input_cents_per_million_tokens": 1500, "output_cents_per_million_tokens": 6000, "quantization": null, "max_input_tokens": 200000, "max_output_tokens": 100000, "throughput": 66.0, "latency": 16.2, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.136375+00:00", "updated_at": "2025-07-19T19:49:17.136375+00:00", "provider_model_id_used": "o1-2024-12-17", "model_name": "o1", "organization_id": "openai" }, { "model_provider_id": 427, "model_id": "o1-mini", "provider_id": "openai", "deprecated_at": null, "input_cents_per_million_tokens": 300, "output_cents_per_million_tokens": 1200, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 65536, "throughput": 115.0, "latency": 5.2, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.137957+00:00", "updated_at": "2025-07-19T19:49:17.137957+00:00", "provider_model_id_used": "o1-mini", "model_name": "o1-mini", "organization_id": "openai" }, { "model_provider_id": 425, "model_id": "o1-preview", "provider_id": "openai", "deprecated_at": null, "input_cents_per_million_tokens": 1500, "output_cents_per_million_tokens": 6000, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 32768, "throughput": 66.0, "latency": 16.2, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.134477+00:00", "updated_at": "2025-07-19T19:49:17.134477+00:00", "provider_model_id_used": "o1-preview", "model_name": "o1-preview", "organization_id": "openai" }, { "model_provider_id": 433, "model_id": "o3-2025-04-16", "provider_id": "openai", "deprecated_at": null, "input_cents_per_million_tokens": 200, "output_cents_per_million_tokens": 800, "quantization": null, "max_input_tokens": 200000, "max_output_tokens": 100000, "throughput": 50.0, "latency": 20.0, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.156370+00:00", "updated_at": "2025-07-19T19:49:17.156370+00:00", "provider_model_id_used": "o3-2025-04-16", "model_name": "o3", "organization_id": "openai" }, { "model_provider_id": 428, "model_id": "o3-mini", "provider_id": "openai", "deprecated_at": null, "input_cents_per_million_tokens": 110, "output_cents_per_million_tokens": 440, "quantization": null, "max_input_tokens": 200000, "max_output_tokens": 100000, "throughput": 115.0, "latency": 5.2, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.147026+00:00", "updated_at": "2025-07-19T19:49:17.147026+00:00", "provider_model_id_used": "o3-mini", "model_name": "o3-mini", "organization_id": "openai" }, { "model_provider_id": 435, "model_id": "o3-pro-2025-06-10", "provider_id": "openai", "deprecated_at": null, "input_cents_per_million_tokens": 2000, "output_cents_per_million_tokens": 8000, "quantization": null, "max_input_tokens": 200000, "max_output_tokens": 100000, "throughput": 25.0, "latency": 30.0, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.161549+00:00", "updated_at": "2025-07-19T19:49:17.161549+00:00", "provider_model_id_used": "o3-pro-2025-06-10", "model_name": "o3-pro", "organization_id": "openai" }, { "model_provider_id": 434, "model_id": "o4-mini", "provider_id": "openai", "deprecated_at": null, "input_cents_per_million_tokens": 110, "output_cents_per_million_tokens": 440, "quantization": null, "max_input_tokens": 200000, "max_output_tokens": 100000, "throughput": 115.0, "latency": 5.2, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.159618+00:00", "updated_at": "2025-07-19T19:49:17.159618+00:00", "provider_model_id_used": "o4-mini", "model_name": "o4-mini", "organization_id": "openai" }, { "model_provider_id": 434, "model_id": "gpt-oss-120b", "provider_id": "openai", "deprecated_at": null, "input_cents_per_million_tokens": 10, "output_cents_per_million_tokens": 50, "quantization": null, "max_input_tokens": 131072, "max_output_tokens": 131072, "throughput": 115.0, "latency": 5.2, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.159618+00:00", "updated_at": "2025-07-19T19:49:17.159618+00:00", "provider_model_id_used": "gpt-oss-120b", "model_name": "GPT OSS 120B", "organization_id": "openai" }, { "model_provider_id": 434, "model_id": "gpt-oss-20b", "provider_id": "openai", "deprecated_at": null, "input_cents_per_million_tokens": 10, "output_cents_per_million_tokens": 50, "quantization": null, "max_input_tokens": 131072, "max_output_tokens": 131072, "throughput": 115.0, "latency": 5.2, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.159618+00:00", "updated_at": "2025-07-19T19:49:17.159618+00:00", "provider_model_id_used": "gpt-oss-20b", "model_name": "GPT OSS 20B", "organization_id": "openai" }, { "model_provider_id": 436, "model_id": "gpt-5-2025-08-07", "provider_id": "openai", "deprecated_at": null, "input_cents_per_million_tokens": 125, "output_cents_per_million_tokens": 1000, "quantization": null, "max_input_tokens": 400000, "max_output_tokens": 128000, "throughput": 100.0, "latency": 2.0, "feature_web_search": true, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": true, "feature_batch_inference": true, "feature_finetuning": true, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "provider_model_id_used": "gpt-5", "model_name": "GPT-5", "organization_id": "openai" }, { "model_provider_id": 437, "model_id": "gpt-5-mini-2025-08-07", "provider_id": "openai", "deprecated_at": null, "input_cents_per_million_tokens": 25, "output_cents_per_million_tokens": 200, "quantization": null, "max_input_tokens": 400000, "max_output_tokens": 128000, "throughput": 200.0, "latency": 1.0, "feature_web_search": true, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": true, "feature_batch_inference": true, "feature_finetuning": true, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "provider_model_id_used": "gpt-5-mini", "model_name": "GPT-5 mini", "organization_id": "openai" }, { "model_provider_id": 438, "model_id": "gpt-5-nano-2025-08-07", "provider_id": "openai", "deprecated_at": null, "input_cents_per_million_tokens": 5, "output_cents_per_million_tokens": 40, "quantization": null, "max_input_tokens": 400000, "max_output_tokens": 128000, "throughput": 500.0, "latency": 0.3, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": true, "feature_batch_inference": true, "feature_finetuning": true, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "provider_model_id_used": "gpt-5-nano", "model_name": "GPT-5 nano", "organization_id": "openai" } ] ================================================ FILE: data/providers/openai/provider.json ================================================ { "provider_id": "openai", "name": "OpenAI", "website": "https://openai.com", "created_at": "2025-07-19T19:49:17.121876+00:00", "updated_at": "2025-07-19T19:49:17.121876+00:00" } ================================================ FILE: data/providers/replicate/models.json ================================================ [ { "model_provider_id": 396, "model_id": "deepseek-vl2", "provider_id": "replicate", "deprecated_at": null, "input_cents_per_million_tokens": 950, "output_cents_per_million_tokens": 480000, "quantization": null, "max_input_tokens": 129280, "max_output_tokens": 129280, "throughput": 22.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.068077+00:00", "updated_at": "2025-07-19T19:49:17.068077+00:00", "provider_model_id_used": "deepseek-vl2", "model_name": "DeepSeek VL2", "organization_id": "deepseek" }, { "model_provider_id": 395, "model_id": "llama-3.1-405b-instruct", "provider_id": "replicate", "deprecated_at": null, "input_cents_per_million_tokens": 950, "output_cents_per_million_tokens": 950, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 22.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.066199+00:00", "updated_at": "2025-07-19T19:49:17.066199+00:00", "provider_model_id_used": "llama-3.1-405b-instruct", "model_name": "Llama 3.1 405B Instruct", "organization_id": "meta" } ] ================================================ FILE: data/providers/replicate/provider.json ================================================ { "provider_id": "replicate", "name": "Replicate", "website": "https://replicate.com/", "created_at": "2025-07-19T19:49:17.064218+00:00", "updated_at": "2025-07-19T19:49:17.064218+00:00" } ================================================ FILE: data/providers/sambanova/models.json ================================================ [ { "model_provider_id": 240, "model_id": "llama-3.1-70b-instruct", "provider_id": "sambanova", "deprecated_at": null, "input_cents_per_million_tokens": 500, "output_cents_per_million_tokens": 1000, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 74.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.702554+00:00", "updated_at": "2025-07-19T19:49:16.702554+00:00", "provider_model_id_used": "llama-3.1-70b-instruct", "model_name": "Llama 3.1 70B Instruct", "organization_id": "meta" }, { "model_provider_id": 239, "model_id": "llama-3.1-8b-instruct", "provider_id": "sambanova", "deprecated_at": null, "input_cents_per_million_tokens": 10, "output_cents_per_million_tokens": 20, "quantization": null, "max_input_tokens": 131072, "max_output_tokens": 131072, "throughput": 1050.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.699627+00:00", "updated_at": "2025-07-19T19:49:16.699627+00:00", "provider_model_id_used": "llama-3.1-8b-instruct", "model_name": "Llama 3.1 8B Instruct", "organization_id": "meta" }, { "model_provider_id": 241, "model_id": "llama-3.2-11b-instruct", "provider_id": "sambanova", "deprecated_at": null, "input_cents_per_million_tokens": 15, "output_cents_per_million_tokens": 30, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 100.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.705086+00:00", "updated_at": "2025-07-19T19:49:16.705086+00:00", "provider_model_id_used": "llama-3.2-11b-instruct", "model_name": "Llama 3.2 11B Instruct", "organization_id": "meta" }, { "model_provider_id": 242, "model_id": "llama-3.3-70b-instruct", "provider_id": "sambanova", "deprecated_at": null, "input_cents_per_million_tokens": 60, "output_cents_per_million_tokens": 120, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 1096.0, "latency": 0.65, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.707534+00:00", "updated_at": "2025-07-19T19:49:16.707534+00:00", "provider_model_id_used": "llama-3.3-70b-instruct", "model_name": "Llama 3.3 70B Instruct", "organization_id": "meta" }, { "model_provider_id": 243, "model_id": "llama-4-maverick", "provider_id": "sambanova", "deprecated_at": null, "input_cents_per_million_tokens": 63, "output_cents_per_million_tokens": 179, "quantization": null, "max_input_tokens": 1000000, "max_output_tokens": 1000000, "throughput": 638.7, "latency": 2.04, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.710100+00:00", "updated_at": "2025-07-19T19:49:16.710100+00:00", "provider_model_id_used": "llama-4-maverick", "model_name": "Llama 4 Maverick", "organization_id": "meta" }, { "model_provider_id": 244, "model_id": "qwen3-32b", "provider_id": "sambanova", "deprecated_at": null, "input_cents_per_million_tokens": 40, "output_cents_per_million_tokens": 80, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 327.7, "latency": 1.08, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.712669+00:00", "updated_at": "2025-07-19T19:49:16.712669+00:00", "provider_model_id_used": "qwen3-32b", "model_name": "Qwen3 32B", "organization_id": "qwen" } ] ================================================ FILE: data/providers/sambanova/provider.json ================================================ { "provider_id": "sambanova", "name": "Sambanova", "website": "https://sambanova.ai/", "created_at": "2025-07-19T19:49:16.697204+00:00", "updated_at": "2025-07-19T19:49:16.697204+00:00" } ================================================ FILE: data/providers/together/models.json ================================================ [ { "model_provider_id": 255, "model_id": "deepseek-r1", "provider_id": "together", "deprecated_at": null, "input_cents_per_million_tokens": 700, "output_cents_per_million_tokens": 700, "quantization": null, "max_input_tokens": 131072, "max_output_tokens": 131072, "throughput": 4.0, "latency": 0.6, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.738387+00:00", "updated_at": "2025-07-19T19:49:16.738387+00:00", "provider_model_id_used": "deepseek-r1", "model_name": "DeepSeek-R1", "organization_id": "deepseek" }, { "model_provider_id": 245, "model_id": "gemma-3n-e4b-it", "provider_id": "together", "deprecated_at": null, "input_cents_per_million_tokens": 2000, "output_cents_per_million_tokens": 4000, "quantization": null, "max_input_tokens": 32000, "max_output_tokens": 32000, "throughput": 42.09, "latency": 0.43, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.716616+00:00", "updated_at": "2025-07-19T19:49:16.716616+00:00", "provider_model_id_used": "gemma-3n-e4b-it", "model_name": "Gemma 3n E4B Instructed", "organization_id": "google" }, { "model_provider_id": 248, "model_id": "llama-3.1-405b-instruct", "provider_id": "together", "deprecated_at": null, "input_cents_per_million_tokens": 350, "output_cents_per_million_tokens": 350, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 35.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.722263+00:00", "updated_at": "2025-07-19T19:49:16.722263+00:00", "provider_model_id_used": "llama-3.1-405b-instruct", "model_name": "Llama 3.1 405B Instruct", "organization_id": "meta" }, { "model_provider_id": 247, "model_id": "llama-3.1-70b-instruct", "provider_id": "together", "deprecated_at": null, "input_cents_per_million_tokens": 89, "output_cents_per_million_tokens": 89, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 94.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.720699+00:00", "updated_at": "2025-07-19T19:49:16.720699+00:00", "provider_model_id_used": "llama-3.1-70b-instruct", "model_name": "Llama 3.1 70B Instruct", "organization_id": "meta" }, { "model_provider_id": 246, "model_id": "llama-3.1-8b-instruct", "provider_id": "together", "deprecated_at": null, "input_cents_per_million_tokens": 20, "output_cents_per_million_tokens": 20, "quantization": null, "max_input_tokens": 131072, "max_output_tokens": 131072, "throughput": 194.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.718652+00:00", "updated_at": "2025-07-19T19:49:16.718652+00:00", "provider_model_id_used": "llama-3.1-8b-instruct", "model_name": "Llama 3.1 8B Instruct", "organization_id": "meta" }, { "model_provider_id": 249, "model_id": "llama-3.2-11b-instruct", "provider_id": "together", "deprecated_at": null, "input_cents_per_million_tokens": 18, "output_cents_per_million_tokens": 18, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 168.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.724215+00:00", "updated_at": "2025-07-19T19:49:16.724215+00:00", "provider_model_id_used": "llama-3.2-11b-instruct", "model_name": "Llama 3.2 11B Instruct", "organization_id": "meta" }, { "model_provider_id": 250, "model_id": "llama-3.2-90b-instruct", "provider_id": "together", "deprecated_at": null, "input_cents_per_million_tokens": 120, "output_cents_per_million_tokens": 120, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 57.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.726568+00:00", "updated_at": "2025-07-19T19:49:16.726568+00:00", "provider_model_id_used": "llama-3.2-90b-instruct", "model_name": "Llama 3.2 90B Instruct", "organization_id": "meta" }, { "model_provider_id": 254, "model_id": "llama-3.3-70b-instruct", "provider_id": "together", "deprecated_at": null, "input_cents_per_million_tokens": 88, "output_cents_per_million_tokens": 88, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 65.0, "latency": 0.65, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.735754+00:00", "updated_at": "2025-07-19T19:49:16.735754+00:00", "provider_model_id_used": "llama-3.3-70b-instruct", "model_name": "Llama 3.3 70B Instruct", "organization_id": "meta" }, { "model_provider_id": 256, "model_id": "llama-4-maverick", "provider_id": "together", "deprecated_at": null, "input_cents_per_million_tokens": 27, "output_cents_per_million_tokens": 85, "quantization": null, "max_input_tokens": 1000000, "max_output_tokens": 1000000, "throughput": 97.93, "latency": 0.2, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.740112+00:00", "updated_at": "2025-07-19T19:49:16.740112+00:00", "provider_model_id_used": "llama-4-maverick", "model_name": "Llama 4 Maverick", "organization_id": "meta" }, { "model_provider_id": 257, "model_id": "llama-4-scout", "provider_id": "together", "deprecated_at": null, "input_cents_per_million_tokens": 18, "output_cents_per_million_tokens": 59, "quantization": null, "max_input_tokens": 10000000, "max_output_tokens": 10000000, "throughput": 106.9, "latency": 0.54, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.742126+00:00", "updated_at": "2025-07-19T19:49:16.742126+00:00", "provider_model_id_used": "llama-4-scout", "model_name": "Llama 4 Scout", "organization_id": "meta" }, { "model_provider_id": 252, "model_id": "qwen-2.5-72b-instruct", "provider_id": "together", "deprecated_at": null, "input_cents_per_million_tokens": 120, "output_cents_per_million_tokens": 120, "quantization": null, "max_input_tokens": 131072, "max_output_tokens": 8192, "throughput": 47.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.731610+00:00", "updated_at": "2025-07-19T19:49:16.731610+00:00", "provider_model_id_used": "qwen-2.5-72b-instruct", "model_name": "Qwen2.5 72B Instruct", "organization_id": "qwen" }, { "model_provider_id": 251, "model_id": "qwen-2.5-7b-instruct", "provider_id": "together", "deprecated_at": null, "input_cents_per_million_tokens": 30, "output_cents_per_million_tokens": 30, "quantization": null, "max_input_tokens": 131072, "max_output_tokens": 8192, "throughput": 138.0, "latency": 0.5, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.728846+00:00", "updated_at": "2025-07-19T19:49:16.728846+00:00", "provider_model_id_used": "qwen-2.5-7b-instruct", "model_name": "Qwen2.5 7B Instruct", "organization_id": "qwen" }, { "model_provider_id": 258, "model_id": "qwen3-235b-a22b", "provider_id": "together", "deprecated_at": null, "input_cents_per_million_tokens": 20, "output_cents_per_million_tokens": 60, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 128000, "throughput": 23.74, "latency": 0.79, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.746014+00:00", "updated_at": "2025-07-19T19:49:16.746014+00:00", "provider_model_id_used": "qwen3-235b-a22b", "model_name": "Qwen3 235B A22B", "organization_id": "qwen" }, { "model_provider_id": 253, "model_id": "qwq-32b-preview", "provider_id": "together", "deprecated_at": null, "input_cents_per_million_tokens": 120, "output_cents_per_million_tokens": 120, "quantization": null, "max_input_tokens": 32768, "max_output_tokens": 32768, "throughput": 62.14, "latency": 0.74, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.733822+00:00", "updated_at": "2025-07-19T19:49:16.733822+00:00", "provider_model_id_used": "qwq-32b-preview", "model_name": "QwQ-32B-Preview", "organization_id": "qwen" } ] ================================================ FILE: data/providers/together/provider.json ================================================ { "provider_id": "together", "name": "Together", "website": "https://together.ai/", "created_at": "2025-07-19T19:49:16.714534+00:00", "updated_at": "2025-07-19T19:49:16.714534+00:00" } ================================================ FILE: data/providers/xai/models.json ================================================ [ { "model_provider_id": 363, "model_id": "grok-2", "provider_id": "xai", "deprecated_at": null, "input_cents_per_million_tokens": 200, "output_cents_per_million_tokens": 1000, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 8000, "throughput": 85.0, "latency": 0.7, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.997220+00:00", "updated_at": "2025-07-19T19:49:16.997220+00:00", "provider_model_id_used": "grok-2", "model_name": "Grok-2", "organization_id": "xai" }, { "model_provider_id": 364, "model_id": "grok-3", "provider_id": "xai", "deprecated_at": null, "input_cents_per_million_tokens": 300, "output_cents_per_million_tokens": 1500, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 8000, "throughput": 100.0, "latency": 0.7, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:16.998872+00:00", "updated_at": "2025-07-19T19:49:16.998872+00:00", "provider_model_id_used": "grok-3", "model_name": "Grok-3", "organization_id": "xai" }, { "model_provider_id": 365, "model_id": "grok-3-mini", "provider_id": "xai", "deprecated_at": null, "input_cents_per_million_tokens": 30, "output_cents_per_million_tokens": 50, "quantization": null, "max_input_tokens": 128000, "max_output_tokens": 8000, "throughput": 100.0, "latency": 0.7, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.000676+00:00", "updated_at": "2025-07-19T19:49:17.000676+00:00", "provider_model_id_used": "grok-3-mini", "model_name": "Grok-3 Mini", "organization_id": "xai" }, { "model_provider_id": 366, "model_id": "grok-4", "provider_id": "xai", "deprecated_at": null, "input_cents_per_million_tokens": 300, "output_cents_per_million_tokens": 1500, "quantization": null, "max_input_tokens": 256000, "max_output_tokens": 8000, "throughput": 100.0, "latency": 0.7, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.002399+00:00", "updated_at": "2025-07-19T19:49:17.002399+00:00", "provider_model_id_used": "grok-4", "model_name": "Grok-4", "organization_id": "xai" }, { "model_provider_id": 367, "model_id": "grok-code-fast-1", "provider_id": "xai", "deprecated_at": null, "input_cents_per_million_tokens": 20, "output_cents_per_million_tokens": 150, "quantization": null, "max_input_tokens": 256000, "max_output_tokens": 10000, "throughput": 76.41, "latency": 1.38, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-10-03T00:00:00.000000+00:00", "updated_at": "2025-10-03T00:00:00.000000+00:00", "provider_model_id_used": "grok-code-fast-1", "model_name": "Grok Code Fast 1", "organization_id": "xai" }, { "model_provider_id": 444, "model_id": "grok-4-fast", "provider_id": "xai", "deprecated_at": null, "input_cents_per_million_tokens": 20, "output_cents_per_million_tokens": 50, "quantization": null, "max_input_tokens": 2000000, "max_output_tokens": 30000, "throughput": 90, "latency": null, "feature_web_search": true, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": false, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-10-11T00:00:00.000000+00:00", "updated_at": "2025-10-11T00:00:00.000000+00:00", "provider_model_id_used": "grok-4-fast", "model_name": "Grok 4 Fast", "organization_id": "xai" } ] ================================================ FILE: data/providers/xai/provider.json ================================================ { "provider_id": "xai", "name": "xAI", "website": "https://docs.x.ai", "created_at": "2025-07-19T19:49:16.995303+00:00", "updated_at": "2025-07-19T19:49:16.995303+00:00" } ================================================ FILE: data/providers/zeroeval/models.json ================================================ [ { "model_provider_id": 441, "model_id": "claude-3-7-sonnet-20250219", "provider_id": "zeroeval", "deprecated_at": null, "input_cents_per_million_tokens": 300, "output_cents_per_million_tokens": 1500, "quantization": null, "max_input_tokens": 200000, "max_output_tokens": 128000, "throughput": 42.0, "latency": 0.4, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.176639+00:00", "updated_at": "2025-07-19T19:49:17.176639+00:00", "provider_model_id_used": "claude-3-7-sonnet-20250219", "model_name": "Claude 3.7 Sonnet", "organization_id": "anthropic" }, { "model_provider_id": 436, "model_id": "claude-opus-4-20250514", "provider_id": "zeroeval", "deprecated_at": null, "input_cents_per_million_tokens": 1500, "output_cents_per_million_tokens": 7500, "quantization": null, "max_input_tokens": 200000, "max_output_tokens": 128000, "throughput": 42.0, "latency": 0.4, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.165236+00:00", "updated_at": "2025-07-19T19:49:17.165236+00:00", "provider_model_id_used": "claude-opus-4-20250514", "model_name": "Claude Opus 4", "organization_id": "anthropic" }, { "model_provider_id": 437, "model_id": "claude-opus-4-1-20250805", "provider_id": "zeroeval", "deprecated_at": null, "input_cents_per_million_tokens": 1500, "output_cents_per_million_tokens": 7500, "quantization": null, "max_input_tokens": 200000, "max_output_tokens": 32000, "throughput": 42.0, "latency": 0.4, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-08-05T00:00:00.000000+00:00", "updated_at": "2025-08-05T00:00:00.000000+00:00", "provider_model_id_used": "claude-opus-4-1-20250805", "model_name": "Claude Opus 4.1", "organization_id": "anthropic" }, { "model_provider_id": 438, "model_id": "claude-sonnet-4-20250514", "provider_id": "zeroeval", "deprecated_at": null, "input_cents_per_million_tokens": 300, "output_cents_per_million_tokens": 1500, "quantization": null, "max_input_tokens": 200000, "max_output_tokens": 128000, "throughput": 42.0, "latency": 0.4, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.170880+00:00", "updated_at": "2025-07-19T19:49:17.170880+00:00", "provider_model_id_used": "claude-sonnet-4-20250514", "model_name": "Claude Sonnet 4", "organization_id": "anthropic" }, { "model_provider_id": 442, "model_id": "gemini-2.5-flash", "provider_id": "zeroeval", "deprecated_at": null, "input_cents_per_million_tokens": 30, "output_cents_per_million_tokens": 250, "quantization": null, "max_input_tokens": 1048576, "max_output_tokens": 65536, "throughput": 85.0, "latency": 0.7, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.179386+00:00", "updated_at": "2025-07-19T19:49:17.179386+00:00", "provider_model_id_used": "gemini-2.5-flash", "model_name": "Gemini 2.5 Flash", "organization_id": "google" }, { "model_provider_id": 437, "model_id": "gemini-2.5-pro", "provider_id": "zeroeval", "deprecated_at": null, "input_cents_per_million_tokens": 125, "output_cents_per_million_tokens": 1000, "quantization": null, "max_input_tokens": 1048576, "max_output_tokens": 65536, "throughput": 85.0, "latency": 0.7, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.168497+00:00", "updated_at": "2025-07-19T19:49:17.168497+00:00", "provider_model_id_used": "gemini-2.5-pro", "model_name": "Gemini 2.5 Pro", "organization_id": "google" }, { "model_provider_id": 440, "model_id": "gpt-4.1-mini-2025-04-14", "provider_id": "zeroeval", "deprecated_at": null, "input_cents_per_million_tokens": 40, "output_cents_per_million_tokens": 160, "quantization": null, "max_input_tokens": 1047576, "max_output_tokens": 32768, "throughput": 150.0, "latency": 5.0, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.174218+00:00", "updated_at": "2025-07-19T19:49:17.174218+00:00", "provider_model_id_used": "gpt-4.1-mini-2025-04-14", "model_name": "GPT-4.1 mini", "organization_id": "openai" }, { "model_provider_id": 439, "model_id": "grok-4", "provider_id": "zeroeval", "deprecated_at": null, "input_cents_per_million_tokens": 300, "output_cents_per_million_tokens": 1500, "quantization": null, "max_input_tokens": 256000, "max_output_tokens": 8000, "throughput": 100.0, "latency": 0.7, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.172505+00:00", "updated_at": "2025-07-19T19:49:17.172505+00:00", "provider_model_id_used": "grok-4", "model_name": "Grok-4", "organization_id": "xai" }, { "model_provider_id": 1231, "model_id": "gpt-oss-120b", "provider_id": "zeroeval", "deprecated_at": null, "input_cents_per_million_tokens": 15, "output_cents_per_million_tokens": 60, "quantization": null, "max_input_tokens": 131000, "max_output_tokens": 30000, "throughput": 500, "latency": 0.5, "feature_web_search": true, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": true, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-08-05T19:49:16.965756+00:00", "updated_at": "2025-08-05T19:49:16.965756+00:00", "provider_model_id_used": "gpt-oss-120b", "model_name": "OpenAI OSS 120B", "organization_id": "openai" }, { "model_provider_id": 1232, "model_id": "gpt-oss-20b", "provider_id": "zeroeval", "deprecated_at": null, "input_cents_per_million_tokens": 10, "output_cents_per_million_tokens": 50, "quantization": null, "max_input_tokens": 131000, "max_output_tokens": 30000, "throughput": 1000, "latency": 0.38, "feature_web_search": true, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": true, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-08-05T19:49:16.965756+00:00", "updated_at": "2025-08-05T19:49:16.965756+00:00", "provider_model_id_used": "gpt-oss-20b", "model_name": "OpenAI OSS 20B", "organization_id": "openai" }, { "model_provider_id": 1233, "model_id": "gpt-5-2025-08-07", "provider_id": "zeroeval", "deprecated_at": null, "input_cents_per_million_tokens": 125, "output_cents_per_million_tokens": 1000, "quantization": null, "max_input_tokens": 400000, "max_output_tokens": 128000, "throughput": 100.0, "latency": 2.0, "feature_web_search": true, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": true, "feature_batch_inference": true, "feature_finetuning": true, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "provider_model_id_used": "gpt-5", "model_name": "GPT-5", "organization_id": "openai" }, { "model_provider_id": 1234, "model_id": "gpt-5-mini-2025-08-07", "provider_id": "zeroeval", "deprecated_at": null, "input_cents_per_million_tokens": 25, "output_cents_per_million_tokens": 200, "quantization": null, "max_input_tokens": 400000, "max_output_tokens": 128000, "throughput": 200.0, "latency": 1.0, "feature_web_search": true, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": true, "feature_batch_inference": true, "feature_finetuning": true, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "provider_model_id_used": "gpt-5-mini", "model_name": "GPT-5 mini", "organization_id": "openai" }, { "model_provider_id": 1235, "model_id": "gpt-5-nano-2025-08-07", "provider_id": "zeroeval", "deprecated_at": null, "input_cents_per_million_tokens": 5, "output_cents_per_million_tokens": 40, "quantization": null, "max_input_tokens": 400000, "max_output_tokens": 128000, "throughput": 500.0, "latency": 0.3, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": true, "feature_batch_inference": true, "feature_finetuning": true, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-24T12:00:00.000000+00:00", "updated_at": "2025-07-24T12:00:00.000000+00:00", "provider_model_id_used": "gpt-5-nano", "model_name": "GPT-5 nano", "organization_id": "openai" }, { "model_provider_id": 1236, "model_id": "deepseek-v3.2-exp", "provider_id": "zeroeval", "deprecated_at": null, "input_cents_per_million_tokens": 27, "output_cents_per_million_tokens": 41, "quantization": "fp8", "max_input_tokens": 163840, "max_output_tokens": 65536, "throughput": 100.0, "latency": 0.7, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-09-29T00:00:00.000000+00:00", "updated_at": "2025-09-29T00:00:00.000000+00:00", "provider_model_id_used": "deepseek/deepseek-v3.2-exp", "model_name": "DeepSeek V3.2 Exp", "organization_id": "deepseek" }, { "model_provider_id": 1237, "model_id": "glm-4.5", "provider_id": "zeroeval", "deprecated_at": null, "input_cents_per_million_tokens": 60, "output_cents_per_million_tokens": 220, "quantization": "fp8", "max_input_tokens": 131072, "max_output_tokens": 98304, "throughput": 85.0, "latency": 0.7, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-09-29T00:00:00.000000+00:00", "updated_at": "2025-09-29T00:00:00.000000+00:00", "provider_model_id_used": "zai-org/glm-4.5", "model_name": "GLM-4.5", "organization_id": "zai-org" }, { "model_provider_id": 1238, "model_id": "glm-4.5v", "provider_id": "zeroeval", "deprecated_at": null, "input_cents_per_million_tokens": 60, "output_cents_per_million_tokens": 220, "quantization": "fp8", "max_input_tokens": 131072, "max_output_tokens": 65536, "throughput": 85.0, "latency": 0.7, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": true, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-09-29T00:00:00.000000+00:00", "updated_at": "2025-09-29T00:00:00.000000+00:00", "provider_model_id_used": "zai-org/GLM-4.5V", "model_name": "GLM-4.5V", "organization_id": "zai-org" }, { "model_provider_id": 1239, "model_id": "kimi-k2-0905", "provider_id": "zeroeval", "deprecated_at": null, "input_cents_per_million_tokens": 60, "output_cents_per_million_tokens": 250, "quantization": "fp8", "max_input_tokens": 262144, "max_output_tokens": 262144, "throughput": null, "latency": null, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-09-29T00:00:00.000000+00:00", "updated_at": "2025-09-29T00:00:00.000000+00:00", "provider_model_id_used": "moonshotai/kimi-k2-0905", "model_name": "Kimi K2 0905", "organization_id": "moonshotai" }, { "model_provider_id": 1241, "model_id": "deepseek-r1", "provider_id": "zeroeval", "deprecated_at": null, "input_cents_per_million_tokens": 300, "output_cents_per_million_tokens": 600, "quantization": null, "max_input_tokens": 65536, "max_output_tokens": 65536, "throughput": 189.0, "latency": 0.067, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": false, "input_modality_audio": false, "input_modality_video": false, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-09-29T00:00:00.000000+00:00", "updated_at": "2025-09-29T00:00:00.000000+00:00", "provider_model_id_used": "deepseek-r1", "model_name": "DeepSeek R1 671B", "organization_id": "deepseek" }, { "model_provider_id": 1242, "model_id": "claude-sonnet-4-5-20250929", "provider_id": "zeroeval", "deprecated_at": null, "input_cents_per_million_tokens": 300, "output_cents_per_million_tokens": 1500, "quantization": null, "max_input_tokens": 200000, "max_output_tokens": 64000, "throughput": 42.0, "latency": 0.4, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": true, "input_modality_video": true, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-07-19T19:49:17.084616+00:00", "updated_at": "2025-07-19T19:49:17.084616+00:00", "provider_model_id_used": "claude-sonnet-4-5-20250929", "model_name": "Claude Sonnet 4.5", "organization_id": "anthropic" }, { "model_provider_id": 1243, "model_id": "glm-4.6", "provider_id": "zeroeval", "deprecated_at": null, "input_cents_per_million_tokens": 60, "output_cents_per_million_tokens": 200, "quantization": "fp8", "max_input_tokens": 131072, "max_output_tokens": 65536, "throughput": 85.0, "latency": 0.7, "feature_web_search": false, "feature_function_calling": true, "feature_structured_output": true, "feature_code_execution": false, "feature_batch_inference": true, "feature_finetuning": false, "input_modality_text": true, "input_modality_image": true, "input_modality_audio": false, "input_modality_video": true, "output_modality_text": true, "output_modality_image": false, "output_modality_audio": false, "output_modality_video": false, "created_at": "2025-09-30T00:00:00.000000+00:00", "updated_at": "2025-09-30T00:00:00.000000+00:00", "provider_model_id_used": "zai-org/GLM-4.6", "model_name": "GLM-4.6", "organization_id": "zai-org" } ] ================================================ FILE: data/providers/zeroeval/provider.json ================================================ { "provider_id": "zeroeval", "name": "ZeroEval", "website": "https://zeroeval.com", "created_at": "2025-07-15T06:36:02.543462+00:00", "updated_at": "2025-07-15T06:36:02.543462+00:00" } ================================================ FILE: package.json ================================================ { "scripts": { "validate-schemas": "node scripts/validate-schemas.js" }, "devDependencies": { "glob": "^10.4.5", "tv4": "^1.3.0" } } ================================================ FILE: schemas/README.md ================================================ # JSON Schemas for LLM Stats Data This directory contains JSON Schema definitions for all data types used in the LLM Stats project. These schemas define the structure, types, and validation rules for data stored in the hierarchical file system under `data/`. ## Schema Files ### Core Entity Schemas - **`organization.schema.json`** - Schema for AI/ML organizations (e.g., OpenAI, Anthropic) - **`model.schema.json`** - Schema for model metadata - **`license.schema.json`** - Schema for software licenses governing model usage - **`benchmark.schema.json`** - Schema for evaluation benchmark definitions - **`provider.schema.json`** - Schema for model inference providers (e.g., AWS Bedrock, Google Vertex) ### Relationship Schemas - **`benchmark-results.schema.json`** - Schema for model performance scores on benchmarks - **`provider-models.schema.json`** - Schema for provider-specific model configurations and pricing ## Data Structure The schemas correspond to data organized hierarchically: ``` data/ ├── organizations/ │ └── [org_id]/ │ ├── organization.json # Validates against organization.schema.json │ └── models/ │ └── [model_id]/ │ ├── model.json # Validates against model.schema.json │ └── benchmarks.json # Array validating against benchmark-results.schema.json ├── providers/ │ └── [provider_id]/ │ ├── provider.json # Validates against provider.schema.json │ └── models.json # Array validating against provider-models.schema.json ├── licenses/ │ └── [license_id].json # Validates against license.schema.json └── benchmarks/ └── [benchmark_id].json # Validates against benchmark.schema.json ``` ## Usage These schemas can be used for: 1. **Data Validation** - Ensure all data files conform to expected structure 2. **Documentation** - Understand what fields are available and their meanings 3. **Code Generation** - Generate TypeScript interfaces or other language types 4. **API Contracts** - Define expected request/response formats ## Validation Example To validate a data file against its schema using Python: ```python import json import jsonschema # Load schema with open('schemas/model.schema.json') as f: schema = json.load(f) # Load data with open('data/organizations/openai/models/gpt-4/model.json') as f: data = json.load(f) # Validate jsonschema.validate(instance=data, schema=schema) ``` ## Schema Features All schemas use JSON Schema Draft 7 and include: - **Descriptions** - Every field has a human-readable description - **Types** - Strict type definitions with null handling - **Patterns** - Regular expressions for ID formats - **Examples** - Real-world examples for clarity - **Enums** - Restricted value sets where applicable - **Format Validators** - For dates, URIs, etc. - **Required Fields** - Clearly defined required vs optional ## Contributing When adding new fields or modifying schemas: 1. Update the relevant schema file 2. Add clear descriptions and examples 3. Consider backward compatibility 4. Update this README if adding new schemas 5. Validate existing data against updated schemas ## Schema Versioning Currently, all schemas target JSON Schema Draft 7. Future versions may adopt newer drafts as tooling support improves. ================================================ FILE: schemas/benchmark-results.schema.json ================================================ { "$schema": "http://json-schema.org/draft-07/schema#", "title": "ModelBenchmark", "description": "Schema for model performance scores on benchmarks", "type": "object", "properties": { "model_benchmark_id": { "type": "integer", "description": "Unique identifier for this model-benchmark result", "minimum": 1 }, "benchmark_id": { "type": "string", "description": "ID of the benchmark" }, "model_id": { "type": "string", "description": "ID of the model" }, "score": { "type": "number", "description": "Raw score achieved on the benchmark", "minimum": 0 }, "normalized_score": { "type": ["number", "null"], "description": "Score normalized to 0-1 range for cross-benchmark comparison", "minimum": 0, "maximum": 1 }, "is_self_reported": { "type": "boolean", "description": "Whether the score was self-reported by the model creator", "default": true }, "self_reported_source_link": { "type": ["string", "null"], "format": "uri", "description": "URL to the source of self-reported scores" }, "verified_by_llmstats": { "type": "boolean", "description": "Whether the score has been independently verified by llm-stats", "default": false }, "analysis_method": { "type": ["string", "null"], "description": "Method used for evaluation (e.g., '0-shot', '5-shot', 'CoT')", "examples": [ "0-shot", "5-shot", "few-shot", "chain-of-thought", "zero-shot CoT" ] }, "verification_provider_id": { "type": ["string", "null"], "description": "Provider used for independent verification" }, "verification_hardware": { "type": ["string", "null"], "description": "Hardware used for verification", "examples": ["H100 on Modal", "A100 on AWS", "4xA100 on GCP"] }, "verification_date": { "type": ["string", "null"], "format": "date", "description": "Date when the score was independently verified" }, "verification_notes": { "type": ["string", "null"], "description": "Additional notes about the verification process" }, "created_at": { "type": "string", "format": "date-time", "description": "Timestamp when the record was created" }, "updated_at": { "type": "string", "format": "date-time", "description": "Timestamp when the record was last updated" }, "benchmark_name": { "type": "string", "description": "Display name of the benchmark (denormalized for convenience)" } }, "required": [ "model_benchmark_id", "benchmark_id", "model_id", "score", "is_self_reported", "verified_by_llmstats", "created_at", "updated_at", "benchmark_name" ], "additionalProperties": false } ================================================ FILE: schemas/benchmark.schema.json ================================================ { "$schema": "http://json-schema.org/draft-07/schema#", "title": "Benchmark", "description": "Schema for AI/ML evaluation benchmark definitions", "type": "object", "properties": { "benchmark_id": { "type": "string", "description": "Unique identifier for the benchmark", "examples": [ "mmlu", "humaneval", "arc-c", "gsm8k", "mbpp-pass@1", "humanity's-last-exam" ] }, "name": { "type": "string", "description": "Display name of the benchmark", "examples": ["MMLU", "HumanEval", "ARC-Challenge", "GSM8K"] }, "parent_benchmark_id": { "type": ["string", "null"], "description": "ID of parent benchmark if this is a subset or variant" }, "categories": { "type": "array", "description": "Array of categories that this benchmark belongs to", "items": { "type": "string", "enum": [ "general", "code", "math", "reasoning", "language", "multimodal", "safety", "long_context", "roleplay", "agents", "factuality", "vision", "audio", "video", "text-to-image", "image-to-text", "text-to-speech", "speech-to-text", "text-to-video", "video-to-text", "legal", "healthcare", "finance", "chemistry", "economics", "coding", "creativity", "psychology", "games", "communication", "physics", "spatial_reasoning", "summarization", "frontend_development", "writing", "search" ] }, "minItems": 1, "uniqueItems": true, "examples": [ ["general"], ["code", "reasoning"], ["math", "reasoning"], ["vision", "multimodal"] ] }, "modality": { "type": "string", "description": "Primary modality of the benchmark", "enum": ["text", "image", "audio", "video", "multimodal"] }, "multilingual": { "type": "boolean", "description": "Whether the benchmark tests multiple languages", "default": false }, "max_score": { "type": "number", "description": "Maximum possible score on the benchmark", "minimum": 0, "default": 1.0, "examples": [1.0, 100.0] }, "language": { "type": "string", "description": "Primary language of the benchmark (ISO 639-1 code)", "default": "en", "examples": ["en", "zh", "es", "fr"] }, "description": { "type": ["string", "null"], "description": "Detailed description of what the benchmark measures" }, "paper_link": { "type": ["string", "null"], "format": "uri", "description": "URL to the research paper introducing the benchmark" }, "implementation_link": { "type": ["string", "null"], "format": "uri", "description": "URL to the official implementation or dataset" }, "verified": { "type": "boolean", "description": "Whether the benchmark has been verified by llm-stats maintainers", "default": false }, "created_at": { "type": "string", "format": "date-time", "description": "Timestamp when the record was created" }, "updated_at": { "type": "string", "format": "date-time", "description": "Timestamp when the record was last updated" } }, "required": [ "benchmark_id", "name", "categories", "modality", "multilingual", "max_score", "language", "verified", "created_at", "updated_at" ], "additionalProperties": false } ================================================ FILE: schemas/integrity-validator.js ================================================ const fs = require("fs"); const path = require("path"); const glob = require("glob"); class IntegrityValidator { constructor(dataDir) { this.dataDir = dataDir || path.join(__dirname, "..", "data"); this.errors = []; this.warnings = []; // Collections to store all entities this.models = new Map(); this.benchmarks = new Map(); this.organizations = new Map(); this.licenses = new Map(); this.providers = new Map(); // Maps to check for duplicates // Note: Model names can be duplicated (e.g., different versions), only IDs must be unique this.benchmarkNames = new Map(); } loadJSON(filePath) { try { const content = fs.readFileSync(filePath, "utf8"); return JSON.parse(content); } catch (error) { this.errors.push(`Failed to load ${filePath}: ${error.message}`); return null; } } // Load all data into memory async loadAllData() { console.log("\n📂 Loading all data files...\n"); // Load organizations const orgFiles = glob.sync( path.join(this.dataDir, "organizations/*/organization.json") ); for (const file of orgFiles) { const data = this.loadJSON(file); if (data) { this.organizations.set(data.organization_id, data); } } console.log(`✅ Loaded ${this.organizations.size} organizations`); // Load models const modelFiles = glob.sync( path.join(this.dataDir, "organizations/*/models/*/model.json") ); for (const file of modelFiles) { const data = this.loadJSON(file); if (data) { // Check for duplicate model IDs if (this.models.has(data.model_id)) { const existing = this.models.get(data.model_id); this.errors.push( `❌ Duplicate model ID "${data.model_id}" found:\n` + ` - First occurrence: ${path.relative(this.dataDir, existing.file)}\n` + ` - Duplicate found: ${path.relative(this.dataDir, file)}` ); } this.models.set(data.model_id, { ...data, file }); } } console.log(`✅ Loaded ${this.models.size} models`); // Load benchmarks const benchmarkFiles = glob.sync( path.join(this.dataDir, "benchmarks/*.json") ); for (const file of benchmarkFiles) { const data = this.loadJSON(file); if (data) { // Check for duplicate benchmark IDs if (this.benchmarks.has(data.benchmark_id)) { const existing = this.benchmarks.get(data.benchmark_id); this.errors.push( `❌ Duplicate benchmark ID "${data.benchmark_id}" found:\n` + ` - First occurrence: ${path.relative(this.dataDir, existing.file)}\n` + ` - Duplicate found: ${path.relative(this.dataDir, file)}` ); } this.benchmarks.set(data.benchmark_id, { ...data, file }); // Check for duplicate benchmark names if (this.benchmarkNames.has(data.name)) { this.benchmarkNames .get(data.name) .push({ id: data.benchmark_id, file }); } else { this.benchmarkNames.set(data.name, [{ id: data.benchmark_id, file }]); } } } console.log(`✅ Loaded ${this.benchmarks.size} benchmarks`); // Load licenses const licenseFiles = glob.sync(path.join(this.dataDir, "licenses/*.json")); for (const file of licenseFiles) { const data = this.loadJSON(file); if (data) { this.licenses.set(data.license_id, data); } } console.log(`✅ Loaded ${this.licenses.size} licenses`); // Load providers const providerFiles = glob.sync( path.join(this.dataDir, "providers/*/provider.json") ); for (const file of providerFiles) { const data = this.loadJSON(file); if (data) { this.providers.set(data.provider_id, data); } } console.log(`✅ Loaded ${this.providers.size} providers`); } // Check for duplicate names checkDuplicates() { console.log("\n🔍 Checking for duplicate names...\n"); let duplicatesFound = false; // Check duplicate benchmark names (benchmark names should be unique) for (const [name, instances] of this.benchmarkNames.entries()) { if (instances.length > 1) { duplicatesFound = true; this.errors.push( `❌ Duplicate benchmark name "${name}" found in ${instances.length} benchmarks:\n` + instances .map( (i) => ` - ${i.id} in ${path.relative(this.dataDir, i.file)}` ) .join("\n") ); } } if (!duplicatesFound) { console.log("✅ No duplicate benchmark names found"); } // Note: Model names can be duplicated (e.g., different versions of the same model) // IDs are checked during loading and must be unique } // Check all references checkReferences() { console.log("\n🔗 Checking references...\n"); // Check model references for (const [modelId, model] of this.models.entries()) { const relPath = path.relative(this.dataDir, model.file); // Check organization reference if ( model.organization_id && !this.organizations.has(model.organization_id) ) { this.errors.push( `❌ Model "${modelId}" references non-existent organization "${model.organization_id}"\n` + ` in ${relPath}` ); } // Check license reference if (model.license_id && !this.licenses.has(model.license_id)) { this.errors.push( `❌ Model "${modelId}" references non-existent license "${model.license_id}"\n` + ` in ${relPath}` ); } // Check fine-tuned from reference if ( model.fine_tuned_from_model_id && !this.models.has(model.fine_tuned_from_model_id) ) { this.errors.push( `❌ Model "${modelId}" references non-existent base model "${model.fine_tuned_from_model_id}"\n` + ` in ${relPath}` ); } // Check model family reference if (model.model_family_id && !this.models.has(model.model_family_id)) { this.warnings.push( `⚠️ Model "${modelId}" references model family "${model.model_family_id}" which doesn't exist as a model\n` + ` in ${relPath}` ); } } // Check benchmark results references const benchmarkResultFiles = glob.sync( path.join(this.dataDir, "organizations/*/models/*/benchmarks.json") ); for (const file of benchmarkResultFiles) { const results = this.loadJSON(file); if (results && Array.isArray(results)) { const relPath = path.relative(this.dataDir, file); for (let i = 0; i < results.length; i++) { const result = results[i]; // Check model_id reference if (result.model_id && !this.models.has(result.model_id)) { this.errors.push( `❌ Benchmark result [${i}] references non-existent model "${result.model_id}"\n` + ` in ${relPath}` ); } // Check benchmark_id reference if ( result.benchmark_id && !this.benchmarks.has(result.benchmark_id) ) { this.errors.push( `❌ Benchmark result [${i}] references non-existent benchmark "${result.benchmark_id}"\n` + ` in ${relPath}` ); } // Check verification_provider_id reference if ( result.verification_provider_id && !this.providers.has(result.verification_provider_id) ) { this.warnings.push( `⚠️ Benchmark result [${i}] references non-existent verification provider "${result.verification_provider_id}"\n` + ` in ${relPath}` ); } } } } // Check provider models references const providerModelFiles = glob.sync( path.join(this.dataDir, "providers/*/models.json") ); for (const file of providerModelFiles) { const models = this.loadJSON(file); if (models && Array.isArray(models)) { const relPath = path.relative(this.dataDir, file); for (let i = 0; i < models.length; i++) { const providerModel = models[i]; // Check model_id reference if ( providerModel.model_id && !this.models.has(providerModel.model_id) ) { this.errors.push( `❌ Provider model [${i}] references non-existent model "${providerModel.model_id}"\n` + ` in ${relPath}` ); } // Check provider_id reference if ( providerModel.provider_id && !this.providers.has(providerModel.provider_id) ) { this.errors.push( `❌ Provider model [${i}] references non-existent provider "${providerModel.provider_id}"\n` + ` in ${relPath}` ); } } } } // Check benchmark parent references for (const [benchmarkId, benchmark] of this.benchmarks.entries()) { if ( benchmark.parent_benchmark_id && !this.benchmarks.has(benchmark.parent_benchmark_id) ) { const relPath = path.relative(this.dataDir, benchmark.file); this.errors.push( `❌ Benchmark "${benchmarkId}" references non-existent parent benchmark "${benchmark.parent_benchmark_id}"\n` + ` in ${relPath}` ); } } if (this.errors.length === 0 && this.warnings.length === 0) { console.log("✅ All references are valid"); } } // Check for orphaned data checkOrphans() { console.log("\n👻 Checking for orphaned data...\n"); // Check for models without benchmark results const modelsWithBenchmarks = new Set(); const benchmarkResultFiles = glob.sync( path.join(this.dataDir, "organizations/*/models/*/benchmarks.json") ); for (const file of benchmarkResultFiles) { const results = this.loadJSON(file); if (results && Array.isArray(results)) { results.forEach((r) => modelsWithBenchmarks.add(r.model_id)); } } let modelsWithoutBenchmarks = 0; for (const modelId of this.models.keys()) { if (!modelsWithBenchmarks.has(modelId)) { modelsWithoutBenchmarks++; } } if (modelsWithoutBenchmarks > 0) { this.warnings.push( `⚠️ ${modelsWithoutBenchmarks} models have no benchmark results` ); } // Check for unused benchmarks const usedBenchmarks = new Set(); for (const file of benchmarkResultFiles) { const results = this.loadJSON(file); if (results && Array.isArray(results)) { results.forEach((r) => usedBenchmarks.add(r.benchmark_id)); } } let unusedBenchmarks = 0; for (const benchmarkId of this.benchmarks.keys()) { if (!usedBenchmarks.has(benchmarkId)) { unusedBenchmarks++; } } if (unusedBenchmarks > 0) { this.warnings.push( `⚠️ ${unusedBenchmarks} benchmarks are not used by any model` ); } // Check for unused licenses const usedLicenses = new Set(); for (const model of this.models.values()) { if (model.license_id) { usedLicenses.add(model.license_id); } } let unusedLicenses = 0; for (const licenseId of this.licenses.keys()) { if (!usedLicenses.has(licenseId)) { unusedLicenses++; } } if (unusedLicenses > 0) { this.warnings.push( `⚠️ ${unusedLicenses} licenses are not used by any model` ); } } // Main validation function async validate() { console.log("🔍 Running Data Integrity Validation...\n"); console.log(`Data directory: ${this.dataDir}\n`); await this.loadAllData(); this.checkDuplicates(); this.checkReferences(); this.checkOrphans(); // Print summary console.log("\n" + "=".repeat(60)); console.log("📊 Validation Summary"); console.log("=".repeat(60)); if (this.errors.length > 0) { console.log(`\n❌ Found ${this.errors.length} errors:\n`); this.errors.forEach((error) => console.log(error)); } if (this.warnings.length > 0) { console.log(`\n⚠️ Found ${this.warnings.length} warnings:\n`); this.warnings.forEach((warning) => console.log(warning)); } if (this.errors.length === 0 && this.warnings.length === 0) { console.log("\n✅ All integrity checks passed! 🎉"); return true; } console.log("\n" + "=".repeat(60)); return this.errors.length === 0; } } // Run validation if called directly if (require.main === module) { const validator = new IntegrityValidator(); validator.validate().then((success) => { process.exit(success ? 0 : 1); }); } module.exports = IntegrityValidator; ================================================ FILE: schemas/license.schema.json ================================================ { "$schema": "http://json-schema.org/draft-07/schema#", "title": "License", "description": "Schema for model license definitions", "type": "object", "properties": { "license_id": { "type": "string", "description": "Unique identifier for the license", "examples": ["apache_2_0", "mit", "proprietary", "cc_by_nc"] }, "name": { "type": "string", "description": "Display name of the license", "examples": ["Apache 2.0", "MIT License", "Proprietary", "CC BY-NC 4.0"] }, "allow_commercial": { "type": "boolean", "description": "Whether the license allows commercial use of the model" }, "description": { "type": "string", "description": "Brief description of the license terms and restrictions", "examples": [ "Apache License 2.0 - allows commercial use", "Non-commercial research use only", "Proprietary license - contact vendor for terms" ] }, "created_at": { "type": "string", "format": "date-time", "description": "Timestamp when the record was created" }, "updated_at": { "type": "string", "format": "date-time", "description": "Timestamp when the record was last updated" } }, "required": [ "license_id", "name", "allow_commercial", "description", "created_at", "updated_at" ], "additionalProperties": false } ================================================ FILE: schemas/model.schema.json ================================================ { "$schema": "http://json-schema.org/draft-07/schema#", "title": "Model", "description": "Schema for AI/ML model metadata", "type": "object", "properties": { "model_id": { "type": "string", "description": "Unique identifier for the model", "examples": ["gpt-4", "claude-3-opus", "llama-3.1-405b"] }, "name": { "type": "string", "description": "Display name of the model", "examples": ["GPT-4", "Claude 3 Opus", "Llama 3.1 405B"] }, "organization_id": { "type": "string", "description": "ID of the organization that created the model" }, "model_family_id": { "type": ["string", "null"], "description": "ID of the model family this model belongs to", "examples": ["gpt-4", "claude-3", "llama-3-1"] }, "fine_tuned_from_model_id": { "type": ["string", "null"], "description": "ID of the base model this was fine-tuned from" }, "description": { "type": ["string", "null"], "description": "Detailed description of the model's capabilities and use cases" }, "release_date": { "type": ["string", "null"], "format": "date", "description": "Date when the model was released (YYYY-MM-DD)", "examples": ["2024-11-20", "2023-03-14"] }, "announcement_date": { "type": ["string", "null"], "format": "date", "description": "Date when the model was first announced (YYYY-MM-DD)" }, "license_id": { "type": ["string", "null"], "description": "ID of the license governing the model's use" }, "multimodal": { "type": "boolean", "description": "Whether the model supports multiple input/output modalities", "default": false }, "knowledge_cutoff": { "type": ["string", "null"], "format": "date", "description": "Date up to which the model has training data (YYYY-MM-DD)" }, "param_count": { "type": ["number", "null"], "description": "Number of parameters in the model (in billions)", "minimum": 0, "examples": [175, 405, 1.8] }, "training_tokens": { "type": ["number", "null"], "description": "Number of tokens the model was trained on (in trillions)", "minimum": 0 }, "available_in_zeroeval": { "type": "boolean", "description": "Whether the model is available for evaluation in ZeroEval", "default": true }, "source_api_ref": { "type": ["string", "null"], "format": "uri", "description": "URL to the official API documentation" }, "source_playground": { "type": ["string", "null"], "format": "uri", "description": "URL to an interactive playground or demo" }, "source_paper": { "type": ["string", "null"], "format": "uri", "description": "URL to the research paper or technical report" }, "source_scorecard_blog_link": { "type": ["string", "null"], "format": "uri", "description": "URL to scorecard or evaluation blog post" }, "source_repo_link": { "type": ["string", "null"], "format": "uri", "description": "URL to the model's code repository" }, "source_weights_link": { "type": ["string", "null"], "format": "uri", "description": "URL to download model weights" }, "created_at": { "type": "string", "format": "date-time", "description": "Timestamp when the record was created" }, "updated_at": { "type": "string", "format": "date-time", "description": "Timestamp when the record was last updated" } }, "required": [ "model_id", "name", "organization_id", "multimodal", "available_in_zeroeval", "created_at", "updated_at" ], "additionalProperties": false } ================================================ FILE: schemas/organization.schema.json ================================================ { "$schema": "http://json-schema.org/draft-07/schema#", "title": "Organization", "description": "Schema for AI/ML organization data", "type": "object", "properties": { "organization_id": { "type": "string", "description": "Unique identifier for the organization", "examples": ["openai", "anthropic", "google", "amazon"] }, "name": { "type": "string", "description": "Display name of the organization", "examples": ["OpenAI", "Anthropic", "Google", "Amazon"] }, "website": { "type": "string", "format": "uri", "description": "Official website URL of the organization", "examples": ["https://openai.com", "https://anthropic.com"] }, "description": { "type": ["string", "null"], "description": "Brief description of the organization and its focus areas", "examples": ["Cloud and AI services", "AI safety and research company"] }, "country": { "type": ["string", "null"], "description": "Country where the organization is headquartered (ISO 3166-1 alpha-2 code)", "examples": ["US", "UK", "CN"] }, "created_at": { "type": "string", "format": "date-time", "description": "Timestamp when the record was created in the database" }, "updated_at": { "type": "string", "format": "date-time", "description": "Timestamp when the record was last updated in the database" } }, "required": [ "organization_id", "name", "website", "created_at", "updated_at" ], "additionalProperties": false } ================================================ FILE: schemas/provider-models.schema.json ================================================ { "$schema": "http://json-schema.org/draft-07/schema#", "title": "ProviderModel", "description": "Schema for provider-specific model configurations and pricing", "type": "object", "properties": { "model_provider_id": { "type": "integer", "description": "Unique identifier for this provider-model configuration", "minimum": 1 }, "model_id": { "type": "string", "description": "ID of the model" }, "provider_id": { "type": "string", "description": "ID of the provider offering this model" }, "provider_model_id_used": { "type": ["string", "null"], "description": "Model ID as used by the provider's API", "examples": ["gpt-4-turbo", "claude-3-opus-20240229"] }, "deprecated_at": { "type": ["string", "null"], "format": "date-time", "description": "Timestamp when this model configuration was deprecated" }, "input_cents_per_million_tokens": { "type": ["number", "null"], "description": "Cost in cents per million input tokens", "minimum": 0, "examples": [1000, 300, 80] }, "output_cents_per_million_tokens": { "type": ["number", "null"], "description": "Cost in cents per million output tokens", "minimum": 0, "examples": [3000, 1500, 400] }, "quantization": { "type": ["string", "null"], "description": "Quantization method applied to the model", "examples": ["int8", "int4", "fp16", "bf16"] }, "max_input_tokens": { "type": ["integer", "null"], "description": "Maximum number of input tokens supported", "minimum": 1, "examples": [128000, 200000, 32000] }, "max_output_tokens": { "type": ["integer", "null"], "description": "Maximum number of output tokens supported", "minimum": 1, "examples": [4096, 8192, 200000] }, "throughput": { "type": ["number", "null"], "description": "Tokens per second throughput", "minimum": 0, "examples": [42.0, 150.5, 200.0] }, "latency": { "type": ["number", "null"], "description": "Time to first token in seconds", "minimum": 0, "examples": [0.4, 0.2, 1.5] }, "feature_web_search": { "type": ["boolean", "null"], "description": "Whether web search is available", "default": false }, "feature_function_calling": { "type": ["boolean", "null"], "description": "Whether function/tool calling is supported", "default": false }, "feature_structured_output": { "type": ["boolean", "null"], "description": "Whether structured output (JSON mode) is supported", "default": false }, "feature_code_execution": { "type": ["boolean", "null"], "description": "Whether code execution is supported", "default": false }, "feature_batch_inference": { "type": ["boolean", "null"], "description": "Whether batch inference is available", "default": false }, "feature_finetuning": { "type": ["boolean", "null"], "description": "Whether fine-tuning is available", "default": false }, "input_modality_text": { "type": ["boolean", "null"], "description": "Whether text input is supported", "default": true }, "input_modality_image": { "type": ["boolean", "null"], "description": "Whether image input is supported", "default": false }, "input_modality_audio": { "type": ["boolean", "null"], "description": "Whether audio input is supported", "default": false }, "input_modality_video": { "type": ["boolean", "null"], "description": "Whether video input is supported", "default": false }, "output_modality_text": { "type": ["boolean", "null"], "description": "Whether text output is supported", "default": true }, "output_modality_image": { "type": ["boolean", "null"], "description": "Whether image output is supported", "default": false }, "output_modality_audio": { "type": ["boolean", "null"], "description": "Whether audio output is supported", "default": false }, "output_modality_video": { "type": ["boolean", "null"], "description": "Whether video output is supported", "default": false }, "created_at": { "type": "string", "format": "date-time", "description": "Timestamp when the record was created" }, "updated_at": { "type": "string", "format": "date-time", "description": "Timestamp when the record was last updated" }, "model_name": { "type": "string", "description": "Display name of the model (denormalized for convenience)" }, "organization_id": { "type": "string", "description": "ID of the organization that created the model (denormalized)" } }, "required": [ "model_provider_id", "model_id", "provider_id", "created_at", "updated_at", "model_name", "organization_id" ], "additionalProperties": false } ================================================ FILE: schemas/provider.schema.json ================================================ { "$schema": "http://json-schema.org/draft-07/schema#", "title": "Provider", "description": "Schema for AI model inference providers", "type": "object", "properties": { "provider_id": { "type": "string", "description": "Unique identifier for the provider", "examples": ["openai", "anthropic", "google", "aws-bedrock", "azure"] }, "name": { "type": "string", "description": "Display name of the provider", "examples": [ "OpenAI", "Anthropic", "Google", "AWS Bedrock", "Azure OpenAI" ] }, "website": { "type": "string", "format": "uri", "description": "Official website or API documentation URL", "examples": ["https://openai.com/api", "https://docs.anthropic.com"] }, "created_at": { "type": "string", "format": "date-time", "description": "Timestamp when the record was created" }, "updated_at": { "type": "string", "format": "date-time", "description": "Timestamp when the record was last updated" } }, "required": ["provider_id", "name", "website", "created_at", "updated_at"], "additionalProperties": false } ================================================ FILE: schemas/validator.js ================================================ const fs = require("fs"); const path = require("path"); const tv4 = require("tv4"); const glob = require("glob"); function validateSchema(schemaName, filePattern, isArray = false) { console.log(`\nValidating ${schemaName}...`); const schemaPath = path.join(__dirname, `${schemaName}.schema.json`); let schema; try { schema = JSON.parse(fs.readFileSync(schemaPath, "utf8")); } catch (error) { console.error(`Error reading schema file: ${schemaPath}`); console.error(error); return false; } const files = glob.sync(path.join(__dirname, "..", filePattern)); if (files.length === 0) { console.warn(`⚠️ No files found matching pattern: ${filePattern}`); return true; } let isValid = true; for (const file of files) { try { const data = JSON.parse(fs.readFileSync(file, "utf8")); // If expecting an array, validate each item if (isArray) { if (!Array.isArray(data)) { console.error( `❌ Invalid: ${file} - Expected array but got ${typeof data}` ); isValid = false; continue; } let allItemsValid = true; data.forEach((item, index) => { const result = tv4.validateMultiple(item, schema); if (!result.valid) { console.error(`❌ Invalid item [${index}] in: ${file}`); result.errors.forEach((error) => console.error(` - ${error.message} at ${error.dataPath}`) ); allItemsValid = false; } }); if (allItemsValid) { console.log(`✅ Valid: ${file} (${data.length} items)`); } else { isValid = false; } } else { // Single object validation const result = tv4.validateMultiple(data, schema); if (result.valid) { console.log(`✅ Valid: ${file}`); } else { console.error(`❌ Invalid: ${file}`); result.errors.forEach((error) => console.error(` - ${error.message} at ${error.dataPath}`) ); isValid = false; } } } catch (error) { console.error(`Error processing file: ${file}`); console.error(error); isValid = false; } } return isValid; } console.log("🔍 Validating LLM Stats Data Structure...\n"); console.log("=".repeat(60)); console.log("Phase 1: Schema Validation"); console.log("=".repeat(60)); // Validate all data types const validations = [ // Core entities { schema: "organization", pattern: "data/organizations/*/organization.json", }, { schema: "model", pattern: "data/organizations/*/models/*/model.json", }, { schema: "license", pattern: "data/licenses/*.json" }, { schema: "benchmark", pattern: "data/benchmarks/*.json" }, { schema: "provider", pattern: "data/providers/*/provider.json" }, // Arrays { schema: "benchmark-results", pattern: "data/organizations/*/models/*/benchmarks.json", isArray: true, }, { schema: "provider-models", pattern: "data/providers/*/models.json", isArray: true, }, ]; let allValid = true; for (const { schema, pattern, isArray } of validations) { const isValid = validateSchema(schema, pattern, isArray); allValid = allValid && isValid; } if (allValid) { console.log("\n✅ All schemas are valid! 🎉"); // Run integrity validation console.log("\n" + "=".repeat(60)); console.log("Phase 2: Data Integrity Validation"); console.log("=".repeat(60)); const IntegrityValidator = require("./integrity-validator.js"); const integrityValidator = new IntegrityValidator(); integrityValidator.validate().then((integrityValid) => { if (integrityValid) { console.log("\n🎉 All validations passed successfully!"); process.exit(0); } else { console.error("\n❌ Data integrity validation failed."); process.exit(1); } }); } else { console.error("\n❌ Schema validation failed."); process.exit(1); }