Repository: JonathanChavezTamales/LLMStats
Branch: main
Commit: 872b75f63b8d
Files: 778
Total size: 2.2 MB

Directory structure:
gitextract_261_qksq/

├── .github/
│   ├── pull_request_template.md
│   └── workflows/
│       └── schema-validation.yml
├── .gitignore
├── .vscode/
│   └── settings.json
├── CONTRIBUTING.md
├── LICENSE.md
├── README.md
├── data/
│   ├── .github/
│   │   └── CODEOWNERS
│   ├── benchmarks/
│   │   ├── aa-index.json
│   │   ├── acebench.json
│   │   ├── activitynet.json
│   │   ├── agieval.json
│   │   ├── ai2-reasoning-challenge-(arc).json
│   │   ├── ai2d.json
│   │   ├── aider-polyglot-edit.json
│   │   ├── aider-polyglot.json
│   │   ├── aider.json
│   │   ├── aime-2024.json
│   │   ├── aime-2025.json
│   │   ├── aime.json
│   │   ├── aitz-em.json
│   │   ├── alignbench.json
│   │   ├── alpacaeval-2.0.json
│   │   ├── amc-2022-23.json
│   │   ├── android-control-high-em.json
│   │   ├── android-control-low-em.json
│   │   ├── androidworld-sr.json
│   │   ├── api-bank.json
│   │   ├── arc-agi-v2.json
│   │   ├── arc-agi.json
│   │   ├── arc-c.json
│   │   ├── arc-e.json
│   │   ├── arc.json
│   │   ├── arena-hard-v2.json
│   │   ├── arena-hard.json
│   │   ├── attaq.json
│   │   ├── autologi.json
│   │   ├── bbh.json
│   │   ├── bfcl-v2.json
│   │   ├── bfcl-v3-multiturn.json
│   │   ├── bfcl-v3.json
│   │   ├── bfcl.json
│   │   ├── big-bench-extra-hard.json
│   │   ├── big-bench-hard.json
│   │   ├── big-bench.json
│   │   ├── bigcodebench-full.json
│   │   ├── bigcodebench-hard.json
│   │   ├── bigcodebench.json
│   │   ├── bird-sql-(dev).json
│   │   ├── blink.json
│   │   ├── boolq.json
│   │   ├── browsecomp-long-128k.json
│   │   ├── browsecomp-long-256k.json
│   │   ├── browsecomp-zh.json
│   │   ├── browsecomp.json
│   │   ├── c-eval.json
│   │   ├── cbnsl.json
│   │   ├── cc-ocr.json
│   │   ├── cfeval.json
│   │   ├── charadessta.json
│   │   ├── chartqa.json
│   │   ├── charxiv-d.json
│   │   ├── charxiv-r.json
│   │   ├── chexpert-cxr.json
│   │   ├── cluewsc.json
│   │   ├── cmmlu.json
│   │   ├── cnmo-2024.json
│   │   ├── codeforces.json
│   │   ├── codegolf-v2.2.json
│   │   ├── collie.json
│   │   ├── common-voice-15.json
│   │   ├── commonsenseqa.json
│   │   ├── complexfuncbench.json
│   │   ├── covost2-en-zh.json
│   │   ├── covost2.json
│   │   ├── crag.json
│   │   ├── creative-writing-v3.json
│   │   ├── crperelation.json
│   │   ├── crux-o.json
│   │   ├── cruxeval-input-cot.json
│   │   ├── cruxeval-o.json
│   │   ├── cruxeval-output-cot.json
│   │   ├── csimpleqa.json
│   │   ├── cybersecurity-ctfs.json
│   │   ├── dermmcqa.json
│   │   ├── docvqa.json
│   │   ├── docvqatest.json
│   │   ├── drop.json
│   │   ├── ds-arena-code.json
│   │   ├── ds-fim-eval.json
│   │   ├── eclektic.json
│   │   ├── egoschema.json
│   │   ├── erqa.json
│   │   ├── evalplus.json
│   │   ├── facts-grounding.json
│   │   ├── factscore.json
│   │   ├── finqa.json
│   │   ├── flenqa.json
│   │   ├── fleurs.json
│   │   ├── frames.json
│   │   ├── french-mmlu.json
│   │   ├── frontiermath.json
│   │   ├── functionalmath.json
│   │   ├── giantsteps-tempo.json
│   │   ├── global-mmlu-lite.json
│   │   ├── global-mmlu.json
│   │   ├── gorilla-benchmark-api-bench.json
│   │   ├── govreport.json
│   │   ├── gpqa-biology.json
│   │   ├── gpqa-chemistry.json
│   │   ├── gpqa-physics.json
│   │   ├── gpqa.json
│   │   ├── graphwalks-bfs-%3C128k.json
│   │   ├── graphwalks-bfs-%3E128k.json
│   │   ├── graphwalks-parents-%3C128k.json
│   │   ├── graphwalks-parents-%3E128k.json
│   │   ├── groundui-1k.json
│   │   ├── gsm-8k-(cot).json
│   │   ├── gsm8k-chat.json
│   │   ├── gsm8k.json
│   │   ├── hallusion-bench.json
│   │   ├── healthbench-hard.json
│   │   ├── healthbench.json
│   │   ├── hellaswag.json
│   │   ├── hiddenmath.json
│   │   ├── hle.json
│   │   ├── hmmt-2025.json
│   │   ├── hmmt25.json
│   │   ├── humaneval+.json
│   │   ├── humaneval-average.json
│   │   ├── humaneval-er.json
│   │   ├── humaneval-mul.json
│   │   ├── humaneval-plus.json
│   │   ├── humaneval.json
│   │   ├── humanevalfim-average.json
│   │   ├── humanity's-last-exam.json
│   │   ├── if.json
│   │   ├── ifeval.json
│   │   ├── include.json
│   │   ├── infinitebench-en.mc.json
│   │   ├── infinitebench-en.qa.json
│   │   ├── infographicsqa.json
│   │   ├── infovqa.json
│   │   ├── infovqatest.json
│   │   ├── instruct-humaneval.json
│   │   ├── intergps.json
│   │   ├── internal-api-instruction-following-(hard).json
│   │   ├── lbpp-(v2).json
│   │   ├── livebench-20241125.json
│   │   ├── livebench.json
│   │   ├── livecodebench(01-09).json
│   │   ├── livecodebench-v5-24.12-25.2.json
│   │   ├── livecodebench-v5.json
│   │   ├── livecodebench-v6.json
│   │   ├── livecodebench.json
│   │   ├── longbench-v2.json
│   │   ├── longfact-concepts.json
│   │   ├── longfact-objects.json
│   │   ├── longvideobench.json
│   │   ├── lsat.json
│   │   ├── lvbench.json
│   │   ├── math-(cot).json
│   │   ├── math-500.json
│   │   ├── math.json
│   │   ├── mathvision.json
│   │   ├── mathvista-mini.json
│   │   ├── mathvista.json
│   │   ├── mbpp+.json
│   │   ├── mbpp-++-base-version.json
│   │   ├── mbpp-evalplus-(base).json
│   │   ├── mbpp-evalplus.json
│   │   ├── mbpp-pass@1.json
│   │   ├── mbpp-plus.json
│   │   ├── mbpp.json
│   │   ├── medxpertqa.json
│   │   ├── mega-mlqa.json
│   │   ├── mega-tydi-qa.json
│   │   ├── mega-udpos.json
│   │   ├── mega-xcopa.json
│   │   ├── mega-xstorycloze.json
│   │   ├── meld.json
│   │   ├── mgsm.json
│   │   ├── mimic-cxr.json
│   │   ├── mlvu-m.json
│   │   ├── mlvu.json
│   │   ├── mm-if-eval.json
│   │   ├── mm-mind2web.json
│   │   ├── mm-mt-bench.json
│   │   ├── mmau-music.json
│   │   ├── mmau-sound.json
│   │   ├── mmau-speech.json
│   │   ├── mmau.json
│   │   ├── mmbench-test.json
│   │   ├── mmbench-v1.1.json
│   │   ├── mmbench-video.json
│   │   ├── mmbench.json
│   │   ├── mme-realworld.json
│   │   ├── mme.json
│   │   ├── mmlu-(cot).json
│   │   ├── mmlu-base.json
│   │   ├── mmlu-chat.json
│   │   ├── mmlu-french.json
│   │   ├── mmlu-pro.json
│   │   ├── mmlu-prox.json
│   │   ├── mmlu-redux-2.0.json
│   │   ├── mmlu-redux.json
│   │   ├── mmlu-stem.json
│   │   ├── mmlu.json
│   │   ├── mmmlu.json
│   │   ├── mmmu-(val).json
│   │   ├── mmmu-(validation).json
│   │   ├── mmmu-pro.json
│   │   ├── mmmu.json
│   │   ├── mmmuval.json
│   │   ├── mmstar.json
│   │   ├── mmt-bench.json
│   │   ├── mmvet.json
│   │   ├── mmvetgpt4turbo.json
│   │   ├── mobileminiwob++-sr.json
│   │   ├── mrcr-1m-(pointwise).json
│   │   ├── mrcr-1m.json
│   │   ├── mrcr-v2-(8-needle).json
│   │   ├── mrcr-v2.json
│   │   ├── mrcr.json
│   │   ├── mt-bench.json
│   │   ├── mtvqa.json
│   │   ├── muirbench.json
│   │   ├── multi-if.json
│   │   ├── multi-swe-bench.json
│   │   ├── multichallenge-(o3-mini-grader).json
│   │   ├── multichallenge.json
│   │   ├── multilf.json
│   │   ├── multilingual-mgsm-(cot).json
│   │   ├── multilingual-mmlu.json
│   │   ├── multipl-e-humaneval.json
│   │   ├── multipl-e-mbpp.json
│   │   ├── multipl-e.json
│   │   ├── musiccaps.json
│   │   ├── musr.json
│   │   ├── mvbench.json
│   │   ├── natural-questions.json
│   │   ├── natural2code.json
│   │   ├── nexus.json
│   │   ├── nih-multi-needle.json
│   │   ├── nmos.json
│   │   ├── nq.json
│   │   ├── ocrbench-v2-(en).json
│   │   ├── ocrbench-v2-(zh).json
│   │   ├── ocrbench-v2.json
│   │   ├── ocrbench.json
│   │   ├── odinw.json
│   │   ├── ojbench.json
│   │   ├── olympiadbench.json
│   │   ├── omnibench-music.json
│   │   ├── omnibench.json
│   │   ├── omnimath.json
│   │   ├── open-rewrite.json
│   │   ├── openai-mmlu.json
│   │   ├── openai-mrcr%3A-2-needle-128k.json
│   │   ├── openai-mrcr%3A-2-needle-1m.json
│   │   ├── openai-mrcr%3A-2-needle-256k.json
│   │   ├── openbookqa.json
│   │   ├── osworld-extended.json
│   │   ├── osworld-screenshot-only.json
│   │   ├── osworld.json
│   │   ├── pathmcqa.json
│   │   ├── perceptiontest.json
│   │   ├── phibench.json
│   │   ├── physicsfinals.json
│   │   ├── piqa.json
│   │   ├── pointgrounding.json
│   │   ├── polymath-en.json
│   │   ├── polymath.json
│   │   ├── pope.json
│   │   ├── popqa.json
│   │   ├── qasper.json
│   │   ├── qmsum.json
│   │   ├── realworldqa.json
│   │   ├── repobench.json
│   │   ├── repoqa.json
│   │   ├── ruler.json
│   │   ├── sat-math.json
│   │   ├── scale-multichallenge.json
│   │   ├── scicode.json
│   │   ├── scienceqa-visual.json
│   │   ├── scienceqa.json
│   │   ├── screenspot-pro.json
│   │   ├── screenspot.json
│   │   ├── simpleqa.json
│   │   ├── slakevqa.json
│   │   ├── social-iqa.json
│   │   ├── spider.json
│   │   ├── squality.json
│   │   ├── stem.json
│   │   ├── summscreenfd.json
│   │   ├── superglue.json
│   │   ├── supergpqa.json
│   │   ├── swe-bench-multilingual.json
│   │   ├── swe-bench-verified-(agentic-coding).json
│   │   ├── swe-bench-verified-(agentless).json
│   │   ├── swe-bench-verified-(multiple-attempts).json
│   │   ├── swe-bench-verified.json
│   │   ├── swe-dev.json
│   │   ├── swe-lancer-(ic-diamond-subset).json
│   │   ├── swe-lancer.json
│   │   ├── tau-bench-airline.json
│   │   ├── tau-bench-retail.json
│   │   ├── tau-bench.json
│   │   ├── tau2-airline.json
│   │   ├── tau2-retail.json
│   │   ├── tau2-telecom.json
│   │   ├── tempcompass.json
│   │   ├── terminal-bench.json
│   │   ├── terminus.json
│   │   ├── textvqa.json
│   │   ├── theoremqa.json
│   │   ├── tldr9+-(test).json
│   │   ├── translation-en-to-set1-comet22.json
│   │   ├── translation-en-to-set1-spbleu.json
│   │   ├── translation-set1-to-en-comet22.json
│   │   ├── translation-set1-to-en-spbleu.json
│   │   ├── triviaqa.json
│   │   ├── truthfulqa.json
│   │   ├── tydiqa.json
│   │   ├── uniform-bar-exam.json
│   │   ├── usamo25.json
│   │   ├── vatex.json
│   │   ├── vcr-en-easy.json
│   │   ├── vibe-eval.json
│   │   ├── video-mme-(long,-no-subtitles).json
│   │   ├── video-mme.json
│   │   ├── video-mmew-sub.json
│   │   ├── videomme-w-o-sub..json
│   │   ├── videomme-w-sub..json
│   │   ├── videommmu.json
│   │   ├── visualwebbench.json
│   │   ├── vocalsound.json
│   │   ├── voicebench-avg.json
│   │   ├── vqa-rad.json
│   │   ├── vqav2-(test).json
│   │   ├── vqav2-(val).json
│   │   ├── vqav2.json
│   │   ├── wild-bench.json
│   │   ├── winogrande.json
│   │   ├── wmt23.json
│   │   ├── wmt24++.json
│   │   ├── writingbench.json
│   │   ├── xlsum-english.json
│   │   ├── xstest.json
│   │   └── zebralogic.json
│   ├── licenses/
│   │   ├── apache_2_0.json
│   │   ├── cc_by_nc.json
│   │   ├── creative_commons_attribution_4_0_license.json
│   │   ├── deepseek.json
│   │   ├── gemma.json
│   │   ├── health_ai_developer_foundations_terms_of_use.json
│   │   ├── jamba_open_model_license.json
│   │   ├── llama3_2.json
│   │   ├── llama_3_1_community_license.json
│   │   ├── llama_3_2_community_license.json
│   │   ├── llama_3_3_community_license_agreement.json
│   │   ├── llama_4_community_license_agreement.json
│   │   ├── mistral_research_license.json
│   │   ├── mistral_research_license_(mrl)_for_research;_mistral_commercial_license_for_commercial_use.json
│   │   ├── mit.json
│   │   ├── mit_+_model_license_(commercial_use_allowed).json
│   │   ├── mit_license.json
│   │   ├── mnpl_0_1.json
│   │   ├── modified_mit_license.json
│   │   ├── nvidia_open_model_license_agreement.json
│   │   ├── proprietary.json
│   │   ├── qwen.json
│   │   ├── tongyi_qianwen.json
│   │   └── unknown.json
│   ├── organizations/
│   │   ├── ai21/
│   │   │   ├── models/
│   │   │   │   ├── jamba-1.5-large/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   └── jamba-1.5-mini/
│   │   │   │       ├── benchmarks.json
│   │   │   │       └── model.json
│   │   │   └── organization.json
│   │   ├── amazon/
│   │   │   ├── models/
│   │   │   │   ├── nova-lite/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── nova-micro/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   └── nova-pro/
│   │   │   │       ├── benchmarks.json
│   │   │   │       └── model.json
│   │   │   └── organization.json
│   │   ├── anthropic/
│   │   │   ├── models/
│   │   │   │   ├── claude-3-5-haiku-20241022/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── claude-3-5-sonnet-20240620/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── claude-3-5-sonnet-20241022/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── claude-3-7-sonnet-20250219/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── claude-3-haiku-20240307/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── claude-3-opus-20240229/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── claude-3-sonnet-20240229/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── claude-haiku-4-5-20251015/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── claude-opus-4-1-20250805/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── claude-opus-4-20250514/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── claude-sonnet-4-20250514/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   └── claude-sonnet-4-5-20250929/
│   │   │   │       ├── benchmarks.json
│   │   │   │       └── model.json
│   │   │   └── organization.json
│   │   ├── cohere/
│   │   │   ├── models/
│   │   │   │   └── command-r-plus-04-2024/
│   │   │   │       ├── benchmarks.json
│   │   │   │       └── model.json
│   │   │   └── organization.json
│   │   ├── deepseek/
│   │   │   ├── models/
│   │   │   │   ├── deepseek-r1/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── deepseek-r1-0528/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── deepseek-r1-distill-llama-70b/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── deepseek-r1-distill-llama-8b/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── deepseek-r1-distill-qwen-1.5b/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── deepseek-r1-distill-qwen-14b/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── deepseek-r1-distill-qwen-32b/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── deepseek-r1-distill-qwen-7b/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── deepseek-r1-zero/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── deepseek-v2.5/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── deepseek-v3/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── deepseek-v3-0324/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── deepseek-v3.1/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── deepseek-v3.2-exp/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── deepseek-vl2/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── deepseek-vl2-small/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   └── deepseek-vl2-tiny/
│   │   │   │       ├── benchmarks.json
│   │   │   │       └── model.json
│   │   │   └── organization.json
│   │   ├── google/
│   │   │   ├── models/
│   │   │   │   ├── gemini-1.0-pro/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gemini-1.5-flash/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gemini-1.5-flash-8b/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gemini-1.5-pro/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gemini-2.0-flash/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gemini-2.0-flash-lite/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gemini-2.0-flash-thinking/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gemini-2.5-flash/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gemini-2.5-flash-lite/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gemini-2.5-pro/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gemini-2.5-pro-preview-06-05/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gemini-diffusion/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gemma-2-27b-it/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gemma-2-9b-it/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gemma-3-12b-it/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gemma-3-1b-it/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gemma-3-27b-it/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gemma-3-4b-it/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gemma-3n-e2b/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gemma-3n-e2b-it/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gemma-3n-e2b-it-litert-preview/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gemma-3n-e4b/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gemma-3n-e4b-it/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gemma-3n-e4b-it-litert-preview/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   └── medgemma-4b-it/
│   │   │   │       ├── benchmarks.json
│   │   │   │       └── model.json
│   │   │   └── organization.json
│   │   ├── ibm/
│   │   │   ├── models/
│   │   │   │   ├── granite-3.3-8b-base/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── granite-3.3-8b-instruct/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   └── granite-4.0-tiny-preview/
│   │   │   │       ├── benchmarks.json
│   │   │   │       └── model.json
│   │   │   └── organization.json
│   │   ├── meta/
│   │   │   ├── models/
│   │   │   │   ├── llama-3.1-405b-instruct/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── llama-3.1-70b-instruct/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── llama-3.1-8b-instruct/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── llama-3.2-11b-instruct/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── llama-3.2-3b-instruct/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── llama-3.2-90b-instruct/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── llama-3.3-70b-instruct/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── llama-4-maverick/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   └── llama-4-scout/
│   │   │   │       ├── benchmarks.json
│   │   │   │       └── model.json
│   │   │   └── organization.json
│   │   ├── microsoft/
│   │   │   ├── models/
│   │   │   │   ├── phi-3.5-mini-instruct/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── phi-3.5-moe-instruct/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── phi-3.5-vision-instruct/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── phi-4/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── phi-4-mini/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── phi-4-mini-reasoning/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── phi-4-multimodal-instruct/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── phi-4-reasoning/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   └── phi-4-reasoning-plus/
│   │   │   │       ├── benchmarks.json
│   │   │   │       └── model.json
│   │   │   └── organization.json
│   │   ├── mistral/
│   │   │   ├── models/
│   │   │   │   ├── codestral-22b/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── devstral-medium-2507/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── devstral-small-2507/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── magistral-medium/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── magistral-small-2506/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── ministral-8b-instruct-2410/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── mistral-large-2-2407/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── mistral-nemo-instruct-2407/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── mistral-small-2409/
│   │   │   │   │   └── model.json
│   │   │   │   ├── mistral-small-24b-base-2501/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── mistral-small-24b-instruct-2501/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── mistral-small-3.1-24b-base-2503/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── mistral-small-3.1-24b-instruct-2503/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── mistral-small-3.2-24b-instruct-2506/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── pixtral-12b-2409/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   └── pixtral-large/
│   │   │   │       ├── benchmarks.json
│   │   │   │       └── model.json
│   │   │   └── organization.json
│   │   ├── moonshotai/
│   │   │   ├── models/
│   │   │   │   ├── kimi-k1.5/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── kimi-k2-0905/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── kimi-k2-base/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── kimi-k2-instruct/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   └── kimi-k2-instruct-0905/
│   │   │   │       ├── benchmarks.json
│   │   │   │       └── model.json
│   │   │   └── organization.json
│   │   ├── nvidia/
│   │   │   ├── models/
│   │   │   │   ├── llama-3.1-nemotron-70b-instruct/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── llama-3.1-nemotron-nano-8b-v1/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── llama-3.1-nemotron-ultra-253b-v1/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── llama-3.3-nemotron-super-49b-v1/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   └── nemotron-nano-9b-v2/
│   │   │   │       ├── benchmarks.json
│   │   │   │       └── model.json
│   │   │   └── organization.json
│   │   ├── openai/
│   │   │   ├── models/
│   │   │   │   ├── gpt-3.5-turbo-0125/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gpt-4-0613/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gpt-4-turbo-2024-04-09/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gpt-4.1-2025-04-14/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gpt-4.1-mini-2025-04-14/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gpt-4.1-nano-2025-04-14/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gpt-4.5/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gpt-4o-2024-05-13/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gpt-4o-2024-08-06/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gpt-4o-mini-2024-07-18/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gpt-5-2025-08-07/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gpt-5-codex-2025-09-15/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gpt-5-mini-2025-08-07/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gpt-5-nano-2025-08-07/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gpt-oss-120b/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── gpt-oss-20b/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── o1-2024-12-17/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── o1-mini/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── o1-preview/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── o1-pro/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── o3-2025-04-16/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── o3-mini/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── o3-pro-2025-06-10/
│   │   │   │   │   └── model.json
│   │   │   │   └── o4-mini/
│   │   │   │       ├── benchmarks.json
│   │   │   │       └── model.json
│   │   │   └── organization.json
│   │   ├── qwen/
│   │   │   ├── models/
│   │   │   │   ├── qvq-72b-preview/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── qwen-2.5-14b-instruct/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── qwen-2.5-32b-instruct/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── qwen-2.5-72b-instruct/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── qwen-2.5-7b-instruct/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── qwen-2.5-coder-32b-instruct/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── qwen-2.5-coder-7b-instruct/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── qwen2-72b-instruct/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── qwen2-7b-instruct/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── qwen2-vl-72b/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── qwen2.5-omni-7b/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── qwen2.5-vl-32b/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── qwen2.5-vl-72b/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── qwen2.5-vl-7b/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── qwen3-235b-a22b/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── qwen3-235b-a22b-instruct-2507/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── qwen3-235b-a22b-thinking-2507/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── qwen3-30b-a3b/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── qwen3-32b/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── qwen3-next-80b-a3b-base/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── qwen3-next-80b-a3b-instruct/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── qwen3-next-80b-a3b-thinking/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── qwq-32b/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   └── qwq-32b-preview/
│   │   │   │       ├── benchmarks.json
│   │   │   │       └── model.json
│   │   │   └── organization.json
│   │   ├── unknown/
│   │   │   └── organization.json
│   │   ├── xai/
│   │   │   ├── models/
│   │   │   │   ├── grok-1.5/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── grok-1.5v/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── grok-2/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── grok-2-mini/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── grok-3/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── grok-3-mini/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── grok-4/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── grok-4-fast/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   ├── grok-4-heavy/
│   │   │   │   │   ├── benchmarks.json
│   │   │   │   │   └── model.json
│   │   │   │   └── grok-code-fast-1/
│   │   │   │       ├── benchmarks.json
│   │   │   │       └── model.json
│   │   │   └── organization.json
│   │   └── zai-org/
│   │       ├── models/
│   │       │   ├── glm-4.5/
│   │       │   │   ├── benchmarks.json
│   │       │   │   └── model.json
│   │       │   ├── glm-4.5-air/
│   │       │   │   ├── benchmarks.json
│   │       │   │   └── model.json
│   │       │   ├── glm-4.5v/
│   │       │   │   ├── benchmarks.json
│   │       │   │   └── model.json
│   │       │   └── glm-4.6/
│   │       │       ├── benchmarks.json
│   │       │       └── model.json
│   │       └── organization.json
│   └── providers/
│       ├── anthropic/
│       │   ├── models.json
│       │   └── provider.json
│       ├── azure/
│       │   ├── models.json
│       │   └── provider.json
│       ├── bedrock/
│       │   ├── models.json
│       │   └── provider.json
│       ├── cerebras/
│       │   ├── models.json
│       │   └── provider.json
│       ├── cohere/
│       │   ├── models.json
│       │   └── provider.json
│       ├── deepinfra/
│       │   ├── models.json
│       │   └── provider.json
│       ├── deepseek/
│       │   ├── models.json
│       │   └── provider.json
│       ├── fireworks/
│       │   ├── models.json
│       │   └── provider.json
│       ├── google/
│       │   ├── models.json
│       │   └── provider.json
│       ├── groq/
│       │   ├── models.json
│       │   └── provider.json
│       ├── hyperbolic/
│       │   ├── models.json
│       │   └── provider.json
│       ├── lambda/
│       │   ├── models.json
│       │   └── provider.json
│       ├── mistral/
│       │   ├── models.json
│       │   └── provider.json
│       ├── novita/
│       │   ├── models.json
│       │   └── provider.json
│       ├── openai/
│       │   ├── models.json
│       │   └── provider.json
│       ├── replicate/
│       │   ├── models.json
│       │   └── provider.json
│       ├── sambanova/
│       │   ├── models.json
│       │   └── provider.json
│       ├── together/
│       │   ├── models.json
│       │   └── provider.json
│       ├── xai/
│       │   ├── models.json
│       │   └── provider.json
│       └── zeroeval/
│           ├── models.json
│           └── provider.json
├── package.json
└── schemas/
    ├── README.md
    ├── benchmark-results.schema.json
    ├── benchmark.schema.json
    ├── integrity-validator.js
    ├── license.schema.json
    ├── model.schema.json
    ├── organization.schema.json
    ├── provider-models.schema.json
    ├── provider.schema.json
    └── validator.js

================================================
FILE CONTENTS
================================================

================================================
FILE: .github/pull_request_template.md
================================================
## Description

<!-- Briefly describe your changes and add links to the relevant resources -->

References:

<!-- Add links to the relevant resources -->

## Type of Change

<!-- Mark the appropriate option with an [x] -->

- [ ] Model Update/Addition
- [ ] Qualitative Metrics (Benchmark Results) Update/Addition
- [ ] Provider Update/Addition
- [ ] Other (please specify)

## Checklist

- [ ] I've read the [CONTRIBUTING.md](../CONTRIBUTING.md) guidelines
- [ ] My changes are accurate and properly referenced


================================================
FILE: .github/workflows/schema-validation.yml
================================================
name: Schema Validation

on:
  pull_request:
    branches: [main]

jobs:
  validate:
    name: Validate Schema
    runs-on: ubuntu-latest

    steps:
      - name: Checkout code
        uses: actions/checkout@v3

      - name: Setup Node.js
        uses: actions/setup-node@v3
        with:
          node-version: "16"
          cache: "npm"

      - name: Install dependencies
        run: npm ci

      - name: Run schema validation
        run: node schemas/validator.js


================================================
FILE: .gitignore
================================================
/node_modules


================================================
FILE: .vscode/settings.json
================================================
{
  "json.schemas": [
    {
      "fileMatch": ["/models/*/model.json"],
      "url": "../schemas/models-schema.json"
    },
    {
      "fileMatch": ["/models/*/qualitativemetrics.json"],
      "url": "../schemas/qualitativemetrics-schema.json"
    },
    {
      "fileMatch": ["/providers/*/provider.json"],
      "url": "../schemas/providers-schema.json"
    },
    {
      "fileMatch": ["/providers/*/providermodels.json"],
      "url": "../schemas/providermodels-schema.json"
    }
  ]
}

================================================
FILE: CONTRIBUTING.md
================================================
# Contributing to LLM Stats

Thank you for your interest in contributing. This guide outlines the process for updating and adding information to the LLM Stats database.

## Table of Contents

- [Overview](#overview)
- [Data Structure](#data-structure)
- [General Guidelines](#general-guidelines)
- [Organizations](#organizations)
- [Models](#models)
- [Benchmark Results](#benchmark-results)
- [Benchmarks](#benchmarks)
- [Providers](#providers)
- [Licenses](#licenses)
- [Validation](#validation)
- [Submitting Your Contribution](#submitting-your-contribution)

## Overview

All data is organized in the `data/data/` directory with a hierarchical structure. Each entity type has its own JSON schema definition in `schemas/` that validates the data structure.

## Data Structure

```
data/
├── data/
│   ├── organizations/
│   │   └── [organization_id]/
│   │       ├── organization.json
│   │       └── models/
│   │           └── [model_id]/
│   │               ├── model.json
│   │               └── benchmarks.json
│   ├── providers/
│   │   └── [provider_id]/
│   │       ├── provider.json
│   │       └── models.json
│   ├── licenses/
│   │   └── [license_id].json
│   └── benchmarks/
│       └── [benchmark_id].json
└── schemas/
    ├── organization.schema.json
    ├── model.schema.json
    ├── benchmark-results.schema.json
    ├── benchmark.schema.json
    ├── provider.schema.json
    ├── provider-models.schema.json
    └── license.schema.json
```

## General Guidelines

1. **Accuracy First**: Ensure all data is accurate and sourced from authoritative references
2. **Follow Structure**: Adhere to the existing file structure and naming conventions
3. **Consistent Formatting**: Use consistent JSON formatting with 2-space indentation
4. **One Change per PR**: Submit one pull request per logical change (e.g., one model, one provider)
5. **Schema Validation**: All data files must validate against their respective JSON schemas
6. **Required Fields**: Pay attention to required vs optional fields in schemas
7. **Timestamps**: Use ISO 8601 format for dates (YYYY-MM-DD or full timestamp)

## Organizations

Organizations represent the entities that create and release models (e.g., OpenAI, Anthropic, Meta).

### Location

`data/data/organizations/[organization_id]/organization.json`

### Adding a New Organization

1. Create a new directory: `data/data/organizations/[organization_id]/`
2. Create `organization.json` with the following structure:

```json
{
  "organization_id": "organization-name",
  "name": "Organization Display Name",
  "website": "https://organization.com",
  "description": "Brief description of the organization",
  "country": "US",
  "created_at": "2025-10-02T00:00:00.000000+00:00",
  "updated_at": "2025-10-02T00:00:00.000000+00:00"
}
```

3. Validate against `schemas/organization.schema.json`
4. Create a `models/` subdirectory for future models

### Updating an Existing Organization

1. Navigate to `data/data/organizations/[organization_id]/organization.json`
2. Update the relevant fields
3. Update the `updated_at` timestamp
4. Validate against the schema

## Models

Models are stored within their respective organization directories.

### Location

`data/data/organizations/[organization_id]/models/[model_id]/`

### Adding a New Model

1. Ensure the organization exists in `data/data/organizations/`
2. Ensure the license exists in `data/data/licenses/`
3. Create a new directory: `data/data/organizations/[organization_id]/models/[model_id]/`
4. Create two files in this directory:

#### `model.json`

```json
{
  "model_id": "model-name-version",
  "name": "Model Display Name",
  "organization_id": "organization-name",
  "fine_tuned_from_model_id": null,
  "description": "Detailed description of the model's capabilities",
  "release_date": "2024-10-22",
  "announcement_date": "2024-10-22",
  "license_id": "proprietary",
  "multimodal": false,
  "knowledge_cutoff": "2024-04-01",
  "param_count": 7000000000,
  "training_tokens": 15000000000000,
  "available_in_zeroeval": true,
  "source_api_ref": "https://...",
  "source_playground": "https://...",
  "source_paper": "https://...",
  "source_scorecard_blog_link": "https://...",
  "source_repo_link": "https://github.com/...",
  "source_weights_link": "https://huggingface.co/...",
  "created_at": "2025-10-02T00:00:00.000000+00:00",
  "updated_at": "2025-10-02T00:00:00.000000+00:00",
  "model_family_id": null
}
```

**Required Fields**: `model_id`, `name`, `organization_id`, `release_date`, `license_id`, `multimodal`

**Optional Fields**: Set to `null` if not applicable

#### `benchmarks.json`

Start with an empty array if no benchmark results are available yet:

```json
[]
```

5. Validate both files against their respective schemas

### Updating an Existing Model

1. Navigate to `data/data/organizations/[organization_id]/models/[model_id]/model.json`
2. Update the relevant fields
3. Update the `updated_at` timestamp
4. Validate against `schemas/model.schema.json`

## Benchmark Results

Benchmark results are stored in the `benchmarks.json` file within each model directory.

### Location

`data/data/organizations/[organization_id]/models/[model_id]/benchmarks.json`

### Adding Benchmark Results

1. Ensure the benchmark exists in `data/data/benchmarks/`
2. Ensure the model exists
3. Add a new entry to the `benchmarks.json` array:

```json
[
  {
    "benchmark_id": "mmlu",
    "score": 85.5,
    "score_unit": "percentage",
    "source_link": "https://example.com/results",
    "created_at": "2025-10-02T00:00:00.000000+00:00",
    "updated_at": "2025-10-02T00:00:00.000000+00:00"
  }
]
```

4. Validate against `schemas/benchmark-results.schema.json`

### Updating Benchmark Results

1. Locate the specific result in the array
2. Update the `score` and/or `source_link`
3. Update the `updated_at` timestamp
4. Ensure the `source_link` is reliable and authoritative

## Benchmarks

Benchmarks define the evaluation tests used to measure model performance.

### Location

`data/data/benchmarks/[benchmark_id].json`

### Adding a New Benchmark

1. Create a new file: `data/data/benchmarks/[benchmark_id].json`
2. Follow this structure:

```json
{
  "benchmark_id": "benchmark-name",
  "name": "Benchmark Display Name",
  "description": "Description of what this benchmark measures",
  "category": "reasoning",
  "source_link": "https://...",
  "created_at": "2025-10-02T00:00:00.000000+00:00",
  "updated_at": "2025-10-02T00:00:00.000000+00:00"
}
```

3. Validate against `schemas/benchmark.schema.json`

## Providers

Providers are services that offer access to models (e.g., OpenAI API, AWS Bedrock, Google Vertex AI).

### Location

`data/data/providers/[provider_id]/`

### Adding a New Provider

1. Create a new directory: `data/data/providers/[provider_id]/`
2. Create two files:

#### `provider.json`

```json
{
  "provider_id": "provider-name",
  "name": "Provider Display Name",
  "website": "https://provider.com",
  "created_at": "2025-10-02T00:00:00.000000+00:00",
  "updated_at": "2025-10-02T00:00:00.000000+00:00"
}
```

#### `models.json`

Start with an empty array:

```json
[]
```

3. Validate both files against their respective schemas

### Updating Provider Information

1. Navigate to `data/data/providers/[provider_id]/provider.json`
2. Update the relevant fields
3. Update the `updated_at` timestamp

### Adding Provider Models

Provider models specify pricing and availability of models through specific providers.

1. Open `data/data/providers/[provider_id]/models.json`
2. Add a new entry to the array:

```json
[
  {
    "provider_model_id": "provider-specific-id",
    "model_id": "actual-model-id",
    "provider_id": "provider-name",
    "input_price_per_million": 3.0,
    "output_price_per_million": 15.0,
    "context_window": 200000,
    "max_output_tokens": 4096,
    "available": true,
    "created_at": "2025-10-02T00:00:00.000000+00:00",
    "updated_at": "2025-10-02T00:00:00.000000+00:00"
  }
]
```

3. Ensure the model exists in `data/data/organizations/[org]/models/[model_id]/`
4. Validate against `schemas/provider-models.schema.json`

## Licenses

Licenses define the terms under which models can be used.

### Location

`data/data/licenses/[license_id].json`

### Adding a New License

1. Create a new file: `data/data/licenses/[license_id].json`
2. Follow this structure:

```json
{
  "license_id": "license-name",
  "name": "License Display Name",
  "url": "https://...",
  "commercial_use": true,
  "created_at": "2025-10-02T00:00:00.000000+00:00",
  "updated_at": "2025-10-02T00:00:00.000000+00:00"
}
```

3. Validate against `schemas/license.schema.json`

## Validation

Before submitting your contribution:

### Manual Validation

Run the validator script from the `data/` directory:

```bash
cd data
node schemas/validator.js
```

This will check all JSON files against their respective schemas.

### What the Validator Checks

- JSON syntax correctness
- Required fields are present
- Field types match schema definitions
- ID references exist (e.g., organization_id, license_id)
- Date formats are valid
- URLs are properly formatted

### Common Validation Errors

1. **Missing Required Fields**: Ensure all required fields are present
2. **Invalid Date Format**: Use ISO 8601 format (YYYY-MM-DD or full timestamp)
3. **Invalid References**: Ensure referenced IDs exist (organization_id, license_id, etc.)
4. **Type Mismatch**: Ensure numbers are numbers, strings are strings, etc.
5. **Trailing Commas**: Remove trailing commas in JSON arrays/objects

## Submitting Your Contribution

1. **Fork the Repository**: Create your own fork of the project
2. **Create a Branch**: Use a descriptive branch name (e.g., `add-gpt5-model`, `update-claude-pricing`)
3. **Make Changes**: Follow the guidelines above
4. **Validate Locally**: Run `node schemas/validator.js` to ensure your changes are valid
5. **Commit Changes**: Write clear, descriptive commit messages
6. **Submit a Pull Request**:
   - Provide a clear title and description
   - List what was added or changed
   - Include links to authoritative sources
   - Reference any related issues

### Pull Request Template

```markdown
## Description

Brief description of what this PR adds or changes

## Changes

- Added/Updated model: [Model Name]
- Added/Updated organization: [Organization Name]
- Added benchmark results for: [Benchmark Name]

## Sources

- [Source 1]: https://...
- [Source 2]: https://...

## Validation

- [ ] Ran `node schemas/validator.js` successfully
- [ ] All files follow the correct structure
- [ ] All references (organization_id, license_id) are valid
```

### Example Pull Request

For reference, see this [example pull request](https://github.com/JonathanChavezTamales/llm-leaderboard/pull/1).

## Questions?

If you have questions or need clarification, please:

1. Check the schema files in `schemas/` for detailed field definitions
2. Look at existing data files as examples
3. Open an issue on GitHub

Thank you for contributing to LLM Stats!


================================================
FILE: LICENSE.md
================================================
Creative Commons Attribution 4.0 International License

Copyright (c) 2024 jc

This work is licensed under the Creative Commons Attribution 4.0 International License.
To view a copy of this license, visit http://creativecommons.org/licenses/by/4.0/
or send a letter to Creative Commons, PO Box 1866, Mountain View, CA 94042, USA.

You are free to:

- Share — copy and redistribute the material in any medium or format
- Adapt — remix, transform, and build upon the material for any purpose, even commercially

Under the following terms:

- Attribution — You must give appropriate credit, provide a link to the license, and indicate
  if changes were made. You may do so in any reasonable manner, but not in any way that
  suggests the licensor endorses you or your use.

No additional restrictions — You may not apply legal terms or technological measures that
legally restrict others from doing anything the license permits.

Notices:

- You do not have to comply with the license for elements of the material in the public domain
  or where your use is permitted by an applicable exception or limitation.
- No warranties are given. The license may not give you all of the permissions necessary for
  your intended use. For example, other rights such as publicity, privacy, or moral rights
  may limit how you use the material.


================================================
FILE: README.md
================================================
# DEPRECATED - Updates and contributions

This repository is now depracated and won't be getting any new updates. For contributions and corrections of the data seen in [LLM Stats](https://llm-stats.com/) please create a post with the tag "Issue" in the [official community section](https://llm-stats.com/posts) of the website.

For model and/or benchmark specific corrections, please visit create an Issue under the "Discussion" tab of the model/benchmark, as seen in the example below.

<img width="1156" height="575" alt="Screenshot 2025-10-24 at 1 43 52 p m" src="https://github.com/user-attachments/assets/b78f2cf3-f3ff-4a51-bba4-d8643865d16b" />

---

<img width="1208" alt="image" src="https://github.com/user-attachments/assets/835f1e1b-73e6-405a-b7ad-096d5f5f567a" />

# LLM-Stats.com

[![GitHub stars](https://img.shields.io/github/stars/JonathanChavezTamales/llm-leaderboard?style=social)](https://github.com/JonathanChavezTamales/llm-leaderboard/stargazers)
[![PRs Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg)](CONTRIBUTING.md)
[![Discord](https://img.shields.io/badge/Discord-Join%20Us-7289da?logo=discord&logoColor=white)](https://discord.com/invite/RxGUBvE42d)
[![Issues](https://img.shields.io/github/issues/JonathanChavezTamales/llm-leaderboard)](https://github.com/JonathanChavezTamales/llm-leaderboard/issues)

A community-driven repository of LLM data and benchmarks. Compare and explore language models through our interactive dashboard at [llm-stats.com](https://llm-stats.com).

## Found an issue or have a feature request?

[Open an issue here](https://github.com/JonathanChavezTamales/llm-leaderboard/issues). Thank you!

# Data

## 🔍 What's Inside

Our repository contains detailed information on hundreds of LLMs:

- Model parameters, context window sizes, licensing details, capabilities, and more
- Provider pricing and configurations
- Performance metrics (throughput, latency)
- Standardized benchmark results
- Organization and license information

## 📁 Data Structure

All data is organized in the `data/` directory:

- `data/models/` - Model metadata and configurations
- `data/providers/` - Provider information
- `data/provider_models/` - Provider-specific model pricing and features
- `data/benchmarks/` - Benchmark definitions
- `data/model_benchmarks/` - Model benchmark scores
- `data/organizations/` - Organization information
- `data/licenses/` - License definitions

## 🤝 How to Contribute

We welcome community contributions to keep our data accurate and up-to-date:

1. **Update Model Data**

   - Browse the [`data/`](data/) directory structure
   - Submit a PR following our [contribution guidelines](CONTRIBUTING.md)
   - Check [`schemas/`](schemas/) for JSON Schema validation

## 📈 Data Quality

Accuracy is our priority. To ensure reliable information:

- All benchmark data requires verifiable source links
- Community review process for all changes
- Multiple source citations encouraged
- Regular validation of submitted data

There's no guarantee that the data is 100% accurate, but we do our best to ensure it's as accurate as possible.

## 🌟 Community

- Join our [Discord](https://discord.gg/RxGUBvE42d) for discussions

## Leaderboard

| Name                                     | Release Date | Input Context | Output Context | GPQA  | MMLU  | MMLU-Pro | MATH  | HumanEval | MMMU  | LiveCodeBench |
| ---------------------------------------- | ------------ | ------------- | -------------- | ----- | ----- | -------- | ----- | --------- | ----- | ------------- |
| GPT-5                                    | 2025-08-07   | N/A           | N/A            | 0.857 | 0.925 | N/A      | 0.847 | 0.934     | 0.842 | N/A           |
| o1                                       | 2024-12-17   | N/A           | N/A            | 0.780 | 0.918 | N/A      | 0.964 | 0.881     | 0.776 | N/A           |
| GPT-4.5                                  | 2025-02-27   | N/A           | N/A            | 0.695 | 0.908 | N/A      | N/A   | 0.880     | 0.752 | N/A           |
| o1-preview                               | 2024-09-12   | N/A           | N/A            | 0.733 | 0.908 | N/A      | 0.855 | N/A       | N/A   | N/A           |
| Claude 3.5 Sonnet                        | 2024-10-22   | N/A           | N/A            | 0.672 | 0.904 | 0.776    | 0.783 | 0.937     | 0.683 | N/A           |
| Claude 3.5 Sonnet                        | 2024-06-21   | N/A           | N/A            | 0.594 | 0.904 | 0.761    | 0.711 | 0.920     | N/A   | N/A           |
| Kimi K2 0905                             | 2025-09-05   | N/A           | N/A            | 0.758 | 0.902 | 0.825    | 0.891 | 0.945     | N/A   | N/A           |
| GPT-4.1                                  | 2025-04-14   | N/A           | N/A            | 0.663 | 0.902 | N/A      | N/A   | N/A       | 0.748 | N/A           |
| Kimi K2 Instruct                         | 2025-07-11   | N/A           | N/A            | 0.751 | 0.895 | 0.811    | N/A   | 0.933     | N/A   | N/A           |
| GPT-4o                                   | 2024-05-13   | N/A           | N/A            | 0.536 | 0.887 | 0.726    | 0.766 | 0.902     | N/A   | N/A           |
| DeepSeek-V3                              | 2024-12-25   | N/A           | N/A            | 0.591 | 0.885 | 0.759    | N/A   | N/A       | N/A   | 0.376         |
| Qwen3 235B A22B                          | 2025-04-29   | N/A           | N/A            | 0.475 | 0.878 | 0.682    | 0.718 | N/A       | N/A   | 0.707         |
| Kimi K2 Base                             | 2025-07-11   | N/A           | N/A            | 0.481 | 0.878 | 0.692    | 0.702 | N/A       | N/A   | N/A           |
| Grok-2                                   | 2024-08-13   | N/A           | N/A            | 0.560 | 0.875 | 0.755    | 0.761 | 0.884     | 0.661 | N/A           |
| GPT-4.1 mini                             | 2025-04-14   | N/A           | N/A            | 0.650 | 0.875 | N/A      | N/A   | N/A       | 0.727 | N/A           |
| Kimi-k1.5                                | 2025-01-20   | N/A           | N/A            | N/A   | 0.874 | N/A      | N/A   | N/A       | 0.700 | N/A           |
| Llama 3.1 405B Instruct                  | 2024-07-23   | N/A           | N/A            | 0.507 | 0.873 | 0.733    | 0.738 | 0.890     | N/A   | N/A           |
| o3-mini                                  | 2025-01-30   | N/A           | N/A            | 0.772 | 0.869 | N/A      | 0.979 | N/A       | N/A   | N/A           |
| Claude 3 Opus                            | 2024-02-29   | N/A           | N/A            | 0.504 | 0.868 | 0.685    | 0.601 | 0.849     | N/A   | N/A           |
| GPT-4 Turbo                              | 2024-04-09   | N/A           | N/A            | 0.480 | 0.865 | N/A      | 0.726 | 0.871     | N/A   | N/A           |
| GPT-4                                    | 2023-06-13   | N/A           | N/A            | 0.357 | 0.864 | N/A      | 0.420 | 0.670     | N/A   | N/A           |
| Grok-2 mini                              | 2024-08-13   | N/A           | N/A            | 0.510 | 0.862 | 0.720    | 0.730 | 0.857     | 0.632 | N/A           |
| Llama 3.2 90B Instruct                   | 2024-09-25   | N/A           | N/A            | 0.467 | 0.860 | N/A      | 0.680 | N/A       | 0.603 | N/A           |
| Llama 3.3 70B Instruct                   | 2024-12-06   | N/A           | N/A            | 0.505 | 0.860 | 0.689    | 0.770 | 0.884     | N/A   | N/A           |
| Nova Pro                                 | 2024-11-20   | N/A           | N/A            | 0.469 | 0.859 | N/A      | 0.766 | 0.890     | 0.617 | N/A           |
| Gemini 1.5 Pro                           | 2024-05-01   | N/A           | N/A            | 0.591 | 0.859 | 0.758    | 0.865 | 0.841     | 0.659 | N/A           |
| GPT-4o                                   | 2024-08-06   | N/A           | N/A            | 0.701 | 0.857 | 0.747    | N/A   | N/A       | 0.722 | N/A           |
| Llama 4 Maverick                         | 2025-04-05   | N/A           | N/A            | 0.698 | 0.855 | 0.805    | 0.612 | N/A       | 0.734 | 0.434         |
| o1-mini                                  | 2024-09-12   | N/A           | N/A            | 0.600 | 0.852 | N/A      | N/A   | 0.924     | N/A   | N/A           |
| Phi 4                                    | 2024-12-12   | N/A           | N/A            | 0.561 | 0.848 | 0.704    | 0.804 | 0.826     | N/A   | N/A           |
| Mistral Large 2                          | 2024-07-24   | N/A           | N/A            | N/A   | 0.840 | N/A      | N/A   | 0.920     | N/A   | N/A           |
| Llama 3.1 70B Instruct                   | 2024-07-23   | N/A           | N/A            | 0.417 | 0.836 | 0.664    | N/A   | 0.805     | N/A   | N/A           |
| Qwen2.5 32B Instruct                     | 2024-09-19   | N/A           | N/A            | 0.495 | 0.833 | 0.690    | 0.831 | 0.884     | N/A   | N/A           |
| Qwen2 72B Instruct                       | 2024-07-23   | N/A           | N/A            | 0.424 | 0.823 | 0.644    | 0.597 | 0.860     | N/A   | N/A           |
| GPT-4o mini                              | 2024-07-18   | N/A           | N/A            | 0.402 | 0.820 | N/A      | 0.702 | 0.872     | 0.594 | N/A           |
| Grok-1.5                                 | 2024-03-28   | N/A           | N/A            | 0.359 | 0.813 | 0.510    | 0.506 | 0.741     | 0.536 | N/A           |
| Jamba 1.5 Large                          | 2024-08-22   | N/A           | N/A            | 0.369 | 0.812 | 0.535    | N/A   | N/A       | N/A   | N/A           |
| Mistral Small 3.1 24B Base               | 2025-03-17   | N/A           | N/A            | 0.375 | 0.810 | 0.560    | N/A   | N/A       | 0.593 | N/A           |
| Mistral Small 3 24B Base                 | 2025-01-30   | N/A           | N/A            | 0.344 | 0.807 | 0.544    | 0.460 | N/A       | N/A   | N/A           |
| Mistral Small 3.1 24B Instruct           | 2025-03-17   | N/A           | N/A            | 0.460 | 0.806 | 0.668    | 0.693 | 0.884     | 0.593 | N/A           |
| Nova Lite                                | 2024-11-20   | N/A           | N/A            | 0.420 | 0.805 | N/A      | 0.733 | 0.854     | 0.562 | N/A           |
| Mistral Small 3.2 24B Instruct           | 2025-06-20   | N/A           | N/A            | 0.461 | 0.805 | 0.691    | 0.694 | N/A       | 0.625 | N/A           |
| DeepSeek-V2.5                            | 2024-05-08   | N/A           | N/A            | N/A   | 0.804 | N/A      | 0.747 | 0.890     | N/A   | N/A           |
| Llama 3.1 Nemotron 70B Instruct          | 2024-10-01   | N/A           | N/A            | N/A   | 0.802 | N/A      | N/A   | N/A       | N/A   | N/A           |
| GPT-4.1 nano                             | 2025-04-14   | N/A           | N/A            | 0.503 | 0.801 | N/A      | N/A   | N/A       | 0.554 | N/A           |
| Qwen2.5 14B Instruct                     | 2024-09-19   | N/A           | N/A            | 0.455 | 0.797 | 0.637    | 0.800 | 0.835     | N/A   | N/A           |
| Llama 4 Scout                            | 2025-04-05   | N/A           | N/A            | 0.572 | 0.796 | 0.743    | 0.503 | N/A       | 0.694 | 0.328         |
| Claude 3 Sonnet                          | 2024-02-29   | N/A           | N/A            | 0.404 | 0.790 | 0.568    | 0.431 | 0.730     | N/A   | N/A           |
| Gemini 1.5 Flash                         | 2024-05-01   | N/A           | N/A            | 0.510 | 0.789 | 0.673    | 0.779 | 0.743     | 0.623 | N/A           |
| Phi-3.5-MoE-instruct                     | 2024-08-23   | N/A           | N/A            | 0.368 | 0.789 | 0.453    | 0.595 | 0.707     | N/A   | N/A           |
| Qwen2.5 VL 32B Instruct                  | 2025-02-28   | N/A           | N/A            | 0.460 | 0.784 | 0.688    | 0.822 | 0.915     | 0.700 | N/A           |
| Nova Micro                               | 2024-11-20   | N/A           | N/A            | 0.400 | 0.776 | N/A      | 0.693 | 0.811     | N/A   | N/A           |
| Command R+                               | 2024-08-30   | N/A           | N/A            | N/A   | 0.757 | N/A      | N/A   | N/A       | N/A   | N/A           |
| Gemma 2 27B                              | 2024-06-27   | N/A           | N/A            | N/A   | 0.752 | N/A      | 0.423 | 0.518     | N/A   | N/A           |
| Claude 3 Haiku                           | 2024-03-13   | N/A           | N/A            | 0.333 | 0.752 | N/A      | 0.389 | 0.759     | N/A   | N/A           |
| Qwen2.5-Coder 32B Instruct               | 2024-09-19   | N/A           | N/A            | N/A   | 0.751 | 0.504    | 0.572 | 0.927     | N/A   | 0.314         |
| Llama 3.2 11B Instruct                   | 2024-09-25   | N/A           | N/A            | 0.328 | 0.730 | N/A      | 0.519 | N/A       | 0.507 | N/A           |
| Gemini 1.0 Pro                           | 2024-02-15   | N/A           | N/A            | 0.279 | 0.718 | N/A      | 0.326 | N/A       | 0.479 | N/A           |
| Gemma 2 9B                               | 2024-06-27   | N/A           | N/A            | N/A   | 0.713 | N/A      | 0.366 | 0.402     | N/A   | N/A           |
| Qwen2 7B Instruct                        | 2024-07-23   | N/A           | N/A            | 0.253 | 0.705 | 0.441    | 0.496 | 0.799     | N/A   | 0.266         |
| GPT-3.5 Turbo                            | 2023-03-21   | N/A           | N/A            | 0.308 | 0.698 | N/A      | 0.431 | 0.680     | 0.000 | N/A           |
| Jamba 1.5 Mini                           | 2024-08-22   | N/A           | N/A            | 0.323 | 0.697 | 0.425    | N/A   | N/A       | N/A   | N/A           |
| Llama 3.1 8B Instruct                    | 2024-07-23   | N/A           | N/A            | 0.304 | 0.694 | 0.483    | N/A   | 0.726     | N/A   | N/A           |
| Pixtral-12B                              | 2024-09-17   | N/A           | N/A            | N/A   | 0.692 | N/A      | 0.481 | 0.720     | 0.525 | N/A           |
| Phi-3.5-mini-instruct                    | 2024-08-23   | N/A           | N/A            | 0.304 | 0.690 | 0.474    | 0.485 | 0.628     | N/A   | N/A           |
| Mistral NeMo Instruct                    | 2024-07-18   | N/A           | N/A            | N/A   | 0.680 | N/A      | N/A   | N/A       | N/A   | N/A           |
| Qwen2.5-Coder 7B Instruct                | 2024-09-19   | N/A           | N/A            | N/A   | 0.676 | 0.401    | 0.466 | 0.884     | N/A   | 0.182         |
| Phi 4 Mini                               | 2025-02-01   | N/A           | N/A            | 0.252 | 0.673 | 0.528    | 0.640 | N/A       | N/A   | N/A           |
| Granite 3.3 8B Instruct                  | 2025-04-16   | N/A           | N/A            | N/A   | 0.655 | N/A      | N/A   | 0.897     | N/A   | N/A           |
| Ministral 8B Instruct                    | 2024-10-16   | N/A           | N/A            | N/A   | 0.650 | N/A      | 0.545 | 0.348     | N/A   | N/A           |
| Gemma 3n E4B Instructed LiteRT Preview   | 2025-05-20   | N/A           | N/A            | 0.237 | 0.649 | 0.506    | N/A   | 0.750     | N/A   | 0.132         |
| Gemma 3n E4B Instructed                  | 2025-06-26   | N/A           | N/A            | 0.237 | 0.649 | 0.506    | N/A   | 0.750     | N/A   | 0.132         |
| Granite 3.3 8B Base                      | 2025-04-16   | N/A           | N/A            | N/A   | 0.639 | N/A      | N/A   | 0.897     | N/A   | N/A           |
| Llama 3.2 3B Instruct                    | 2024-09-25   | N/A           | N/A            | 0.328 | 0.634 | N/A      | 0.480 | N/A       | N/A   | N/A           |
| IBM Granite 4.0 Tiny Preview             | 2025-05-02   | N/A           | N/A            | N/A   | 0.604 | N/A      | N/A   | 0.824     | N/A   | N/A           |
| Gemma 3n E2B Instructed LiteRT (Preview) | 2025-05-20   | N/A           | N/A            | 0.248 | 0.601 | 0.405    | N/A   | 0.665     | N/A   | 0.132         |
| Gemma 3n E2B Instructed                  | 2025-06-26   | N/A           | N/A            | 0.248 | 0.601 | 0.405    | N/A   | 0.665     | N/A   | 0.132         |
| Kimi K2-Instruct-0905                    | 2025-09-05   | N/A           | N/A            | N/A   | N/A   | N/A      | N/A   | N/A       | N/A   | N/A           |
| Gemma 3n E4B                             | 2025-06-26   | N/A           | N/A            | N/A   | N/A   | N/A      | N/A   | N/A       | N/A   | N/A           |
| Gemma 3 12B                              | 2025-03-12   | N/A           | N/A            | 0.409 | N/A   | 0.606    | 0.838 | 0.854     | N/A   | 0.246         |
| Gemini 2.5 Pro                           | 2025-05-20   | N/A           | N/A            | 0.830 | N/A   | N/A      | N/A   | N/A       | 0.796 | N/A           |
| Gemini 2.0 Flash-Lite                    | 2025-02-05   | N/A           | N/A            | 0.515 | N/A   | 0.716    | 0.868 | N/A       | 0.680 | N/A           |
| Gemini 2.5 Flash-Lite                    | 2025-06-17   | N/A           | N/A            | 0.646 | N/A   | N/A      | N/A   | N/A       | 0.729 | 0.337         |
| Gemini 2.5 Pro Preview 06-05             | 2025-06-05   | N/A           | N/A            | 0.864 | N/A   | N/A      | N/A   | N/A       | 0.820 | 0.690         |
| Gemini 2.5 Flash                         | 2025-05-20   | N/A           | N/A            | 0.828 | N/A   | N/A      | N/A   | N/A       | 0.797 | N/A           |
| Gemini 2.0 Flash Thinking                | 2025-01-21   | N/A           | N/A            | 0.742 | N/A   | N/A      | N/A   | N/A       | 0.754 | N/A           |
| Gemma 3n E2B                             | 2025-06-26   | N/A           | N/A            | N/A   | N/A   | N/A      | N/A   | N/A       | N/A   | N/A           |
| MedGemma 4B IT                           | 2025-05-20   | N/A           | N/A            | N/A   | N/A   | N/A      | N/A   | N/A       | N/A   | N/A           |
| Gemma 3 4B                               | 2025-03-12   | N/A           | N/A            | 0.308 | N/A   | 0.436    | 0.756 | 0.713     | N/A   | 0.126         |
| Gemma 3 27B                              | 2025-03-12   | N/A           | N/A            | 0.424 | N/A   | 0.675    | 0.890 | 0.878     | N/A   | 0.297         |
| Gemma 3 1B                               | 2025-03-12   | N/A           | N/A            | 0.192 | N/A   | 0.147    | 0.480 | 0.415     | N/A   | 0.019         |
| Gemini 1.5 Flash 8B                      | 2024-03-15   | N/A           | N/A            | 0.384 | N/A   | 0.587    | 0.587 | N/A       | 0.537 | N/A           |
| Gemini Diffusion                         | 2025-05-20   | N/A           | N/A            | 0.404 | N/A   | N/A      | N/A   | 0.896     | N/A   | 0.309         |
| Gemini 2.0 Flash                         | 2024-12-01   | N/A           | N/A            | 0.621 | N/A   | 0.764    | 0.897 | N/A       | 0.707 | 0.351         |
| Phi 4 Mini Reasoning                     | 2025-04-30   | N/A           | N/A            | 0.520 | N/A   | N/A      | N/A   | N/A       | N/A   | N/A           |
| Phi-3.5-vision-instruct                  | 2024-08-23   | N/A           | N/A            | N/A   | N/A   | N/A      | N/A   | N/A       | 0.430 | N/A           |
| Phi 4 Reasoning Plus                     | 2025-04-30   | N/A           | N/A            | 0.689 | N/A   | 0.760    | N/A   | N/A       | N/A   | 0.531         |
| Phi-4-multimodal-instruct                | 2025-02-01   | N/A           | N/A            | N/A   | N/A   | N/A      | N/A   | N/A       | 0.551 | N/A           |
| Phi 4 Reasoning                          | 2025-04-30   | N/A           | N/A            | 0.658 | N/A   | 0.743    | N/A   | N/A       | N/A   | 0.538         |
| Qwen3-235B-A22B-Instruct-2507            | 2025-07-22   | N/A           | N/A            | 0.775 | N/A   | 0.830    | N/A   | N/A       | N/A   | N/A           |
| QwQ-32B                                  | 2025-03-05   | N/A           | N/A            | 0.652 | N/A   | N/A      | N/A   | N/A       | N/A   | 0.634         |
| Qwen3-235B-A22B-Thinking-2507            | 2025-07-25   | N/A           | N/A            | 0.811 | N/A   | 0.844    | N/A   | N/A       | N/A   | N/A           |
| QwQ-32B-Preview                          | 2024-11-28   | N/A           | N/A            | 0.652 | N/A   | N/A      | N/A   | N/A       | N/A   | 0.500         |
| Qwen3-Next-80B-A3B-Thinking              | 2025-09-10   | N/A           | N/A            | 0.772 | N/A   | 0.827    | N/A   | N/A       | N/A   | N/A           |
| Qwen2-VL-72B-Instruct                    | 2024-08-29   | N/A           | N/A            | N/A   | N/A   | N/A      | N/A   | N/A       | N/A   | N/A           |
| Qwen3 32B                                | 2025-04-29   | N/A           | N/A            | N/A   | N/A   | N/A      | N/A   | N/A       | N/A   | 0.657         |
| Qwen2.5 72B Instruct                     | 2024-09-19   | N/A           | N/A            | 0.490 | N/A   | 0.711    | 0.831 | 0.866     | N/A   | 0.555         |
| Qwen3 30B A3B                            | 2025-04-29   | N/A           | N/A            | 0.658 | N/A   | N/A      | N/A   | N/A       | N/A   | 0.626         |
| Qwen2.5 VL 7B Instruct                   | 2025-01-26   | N/A           | N/A            | N/A   | N/A   | N/A      | N/A   | N/A       | 0.586 | N/A           |
| Qwen3-Next-80B-A3B-Base                  | 2025-09-10   | N/A           | N/A            | N/A   | N/A   | N/A      | N/A   | N/A       | N/A   | N/A           |
| QvQ-72B-Preview                          | 2024-12-25   | N/A           | N/A            | N/A   | N/A   | N/A      | N/A   | N/A       | 0.703 | N/A           |
| Qwen2.5-Omni-7B                          | 2025-03-27   | N/A           | N/A            | 0.308 | N/A   | 0.470    | 0.715 | 0.787     | 0.592 | N/A           |
| Qwen2.5 7B Instruct                      | 2024-09-19   | N/A           | N/A            | 0.364 | N/A   | 0.563    | 0.755 | 0.848     | N/A   | 0.287         |
| Qwen3-Next-80B-A3B-Instruct              | 2025-09-10   | N/A           | N/A            | 0.729 | N/A   | 0.806    | N/A   | N/A       | N/A   | N/A           |
| Qwen2.5 VL 72B Instruct                  | 2025-01-26   | N/A           | N/A            | N/A   | N/A   | N/A      | N/A   | N/A       | 0.702 | N/A           |
| DeepSeek-R1-0528                         | 2025-05-28   | N/A           | N/A            | N/A   | N/A   | 0.850    | N/A   | N/A       | N/A   | 0.733         |
| DeepSeek VL2                             | 2024-12-13   | N/A           | N/A            | N/A   | N/A   | N/A      | N/A   | N/A       | 0.511 | N/A           |
| DeepSeek VL2 Tiny                        | 2024-12-13   | N/A           | N/A            | N/A   | N/A   | N/A      | N/A   | N/A       | 0.407 | N/A           |
| DeepSeek R1 Zero                         | 2025-01-20   | N/A           | N/A            | 0.733 | N/A   | N/A      | N/A   | N/A       | N/A   | 0.500         |
| DeepSeek VL2 Small                       | 2024-12-13   | N/A           | N/A            | N/A   | N/A   | N/A      | N/A   | N/A       | 0.480 | N/A           |
| DeepSeek R1 Distill Qwen 7B              | 2025-01-20   | N/A           | N/A            | 0.491 | N/A   | N/A      | N/A   | N/A       | N/A   | 0.376         |
| DeepSeek R1 Distill Qwen 1.5B            | 2025-01-20   | N/A           | N/A            | 0.338 | N/A   | N/A      | N/A   | N/A       | N/A   | 0.169         |
| DeepSeek-R1                              | 2025-01-20   | N/A           | N/A            | N/A   | N/A   | N/A      | N/A   | N/A       | N/A   | N/A           |
| DeepSeek R1 Distill Llama 8B             | 2025-01-20   | N/A           | N/A            | 0.490 | N/A   | N/A      | N/A   | N/A       | N/A   | 0.396         |
| DeepSeek R1 Distill Llama 70B            | 2025-01-20   | N/A           | N/A            | 0.652 | N/A   | N/A      | N/A   | N/A       | N/A   | 0.575         |
| DeepSeek R1 Distill Qwen 14B             | 2025-01-20   | N/A           | N/A            | 0.591 | N/A   | N/A      | N/A   | N/A       | N/A   | 0.531         |
| DeepSeek R1 Distill Qwen 32B             | 2025-01-20   | N/A           | N/A            | 0.621 | N/A   | N/A      | N/A   | N/A       | N/A   | 0.572         |
| DeepSeek-V3.1                            | 2025-01-10   | N/A           | N/A            | N/A   | N/A   | 0.837    | N/A   | N/A       | N/A   | 0.564         |
| DeepSeek-V3.2-Exp                        | 2025-09-29   | N/A           | N/A            | N/A   | N/A   | 0.850    | N/A   | N/A       | N/A   | 0.741         |
| DeepSeek-V3 0324                         | 2025-03-25   | N/A           | N/A            | 0.684 | N/A   | 0.812    | N/A   | N/A       | N/A   | 0.492         |
| Grok-3 Mini                              | 2025-02-17   | N/A           | N/A            | 0.840 | N/A   | N/A      | N/A   | N/A       | N/A   | 0.804         |
| Grok-4 Heavy                             | 2025-07-09   | N/A           | N/A            | 0.884 | N/A   | N/A      | N/A   | N/A       | N/A   | 0.794         |
| Grok-4                                   | 2025-07-09   | N/A           | N/A            | 0.875 | N/A   | N/A      | N/A   | N/A       | N/A   | 0.790         |
| Grok-3                                   | 2025-02-17   | N/A           | N/A            | 0.846 | N/A   | N/A      | N/A   | N/A       | 0.780 | 0.794         |
| Grok-1.5V                                | 2024-04-12   | N/A           | N/A            | N/A   | N/A   | N/A      | N/A   | N/A       | 0.536 | N/A           |
| GLM-4.5V                                 | 2025-08-11   | N/A           | N/A            | N/A   | N/A   | N/A      | N/A   | N/A       | N/A   | N/A           |
| GLM-4.5-Air                              | 2025-07-28   | N/A           | N/A            | 0.750 | N/A   | 0.814    | N/A   | N/A       | N/A   | 0.707         |
| GLM-4.5                                  | 2025-07-28   | N/A           | N/A            | 0.791 | N/A   | 0.846    | N/A   | N/A       | N/A   | 0.729         |
| Llama-3.3 Nemotron Super 49B v1          | 2025-03-18   | N/A           | N/A            | 0.667 | N/A   | N/A      | N/A   | N/A       | N/A   | N/A           |
| Llama 3.1 Nemotron Nano 8B V1            | 2025-03-18   | N/A           | N/A            | 0.541 | N/A   | N/A      | N/A   | N/A       | N/A   | N/A           |
| Llama 3.1 Nemotron Ultra 253B v1         | 2025-04-07   | N/A           | N/A            | 0.760 | N/A   | N/A      | N/A   | N/A       | N/A   | 0.663         |
| Claude Opus 4.1                          | 2025-08-05   | N/A           | N/A            | 0.809 | N/A   | N/A      | N/A   | N/A       | N/A   | N/A           |
| Claude Sonnet 4.5                        | 2025-09-29   | N/A           | N/A            | N/A   | N/A   | N/A      | N/A   | N/A       | N/A   | N/A           |
| Claude 3.5 Haiku                         | 2024-10-22   | N/A           | N/A            | 0.416 | N/A   | 0.650    | 0.694 | 0.881     | N/A   | N/A           |
| Claude 3.7 Sonnet                        | 2025-02-24   | N/A           | N/A            | 0.848 | N/A   | N/A      | N/A   | N/A       | 0.750 | N/A           |
| Claude Sonnet 4                          | 2025-05-22   | N/A           | N/A            | 0.754 | N/A   | N/A      | N/A   | N/A       | 0.744 | N/A           |
| Claude Opus 4                            | 2025-05-22   | N/A           | N/A            | 0.796 | N/A   | N/A      | N/A   | N/A       | N/A   | N/A           |
| Magistral Small 2506                     | 2025-06-10   | N/A           | N/A            | 0.682 | N/A   | N/A      | N/A   | N/A       | N/A   | 0.513         |
| Magistral Medium                         | 2025-06-10   | N/A           | N/A            | 0.708 | N/A   | N/A      | N/A   | N/A       | N/A   | 0.503         |
| Devstral Medium                          | 2025-07-10   | N/A           | N/A            | N/A   | N/A   | N/A      | N/A   | N/A       | N/A   | N/A           |
| Pixtral Large                            | 2024-11-18   | N/A           | N/A            | N/A   | N/A   | N/A      | N/A   | N/A       | 0.640 | N/A           |
| Mistral Small 3 24B Instruct             | 2025-01-30   | N/A           | N/A            | 0.453 | N/A   | 0.663    | 0.706 | 0.848     | N/A   | N/A           |
| Devstral Small 1.1                       | 2025-07-11   | N/A           | N/A            | N/A   | N/A   | N/A      | N/A   | N/A       | N/A   | N/A           |
| Codestral-22B                            | 2024-05-29   | N/A           | N/A            | N/A   | N/A   | N/A      | N/A   | 0.811     | N/A   | N/A           |
| Mistral Small                            | 2024-09-17   | N/A           | N/A            | N/A   | N/A   | N/A      | N/A   | N/A       | N/A   | N/A           |
| GPT OSS 120B                             | 2025-08-05   | N/A           | N/A            | 0.801 | N/A   | N/A      | N/A   | N/A       | N/A   | N/A           |
| o3                                       | 2025-04-16   | N/A           | N/A            | 0.833 | N/A   | N/A      | N/A   | N/A       | 0.829 | N/A           |
| GPT OSS 20B                              | 2025-08-05   | N/A           | N/A            | 0.715 | N/A   | N/A      | N/A   | N/A       | N/A   | N/A           |
| o4-mini                                  | 2025-04-16   | N/A           | N/A            | 0.814 | N/A   | N/A      | N/A   | N/A       | 0.816 | N/A           |
| o3-pro                                   | 2025-06-10   | N/A           | N/A            | N/A   | N/A   | N/A      | N/A   | N/A       | N/A   | N/A           |
| o1-pro                                   | 2024-12-17   | N/A           | N/A            | 0.790 | N/A   | N/A      | N/A   | N/A       | N/A   | N/A           |
| GPT-5 nano                               | 2025-08-07   | N/A           | N/A            | 0.712 | N/A   | N/A      | N/A   | N/A       | N/A   | N/A           |
| GPT-5 mini                               | 2025-08-07   | N/A           | N/A            | 0.823 | N/A   | N/A      | N/A   | N/A       | N/A   | N/A           |
| GPT-5 Codex                              | 2025-09-15   | N/A           | N/A            | N/A   | N/A   | N/A      | N/A   | N/A       | N/A   | N/A           |

<div align="center">
Built with 💙 by the AI community, for the AI community.<br>
Star this repo if you find it useful!
</div>


================================================
FILE: data/.github/CODEOWNERS
================================================
* @JonathanChavezTamales
* @sebastiancrossa


================================================
FILE: data/benchmarks/aa-index.json
================================================
{
  "benchmark_id": "aa-index",
  "name": "AA-Index",
  "parent_benchmark_id": null,
  "categories": ["general"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "No official academic documentation found for this benchmark. Extensive research through ArXiv, IEEE/ACL/NeurIPS papers, and university research sites yielded no peer-reviewed sources for an 'aa-index' benchmark. This entry requires verification from official academic sources.",
  "paper_link": null,
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-28T00:00:00.000000+00:00",
  "updated_at": "2025-07-28T00:00:00.000000+00:00"
}


================================================
FILE: data/benchmarks/acebench.json
================================================
{
  "benchmark_id": "acebench",
  "name": "ACEBench",
  "parent_benchmark_id": null,
  "categories": ["general", "reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "ACEBench is a comprehensive benchmark for evaluating Large Language Models' tool usage capabilities across three primary evaluation types: Normal (basic tool usage scenarios), Special (tool usage with ambiguous or incomplete instructions), and Agent (multi-agent interactions simulating real-world dialogues). The benchmark covers 4,538 APIs across 8 major domains and 68 sub-domains including technology, finance, entertainment, society, health, culture, and environment, supporting both English and Chinese languages.",
  "paper_link": "https://arxiv.org/abs/2501.12851",
  "implementation_link": "https://github.com/ACEBench/ACEBench",
  "verified": false,
  "created_at": "2025-09-05T00:00:00.000000+00:00",
  "updated_at": "2025-09-30T00:00:00.000000+00:00"
}


================================================
FILE: data/benchmarks/activitynet.json
================================================
{
  "benchmark_id": "activitynet",
  "name": "ActivityNet",
  "parent_benchmark_id": null,
  "categories": ["vision", "video"],
  "modality": "video",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "A large-scale video benchmark for human activity understanding. Provides samples from 203 activity classes with an average of 137 untrimmed videos per class and 1.41 activity instances per video, for a total of 849 video hours. The benchmark covers a wide range of complex human activities that are of interest to people in their daily living and can be used to compare algorithms for three scenarios: untrimmed video classification, trimmed activity classification, and activity detection.",
  "paper_link": "https://openaccess.thecvf.com/content_cvpr_2015/html/Heilbron_ActivityNet_A_Large-Scale_2015_CVPR_paper.html",
  "implementation_link": "https://github.com/activitynet/ActivityNet",
  "verified": false,
  "created_at": "2025-07-19T19:56:15.378371+00:00",
  "updated_at": "2025-07-19T19:56:15.378371+00:00"
}


================================================
FILE: data/benchmarks/agieval.json
================================================
{
  "benchmark_id": "agieval",
  "name": "AGIEval",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "general", "math"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "A human-centric benchmark for evaluating foundation models on standardized exams including college entrance exams (Gaokao, SAT), law school admission tests (LSAT), math competitions, lawyer qualification tests, and civil service exams. Contains 20 tasks (18 multiple-choice, 2 cloze) designed to assess understanding, knowledge, reasoning, and calculation abilities in real-world academic and professional contexts.",
  "paper_link": "https://arxiv.org/abs/2304.06364",
  "implementation_link": "https://github.com/ruixiangcui/AGIEval",
  "verified": false,
  "created_at": "2025-07-19T19:56:13.970928+00:00",
  "updated_at": "2025-07-19T19:56:13.970928+00:00"
}


================================================
FILE: data/benchmarks/ai2-reasoning-challenge-(arc).json
================================================
{
  "benchmark_id": "ai2-reasoning-challenge-(arc)",
  "name": "AI2 Reasoning Challenge (ARC)",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "general"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "A dataset of 7,787 genuine grade-school level, multiple-choice science questions assembled to encourage research in advanced question-answering. The dataset is partitioned into a Challenge Set and Easy Set, where the Challenge Set contains only questions answered incorrectly by both retrieval-based and word co-occurrence algorithms. Covers multiple scientific domains including biology, physics, earth science, and chemistry, requiring scientific reasoning, causal understanding, and conceptual knowledge beyond simple fact retrieval. Includes a supporting corpus of over 14 million science sentences.",
  "paper_link": "https://arxiv.org/abs/1803.05457",
  "implementation_link": "https://github.com/allenai/ARC-Solvers",
  "verified": false,
  "created_at": "2025-07-19T19:56:15.419158+00:00",
  "updated_at": "2025-07-19T19:56:15.419158+00:00"
}


================================================
FILE: data/benchmarks/ai2d.json
================================================
{
  "benchmark_id": "ai2d",
  "name": "AI2D",
  "parent_benchmark_id": null,
  "categories": ["vision", "reasoning", "multimodal"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "AI2D is a dataset of 4,903 illustrative diagrams from grade school natural sciences (such as food webs, human physiology, and life cycles) with over 15,000 multiple choice questions and answers. The benchmark evaluates diagram understanding and visual reasoning capabilities, requiring models to interpret diagrammatic elements, relationships, and structure to answer questions about scientific concepts represented in visual form.",
  "paper_link": "https://arxiv.org/abs/1603.07396",
  "implementation_link": "https://allenai.org/data/diagrams",
  "verified": false,
  "created_at": "2025-07-19T19:56:13.618926+00:00",
  "updated_at": "2025-07-19T19:56:13.618926+00:00"
}


================================================
FILE: data/benchmarks/aider-polyglot-edit.json
================================================
{
  "benchmark_id": "aider-polyglot-edit",
  "name": "Aider-Polyglot Edit",
  "parent_benchmark_id": null,
  "categories": ["general", "code"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "A challenging multi-language coding benchmark that evaluates models' code editing abilities across C++, Go, Java, JavaScript, Python, and Rust. Contains 225 of Exercism's most difficult programming problems, selected as problems that were solved by 3 or fewer out of 7 top coding models. The benchmark focuses on code editing tasks and measures both correctness of solutions and proper edit format usage. Designed to re-calibrate evaluation scales so top models score between 5-50%.",
  "paper_link": null,
  "implementation_link": "https://github.com/Aider-AI/polyglot-benchmark",
  "verified": false,
  "created_at": "2025-07-19T19:56:13.789839+00:00",
  "updated_at": "2025-09-30T00:00:00.000000+00:00"
}


================================================
FILE: data/benchmarks/aider-polyglot.json
================================================
{
  "benchmark_id": "aider-polyglot",
  "name": "Aider-Polyglot",
  "parent_benchmark_id": null,
  "categories": ["general", "code"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "A coding benchmark that evaluates LLMs on 225 challenging Exercism programming exercises across C++, Go, Java, JavaScript, Python, and Rust. Models receive two attempts to solve each problem, with test error feedback provided after the first attempt if it fails. The benchmark measures both initial problem-solving ability and capacity to edit code based on error feedback, providing an end-to-end evaluation of code generation and editing capabilities across multiple programming languages.",
  "paper_link": null,
  "implementation_link": "https://github.com/Aider-AI/polyglot-benchmark",
  "verified": false,
  "created_at": "2025-09-05T00:00:00.000000+00:00",
  "updated_at": "2025-09-30T00:00:00.000000+00:00"
}


================================================
FILE: data/benchmarks/aider.json
================================================
{
  "benchmark_id": "aider",
  "name": "Aider",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "code"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "Aider is a comprehensive code editing benchmark based on 133 practice exercises from Exercism's Python repository, designed to evaluate AI models' ability to translate natural language coding requests into executable code that passes unit tests. The benchmark measures end-to-end code editing capabilities, including GPT's ability to edit existing code and format code changes for automated saving to local files. The Aider Polyglot variant extends this evaluation across 225 challenging exercises spanning C++, Go, Java, JavaScript, Python, and Rust, making it a standard benchmark for assessing multilingual code editing performance in AI research.",
  "paper_link": null,
  "implementation_link": "https://github.com/Aider-AI/aider",
  "verified": false,
  "created_at": "2025-07-19T19:56:14.566857+00:00",
  "updated_at": "2025-07-19T19:56:14.566857+00:00"
}


================================================
FILE: data/benchmarks/aime-2024.json
================================================
{
  "benchmark_id": "aime-2024",
  "name": "AIME 2024",
  "parent_benchmark_id": null,
  "categories": ["math", "reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "American Invitational Mathematics Examination 2024, consisting of 30 challenging mathematical reasoning problems from AIME I and AIME II competitions. Each problem requires an integer answer between 0-999 and tests advanced mathematical reasoning across algebra, geometry, combinatorics, and number theory. Used as a benchmark for evaluating mathematical reasoning capabilities in large language models at Olympiad-level difficulty.",
  "paper_link": "https://arxiv.org/html/2503.21380v2",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:11.941652+00:00",
  "updated_at": "2025-09-30T00:00:00.000000+00:00"
}


================================================
FILE: data/benchmarks/aime-2025.json
================================================
{
  "benchmark_id": "aime-2025",
  "name": "AIME 2025",
  "parent_benchmark_id": null,
  "categories": ["math", "reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "All 30 problems from the 2025 American Invitational Mathematics Examination (AIME I and AIME II), testing olympiad-level mathematical reasoning with integer answers from 000-999. Used as an AI benchmark to evaluate large language models' ability to solve complex mathematical problems requiring multi-step logical deductions and structured symbolic reasoning.",
  "paper_link": "https://arxiv.org/abs/2503.21380",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-09-05T00:00:00.000000+00:00",
  "updated_at": "2025-09-05T00:00:00.000000+00:00"
}


================================================
FILE: data/benchmarks/aime.json
================================================
{
  "benchmark_id": "aime",
  "name": "AIME",
  "parent_benchmark_id": null,
  "categories": ["math", "reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "American Invitational Mathematics Examination (AIME) benchmark for evaluating mathematical reasoning capabilities of large language models. Contains 30 challenging mathematical problems from AIME 2024 competition that require multi-step reasoning and advanced mathematical insight. Each problem has an integer answer between 000-999.",
  "paper_link": "https://arxiv.org/html/2503.21380v2",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.057279+00:00",
  "updated_at": "2025-07-19T19:56:14.057279+00:00"
}

================================================
FILE: data/benchmarks/aitz-em.json
================================================
{
  "benchmark_id": "aitz-em",
  "name": "AITZ_EM",
  "parent_benchmark_id": null,
  "categories": ["multimodal", "reasoning"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "Android-In-The-Zoo (AitZ) benchmark for evaluating autonomous GUI agents on smartphones. Contains 18,643 screen-action pairs with chain-of-action-thought annotations spanning over 70 Android apps. Designed to connect perception (screen layouts and UI elements) with cognition (action decision-making) for natural language-triggered smartphone task completion.",
  "paper_link": "https://arxiv.org/abs/2403.02713",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.785085+00:00",
  "updated_at": "2025-07-19T19:56:14.785085+00:00"
}

================================================
FILE: data/benchmarks/alignbench.json
================================================
{
  "benchmark_id": "alignbench",
  "name": "AlignBench",
  "parent_benchmark_id": null,
  "categories": ["general", "language", "math", "reasoning", "roleplay"],
  "modality": "text",
  "multilingual": true,
  "max_score": 1.0,
  "language": "en",
  "description": "AlignBench is a comprehensive multi-dimensional benchmark for evaluating Chinese alignment of Large Language Models. It contains 8 main categories: Fundamental Language Ability, Advanced Chinese Understanding, Open-ended Questions, Writing Ability, Logical Reasoning, Mathematics, Task-oriented Role Play, and Professional Knowledge. The benchmark includes 683 real-scenario rooted queries with human-verified references and uses a rule-calibrated multi-dimensional LLM-as-Judge approach with Chain-of-Thought for evaluation.",
  "paper_link": "https://arxiv.org/abs/2311.18743",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.542033+00:00",
  "updated_at": "2025-07-19T19:56:14.542033+00:00"
}

================================================
FILE: data/benchmarks/alpacaeval-2.0.json
================================================
{
  "benchmark_id": "alpacaeval-2.0",
  "name": "AlpacaEval 2.0",
  "parent_benchmark_id": null,
  "categories": ["general", "creativity", "reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "AlpacaEval 2.0 is a length-controlled automatic evaluator for instruction-following language models that uses GPT-4 Turbo to assess model responses against a baseline. It evaluates models on 805 diverse instruction-following tasks including creative writing, classification, programming, and general knowledge questions. The benchmark achieves 0.98 Spearman correlation with ChatBot Arena while being fast (< 3 minutes) and affordable (< $10 in OpenAI credits). It addresses length bias in automatic evaluation through length-controlled win-rates and uses weighted scoring based on response quality.",
  "paper_link": "https://arxiv.org/abs/2404.04475",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:15.038178+00:00",
  "updated_at": "2025-07-19T19:56:15.038178+00:00"
}

================================================
FILE: data/benchmarks/amc-2022-23.json
================================================
{
  "benchmark_id": "amc-2022-23",
  "name": "AMC_2022_23",
  "parent_benchmark_id": null,
  "categories": ["math", "reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "American Mathematics Competition problems from the 2022-23 academic year, consisting of multiple-choice mathematics competition problems designed for high school students. These problems require advanced mathematical reasoning, problem-solving strategies, and mathematical knowledge covering topics like algebra, geometry, number theory, and combinatorics. The benchmark is derived from the official AMC competitions sponsored by the Mathematical Association of America.",
  "paper_link": "https://arxiv.org/abs/2103.03874",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:13.992903+00:00",
  "updated_at": "2025-07-19T19:56:13.992903+00:00"
}

================================================
FILE: data/benchmarks/android-control-high-em.json
================================================
{
  "benchmark_id": "android-control-high-em",
  "name": "Android Control High_EM",
  "parent_benchmark_id": null,
  "categories": ["multimodal", "reasoning"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "Android device control benchmark using high exact match evaluation metric for assessing agent performance on mobile interface tasks",
  "paper_link": null,
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.792498+00:00",
  "updated_at": "2025-07-19T19:56:14.792498+00:00"
}

================================================
FILE: data/benchmarks/android-control-low-em.json
================================================
{
  "benchmark_id": "android-control-low-em",
  "name": "Android Control Low_EM",
  "parent_benchmark_id": null,
  "categories": ["multimodal", "reasoning"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "Android control benchmark evaluating autonomous agents on mobile device interaction tasks with low exact match scoring criteria",
  "paper_link": null,
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.800337+00:00",
  "updated_at": "2025-07-19T19:56:14.800337+00:00"
}

================================================
FILE: data/benchmarks/androidworld-sr.json
================================================
{
  "benchmark_id": "androidworld-sr",
  "name": "AndroidWorld_SR",
  "parent_benchmark_id": null,
  "categories": ["general", "multimodal", "reasoning"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "AndroidWorld Success Rate (SR) benchmark - A dynamic benchmarking environment for autonomous agents operating on Android devices. Evaluates agents on 116 programmatic tasks across 20 real-world Android apps using multimodal inputs (screen screenshots, accessibility trees, and natural language instructions). Measures success rate of agents completing tasks like sending messages, creating calendar events, and navigating mobile interfaces. Published at ICLR 2025. Best current performance: 30.6% success rate (M3A agent) vs 80.0% human performance.",
  "paper_link": "https://arxiv.org/abs/2405.14573",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.808659+00:00",
  "updated_at": "2025-07-19T19:56:14.808659+00:00"
}

================================================
FILE: data/benchmarks/api-bank.json
================================================
{
  "benchmark_id": "api-bank",
  "name": "API-Bank",
  "parent_benchmark_id": null,
  "categories": ["reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "A comprehensive benchmark for tool-augmented LLMs that evaluates API planning, retrieval, and calling capabilities. Contains 314 tool-use dialogues with 753 API calls across 73 API tools, designed to assess how effectively LLMs can utilize external tools and overcome obstacles in tool leveraging.",
  "paper_link": "https://arxiv.org/abs/2304.08244",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.374447+00:00",
  "updated_at": "2025-07-19T19:56:14.374447+00:00"
}

================================================
FILE: data/benchmarks/arc-agi-v2.json
================================================
{
  "benchmark_id": "arc-agi-v2",
  "name": "ARC-AGI v2",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "vision", "spatial_reasoning"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "ARC-AGI-2 is an upgraded benchmark for measuring abstract reasoning and problem-solving abilities in AI systems through visual grid transformation tasks. It evaluates fluid intelligence via input-output grid pairs (1x1 to 30x30) using colored cells (0-9), requiring models to identify underlying transformation rules from demonstration examples and apply them to test cases. Designed to be easy for humans but challenging for AI, focusing on core cognitive abilities like spatial reasoning, pattern recognition, and compositional generalization.",
  "paper_link": "https://arxiv.org/abs/2505.11831",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:13.916360+00:00",
  "updated_at": "2025-07-19T19:56:13.916360+00:00"
}

================================================
FILE: data/benchmarks/arc-agi.json
================================================
{
  "benchmark_id": "arc-agi",
  "name": "ARC-AGI",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "vision", "spatial_reasoning"],
  "modality": "image",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "The Abstraction and Reasoning Corpus for Artificial General Intelligence (ARC-AGI) is a benchmark designed to test general intelligence and abstract reasoning capabilities through visual grid-based transformation tasks. Each task consists of 2-5 demonstration pairs showing input grids transformed into output grids according to underlying rules, with test-takers required to infer these rules and apply them to novel test inputs. The benchmark uses colored grids (up to 30x30) with 10 discrete colors/symbols, designed to measure human-like general fluid intelligence and skill-acquisition efficiency with minimal prior knowledge.",
  "paper_link": "https://arxiv.org/abs/1911.01547",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:15.187761+00:00",
  "updated_at": "2025-07-19T19:56:15.187761+00:00"
}

================================================
FILE: data/benchmarks/arc-c.json
================================================
{
  "benchmark_id": "arc-c",
  "name": "ARC-C",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "general"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "The AI2 Reasoning Challenge (ARC) Challenge Set is a multiple-choice question-answering benchmark containing grade-school level science questions that require advanced reasoning capabilities. ARC-C specifically contains questions that were answered incorrectly by both retrieval-based and word co-occurrence algorithms, making it a particularly challenging subset designed to test commonsense reasoning abilities in AI systems.",
  "paper_link": "https://arxiv.org/abs/1803.05457",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:11.052939+00:00",
  "updated_at": "2025-07-19T19:56:11.052939+00:00"
}

================================================
FILE: data/benchmarks/arc-e.json
================================================
{
  "benchmark_id": "arc-e",
  "name": "ARC-E",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "general"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "ARC-E (AI2 Reasoning Challenge - Easy Set) is a subset of grade-school level, multiple-choice science questions that requires knowledge and reasoning capabilities. Part of the AI2 Reasoning Challenge dataset containing 5,197 questions that test scientific reasoning and factual knowledge. The Easy Set contains questions that are answerable by retrieval-based and word co-occurrence algorithms, making them more accessible than the Challenge Set.",
  "paper_link": "https://arxiv.org/abs/1803.05457",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:13.192662+00:00",
  "updated_at": "2025-07-19T19:56:13.192662+00:00"
}

================================================
FILE: data/benchmarks/arc.json
================================================
{
  "benchmark_id": "arc",
  "name": "Arc",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "general"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "The Abstraction and Reasoning Corpus (ARC) is a benchmark designed to measure human-like general fluid intelligence through grid-based reasoning tasks. It consists of 800 tasks (400 training, 400 evaluation) where each task presents input-output grids that require understanding abstract patterns and transformations. Test-takers must produce exactly correct output grids for all test inputs in a task to solve it, with 3 trials allowed per test input. ARC aims to enable fair comparisons of general intelligence between AI systems and humans using priors designed to be as close as possible to innate human priors.",
  "paper_link": "https://arxiv.org/abs/1911.01547",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:13.967150+00:00",
  "updated_at": "2025-07-19T19:56:13.967150+00:00"
}

================================================
FILE: data/benchmarks/arena-hard-v2.json
================================================
{
  "benchmark_id": "arena-hard-v2",
  "name": "Arena-Hard v2",
  "parent_benchmark_id": null,
  "categories": ["general", "reasoning", "creativity"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "Arena-Hard-Auto v2 is a challenging benchmark consisting of 500 carefully curated prompts sourced from Chatbot Arena and WildChat-1M, designed to evaluate large language models on real-world user queries. The benchmark covers diverse domains including open-ended software engineering problems, mathematics, creative writing, and technical problem-solving. It uses LLM-as-a-Judge for automatic evaluation, achieving 98.6% correlation with human preference rankings while providing 3x higher separation of model performances compared to MT-Bench. The benchmark emphasizes prompt specificity, complexity, and domain knowledge to better distinguish between model capabilities.",
  "paper_link": "https://arxiv.org/abs/2406.11939",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-08-03T22:06:11.411643+00:00",
  "updated_at": "2025-08-03T22:06:11.411643+00:00"
}

================================================
FILE: data/benchmarks/arena-hard.json
================================================
{
  "benchmark_id": "arena-hard",
  "name": "Arena Hard",
  "parent_benchmark_id": null,
  "categories": ["general", "reasoning", "creativity"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "Arena-Hard-Auto is an automatic evaluation benchmark for instruction-tuned LLMs consisting of 500 challenging real-world prompts curated by BenchBuilder. It includes open-ended software engineering problems, mathematical questions, and creative writing tasks. The benchmark uses LLM-as-a-Judge methodology with GPT-4.1 and Gemini-2.5 as automatic judges to approximate human preference. Arena-Hard achieves 98.6% correlation with human preference rankings and provides 3x higher separation of model performances compared to MT-Bench, making it highly effective for distinguishing between models of similar quality.",
  "paper_link": "https://arxiv.org/abs/2406.11939",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.079874+00:00",
  "updated_at": "2025-07-19T19:56:14.079874+00:00"
}

================================================
FILE: data/benchmarks/attaq.json
================================================
{
  "benchmark_id": "attaq",
  "name": "AttaQ",
  "parent_benchmark_id": null,
  "categories": ["safety"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "AttaQ is a unique dataset containing adversarial examples in the form of questions designed to provoke harmful or inappropriate responses from large language models. The benchmark evaluates safety vulnerabilities by using specialized clustering techniques that analyze both the semantic similarity of input attacks and the harmfulness of model responses, facilitating targeted improvements to model safety mechanisms.",
  "paper_link": "https://arxiv.org/abs/2311.04124",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:15.079764+00:00",
  "updated_at": "2025-07-19T19:56:15.079764+00:00"
}

================================================
FILE: data/benchmarks/autologi.json
================================================
{
  "benchmark_id": "autologi",
  "name": "AutoLogi",
  "parent_benchmark_id": null,
  "categories": ["reasoning"],
  "modality": "text",
  "multilingual": true,
  "max_score": 1.0,
  "language": "en",
  "description": "AutoLogi is an automated method for synthesizing open-ended logic puzzles to evaluate reasoning abilities of Large Language Models. The benchmark addresses limitations of existing multiple-choice reasoning evaluations by featuring program-based verification and controllable difficulty levels. It includes 1,575 English and 883 Chinese puzzles, enabling more reliable evaluation that better distinguishes models' reasoning capabilities across languages.",
  "paper_link": "https://arxiv.org/abs/2502.16906",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-09-05T00:00:00.000000+00:00",
  "updated_at": "2025-09-05T00:00:00.000000+00:00"
}


================================================
FILE: data/benchmarks/bbh.json
================================================
{
  "benchmark_id": "bbh",
  "name": "BBH",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "math", "language"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "Big-Bench Hard (BBH) is a suite of 23 challenging tasks selected from BIG-Bench for which prior language model evaluations did not outperform the average human-rater. These tasks require multi-step reasoning across diverse domains including arithmetic, logical reasoning, reading comprehension, and commonsense reasoning. The benchmark was designed to test capabilities believed to be beyond current language models and focuses on evaluating complex reasoning skills including temporal understanding, spatial reasoning, causal understanding, and deductive logical reasoning.",
  "paper_link": "https://arxiv.org/abs/2210.09261",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:13.031859+00:00",
  "updated_at": "2025-07-19T19:56:13.031859+00:00"
}

================================================
FILE: data/benchmarks/bfcl-v2.json
================================================
{
  "benchmark_id": "bfcl-v2",
  "name": "BFCL v2",
  "parent_benchmark_id": null,
  "categories": ["general", "reasoning"],
  "modality": "text",
  "multilingual": true,
  "max_score": 1.0,
  "language": "en",
  "description": "Berkeley Function Calling Leaderboard (BFCL) v2 is a comprehensive benchmark for evaluating large language models' function calling capabilities. It features 2,251 question-function-answer pairs with enterprise and OSS-contributed functions, addressing data contamination and bias through live, user-contributed scenarios. The benchmark evaluates AST accuracy, executable accuracy, irrelevance detection, and relevance detection across multiple programming languages (Python, Java, JavaScript) and includes complex real-world function calling scenarios with multi-lingual prompts.",
  "paper_link": "https://arxiv.org/abs/2305.15334",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.444045+00:00",
  "updated_at": "2025-07-19T19:56:14.444045+00:00"
}

================================================
FILE: data/benchmarks/bfcl-v3-multiturn.json
================================================
{
  "benchmark_id": "bfcl-v3-multiturn",
  "name": "BFCL_v3_MultiTurn",
  "parent_benchmark_id": null,
  "categories": ["general", "reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "Berkeley Function Calling Leaderboard (BFCL) V3 MultiTurn benchmark that evaluates large language models' ability to handle multi-turn and multi-step function calling scenarios. The benchmark introduces complex interactions requiring models to manage sequential function calls, handle conversational context across multiple turns, and make dynamic decisions about when and how to use available functions. BFCL V3 uses state-based evaluation by verifying the actual state of API systems after function execution, providing more realistic assessment of function calling capabilities in agentic applications.",
  "paper_link": "https://openreview.net/forum?id=2GmDdhBdDk",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.962161+00:00",
  "updated_at": "2025-07-19T19:56:14.962161+00:00"
}

================================================
FILE: data/benchmarks/bfcl-v3.json
================================================
{
  "benchmark_id": "bfcl-v3",
  "name": "BFCL-v3",
  "parent_benchmark_id": null,
  "categories": ["general", "reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "Berkeley Function Calling Leaderboard v3 (BFCL-v3) is an advanced benchmark that evaluates large language models' function calling capabilities through multi-turn and multi-step interactions. It introduces extended conversational exchanges where models must retain contextual information across turns and execute multiple internal function calls for complex user requests. The benchmark includes 1000 test cases across domains like vehicle control, trading bots, travel booking, and file system management, using state-based evaluation to verify both system state changes and execution path correctness.",
  "paper_link": "https://openreview.net/forum?id=2GmDdhBdDk",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-08-03T22:06:11.216985+00:00",
  "updated_at": "2025-08-03T22:06:11.216985+00:00"
}

================================================
FILE: data/benchmarks/bfcl.json
================================================
{
  "benchmark_id": "bfcl",
  "name": "BFCL",
  "parent_benchmark_id": null,
  "categories": ["general", "reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "The Berkeley Function Calling Leaderboard (BFCL) is the first comprehensive and executable function call evaluation dedicated to assessing Large Language Models' ability to invoke functions. It evaluates serial and parallel function calls across multiple programming languages (Python, Java, JavaScript, REST API) using a novel Abstract Syntax Tree (AST) evaluation method. The benchmark consists of over 2,000 question-function-answer pairs covering diverse application domains and complex use cases including multiple function calls, parallel function calls, and multi-turn interactions.",
  "paper_link": "https://openreview.net/pdf?id=2GmDdhBdDk",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:12.763704+00:00",
  "updated_at": "2025-07-19T19:56:12.763704+00:00"
}

================================================
FILE: data/benchmarks/big-bench-extra-hard.json
================================================
{
  "benchmark_id": "big-bench-extra-hard",
  "name": "BIG-Bench Extra Hard",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "general", "language"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "BIG-Bench Extra Hard (BBEH) is a challenging benchmark that replaces each task in BIG-Bench Hard with a novel task that probes similar reasoning capabilities but exhibits significantly increased difficulty. The benchmark contains 23 tasks testing diverse reasoning skills including many-hop reasoning, causal understanding, spatial reasoning, temporal arithmetic, geometric reasoning, linguistic reasoning, logic puzzles, and humor understanding. Designed to address saturation on existing benchmarks where state-of-the-art models achieve near-perfect scores, BBEH shows substantial room for improvement with best models achieving only 9.8-44.8% average accuracy.",
  "paper_link": "https://arxiv.org/abs/2502.19187",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:13.279517+00:00",
  "updated_at": "2025-07-19T19:56:13.279517+00:00"
}

================================================
FILE: data/benchmarks/big-bench-hard.json
================================================
{
  "benchmark_id": "big-bench-hard",
  "name": "BIG-Bench Hard",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "math", "language"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "BIG-Bench Hard (BBH) is a subset of 23 challenging BIG-Bench tasks selected because prior language model evaluations did not outperform average human-rater performance. The benchmark contains 6,511 evaluation examples testing various forms of multi-step reasoning including arithmetic, logical reasoning (Boolean expressions, logical deduction), geometric reasoning, temporal reasoning, and language understanding. Tasks require capabilities such as causal judgment, object counting, navigation, pattern recognition, and complex problem solving.",
  "paper_link": "https://arxiv.org/abs/2210.09261",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:13.222809+00:00",
  "updated_at": "2025-07-19T19:56:13.222809+00:00"
}

================================================
FILE: data/benchmarks/big-bench.json
================================================
{
  "benchmark_id": "big-bench",
  "name": "BIG-Bench",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "math", "language"],
  "modality": "text",
  "multilingual": true,
  "max_score": 1.0,
  "language": "en",
  "description": "Beyond the Imitation Game Benchmark (BIG-bench) is a collaborative benchmark consisting of 204+ tasks designed to probe large language models and extrapolate their future capabilities. It covers diverse domains including linguistics, mathematics, common-sense reasoning, biology, physics, social bias, software development, and more. The benchmark focuses on tasks believed to be beyond current language model capabilities and includes both English and non-English tasks across multiple languages.",
  "paper_link": "https://arxiv.org/abs/2206.04615",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:13.926457+00:00",
  "updated_at": "2025-07-19T19:56:13.926457+00:00"
}

================================================
FILE: data/benchmarks/bigcodebench-full.json
================================================
{
  "benchmark_id": "bigcodebench-full",
  "name": "BigCodeBench-Full",
  "parent_benchmark_id": null,
  "categories": ["general", "reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "A comprehensive benchmark that evaluates large language models' ability to solve complex, practical programming tasks via code generation. Contains 1,140 fine-grained tasks across 7 domains using function calls from 139 libraries. Challenges LLMs to invoke multiple function calls as tools and handle complex instructions for realistic software engineering and general-purpose reasoning tasks.",
  "paper_link": "https://arxiv.org/abs/2406.15877",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.508830+00:00",
  "updated_at": "2025-07-19T19:56:14.508830+00:00"
}

================================================
FILE: data/benchmarks/bigcodebench-hard.json
================================================
{
  "benchmark_id": "bigcodebench-hard",
  "name": "BigCodeBench-Hard",
  "parent_benchmark_id": null,
  "categories": ["general", "reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "BigCodeBench-Hard is a subset of 148 challenging programming tasks from BigCodeBench, designed to evaluate large language models' ability to solve complex, real-world programming problems. These tasks require diverse function calls from multiple libraries across 7 domains including computation, networking, data analysis, and visualization. The benchmark tests compositional reasoning and the ability to implement complex instructions that span 139 libraries with an average of 2.8 libraries per task.",
  "paper_link": "https://arxiv.org/abs/2406.15877",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.512684+00:00",
  "updated_at": "2025-07-19T19:56:14.512684+00:00"
}

================================================
FILE: data/benchmarks/bigcodebench.json
================================================
{
  "benchmark_id": "bigcodebench",
  "name": "BigCodeBench",
  "parent_benchmark_id": null,
  "categories": ["general", "reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "A benchmark that challenges LLMs to invoke multiple function calls as tools from 139 libraries and 7 domains for 1,140 fine-grained programming tasks. Evaluates code generation with diverse function calls and complex instructions, featuring two variants: Complete (code completion based on comprehensive docstrings) and Instruct (generating code from natural language instructions).",
  "paper_link": "https://arxiv.org/abs/2406.15877",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.048433+00:00",
  "updated_at": "2025-07-19T19:56:14.048433+00:00"
}

================================================
FILE: data/benchmarks/bird-sql-(dev).json
================================================
{
  "benchmark_id": "bird-sql-(dev)",
  "name": "Bird-SQL (dev)",
  "parent_benchmark_id": null,
  "categories": ["reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "BIRD (BIg Bench for LaRge-scale Database Grounded Text-to-SQLs) is a comprehensive text-to-SQL benchmark containing 12,751 question-SQL pairs across 95 databases (33.4 GB total) spanning 37+ professional domains. It evaluates large language models' ability to convert natural language to executable SQL queries in real-world scenarios with complex database schemas and dirty data.",
  "paper_link": "https://arxiv.org/abs/2305.03111",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:13.410905+00:00",
  "updated_at": "2025-07-19T19:56:13.410905+00:00"
}

================================================
FILE: data/benchmarks/blink.json
================================================
{
  "benchmark_id": "blink",
  "name": "BLINK",
  "parent_benchmark_id": null,
  "categories": ["vision", "multimodal", "reasoning"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "BLINK: Multimodal Large Language Models Can See but Not Perceive. A benchmark for multimodal language models focusing on core visual perception abilities. Reformats 14 classic computer vision tasks into 3,807 multiple-choice questions paired with single or multiple images and visual prompting. Tasks include relative depth estimation, visual correspondence, forensics detection, multi-view reasoning, counting, object localization, and spatial reasoning that humans can solve 'within a blink'.",
  "paper_link": "https://arxiv.org/abs/2404.12390",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.326398+00:00",
  "updated_at": "2025-07-19T19:56:14.326398+00:00"
}

================================================
FILE: data/benchmarks/boolq.json
================================================
{
  "benchmark_id": "boolq",
  "name": "BoolQ",
  "parent_benchmark_id": null,
  "categories": ["language", "reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "BoolQ is a reading comprehension dataset for yes/no questions containing 15,942 naturally occurring examples. Each example consists of a question, passage, and boolean answer, where questions are generated in unprompted and unconstrained settings. The dataset challenges models with complex, non-factoid information requiring entailment-like inference to solve.",
  "paper_link": "https://arxiv.org/abs/1905.10044",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:13.117325+00:00",
  "updated_at": "2025-07-19T19:56:13.117325+00:00"
}

================================================
FILE: data/benchmarks/browsecomp-long-128k.json
================================================
{
  "benchmark_id": "browsecomp-long-128k",
  "name": "BrowseComp Long Context 128k",
  "parent_benchmark_id": "browsecomp",
  "categories": ["reasoning", "search"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "A challenging benchmark for evaluating web browsing agents' ability to persistently navigate the internet and find hard-to-locate, entangled information. Comprises 1,266 questions requiring strategic reasoning, creative search, and interpretation of retrieved content, with short and easily verifiable answers.",
  "paper_link": "https://arxiv.org/abs/2504.12516",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-24T12:00:00.000000+00:00",
  "updated_at": "2025-07-24T12:00:00.000000+00:00"
}


================================================
FILE: data/benchmarks/browsecomp-long-256k.json
================================================
{
  "benchmark_id": "browsecomp-long-256k",
  "name": "BrowseComp Long Context 256k",
  "parent_benchmark_id": "browsecomp",
  "categories": ["reasoning", "search"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "BrowseComp is a benchmark for measuring the ability of agents to browse the web, comprising 1,266 questions that require persistently navigating the internet in search of hard-to-find, entangled information. Despite the difficulty of the questions, BrowseComp is simple and easy-to-use, as predicted answers are short and easily verifiable against reference answers. The benchmark focuses on questions where answers are obscure, time-invariant, and well-supported by evidence scattered across the open web.",
  "paper_link": "https://arxiv.org/abs/2504.12516",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-24T12:00:00.000000+00:00",
  "updated_at": "2025-07-24T12:00:00.000000+00:00"
}


================================================
FILE: data/benchmarks/browsecomp-zh.json
================================================
{
  "benchmark_id": "browsecomp-zh",
  "name": "BrowseComp-zh",
  "parent_benchmark_id": "browsecomp",
  "categories": ["reasoning", "search"],
  "modality": "text",
  "multilingual": true,
  "max_score": 1.0,
  "language": "zh",
  "description": "A high-difficulty benchmark purpose-built to comprehensively evaluate LLM agents on the Chinese web, consisting of 289 multi-hop questions spanning 11 diverse domains including Film & TV, Technology, Medicine, and History. Questions are reverse-engineered from short, objective, and easily verifiable answers, requiring sophisticated reasoning and information reconciliation beyond basic retrieval. The benchmark addresses linguistic, infrastructural, and censorship-related complexities in Chinese web environments.",
  "paper_link": "https://arxiv.org/abs/2504.19314",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-09-15T00:00:00.000000+00:00",
  "updated_at": "2025-09-15T00:00:00.000000+00:00"
}


================================================
FILE: data/benchmarks/browsecomp.json
================================================
{
  "benchmark_id": "browsecomp",
  "name": "BrowseComp",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "search"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "BrowseComp is a benchmark comprising 1,266 questions that challenge AI agents to persistently navigate the internet in search of hard-to-find, entangled information. The benchmark measures agents' ability to exercise persistence in information gathering, demonstrate creativity in web navigation, and find concise, verifiable answers. Despite the difficulty of the questions, BrowseComp is simple and easy-to-use, as predicted answers are short and easily verifiable against reference answers.",
  "paper_link": "https://arxiv.org/abs/2504.12516",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-28T00:00:00.000000+00:00",
  "updated_at": "2025-07-28T00:00:00.000000+00:00"
}


================================================
FILE: data/benchmarks/c-eval.json
================================================
{
  "benchmark_id": "c-eval",
  "name": "C-Eval",
  "parent_benchmark_id": null,
  "categories": ["general", "reasoning"],
  "modality": "text",
  "multilingual": true,
  "max_score": 1.0,
  "language": "en",
  "description": "C-Eval is a comprehensive Chinese evaluation suite designed to assess advanced knowledge and reasoning abilities of foundation models in a Chinese context. It comprises 13,948 multiple-choice questions across 52 diverse disciplines spanning humanities, science, and engineering, with four difficulty levels: middle school, high school, college, and professional. The benchmark includes C-Eval Hard, a subset of very challenging subjects requiring advanced reasoning abilities.",
  "paper_link": "https://arxiv.org/abs/2305.08322",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:11.917478+00:00",
  "updated_at": "2025-07-19T19:56:11.917478+00:00"
}

================================================
FILE: data/benchmarks/cbnsl.json
================================================
{
  "benchmark_id": "cbnsl",
  "name": "CBNSL",
  "parent_benchmark_id": null,
  "categories": ["math", "reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "Curriculum Learning of Bayesian Network Structures (CBNSL) benchmark for evaluating algorithms that learn Bayesian network structures from data using curriculum learning techniques. The benchmark uses networks from the bnlearn repository and evaluates structure learning performance using BDeu scoring metrics.",
  "paper_link": "http://proceedings.mlr.press/v45/Zhao15a.pdf",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:12.590999+00:00",
  "updated_at": "2025-07-19T19:56:12.590999+00:00"
}

================================================
FILE: data/benchmarks/cc-ocr.json
================================================
{
  "benchmark_id": "cc-ocr",
  "name": "CC-OCR",
  "parent_benchmark_id": null,
  "categories": ["vision", "multimodal", "text-to-image"],
  "modality": "multimodal",
  "multilingual": true,
  "max_score": 1.0,
  "language": "en",
  "description": "A comprehensive OCR benchmark for evaluating Large Multimodal Models (LMMs) in literacy. Comprises four OCR-centric tracks: multi-scene text reading, multilingual text reading, document parsing, and key information extraction. Contains 39 subsets with 7,058 fully annotated images, 41% sourced from real applications. Tests capabilities including text grounding, multi-orientation text recognition, and detecting hallucination/repetition across diverse visual challenges.",
  "paper_link": "https://arxiv.org/abs/2412.02210",
  "implementation_link": "https://github.com/AlibabaResearch/AdvancedLiterateMachinery",
  "verified": false,
  "created_at": "2025-07-19T19:56:14.652986+00:00",
  "updated_at": "2025-07-19T19:56:14.652986+00:00"
}

================================================
FILE: data/benchmarks/cfeval.json
================================================
{
  "benchmark_id": "cfeval",
  "name": "CFEval",
  "parent_benchmark_id": null,
  "categories": ["code"],
  "modality": "text",
  "multilingual": false,
  "max_score": 10000.0,
  "language": "en",
  "description": "CFEval benchmark for evaluating code generation and problem-solving capabilities",
  "paper_link": null,
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-09-15T00:00:00.000000+00:00",
  "updated_at": "2025-09-15T00:00:00.000000+00:00"
}


================================================
FILE: data/benchmarks/charadessta.json
================================================
{
  "benchmark_id": "charadessta",
  "name": "CharadesSTA",
  "parent_benchmark_id": null,
  "categories": ["video", "language", "multimodal"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "Charades-STA is a benchmark dataset for temporal activity localization via language queries, extending the Charades dataset with sentence temporal annotations. It contains 12,408 training and 3,720 testing segment-sentence pairs from videos with natural language descriptions and precise temporal boundaries for localizing activities based on language queries.",
  "paper_link": "https://arxiv.org/abs/1705.02101",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.760027+00:00",
  "updated_at": "2025-07-19T19:56:14.760027+00:00"
}


================================================
FILE: data/benchmarks/chartqa.json
================================================
{
  "benchmark_id": "chartqa",
  "name": "ChartQA",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "vision", "multimodal"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "ChartQA is a large-scale benchmark comprising 9.6K human-written questions and 23.1K questions generated from human-written chart summaries, designed to evaluate models' abilities in visual and logical reasoning over charts.",
  "paper_link": "https://arxiv.org/abs/2203.10244",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:12.783541+00:00",
  "updated_at": "2025-07-19T19:56:12.783541+00:00"
}


================================================
FILE: data/benchmarks/charxiv-d.json
================================================
{
  "benchmark_id": "charxiv-d",
  "name": "CharXiv-D",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "vision", "multimodal"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "CharXiv-D is the descriptive questions subset of the CharXiv benchmark, designed to assess multimodal large language models' ability to extract basic information from scientific charts. It contains descriptive questions covering information extraction, enumeration, pattern recognition, and counting across 2,323 diverse charts from arXiv papers, all curated and verified by human experts.",
  "paper_link": "https://arxiv.org/abs/2406.18521",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:15.325204+00:00",
  "updated_at": "2025-07-19T19:56:15.325204+00:00"
}


================================================
FILE: data/benchmarks/charxiv-r.json
================================================
{
  "benchmark_id": "charxiv-r",
  "name": "CharXiv-R",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "vision", "multimodal"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "CharXiv-R is the reasoning component of the CharXiv benchmark, focusing on complex reasoning questions that require synthesizing information across visual chart elements. It evaluates multimodal large language models on their ability to understand and reason about scientific charts from arXiv papers through various reasoning tasks.",
  "paper_link": "https://arxiv.org/abs/2406.18521",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:15.191553+00:00",
  "updated_at": "2025-07-19T19:56:15.191553+00:00"
}


================================================
FILE: data/benchmarks/chexpert-cxr.json
================================================
{
  "benchmark_id": "chexpert-cxr",
  "name": "CheXpert CXR",
  "parent_benchmark_id": null,
  "categories": ["healthcare", "vision"],
  "modality": "image",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "CheXpert is a large dataset of 224,316 chest radiographs from 65,240 patients for automated chest X-ray interpretation. The dataset includes uncertainty labels for 14 medical observations extracted from radiology reports. It serves as a benchmark for developing and evaluating automated chest radiograph interpretation models.",
  "paper_link": "https://arxiv.org/abs/1901.07031",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.021015+00:00",
  "updated_at": "2025-07-19T19:56:14.021015+00:00"
}


================================================
FILE: data/benchmarks/cluewsc.json
================================================
{
  "benchmark_id": "cluewsc",
  "name": "CLUEWSC",
  "parent_benchmark_id": null,
  "categories": ["language", "reasoning"],
  "modality": "text",
  "multilingual": true,
  "max_score": 1.0,
  "language": "en",
  "description": "CLUEWSC2020 is the Chinese version of the Winograd Schema Challenge, part of the CLUE benchmark. It focuses on pronoun disambiguation and coreference resolution, requiring models to determine which noun a pronoun refers to in a sentence. The dataset contains 1,244 training samples and 304 development samples extracted from contemporary Chinese literature.",
  "paper_link": "https://arxiv.org/abs/2004.05986",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:12.233189+00:00",
  "updated_at": "2025-07-19T19:56:12.233189+00:00"
}


================================================
FILE: data/benchmarks/cmmlu.json
================================================
{
  "benchmark_id": "cmmlu",
  "name": "CMMLU",
  "parent_benchmark_id": null,
  "categories": ["language", "reasoning", "general"],
  "modality": "text",
  "multilingual": true,
  "max_score": 1.0,
  "language": "en",
  "description": "CMMLU (Chinese Massive Multitask Language Understanding) is a comprehensive Chinese benchmark that evaluates the knowledge and reasoning capabilities of large language models across 67 different subject topics. The benchmark covers natural sciences, social sciences, engineering, and humanities with multiple-choice questions ranging from basic to advanced professional levels.",
  "paper_link": "https://arxiv.org/abs/2306.09212",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.941108+00:00",
  "updated_at": "2025-07-19T19:56:14.941108+00:00"
}


================================================
FILE: data/benchmarks/cnmo-2024.json
================================================
{
  "benchmark_id": "cnmo-2024",
  "name": "CNMO 2024",
  "parent_benchmark_id": null,
  "categories": ["math"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "China Mathematical Olympiad 2024 - A challenging mathematics competition.",
  "paper_link": null,
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-09-05T00:00:00.000000+00:00",
  "updated_at": "2025-09-05T00:00:00.000000+00:00"
}


================================================
FILE: data/benchmarks/codeforces.json
================================================
{
  "benchmark_id": "codeforces",
  "name": "CodeForces",
  "parent_benchmark_id": null,
  "categories": ["math", "reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 3000.0,
  "language": "en",
  "description": "A competitive programming benchmark using problems from the CodeForces platform. The benchmark evaluates code generation capabilities of LLMs on algorithmic problems with difficulty ratings ranging from 800 to 2400. Problems cover diverse algorithmic categories including dynamic programming, graph algorithms, data structures, and mathematical problems with standardized evaluation through direct platform submission.",
  "paper_link": "https://arxiv.org/abs/2501.01257",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.624663+00:00",
  "updated_at": "2025-07-19T19:56:14.624663+00:00"
}


================================================
FILE: data/benchmarks/codegolf-v2.2.json
================================================
{
  "benchmark_id": "codegolf-v2.2",
  "name": "Codegolf v2.2",
  "parent_benchmark_id": null,
  "categories": ["code"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "Codegolf v2.2 benchmark",
  "paper_link": null,
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:13.778275+00:00",
  "updated_at": "2025-07-19T19:56:13.778275+00:00"
}

================================================
FILE: data/benchmarks/collie.json
================================================
{
  "benchmark_id": "collie",
  "name": "COLLIE",
  "parent_benchmark_id": null,
  "categories": ["language", "reasoning", "writing"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "COLLIE is a grammar-based framework for systematic construction of constrained text generation tasks. It allows specification of rich, compositional constraints across diverse generation levels and modeling challenges including language understanding, logical reasoning, and semantic planning. The COLLIE-v1 dataset contains 2,080 instances across 13 constraint structures.",
  "paper_link": "https://arxiv.org/abs/2307.08689",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:15.250323+00:00",
  "updated_at": "2025-07-19T19:56:15.250323+00:00"
}


================================================
FILE: data/benchmarks/common-voice-15.json
================================================
{
  "benchmark_id": "common-voice-15",
  "name": "Common Voice 15",
  "parent_benchmark_id": null,
  "categories": ["audio", "speech-to-text", "language"],
  "modality": "audio",
  "multilingual": true,
  "max_score": 100.0,
  "language": "en",
  "description": "Common Voice is a massively-multilingual collection of transcribed speech intended for speech technology research and development. Version 15.0 contains 28,750 recorded hours across 114 languages, consisting of crowdsourced voice recordings with corresponding transcriptions.",
  "paper_link": "https://arxiv.org/abs/1912.06670",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.830793+00:00",
  "updated_at": "2025-07-19T19:56:14.830793+00:00"
}


================================================
FILE: data/benchmarks/commonsenseqa.json
================================================
{
  "benchmark_id": "commonsenseqa",
  "name": "CommonSenseQA",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "language"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "CommonSenseQA is a multiple-choice question answering dataset that requires different types of commonsense knowledge to predict correct answers. It contains 12,102 questions with one correct answer and four distractors, designed to test semantic reasoning and conceptual relationships. Questions are created based on ConceptNet concepts and require prior world knowledge for accurate reasoning.",
  "paper_link": "https://arxiv.org/abs/1811.00937",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:15.129679+00:00",
  "updated_at": "2025-07-19T19:56:15.129679+00:00"
}


================================================
FILE: data/benchmarks/complexfuncbench.json
================================================
{
  "benchmark_id": "complexfuncbench",
  "name": "ComplexFuncBench",
  "parent_benchmark_id": null,
  "categories": ["long_context", "reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "ComplexFuncBench is a benchmark designed to evaluate large language models' capabilities in handling complex function calling scenarios. It encompasses multi-step and constrained function calling tasks that require long-parameter filling, parameter value reasoning, and managing contexts up to 128k tokens. The benchmark includes 1,000 samples across five real-world scenarios.",
  "paper_link": "https://arxiv.org/abs/2501.10132",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:15.336577+00:00",
  "updated_at": "2025-07-19T19:56:15.336577+00:00"
}


================================================
FILE: data/benchmarks/covost2-en-zh.json
================================================
{
  "benchmark_id": "covost2-en-zh",
  "name": "CoVoST2 en-zh",
  "parent_benchmark_id": null,
  "categories": ["audio", "speech-to-text", "language"],
  "modality": "audio",
  "multilingual": true,
  "max_score": 100.0,
  "language": "en",
  "description": "CoVoST 2 English-to-Chinese subset is part of the large-scale multilingual speech translation corpus derived from Common Voice. This subset focuses specifically on English to Chinese speech translation tasks within the broader CoVoST 2 dataset.",
  "paper_link": "https://arxiv.org/abs/2007.10310",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.825578+00:00",
  "updated_at": "2025-07-19T19:56:14.825578+00:00"
}


================================================
FILE: data/benchmarks/covost2.json
================================================
{
  "benchmark_id": "covost2",
  "name": "CoVoST2",
  "parent_benchmark_id": null,
  "categories": ["audio", "speech-to-text", "language"],
  "modality": "audio",
  "multilingual": true,
  "max_score": 1.0,
  "language": "en",
  "description": "CoVoST 2 is a large-scale multilingual speech translation corpus derived from Common Voice, covering translations from 21 languages into English and from English into 15 languages. The dataset contains 2,880 hours of speech with 78K speakers for speech translation research.",
  "paper_link": "https://arxiv.org/abs/2007.10310",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:13.958237+00:00",
  "updated_at": "2025-07-19T19:56:13.958237+00:00"
}


================================================
FILE: data/benchmarks/crag.json
================================================
{
  "benchmark_id": "crag",
  "name": "CRAG",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "search"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "CRAG (Comprehensive RAG Benchmark) is a factual question answering benchmark consisting of 4,409 question-answer pairs across 5 domains (finance, sports, music, movie, open domain) and 8 question categories. The benchmark includes mock APIs to simulate web and Knowledge Graph search, designed to represent the diverse and dynamic nature of real-world QA tasks with temporal dynamism ranging from years to seconds. It evaluates retrieval-augmented generation systems for trustworthy question answering.",
  "paper_link": "https://arxiv.org/abs/2406.04744",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:12.741280+00:00",
  "updated_at": "2025-07-19T19:56:12.741280+00:00"
}

================================================
FILE: data/benchmarks/creative-writing-v3.json
================================================
{
  "benchmark_id": "creative-writing-v3",
  "name": "Creative Writing v3",
  "parent_benchmark_id": null,
  "categories": ["creativity", "writing"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "EQ-Bench Creative Writing v3 is an LLM-judged creative writing benchmark that evaluates models across 32 writing prompts with 3 iterations per prompt. Uses a hybrid scoring system combining rubric assessment and Elo ratings through pairwise comparisons. Challenges models in areas like humor, romance, spatial awareness, and unique perspectives to assess emotional intelligence and creative writing capabilities.",
  "paper_link": "https://arxiv.org/abs/2312.06281",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-08-03T22:06:11.157942+00:00",
  "updated_at": "2025-08-03T22:06:11.157942+00:00"
}

================================================
FILE: data/benchmarks/crperelation.json
================================================
{
  "benchmark_id": "crperelation",
  "name": "CRPErelation",
  "parent_benchmark_id": null,
  "categories": ["healthcare", "reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "Clinical reasoning problems evaluation benchmark for assessing diagnostic reasoning and medical knowledge application capabilities.",
  "paper_link": null,
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.834739+00:00",
  "updated_at": "2025-07-19T19:56:14.834739+00:00"
}

================================================
FILE: data/benchmarks/crux-o.json
================================================
{
  "benchmark_id": "crux-o",
  "name": "CRUX-O",
  "parent_benchmark_id": null,
  "categories": ["reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 100.0,
  "language": "en",
  "description": "CRUXEval-O (output prediction) is part of the CRUXEval benchmark consisting of 800 Python functions (3-13 lines) designed to evaluate AI models' capabilities in code reasoning, understanding, and execution. The benchmark tests models' ability to predict correct function outputs given function code and inputs, focusing on short problems that a good human programmer should be able to solve in a minute.",
  "paper_link": "https://arxiv.org/abs/2401.03065",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.635245+00:00",
  "updated_at": "2025-07-19T19:56:14.635245+00:00"
}

================================================
FILE: data/benchmarks/cruxeval-input-cot.json
================================================
{
  "benchmark_id": "cruxeval-input-cot",
  "name": "CRUXEval-Input-CoT",
  "parent_benchmark_id": null,
  "categories": ["reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "CRUXEval input prediction task with Chain of Thought (CoT) prompting. Part of the CRUXEval benchmark for code reasoning, understanding, and execution evaluation. Given a Python function and its expected output, the task is to predict the appropriate input using chain-of-thought reasoning. Consists of 800 Python functions (3-13 lines) designed to evaluate code comprehension and reasoning capabilities.",
  "paper_link": "https://arxiv.org/abs/2401.03065",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.551746+00:00",
  "updated_at": "2025-07-19T19:56:14.551746+00:00"
}

================================================
FILE: data/benchmarks/cruxeval-o.json
================================================
{
  "benchmark_id": "cruxeval-o",
  "name": "CruxEval-O",
  "parent_benchmark_id": null,
  "categories": ["reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "CruxEval-O is the output prediction task of the CRUXEval benchmark, designed to evaluate code reasoning, understanding, and execution capabilities. It consists of 800 Python functions (3-13 lines) where models must predict the output given a function and input. The benchmark tests fundamental code execution reasoning abilities and goes beyond simple code generation to assess deeper understanding of program behavior.",
  "paper_link": "https://arxiv.org/abs/2401.03065",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:15.146592+00:00",
  "updated_at": "2025-07-19T19:56:15.146592+00:00"
}

================================================
FILE: data/benchmarks/cruxeval-output-cot.json
================================================
{
  "benchmark_id": "cruxeval-output-cot",
  "name": "CRUXEval-Output-CoT",
  "parent_benchmark_id": null,
  "categories": ["reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "CRUXEval-O (output prediction) with Chain-of-Thought prompting. Part of the CRUXEval benchmark consisting of 800 Python functions (3-13 lines) designed to evaluate code reasoning, understanding, and execution capabilities. The output prediction task requires models to predict the output of a given Python function with specific inputs, evaluated using chain-of-thought reasoning methodology.",
  "paper_link": "https://arxiv.org/abs/2401.03065",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.555432+00:00",
  "updated_at": "2025-07-19T19:56:14.555432+00:00"
}

================================================
FILE: data/benchmarks/csimpleqa.json
================================================
{
  "benchmark_id": "csimpleqa",
  "name": "CSimpleQA",
  "parent_benchmark_id": null,
  "categories": ["general", "language"],
  "modality": "text",
  "multilingual": true,
  "max_score": 1.0,
  "language": "en",
  "description": "Chinese SimpleQA is the first comprehensive Chinese benchmark to evaluate the factuality ability of language models to answer short questions. It contains 3,000 high-quality questions spanning 6 major topics with 99 diverse subtopics, designed to assess Chinese factual knowledge across humanities, science, engineering, culture, and society.",
  "paper_link": "https://arxiv.org/abs/2411.07140",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:11.931358+00:00",
  "updated_at": "2025-07-19T19:56:11.931358+00:00"
}

================================================
FILE: data/benchmarks/cybersecurity-ctfs.json
================================================
{
  "benchmark_id": "cybersecurity-ctfs",
  "name": "Cybersecurity CTFs",
  "parent_benchmark_id": null,
  "categories": ["safety"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "Cybersecurity Capture the Flag (CTF) benchmark for evaluating LLMs in offensive security challenges. Contains diverse cybersecurity tasks including cryptography, web exploitation, binary analysis, and forensics to assess AI capabilities in cybersecurity problem-solving.",
  "paper_link": "https://arxiv.org/abs/2406.05590",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:15.387055+00:00",
  "updated_at": "2025-07-19T19:56:15.387055+00:00"
}

================================================
FILE: data/benchmarks/dermmcqa.json
================================================
{
  "benchmark_id": "dermmcqa",
  "name": "DermMCQA",
  "parent_benchmark_id": null,
  "categories": ["healthcare"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "Dermatology multiple choice question assessment benchmark for evaluating medical knowledge and diagnostic reasoning in dermatological conditions and treatments.",
  "paper_link": "https://arxiv.org/abs/2309.06961",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.024498+00:00",
  "updated_at": "2025-07-19T19:56:14.024498+00:00"
}

================================================
FILE: data/benchmarks/docvqa.json
================================================
{
  "benchmark_id": "docvqa",
  "name": "DocVQA",
  "parent_benchmark_id": null,
  "categories": ["vision", "multimodal"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "A dataset for Visual Question Answering on document images containing 50,000 questions defined on 12,000+ document images. The benchmark tests AI's ability to understand document structure and content, requiring models to comprehend document layout and perform information retrieval to answer questions about document images.",
  "paper_link": "https://arxiv.org/abs/2007.00398",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:12.825214+00:00",
  "updated_at": "2025-07-19T19:56:12.825214+00:00"
}

================================================
FILE: data/benchmarks/docvqatest.json
================================================
{
  "benchmark_id": "docvqatest",
  "name": "DocVQAtest",
  "parent_benchmark_id": null,
  "categories": ["vision", "multimodal"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "DocVQA is a Visual Question Answering benchmark on document images containing 50,000 questions defined on 12,000+ document images. The benchmark focuses on understanding document structure and content to answer questions about various document types including letters, memos, notes, and reports from the UCSF Industry Documents Library.",
  "paper_link": "https://arxiv.org/abs/2007.00398",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.579372+00:00",
  "updated_at": "2025-07-19T19:56:14.579372+00:00"
}

================================================
FILE: data/benchmarks/drop.json
================================================
{
  "benchmark_id": "drop",
  "name": "DROP",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "math"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "DROP (Discrete Reasoning Over Paragraphs) is a reading comprehension benchmark requiring discrete reasoning over paragraph content. It contains crowdsourced, adversarially-created questions that require resolving references and performing discrete operations like addition, counting, or sorting, demanding comprehensive paragraph understanding beyond paraphrase-and-entity-typing shortcuts.",
  "paper_link": "https://arxiv.org/abs/1903.00161",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:12.981569+00:00",
  "updated_at": "2025-07-19T19:56:12.981569+00:00"
}

================================================
FILE: data/benchmarks/ds-arena-code.json
================================================
{
  "benchmark_id": "ds-arena-code",
  "name": "DS-Arena-Code",
  "parent_benchmark_id": null,
  "categories": ["reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "Data Science Arena Code benchmark for evaluating LLMs on realistic data science code generation tasks. Tests capabilities in complex data processing, analysis, and programming across popular Python libraries used in data science workflows.",
  "paper_link": "https://arxiv.org/abs/2505.15621",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:15.057744+00:00",
  "updated_at": "2025-07-19T19:56:15.057744+00:00"
}

================================================
FILE: data/benchmarks/ds-fim-eval.json
================================================
{
  "benchmark_id": "ds-fim-eval",
  "name": "DS-FIM-Eval",
  "parent_benchmark_id": null,
  "categories": ["general"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "DeepSeek's internal Fill-in-the-Middle evaluation dataset for measuring code completion performance improvements in data science contexts",
  "paper_link": "https://arxiv.org/abs/2406.11931",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:15.053854+00:00",
  "updated_at": "2025-07-19T19:56:15.053854+00:00"
}

================================================
FILE: data/benchmarks/eclektic.json
================================================
{
  "benchmark_id": "eclektic",
  "name": "ECLeKTic",
  "parent_benchmark_id": null,
  "categories": ["language", "reasoning"],
  "modality": "text",
  "multilingual": true,
  "max_score": 1.0,
  "language": "en",
  "description": "A multilingual closed-book question answering dataset that evaluates cross-lingual knowledge transfer in large language models across 12 languages, using knowledge-seeking questions based on Wikipedia articles that exist only in one language",
  "paper_link": "https://arxiv.org/abs/2502.21228",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:13.561292+00:00",
  "updated_at": "2025-07-19T19:56:13.561292+00:00"
}

================================================
FILE: data/benchmarks/egoschema.json
================================================
{
  "benchmark_id": "egoschema",
  "name": "EgoSchema",
  "parent_benchmark_id": null,
  "categories": ["vision", "reasoning", "long_context"],
  "modality": "video",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "A diagnostic benchmark for very long-form video language understanding consisting of over 5000 human curated multiple choice questions based on 3-minute video clips from Ego4D, covering a broad range of natural human activities and behaviors",
  "paper_link": "https://arxiv.org/abs/2308.09126",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:12.915240+00:00",
  "updated_at": "2025-07-19T19:56:12.915240+00:00"
}

================================================
FILE: data/benchmarks/erqa.json
================================================
{
  "benchmark_id": "erqa",
  "name": "ERQA",
  "parent_benchmark_id": null,
  "categories": ["vision", "reasoning", "spatial_reasoning"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "Embodied Reasoning Question Answering benchmark consisting of 400 multiple-choice visual questions across spatial reasoning, trajectory reasoning, action reasoning, state estimation, and multi-view reasoning for evaluating AI capabilities in physical world interactions",
  "paper_link": "https://arxiv.org/abs/2503.20020",
  "implementation_link": "https://github.com/embodiedreasoning/ERQA",
  "verified": false,
  "created_at": "2025-07-24T12:00:00.000000+00:00",
  "updated_at": "2025-07-24T12:00:00.000000+00:00"
}


================================================
FILE: data/benchmarks/evalplus.json
================================================
{
  "benchmark_id": "evalplus",
  "name": "EvalPlus",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "code"],
  "modality": "text",
  "multilingual": false,
  "max_score": 100.0,
  "language": "en",
  "description": "A rigorous code synthesis evaluation framework that augments existing datasets with extensive test cases generated by LLM and mutation-based strategies to better assess functional correctness of generated code, including HumanEval+ with 80x more test cases",
  "paper_link": "https://arxiv.org/abs/2305.01210",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:11.793176+00:00",
  "updated_at": "2025-07-19T19:56:11.793176+00:00"
}


================================================
FILE: data/benchmarks/facts-grounding.json
================================================
{
  "benchmark_id": "facts-grounding",
  "name": "FACTS Grounding",
  "parent_benchmark_id": null,
  "categories": ["reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "A benchmark evaluating language models' ability to generate factually accurate and well-grounded responses based on long-form input context, comprising 1,719 examples with documents up to 32k tokens requiring detailed responses that are fully grounded in provided documents",
  "paper_link": "https://arxiv.org/abs/2501.03200",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:13.260285+00:00",
  "updated_at": "2025-07-19T19:56:13.260285+00:00"
}

================================================
FILE: data/benchmarks/factscore.json
================================================
{
  "benchmark_id": "factscore",
  "name": "FActScore",
  "parent_benchmark_id": null,
  "categories": ["reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "A fine-grained atomic evaluation metric for factual precision in long-form text generation that breaks generated text into atomic facts and computes the percentage supported by reliable knowledge sources, with automated assessment using retrieval and language models",
  "paper_link": "https://arxiv.org/abs/2305.14251",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-24T12:00:00.000000+00:00",
  "updated_at": "2025-07-24T12:00:00.000000+00:00"
}


================================================
FILE: data/benchmarks/finqa.json
================================================
{
  "benchmark_id": "finqa",
  "name": "FinQA",
  "parent_benchmark_id": null,
  "categories": ["finance", "math", "reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "A large-scale dataset for numerical reasoning over financial data with question-answering pairs written by financial experts, featuring complex numerical reasoning and understanding of heterogeneous representations with annotated gold reasoning programs for full explainability",
  "paper_link": "https://arxiv.org/abs/2109.00122",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:12.734486+00:00",
  "updated_at": "2025-07-19T19:56:12.734486+00:00"
}

================================================
FILE: data/benchmarks/flenqa.json
================================================
{
  "benchmark_id": "flenqa",
  "name": "FlenQA",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "long_context"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "Flexible Length Question Answering dataset for evaluating the impact of input length on reasoning performance of language models, featuring True/False questions embedded in contexts of varying lengths (250-3000 tokens) across three reasoning tasks: Monotone Relations, People In Rooms, and simplified Ruletaker",
  "paper_link": "https://arxiv.org/abs/2402.14848",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.277205+00:00",
  "updated_at": "2025-07-19T19:56:14.277205+00:00"
}

================================================
FILE: data/benchmarks/fleurs.json
================================================
{
  "benchmark_id": "fleurs",
  "name": "FLEURS",
  "parent_benchmark_id": null,
  "categories": ["language", "speech-to-text"],
  "modality": "audio",
  "multilingual": true,
  "max_score": 100.0,
  "language": "en",
  "description": "Few-shot Learning Evaluation of Universal Representations of Speech - a parallel speech dataset in 102 languages built on FLoRes-101 with approximately 12 hours of speech supervision per language for tasks including ASR, speech language identification, translation and retrieval",
  "paper_link": "https://arxiv.org/abs/2205.12446",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:13.943695+00:00",
  "updated_at": "2025-07-19T19:56:13.943695+00:00"
}

================================================
FILE: data/benchmarks/frames.json
================================================
{
  "benchmark_id": "frames",
  "name": "FRAMES",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "search"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "Factuality, Retrieval, And reasoning MEasurement Set - a unified evaluation dataset of 824 challenging multi-hop questions for testing retrieval-augmented generation systems across factuality, retrieval accuracy, and reasoning capabilities, requiring integration of 2-15 Wikipedia articles per question",
  "paper_link": "https://arxiv.org/abs/2409.12941",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.954436+00:00",
  "updated_at": "2025-07-19T19:56:14.954436+00:00"
}

================================================
FILE: data/benchmarks/french-mmlu.json
================================================
{
  "benchmark_id": "french-mmlu",
  "name": "French MMLU",
  "parent_benchmark_id": null,
  "categories": ["general", "language", "reasoning"],
  "modality": "text",
  "multilingual": true,
  "max_score": 1.0,
  "language": "en",
  "description": "French version of MMLU-Pro, a multilingual benchmark for evaluating language models' cross-lingual reasoning capabilities across 14 diverse domains including mathematics, physics, chemistry, law, engineering, psychology, and health.",
  "paper_link": "https://arxiv.org/abs/2503.10497",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:15.134340+00:00",
  "updated_at": "2025-07-19T19:56:15.134340+00:00"
}

================================================
FILE: data/benchmarks/frontiermath.json
================================================
{
  "benchmark_id": "frontiermath",
  "name": "FrontierMath",
  "parent_benchmark_id": null,
  "categories": ["math", "reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "A benchmark of hundreds of original, exceptionally challenging mathematics problems crafted and vetted by expert mathematicians, covering most major branches of modern mathematics from number theory and real analysis to algebraic geometry and category theory.",
  "paper_link": "https://arxiv.org/abs/2411.04872",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:15.179213+00:00",
  "updated_at": "2025-07-19T19:56:15.179213+00:00"
}

================================================
FILE: data/benchmarks/functionalmath.json
================================================
{
  "benchmark_id": "functionalmath",
  "name": "FunctionalMATH",
  "parent_benchmark_id": null,
  "categories": ["math", "reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "A functional variant of the MATH benchmark that tests language models' ability to generalize reasoning patterns across different problem instances, revealing the reasoning gap between static and functional performance.",
  "paper_link": "https://arxiv.org/abs/2402.19450",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:13.987516+00:00",
  "updated_at": "2025-07-19T19:56:13.987516+00:00"
}

================================================
FILE: data/benchmarks/giantsteps-tempo.json
================================================
{
  "benchmark_id": "giantsteps-tempo",
  "name": "GiantSteps Tempo",
  "parent_benchmark_id": null,
  "categories": ["audio"],
  "modality": "audio",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "A dataset for tempo estimation in electronic dance music containing 664 2-minute audio previews from Beatport, annotated from user corrections for evaluating automatic tempo estimation algorithms.",
  "paper_link": "https://archives.ismir.net/ismir2015/paper/000246.pdf",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.838584+00:00",
  "updated_at": "2025-07-19T19:56:14.838584+00:00"
}

================================================
FILE: data/benchmarks/global-mmlu-lite.json
================================================
{
  "benchmark_id": "global-mmlu-lite",
  "name": "Global-MMLU-Lite",
  "parent_benchmark_id": null,
  "categories": ["general", "language", "reasoning"],
  "modality": "text",
  "multilingual": true,
  "max_score": 1.0,
  "language": "en",
  "description": "A lightweight version of Global MMLU benchmark that evaluates language models across multiple languages while addressing cultural and linguistic biases in multilingual evaluation.",
  "paper_link": "https://arxiv.org/abs/2412.03304",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:13.534515+00:00",
  "updated_at": "2025-07-19T19:56:13.534515+00:00"
}

================================================
FILE: data/benchmarks/global-mmlu.json
================================================
{
  "benchmark_id": "global-mmlu",
  "name": "Global-MMLU",
  "parent_benchmark_id": null,
  "categories": ["general", "language", "reasoning"],
  "modality": "text",
  "multilingual": true,
  "max_score": 1.0,
  "language": "en",
  "description": "A comprehensive multilingual benchmark covering 42 languages that addresses cultural and linguistic biases in evaluation, with improved translation quality and culturally sensitive question subsets.",
  "paper_link": "https://arxiv.org/abs/2412.03304",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:13.747524+00:00",
  "updated_at": "2025-07-19T19:56:13.747524+00:00"
}

================================================
FILE: data/benchmarks/gorilla-benchmark-api-bench.json
================================================
{
  "benchmark_id": "gorilla-benchmark-api-bench",
  "name": "Gorilla Benchmark API Bench",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "code"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "APIBench, a comprehensive dataset of over 11,000 instruction-API pairs from HuggingFace, TorchHub, and TensorHub APIs for evaluating language models' ability to generate accurate API calls.",
  "paper_link": "https://arxiv.org/abs/2305.15334",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.383584+00:00",
  "updated_at": "2025-07-19T19:56:14.383584+00:00"
}


================================================
FILE: data/benchmarks/govreport.json
================================================
{
  "benchmark_id": "govreport",
  "name": "GovReport",
  "parent_benchmark_id": null,
  "categories": ["summarization", "long_context"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "A long document summarization dataset consisting of reports from government research agencies including Congressional Research Service and U.S. Government Accountability Office, with significantly longer documents and summaries than other datasets.",
  "paper_link": "https://arxiv.org/abs/2104.02112",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.218809+00:00",
  "updated_at": "2025-07-19T19:56:14.218809+00:00"
}

================================================
FILE: data/benchmarks/gpqa-biology.json
================================================
{
  "benchmark_id": "gpqa-biology",
  "name": "GPQA Biology",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "general"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "Biology subset of GPQA, containing challenging multiple-choice questions written by domain experts in biology. These Google-proof questions require graduate-level knowledge and reasoning.",
  "paper_link": "https://arxiv.org/abs/2311.12022",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:15.391187+00:00",
  "updated_at": "2025-07-19T19:56:15.391187+00:00"
}

================================================
FILE: data/benchmarks/gpqa-chemistry.json
================================================
{
  "benchmark_id": "gpqa-chemistry",
  "name": "GPQA Chemistry",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "chemistry"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "Chemistry subset of GPQA, containing challenging multiple-choice questions written by domain experts in chemistry. These Google-proof questions require graduate-level knowledge and reasoning.",
  "paper_link": "https://arxiv.org/abs/2311.12022",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:15.395806+00:00",
  "updated_at": "2025-07-19T19:56:15.395806+00:00"
}

================================================
FILE: data/benchmarks/gpqa-physics.json
================================================
{
  "benchmark_id": "gpqa-physics",
  "name": "GPQA Physics",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "physics"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "Physics subset of GPQA, containing challenging multiple-choice questions written by domain experts in physics. These Google-proof questions require graduate-level knowledge and reasoning.",
  "paper_link": "https://arxiv.org/abs/2311.12022",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:15.400663+00:00",
  "updated_at": "2025-07-19T19:56:15.400663+00:00"
}

================================================
FILE: data/benchmarks/gpqa.json
================================================
{
  "benchmark_id": "gpqa",
  "name": "GPQA",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "general"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "A challenging dataset of 448 multiple-choice questions written by domain experts in biology, physics, and chemistry. Questions are Google-proof and extremely difficult, with PhD experts reaching 65% accuracy.",
  "paper_link": "https://arxiv.org/abs/2311.12022",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:11.588605+00:00",
  "updated_at": "2025-07-19T19:56:11.588605+00:00"
}

================================================
FILE: data/benchmarks/graphwalks-bfs-%3C128k.json
================================================
{
  "benchmark_id": "graphwalks-bfs-<128k",
  "name": "Graphwalks BFS <128k",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "spatial_reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "A graph reasoning benchmark that evaluates language models' ability to perform breadth-first search (BFS) operations on graphs with context length under 128k tokens, returning nodes reachable at specified depths.",
  "paper_link": null,
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:15.287324+00:00",
  "updated_at": "2025-07-19T19:56:15.287324+00:00"
}

================================================
FILE: data/benchmarks/graphwalks-bfs-%3E128k.json
================================================
{
  "benchmark_id": "graphwalks-bfs->128k",
  "name": "Graphwalks BFS >128k",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "spatial_reasoning", "long_context"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "A graph reasoning benchmark that evaluates language models' ability to perform breadth-first search (BFS) operations on graphs with context length over 128k tokens, testing long-context reasoning capabilities.",
  "paper_link": null,
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:15.295876+00:00",
  "updated_at": "2025-07-19T19:56:15.295876+00:00"
}

================================================
FILE: data/benchmarks/graphwalks-parents-%3C128k.json
================================================
{
  "benchmark_id": "graphwalks-parents-<128k",
  "name": "Graphwalks parents <128k",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "spatial_reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "A graph reasoning benchmark that evaluates language models' ability to find parent nodes in graphs with context length under 128k tokens, requiring understanding of graph structure and edge relationships.",
  "paper_link": null,
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:15.303643+00:00",
  "updated_at": "2025-07-19T19:56:15.303643+00:00"
}

================================================
FILE: data/benchmarks/graphwalks-parents-%3E128k.json
================================================
{
  "benchmark_id": "graphwalks-parents->128k",
  "name": "Graphwalks parents >128k",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "spatial_reasoning", "long_context"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "A graph reasoning benchmark that evaluates language models' ability to find parent nodes in graphs with context length over 128k tokens, testing long-context reasoning and graph structure understanding.",
  "paper_link": null,
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:15.316836+00:00",
  "updated_at": "2025-07-19T19:56:15.316836+00:00"
}

================================================
FILE: data/benchmarks/groundui-1k.json
================================================
{
  "benchmark_id": "groundui-1k",
  "name": "GroundUI-1K",
  "parent_benchmark_id": null,
  "categories": ["multimodal", "vision"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "A subset of GroundUI-18K for UI grounding evaluation, where models must predict action coordinates on screenshots based on single-step instructions across web, desktop, and mobile platforms.",
  "paper_link": "https://arxiv.org/abs/2403.17918",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:12.758595+00:00",
  "updated_at": "2025-07-19T19:56:12.758595+00:00"
}

================================================
FILE: data/benchmarks/gsm-8k-(cot).json
================================================
{
  "benchmark_id": "gsm-8k-(cot)",
  "name": "GSM-8K (CoT)",
  "parent_benchmark_id": null,
  "categories": ["math", "reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "Grade School Math 8K with Chain-of-Thought prompting, featuring 8.5K high-quality linguistically diverse grade school math word problems requiring multi-step reasoning and elementary arithmetic operations.",
  "paper_link": "https://arxiv.org/abs/2110.14168",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.360381+00:00",
  "updated_at": "2025-07-19T19:56:14.360381+00:00"
}

================================================
FILE: data/benchmarks/gsm8k-chat.json
================================================
{
  "benchmark_id": "gsm8k-chat",
  "name": "GSM8K Chat",
  "parent_benchmark_id": null,
  "categories": ["math", "reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "Grade School Math 8K adapted for chat format evaluation, featuring 8.5K high-quality linguistically diverse grade school math word problems requiring multi-step reasoning and elementary arithmetic operations.",
  "paper_link": "https://arxiv.org/abs/2110.14168",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:15.101578+00:00",
  "updated_at": "2025-07-19T19:56:15.101578+00:00"
}

================================================
FILE: data/benchmarks/gsm8k.json
================================================
{
  "benchmark_id": "gsm8k",
  "name": "GSM8k",
  "parent_benchmark_id": null,
  "categories": ["math", "reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "Grade School Math 8K, a dataset of 8.5K high-quality linguistically diverse grade school math word problems requiring multi-step reasoning and elementary arithmetic operations.",
  "paper_link": "https://arxiv.org/abs/2110.14168",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:11.397385+00:00",
  "updated_at": "2025-07-19T19:56:11.397385+00:00"
}

================================================
FILE: data/benchmarks/hallusion-bench.json
================================================
{
  "benchmark_id": "hallusion-bench",
  "name": "Hallusion Bench",
  "parent_benchmark_id": null,
  "categories": ["vision", "reasoning"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "A comprehensive benchmark designed to evaluate image-context reasoning in large visual-language models (LVLMs) by challenging models with 346 images and 1,129 carefully crafted questions to assess language hallucination and visual illusion",
  "paper_link": "https://arxiv.org/abs/2310.14566",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.689507+00:00",
  "updated_at": "2025-07-19T19:56:14.689507+00:00"
}

================================================
FILE: data/benchmarks/healthbench-hard.json
================================================
{
  "benchmark_id": "healthbench-hard",
  "name": "HealthBench Hard",
  "parent_benchmark_id": null,
  "categories": ["healthcare"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "A challenging variation of HealthBench that evaluates large language models' performance and safety in healthcare through 5,000 multi-turn conversations with particularly rigorous evaluation criteria validated by 262 physicians from 60 countries",
  "paper_link": "https://arxiv.org/abs/2505.08775",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-08-05T19:56:13.424873+00:00",
  "updated_at": "2025-08-05T19:56:13.424873+00:00"
}


================================================
FILE: data/benchmarks/healthbench.json
================================================
{
  "benchmark_id": "healthbench",
  "name": "HealthBench",
  "parent_benchmark_id": null,
  "categories": ["healthcare"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "An open-source benchmark for measuring performance and safety of large language models in healthcare, consisting of 5,000 multi-turn conversations evaluated by 262 physicians using 48,562 unique rubric criteria across health contexts and behavioral dimensions",
  "paper_link": "https://arxiv.org/abs/2505.08775",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-08-05T19:56:13.424873+00:00",
  "updated_at": "2025-08-05T19:56:13.424873+00:00"
}


================================================
FILE: data/benchmarks/hellaswag.json
================================================
{
  "benchmark_id": "hellaswag",
  "name": "HellaSwag",
  "parent_benchmark_id": null,
  "categories": ["reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "A challenging commonsense natural language inference dataset that uses Adversarial Filtering to create questions trivial for humans (>95% accuracy) but difficult for state-of-the-art models, requiring completion of sentence endings based on physical situations and everyday activities",
  "paper_link": "https://arxiv.org/abs/1905.07830",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:11.145630+00:00",
  "updated_at": "2025-07-19T19:56:11.145630+00:00"
}

================================================
FILE: data/benchmarks/hiddenmath.json
================================================
{
  "benchmark_id": "hiddenmath",
  "name": "HiddenMath",
  "parent_benchmark_id": null,
  "categories": ["math", "reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "Google DeepMind's internal mathematical reasoning benchmark that introduces novel problems not encountered during model training to evaluate true mathematical reasoning capabilities rather than memorization",
  "paper_link": "https://storage.googleapis.com/deepmind-media/gemini/gemini_v1_5_report.pdf",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:13.424873+00:00",
  "updated_at": "2025-07-19T19:56:13.424873+00:00"
}

================================================
FILE: data/benchmarks/hle.json
================================================
{
  "benchmark_id": "hle",
  "name": "HLE",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "math"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "Humanity's Last Exam (HLE) is a multi-modal academic benchmark with 2,500 questions across mathematics, humanities, and natural sciences, designed to test LLM capabilities at the frontier of human knowledge with unambiguous, verifiable solutions",
  "paper_link": "https://arxiv.org/abs/2501.14249",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-28T00:00:00.000000+00:00",
  "updated_at": "2025-07-28T00:00:00.000000+00:00"
}


================================================
FILE: data/benchmarks/hmmt-2025.json
================================================
{
  "benchmark_id": "hmmt-2025",
  "name": "HMMT 2025",
  "parent_benchmark_id": null,
  "categories": ["math"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "Harvard-MIT Mathematics Tournament 2025 - A prestigious student-organized mathematics competition for high school students featuring two tournaments (November 2025 at MIT and February 2026 at Harvard) with individual tests, team rounds, and guts rounds",
  "paper_link": "http://web.mit.edu/HMMT/www/",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-09-05T00:00:00.000000+00:00",
  "updated_at": "2025-09-05T00:00:00.000000+00:00"
}


================================================
FILE: data/benchmarks/hmmt25.json
================================================
{
  "benchmark_id": "hmmt25",
  "name": "HMMT25",
  "parent_benchmark_id": null,
  "categories": ["math"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "Harvard-MIT Mathematics Tournament 2025 - A prestigious student-organized mathematics competition for high school students featuring two tournaments (November 2025 at MIT and February 2026 at Harvard) with individual tests, team rounds, and guts rounds",
  "paper_link": "http://web.mit.edu/HMMT/www/",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:15.061281+00:00",
  "updated_at": "2025-07-19T19:56:15.061281+00:00"
}

================================================
FILE: data/benchmarks/humaneval+.json
================================================
{
  "benchmark_id": "humaneval+",
  "name": "HumanEval+",
  "parent_benchmark_id": null,
  "categories": ["reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "Enhanced version of HumanEval that extends the original test cases by 80x using EvalPlus framework for rigorous evaluation of LLM-synthesized code functional correctness, detecting previously undetected wrong code",
  "paper_link": "https://arxiv.org/abs/2305.01210",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.062352+00:00",
  "updated_at": "2025-07-19T19:56:14.062352+00:00"
}

================================================
FILE: data/benchmarks/humaneval-average.json
================================================
{
  "benchmark_id": "humaneval-average",
  "name": "HumanEval-Average",
  "parent_benchmark_id": null,
  "categories": ["reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "A variant of the HumanEval benchmark that measures functional correctness for synthesizing programs from docstrings, consisting of 164 original programming problems assessing language comprehension, algorithms, and simple mathematics",
  "paper_link": "https://arxiv.org/abs/2107.03374",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:15.171175+00:00",
  "updated_at": "2025-07-19T19:56:15.171175+00:00"
}

================================================
FILE: data/benchmarks/humaneval-er.json
================================================
{
  "benchmark_id": "humaneval-er",
  "name": "HumanEval-ER",
  "parent_benchmark_id": null,
  "categories": ["reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "A variant of the HumanEval benchmark that measures functional correctness for synthesizing programs from docstrings, consisting of 164 original programming problems assessing language comprehension, algorithms, and simple mathematics",
  "paper_link": "https://arxiv.org/abs/2107.03374",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:12.704744+00:00",
  "updated_at": "2025-07-19T19:56:12.704744+00:00"
}

================================================
FILE: data/benchmarks/humaneval-mul.json
================================================
{
  "benchmark_id": "humaneval-mul",
  "name": "HumanEval-Mul",
  "parent_benchmark_id": null,
  "categories": ["reasoning"],
  "modality": "text",
  "multilingual": true,
  "max_score": 1.0,
  "language": "en",
  "description": "A multilingual variant of the HumanEval benchmark that measures functional correctness for synthesizing programs from docstrings, consisting of 164 original programming problems assessing language comprehension, algorithms, and simple mathematics",
  "paper_link": "https://arxiv.org/abs/2107.03374",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:15.032472+00:00",
  "updated_at": "2025-07-19T19:56:15.032472+00:00"
}

================================================
FILE: data/benchmarks/humaneval-plus.json
================================================
{
  "benchmark_id": "humaneval-plus",
  "name": "HumanEval Plus",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "code"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "Enhanced version of HumanEval that extends the original test cases by 80x using EvalPlus framework for rigorous evaluation of LLM-synthesized code functional correctness, detecting previously undetected wrong code",
  "paper_link": "https://arxiv.org/abs/2305.01210",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-08-03T22:06:10.921756+00:00",
  "updated_at": "2025-08-03T22:06:10.921756+00:00"
}


================================================
FILE: data/benchmarks/humaneval.json
================================================
{
  "benchmark_id": "humaneval",
  "name": "HumanEval",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "code"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "A benchmark that measures functional correctness for synthesizing programs from docstrings, consisting of 164 original programming problems assessing language comprehension, algorithms, and simple mathematics",
  "paper_link": "https://arxiv.org/abs/2107.03374",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:12.595263+00:00",
  "updated_at": "2025-07-19T19:56:12.595263+00:00"
}


================================================
FILE: data/benchmarks/humanevalfim-average.json
================================================
{
  "benchmark_id": "humanevalfim-average",
  "name": "HumanEvalFIM-Average",
  "parent_benchmark_id": null,
  "categories": ["general"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "Average evaluation of HumanEval Fill-in-the-Middle benchmark variants (single-line, multi-line, random-span) for assessing code infilling capabilities of language models",
  "paper_link": "https://arxiv.org/abs/2207.14255",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:15.160562+00:00",
  "updated_at": "2025-07-19T19:56:15.160562+00:00"
}

================================================
FILE: data/benchmarks/humanity's-last-exam.json
================================================
{
  "benchmark_id": "humanity's-last-exam",
  "name": "Humanity's Last Exam",
  "parent_benchmark_id": null,
  "categories": ["general"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "A multi-modal benchmark at the frontier of human knowledge with 2,500 questions across dozens of subjects including mathematics, humanities, and natural sciences, created by nearly 1000 subject expert contributors from over 500 institutions",
  "paper_link": "https://arxiv.org/abs/2501.14249",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:12.507693+00:00",
  "updated_at": "2025-07-19T19:56:12.507693+00:00"
}

================================================
FILE: data/benchmarks/if.json
================================================
{
  "benchmark_id": "if",
  "name": "IF",
  "parent_benchmark_id": null,
  "categories": ["general"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "Instruction-Following Evaluation (IFEval) benchmark for large language models, focusing on verifiable instructions with 25 types of instructions and around 500 prompts containing one or more verifiable constraints",
  "paper_link": "https://arxiv.org/abs/2311.07911",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-08-03T22:06:11.089394+00:00",
  "updated_at": "2025-08-03T22:06:11.089394+00:00"
}

================================================
FILE: data/benchmarks/ifeval.json
================================================
{
  "benchmark_id": "ifeval",
  "name": "IFEval",
  "parent_benchmark_id": null,
  "categories": ["general"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "Instruction-Following Evaluation (IFEval) benchmark for large language models, focusing on verifiable instructions with 25 types of instructions and around 500 prompts containing one or more verifiable constraints",
  "paper_link": "https://arxiv.org/abs/2311.07911",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:12.241350+00:00",
  "updated_at": "2025-07-19T19:56:12.241350+00:00"
}

================================================
FILE: data/benchmarks/include.json
================================================
{
  "benchmark_id": "include",
  "name": "Include",
  "parent_benchmark_id": null,
  "categories": ["general"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "Include benchmark - specific documentation not found in official sources",
  "paper_link": null,
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:13.724387+00:00",
  "updated_at": "2025-07-19T19:56:13.724387+00:00"
}

================================================
FILE: data/benchmarks/infinitebench-en.mc.json
================================================
{
  "benchmark_id": "infinitebench-en.mc",
  "name": "InfiniteBench/En.MC",
  "parent_benchmark_id": null,
  "categories": ["long_context"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "InfiniteBench English Multiple Choice variant - first LLM benchmark featuring average data length surpassing 100K tokens for evaluating long-context capabilities with 12 tasks spanning diverse domains",
  "paper_link": "https://arxiv.org/abs/2402.13718",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.461508+00:00",
  "updated_at": "2025-07-19T19:56:14.461508+00:00"
}

================================================
FILE: data/benchmarks/infinitebench-en.qa.json
================================================
{
  "benchmark_id": "infinitebench-en.qa",
  "name": "InfiniteBench/En.QA",
  "parent_benchmark_id": null,
  "categories": ["long_context"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "InfiniteBench English Question Answering variant - first LLM benchmark featuring average data length surpassing 100K tokens for evaluating long-context capabilities with 12 tasks spanning diverse domains",
  "paper_link": "https://arxiv.org/abs/2402.13718",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.457927+00:00",
  "updated_at": "2025-07-19T19:56:14.457927+00:00"
}

================================================
FILE: data/benchmarks/infographicsqa.json
================================================
{
  "benchmark_id": "infographicsqa",
  "name": "InfographicsQA",
  "parent_benchmark_id": null,
  "categories": ["vision", "multimodal"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "InfographicVQA dataset with 5,485 infographic images and over 30,000 questions requiring joint reasoning over document layout, textual content, graphical elements, and data visualizations with elementary reasoning and arithmetic skills",
  "paper_link": "https://arxiv.org/abs/2104.12756",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.417669+00:00",
  "updated_at": "2025-07-19T19:56:14.417669+00:00"
}

================================================
FILE: data/benchmarks/infovqa.json
================================================
{
  "benchmark_id": "infovqa",
  "name": "InfoVQA",
  "parent_benchmark_id": null,
  "categories": ["vision", "multimodal"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "InfoVQA dataset with 30,000 questions and 5,000 infographic images requiring joint reasoning over document layout, textual content, graphical elements, and data visualizations with elementary reasoning and arithmetic skills",
  "paper_link": "https://arxiv.org/abs/2104.12756",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:13.601294+00:00",
  "updated_at": "2025-07-19T19:56:13.601294+00:00"
}

================================================
FILE: data/benchmarks/infovqatest.json
================================================
{
  "benchmark_id": "infovqatest",
  "name": "InfoVQAtest",
  "parent_benchmark_id": null,
  "categories": ["vision", "multimodal"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "InfoVQA test set with infographic images requiring joint reasoning over document layout, textual content, graphical elements, and data visualizations with elementary reasoning and arithmetic skills",
  "paper_link": "https://arxiv.org/abs/2104.12756",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.583939+00:00",
  "updated_at": "2025-07-19T19:56:14.583939+00:00"
}

================================================
FILE: data/benchmarks/instruct-humaneval.json
================================================
{
  "benchmark_id": "instruct-humaneval",
  "name": "Instruct HumanEval",
  "parent_benchmark_id": null,
  "categories": ["general"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "Instruction-based variant of HumanEval benchmark for evaluating large language models' code generation capabilities with functional correctness using pass@k metric on programming problems",
  "paper_link": "https://arxiv.org/abs/2107.03374",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:15.105488+00:00",
  "updated_at": "2025-07-19T19:56:15.105488+00:00"
}

================================================
FILE: data/benchmarks/intergps.json
================================================
{
  "benchmark_id": "intergps",
  "name": "InterGPS",
  "parent_benchmark_id": null,
  "categories": ["math", "spatial_reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "Interpretable Geometry Problem Solver (Inter-GPS) with Geometry3K dataset of 3,002 geometry problems with dense annotation in formal language using theorem knowledge and symbolic reasoning",
  "paper_link": "https://arxiv.org/abs/2105.04165",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.259321+00:00",
  "updated_at": "2025-07-19T19:56:14.259321+00:00"
}

================================================
FILE: data/benchmarks/internal-api-instruction-following-(hard).json
================================================
{
  "benchmark_id": "internal-api-instruction-following-(hard)",
  "name": "Internal API instruction following (hard)",
  "parent_benchmark_id": null,
  "categories": ["general"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "Internal API instruction following (hard) benchmark - specific documentation not found in official sources",
  "paper_link": null,
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:15.222560+00:00",
  "updated_at": "2025-07-19T19:56:15.222560+00:00"
}

================================================
FILE: data/benchmarks/lbpp-(v2).json
================================================
{
  "benchmark_id": "lbpp-(v2)",
  "name": "LBPP (v2)",
  "parent_benchmark_id": null,
  "categories": ["reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "LBPP (v2) benchmark - specific documentation not found in official sources, possibly related to language-based planning problems",
  "paper_link": "https://arxiv.org/abs/2206.10498",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.053535+00:00",
  "updated_at": "2025-07-19T19:56:14.053535+00:00"
}

================================================
FILE: data/benchmarks/livebench-20241125.json
================================================
{
  "benchmark_id": "livebench-20241125",
  "name": "LiveBench 20241125",
  "parent_benchmark_id": null,
  "categories": ["math", "reasoning", "general"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "LiveBench is a challenging, contamination-limited LLM benchmark that addresses test set contamination by releasing new questions monthly based on recently-released datasets, arXiv papers, news articles, and IMDb movie synopses. It comprises tasks across math, coding, reasoning, language, instruction following, and data analysis with verifiable, objective ground-truth answers.",
  "paper_link": "https://arxiv.org/abs/2406.19314",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-08-03T22:06:11.046321+00:00",
  "updated_at": "2025-08-03T22:06:11.046321+00:00"
}

================================================
FILE: data/benchmarks/livebench.json
================================================
{
  "benchmark_id": "livebench",
  "name": "LiveBench",
  "parent_benchmark_id": null,
  "categories": ["math", "reasoning", "general"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "LiveBench is a challenging, contamination-limited LLM benchmark that addresses test set contamination by releasing new questions monthly based on recently-released datasets, arXiv papers, news articles, and IMDb movie synopses. It comprises tasks across math, coding, reasoning, language, instruction following, and data analysis with verifiable, objective ground-truth answers.",
  "paper_link": "https://arxiv.org/abs/2406.19314",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-09-05T00:00:00.000000+00:00",
  "updated_at": "2025-09-05T00:00:00.000000+00:00"
}


================================================
FILE: data/benchmarks/livecodebench(01-09).json
================================================
{
  "benchmark_id": "livecodebench(01-09)",
  "name": "LiveCodeBench(01-09)",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "general"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "LiveCodeBench is a holistic and contamination-free evaluation benchmark for large language models for code. It continuously collects new problems from programming contests (LeetCode, AtCoder, CodeForces) and evaluates four different scenarios: code generation, self-repair, code execution, and test output prediction. Problems are annotated with release dates to enable evaluation on unseen problems released after a model's training cutoff.",
  "paper_link": "https://arxiv.org/abs/2403.07974",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:15.049594+00:00",
  "updated_at": "2025-07-19T19:56:15.049594+00:00"
}

================================================
FILE: data/benchmarks/livecodebench-v5-24.12-25.2.json
================================================
{
  "benchmark_id": "livecodebench-v5-24.12-25.2",
  "name": "LiveCodeBench v5 24.12-25.2",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "general"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "LiveCodeBench is a holistic and contamination-free evaluation benchmark for large language models for code. It continuously collects new problems from programming contests (LeetCode, AtCoder, CodeForces) and evaluates four different scenarios: code generation, self-repair, code execution, and test output prediction. Problems are annotated with release dates to enable evaluation on unseen problems released after a model's training cutoff.",
  "paper_link": "https://arxiv.org/abs/2403.07974",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:12.066180+00:00",
  "updated_at": "2025-07-19T19:56:12.066180+00:00"
}

================================================
FILE: data/benchmarks/livecodebench-v5.json
================================================
{
  "benchmark_id": "livecodebench-v5",
  "name": "LiveCodeBench v5",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "general"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "LiveCodeBench is a holistic and contamination-free evaluation benchmark for large language models for code. It continuously collects new problems from programming contests (LeetCode, AtCoder, CodeForces) and evaluates four different scenarios: code generation, self-repair, code execution, and test output prediction. Problems are annotated with release dates to enable evaluation on unseen problems released after a model's training cutoff.",
  "paper_link": "https://arxiv.org/abs/2403.07974",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:13.759330+00:00",
  "updated_at": "2025-07-19T19:56:13.759330+00:00"
}

================================================
FILE: data/benchmarks/livecodebench-v6.json
================================================
{
  "benchmark_id": "livecodebench-v6",
  "name": "LiveCodeBench v6",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "general"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "LiveCodeBench is a holistic and contamination-free evaluation benchmark for large language models for code. It continuously collects new problems from programming contests (LeetCode, AtCoder, CodeForces) and evaluates four different scenarios: code generation, self-repair, code execution, and test output prediction. Problems are annotated with release dates to enable evaluation on unseen problems released after a model's training cutoff.",
  "paper_link": "https://arxiv.org/abs/2403.07974",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:11.785682+00:00",
  "updated_at": "2025-07-19T19:56:11.785682+00:00"
}

================================================
FILE: data/benchmarks/livecodebench.json
================================================
{
  "benchmark_id": "livecodebench",
  "name": "LiveCodeBench",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "general", "code"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "LiveCodeBench is a holistic and contamination-free evaluation benchmark for large language models for code. It continuously collects new problems from programming contests (LeetCode, AtCoder, CodeForces) and evaluates four different scenarios: code generation, self-repair, code execution, and test output prediction. Problems are annotated with release dates to enable evaluation on unseen problems released after a model's training cutoff.",
  "paper_link": "https://arxiv.org/abs/2403.07974",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:13.292229+00:00",
  "updated_at": "2025-07-19T19:56:13.292229+00:00"
}


================================================
FILE: data/benchmarks/longbench-v2.json
================================================
{
  "benchmark_id": "longbench-v2",
  "name": "LongBench v2",
  "parent_benchmark_id": null,
  "categories": ["long_context", "reasoning", "general"],
  "modality": "text",
  "multilingual": true,
  "max_score": 1.0,
  "language": "en",
  "description": "LongBench v2 is a benchmark designed to assess the ability of LLMs to handle long-context problems requiring deep understanding and reasoning across real-world multitasks. It consists of 503 challenging multiple-choice questions with contexts ranging from 8k to 2M words across six major task categories: single-document QA, multi-document QA, long in-context learning, long-dialogue history understanding, code repository understanding, and long structured data understanding.",
  "paper_link": "https://arxiv.org/abs/2412.15204",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:15.029281+00:00",
  "updated_at": "2025-07-19T19:56:15.029281+00:00"
}

================================================
FILE: data/benchmarks/longfact-concepts.json
================================================
{
  "benchmark_id": "longfact-concepts",
  "name": "LongFact Concepts",
  "parent_benchmark_id": null,
  "categories": ["general", "reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "LongFact is a benchmark for evaluating long-form factuality in large language models. It comprises 2,280 fact-seeking prompts spanning 38 topics, designed to test a model's ability to generate accurate, long-form responses. The benchmark uses SAFE (Search-Augmented Factuality Evaluator) to evaluate factual accuracy.",
  "paper_link": "https://arxiv.org/abs/2403.18802",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-24T12:00:00.000000+00:00",
  "updated_at": "2025-07-24T12:00:00.000000+00:00"
}


================================================
FILE: data/benchmarks/longfact-objects.json
================================================
{
  "benchmark_id": "longfact-objects",
  "name": "LongFact Objects",
  "parent_benchmark_id": null,
  "categories": ["general", "reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "LongFact is a benchmark for evaluating long-form factuality in large language models. It comprises 2,280 fact-seeking prompts spanning 38 topics, designed to test a model's ability to generate accurate, long-form responses. The benchmark uses SAFE (Search-Augmented Factuality Evaluator) to evaluate factual accuracy.",
  "paper_link": "https://arxiv.org/abs/2403.18802",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-24T12:00:00.000000+00:00",
  "updated_at": "2025-07-24T12:00:00.000000+00:00"
}


================================================
FILE: data/benchmarks/longvideobench.json
================================================
{
  "benchmark_id": "longvideobench",
  "name": "LongVideoBench",
  "parent_benchmark_id": null,
  "categories": ["vision", "long_context", "multimodal"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "LongVideoBench is a question-answering benchmark featuring video-language interleaved inputs up to an hour long. It includes 3,763 varying-length web-collected videos with subtitles across diverse themes and 6,678 human-annotated multiple-choice questions in 17 fine-grained categories for comprehensive evaluation of long-term multimodal understanding.",
  "paper_link": "https://arxiv.org/abs/2407.15754",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.730349+00:00",
  "updated_at": "2025-07-19T19:56:14.730349+00:00"
}

================================================
FILE: data/benchmarks/lsat.json
================================================
{
  "benchmark_id": "lsat",
  "name": "LSAT",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "legal", "general"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "LSAT (Law School Admission Test) benchmark evaluating complex reasoning capabilities across three challenging tasks: analytical reasoning, logical reasoning, and reading comprehension. The LSAT measures skills considered essential for success in law school including critical thinking, reading comprehension of complex texts, and analysis of arguments.",
  "paper_link": "https://arxiv.org/abs/2108.00648",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:15.409871+00:00",
  "updated_at": "2025-07-19T19:56:15.409871+00:00"
}

================================================
FILE: data/benchmarks/lvbench.json
================================================
{
  "benchmark_id": "lvbench",
  "name": "LVBench",
  "parent_benchmark_id": null,
  "categories": ["vision", "multimodal", "long_context"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "LVBench is an extreme long video understanding benchmark designed to evaluate multimodal models on videos up to two hours in duration. It contains 6 major categories and 21 subcategories, with videos averaging five times longer than existing datasets. The benchmark addresses applications requiring comprehension of extremely long videos.",
  "paper_link": "https://arxiv.org/abs/2406.08035",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:12.724041+00:00",
  "updated_at": "2025-07-19T19:56:12.724041+00:00"
}

================================================
FILE: data/benchmarks/math-(cot).json
================================================
{
  "benchmark_id": "math-(cot)",
  "name": "MATH (CoT)",
  "parent_benchmark_id": null,
  "categories": ["math", "reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "MATH dataset contains 12,500 challenging competition mathematics problems from AMC 10, AMC 12, AIME, and other mathematics competitions. Each problem includes full step-by-step solutions and spans multiple difficulty levels (1-5) across seven mathematical subjects. This variant uses Chain-of-Thought prompting to encourage step-by-step reasoning.",
  "paper_link": "https://arxiv.org/abs/2103.03874",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.366159+00:00",
  "updated_at": "2025-07-19T19:56:14.366159+00:00"
}

================================================
FILE: data/benchmarks/math-500.json
================================================
{
  "benchmark_id": "math-500",
  "name": "MATH-500",
  "parent_benchmark_id": null,
  "categories": ["math", "reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "MATH-500 is a subset of the MATH dataset containing 500 challenging competition mathematics problems from AMC 10, AMC 12, AIME, and other mathematics competitions. Each problem includes full step-by-step solutions and spans multiple difficulty levels across seven mathematical subjects including Prealgebra, Algebra, Number Theory, Counting and Probability, Geometry, Intermediate Algebra, and Precalculus.",
  "paper_link": "https://arxiv.org/abs/2103.03874",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:12.027850+00:00",
  "updated_at": "2025-07-19T19:56:12.027850+00:00"
}

================================================
FILE: data/benchmarks/math.json
================================================
{
  "benchmark_id": "math",
  "name": "MATH",
  "parent_benchmark_id": null,
  "categories": ["math", "reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "MATH dataset contains 12,500 challenging competition mathematics problems from AMC 10, AMC 12, AIME, and other mathematics competitions. Each problem includes full step-by-step solutions and spans multiple difficulty levels (1-5) across seven mathematical subjects including Prealgebra, Algebra, Number Theory, Counting and Probability, Geometry, Intermediate Algebra, and Precalculus.",
  "paper_link": "https://arxiv.org/abs/2103.03874",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:11.804258+00:00",
  "updated_at": "2025-07-19T19:56:11.804258+00:00"
}

================================================
FILE: data/benchmarks/mathvision.json
================================================
{
  "benchmark_id": "mathvision",
  "name": "MathVision",
  "parent_benchmark_id": null,
  "categories": ["math", "vision", "multimodal"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "MATH-Vision is a dataset designed to measure multimodal mathematical reasoning capabilities. It focuses on evaluating how well models can solve mathematical problems that require both visual understanding and mathematical reasoning, bridging the gap between visual and mathematical domains.",
  "paper_link": "https://arxiv.org/abs/2402.14804",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.695583+00:00",
  "updated_at": "2025-07-19T19:56:14.695583+00:00"
}

================================================
FILE: data/benchmarks/mathvista-mini.json
================================================
{
  "benchmark_id": "mathvista-mini",
  "name": "MathVista-Mini",
  "parent_benchmark_id": null,
  "categories": ["math", "vision", "multimodal"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "MathVista-Mini is a smaller version of the MathVista benchmark that evaluates mathematical reasoning in visual contexts. It consists of examples derived from multimodal datasets involving mathematics, combining challenges from diverse mathematical and visual tasks to assess foundation models' ability to solve problems requiring both visual understanding and mathematical reasoning.",
  "paper_link": "https://arxiv.org/abs/2310.02255",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:13.654470+00:00",
  "updated_at": "2025-07-19T19:56:13.654470+00:00"
}

================================================
FILE: data/benchmarks/mathvista.json
================================================
{
  "benchmark_id": "mathvista",
  "name": "MathVista",
  "parent_benchmark_id": null,
  "categories": ["math", "vision", "multimodal"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "MathVista evaluates mathematical reasoning of foundation models in visual contexts. It consists of 6,141 examples derived from 28 existing multimodal datasets and 3 newly created datasets (IQTest, FunctionQA, and PaperQA), combining challenges from diverse mathematical and visual tasks to assess models' ability to understand complex figures and perform rigorous reasoning.",
  "paper_link": "https://arxiv.org/abs/2310.02255",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:12.069611+00:00",
  "updated_at": "2025-07-19T19:56:12.069611+00:00"
}

================================================
FILE: data/benchmarks/mbpp+.json
================================================
{
  "benchmark_id": "mbpp+",
  "name": "MBPP+",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "general"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "MBPP+ is an enhanced version of MBPP (Mostly Basic Python Problems) with significantly more test cases (35x) for more rigorous evaluation. MBPP is a benchmark of 974 crowd-sourced Python programming problems designed to be solvable by entry-level programmers, covering programming fundamentals and standard library functionality.",
  "paper_link": "https://arxiv.org/abs/2108.07732",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.501855+00:00",
  "updated_at": "2025-07-19T19:56:14.501855+00:00"
}

================================================
FILE: data/benchmarks/mbpp-++-base-version.json
================================================
{
  "benchmark_id": "mbpp-++-base-version",
  "name": "MBPP ++ base version",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "general"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "MBPP (Mostly Basic Python Problems) is a benchmark of 974 crowd-sourced Python programming problems designed to be solvable by entry-level programmers. Each problem consists of a task description, code solution, and 3 automated test cases covering programming fundamentals and standard library functionality. This is an enhanced version with additional test cases.",
  "paper_link": "https://arxiv.org/abs/2108.07732",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.341560+00:00",
  "updated_at": "2025-07-19T19:56:14.341560+00:00"
}

================================================
FILE: data/benchmarks/mbpp-evalplus-(base).json
================================================
{
  "benchmark_id": "mbpp-evalplus-(base)",
  "name": "MBPP EvalPlus (base)",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "general"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "MBPP (Mostly Basic Python Problems) is a benchmark of 974 crowd-sourced Python programming problems designed to be solvable by entry-level programmers. EvalPlus extends MBPP with significantly more test cases (35x) for more rigorous evaluation of LLM-synthesized code, providing high-quality and precise evaluation.",
  "paper_link": "https://arxiv.org/abs/2108.07732",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.421722+00:00",
  "updated_at": "2025-07-19T19:56:14.421722+00:00"
}

================================================
FILE: data/benchmarks/mbpp-evalplus.json
================================================
{
  "benchmark_id": "mbpp-evalplus",
  "name": "MBPP EvalPlus",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "general"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "MBPP (Mostly Basic Python Problems) is a benchmark of 974 crowd-sourced Python programming problems designed to be solvable by entry-level programmers. EvalPlus extends MBPP with significantly more test cases (35x) for more rigorous evaluation of LLM-synthesized code, providing high-quality and precise evaluation.",
  "paper_link": "https://arxiv.org/abs/2108.07732",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.425667+00:00",
  "updated_at": "2025-07-19T19:56:14.425667+00:00"
}

================================================
FILE: data/benchmarks/mbpp-pass@1.json
================================================
{
  "benchmark_id": "mbpp-pass@1",
  "name": "MBPP pass@1",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "general"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "MBPP (Mostly Basic Python Problems) is a benchmark of 974 crowd-sourced Python programming problems designed to be solvable by entry-level programmers. Each problem consists of a task description, code solution, and 3 automated test cases. This variant uses pass@1 evaluation metric measuring the percentage of problems solved correctly on the first attempt.",
  "paper_link": "https://arxiv.org/abs/2108.07732",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:15.138778+00:00",
  "updated_at": "2025-07-19T19:56:15.138778+00:00"
}

================================================
FILE: data/benchmarks/mbpp-plus.json
================================================
{
  "benchmark_id": "mbpp-plus",
  "name": "MBPP Plus",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "code"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "MBPP (Mostly Basic Python Problems) is a benchmark of 974 crowd-sourced Python programming problems designed to be solvable by entry-level programmers. Each problem consists of a task description, code solution, and 3 automated test cases covering programming fundamentals and standard library functionality. This is an enhanced version with additional test cases for more rigorous evaluation.",
  "paper_link": "https://arxiv.org/abs/2108.07732",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-08-03T22:06:11.143382+00:00",
  "updated_at": "2025-08-03T22:06:11.143382+00:00"
}


================================================
FILE: data/benchmarks/mbpp.json
================================================
{
  "benchmark_id": "mbpp",
  "name": "MBPP",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "general"],
  "modality": "text",
  "multilingual": false,
  "max_score": 100.0,
  "language": "en",
  "description": "MBPP (Mostly Basic Python Problems) is a benchmark of 974 crowd-sourced Python programming problems designed to be solvable by entry-level programmers. Each problem consists of a task description, code solution, and 3 automated test cases covering programming fundamentals and standard library functionality.",
  "paper_link": "https://arxiv.org/abs/2108.07732",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:13.453370+00:00",
  "updated_at": "2025-07-19T19:56:13.453370+00:00"
}

================================================
FILE: data/benchmarks/medxpertqa.json
================================================
{
  "benchmark_id": "medxpertqa",
  "name": "MedXpertQA",
  "parent_benchmark_id": null,
  "categories": ["healthcare", "reasoning", "multimodal"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "A comprehensive benchmark to evaluate expert-level medical knowledge and advanced reasoning, featuring 4,460 questions spanning 17 specialties and 11 body systems. Includes both text-only and multimodal subsets with expert-level exam questions incorporating diverse medical images and rich clinical information.",
  "paper_link": "https://arxiv.org/abs/2501.18362",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.040381+00:00",
  "updated_at": "2025-07-19T19:56:14.040381+00:00"
}

================================================
FILE: data/benchmarks/mega-mlqa.json
================================================
{
  "benchmark_id": "mega-mlqa",
  "name": "MEGA MLQA",
  "parent_benchmark_id": null,
  "categories": ["language", "reasoning"],
  "modality": "text",
  "multilingual": true,
  "max_score": 1.0,
  "language": "en",
  "description": "MLQA as part of the MEGA (Multilingual Evaluation of Generative AI) benchmark suite. A multi-way aligned extractive QA evaluation benchmark for cross-lingual question answering across 7 languages (English, Arabic, German, Spanish, Hindi, Vietnamese, and Simplified Chinese) with over 12K QA instances in English and 5K in each other language.",
  "paper_link": "https://arxiv.org/abs/2303.12528",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.187404+00:00",
  "updated_at": "2025-07-19T19:56:14.187404+00:00"
}

================================================
FILE: data/benchmarks/mega-tydi-qa.json
================================================
{
  "benchmark_id": "mega-tydi-qa",
  "name": "MEGA TyDi QA",
  "parent_benchmark_id": null,
  "categories": ["language", "reasoning"],
  "modality": "text",
  "multilingual": true,
  "max_score": 1.0,
  "language": "en",
  "description": "TyDi QA as part of the MEGA benchmark suite. A question answering dataset covering 11 typologically diverse languages (Arabic, Bengali, English, Finnish, Indonesian, Japanese, Korean, Russian, Swahili, Telugu, and Thai) with 204K question-answer pairs. Features realistic information-seeking questions written by people who want to know the answer but don't know it yet.",
  "paper_link": "https://arxiv.org/abs/2003.05002",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.192871+00:00",
  "updated_at": "2025-07-19T19:56:14.192871+00:00"
}

================================================
FILE: data/benchmarks/mega-udpos.json
================================================
{
  "benchmark_id": "mega-udpos",
  "name": "MEGA UDPOS",
  "parent_benchmark_id": null,
  "categories": ["language"],
  "modality": "text",
  "multilingual": true,
  "max_score": 1.0,
  "language": "en",
  "description": "Universal Dependencies POS tagging as part of the MEGA benchmark suite. A multilingual part-of-speech tagging dataset based on Universal Dependencies treebanks, utilizing the universal POS tag set of 17 tags across 38 diverse languages from different language families. Used for evaluating multilingual POS tagging systems.",
  "paper_link": "https://arxiv.org/abs/2004.10643",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.198318+00:00",
  "updated_at": "2025-07-19T19:56:14.198318+00:00"
}

================================================
FILE: data/benchmarks/mega-xcopa.json
================================================
{
  "benchmark_id": "mega-xcopa",
  "name": "MEGA XCOPA",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "language"],
  "modality": "text",
  "multilingual": true,
  "max_score": 1.0,
  "language": "en",
  "description": "XCOPA (Cross-lingual Choice of Plausible Alternatives) as part of the MEGA benchmark suite. A typologically diverse multilingual dataset for causal commonsense reasoning in 11 languages, including resource-poor languages like Eastern Apurímac Quechua and Haitian Creole. Requires models to select which choice is the effect or cause of a given premise.",
  "paper_link": "https://arxiv.org/abs/2005.00333",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.205296+00:00",
  "updated_at": "2025-07-19T19:56:14.205296+00:00"
}

================================================
FILE: data/benchmarks/mega-xstorycloze.json
================================================
{
  "benchmark_id": "mega-xstorycloze",
  "name": "MEGA XStoryCloze",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "language"],
  "modality": "text",
  "multilingual": true,
  "max_score": 1.0,
  "language": "en",
  "description": "XStoryCloze as part of the MEGA benchmark suite. A cross-lingual story completion task that consists of professionally translated versions of the English StoryCloze dataset to 10 non-English languages. Requires models to predict the correct ending for a given four-sentence story, evaluating commonsense reasoning and narrative understanding.",
  "paper_link": "https://arxiv.org/abs/2303.12528",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.212479+00:00",
  "updated_at": "2025-07-19T19:56:14.212479+00:00"
}

================================================
FILE: data/benchmarks/meld.json
================================================
{
  "benchmark_id": "meld",
  "name": "Meld",
  "parent_benchmark_id": null,
  "categories": ["multimodal", "psychology"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "MELD (Multimodal EmotionLines Dataset) is a multimodal multi-party dataset for emotion recognition in conversations. Contains approximately 13,000 utterances from 1,433 dialogues extracted from the TV series Friends. Each utterance is annotated with emotion (Anger, Disgust, Sadness, Joy, Neutral, Surprise, Fear) and sentiment labels across audio, visual, and textual modalities.",
  "paper_link": "https://arxiv.org/abs/1810.02508",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.842977+00:00",
  "updated_at": "2025-07-19T19:56:14.842977+00:00"
}

================================================
FILE: data/benchmarks/mgsm.json
================================================
{
  "benchmark_id": "mgsm",
  "name": "MGSM",
  "parent_benchmark_id": null,
  "categories": ["math", "reasoning"],
  "modality": "text",
  "multilingual": true,
  "max_score": 1.0,
  "language": "en",
  "description": "MGSM (Multilingual Grade School Math) is a benchmark of grade-school math problems. Contains 250 grade-school math problems manually translated from the GSM8K dataset into ten typologically diverse languages: Spanish, French, German, Russian, Chinese, Japanese, Thai, Swahili, Bengali, and Telugu. Evaluates multilingual mathematical reasoning capabilities.",
  "paper_link": "https://arxiv.org/abs/2210.03057",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:13.669061+00:00",
  "updated_at": "2025-07-19T19:56:13.669061+00:00"
}

================================================
FILE: data/benchmarks/mimic-cxr.json
================================================
{
  "benchmark_id": "mimic-cxr",
  "name": "MIMIC CXR",
  "parent_benchmark_id": null,
  "categories": ["healthcare", "vision", "multimodal"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "MIMIC-CXR is a large publicly available dataset of chest radiographs with free-text radiology reports. Contains 377,110 images corresponding to 227,835 radiographic studies from 65,379 patients at Beth Israel Deaconess Medical Center. The dataset is de-identified and widely used for medical imaging research, automated report generation, and medical AI development.",
  "paper_link": "https://arxiv.org/abs/1901.07042",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.017221+00:00",
  "updated_at": "2025-07-19T19:56:14.017221+00:00"
}

================================================
FILE: data/benchmarks/mlvu-m.json
================================================
{
  "benchmark_id": "mlvu-m",
  "name": "MLVU-M",
  "parent_benchmark_id": null,
  "categories": ["general"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "MLVU-M benchmark",
  "paper_link": null,
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.931298+00:00",
  "updated_at": "2025-07-19T19:56:14.931298+00:00"
}

================================================
FILE: data/benchmarks/mlvu.json
================================================
{
  "benchmark_id": "mlvu",
  "name": "MLVU",
  "parent_benchmark_id": null,
  "categories": ["video", "multimodal", "long_context"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "A comprehensive benchmark for multi-task long video understanding that evaluates multimodal large language models on videos ranging from 3 minutes to 2 hours across 9 distinct tasks including reasoning, captioning, recognition, and summarization.",
  "paper_link": "https://arxiv.org/abs/2406.04264",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.755571+00:00",
  "updated_at": "2025-07-19T19:56:14.755571+00:00"
}

================================================
FILE: data/benchmarks/mm-if-eval.json
================================================
{
  "benchmark_id": "mm-if-eval",
  "name": "MM IF-Eval",
  "parent_benchmark_id": null,
  "categories": ["multimodal", "reasoning"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "A challenging multimodal instruction-following benchmark that includes both compose-level constraints for output responses and perception-level constraints tied to input images, with comprehensive evaluation pipeline.",
  "paper_link": "https://arxiv.org/abs/2504.07957",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:15.142939+00:00",
  "updated_at": "2025-07-19T19:56:15.142939+00:00"
}

================================================
FILE: data/benchmarks/mm-mind2web.json
================================================
{
  "benchmark_id": "mm-mind2web",
  "name": "MM-Mind2Web",
  "parent_benchmark_id": null,
  "categories": ["multimodal", "frontend_development", "reasoning"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "A multimodal web navigation benchmark comprising 2,000 open-ended tasks spanning 137 websites across 31 domains. Each task includes HTML documents paired with webpage screenshots, action sequences, and complex web interactions.",
  "paper_link": "https://arxiv.org/abs/2306.06070",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:12.753488+00:00",
  "updated_at": "2025-07-19T19:56:12.753488+00:00"
}

================================================
FILE: data/benchmarks/mm-mt-bench.json
================================================
{
  "benchmark_id": "mm-mt-bench",
  "name": "MM-MT-Bench",
  "parent_benchmark_id": null,
  "categories": ["multimodal", "communication"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 100.0,
  "language": "en",
  "description": "A multi-turn LLM-as-a-judge evaluation benchmark for testing multimodal instruction-tuned models' ability to follow user instructions in multi-turn dialogues and answer open-ended questions in a zero-shot manner.",
  "paper_link": null,
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.880812+00:00",
  "updated_at": "2025-07-19T19:56:14.880812+00:00"
}

================================================
FILE: data/benchmarks/mmau-music.json
================================================
{
  "benchmark_id": "mmau-music",
  "name": "MMAU Music",
  "parent_benchmark_id": null,
  "categories": ["audio", "multimodal", "reasoning"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "A subset of the MMAU benchmark focused specifically on music understanding and reasoning tasks. Part of a comprehensive multimodal audio understanding benchmark that evaluates models on expert-level knowledge and complex reasoning across music audio clips.",
  "paper_link": "https://arxiv.org/abs/2410.19168",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.851711+00:00",
  "updated_at": "2025-07-19T19:56:14.851711+00:00"
}

================================================
FILE: data/benchmarks/mmau-sound.json
================================================
{
  "benchmark_id": "mmau-sound",
  "name": "MMAU Sound",
  "parent_benchmark_id": null,
  "categories": ["audio", "multimodal", "reasoning"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "A subset of the MMAU benchmark focused specifically on environmental sound understanding and reasoning tasks. Part of a comprehensive multimodal audio understanding benchmark that evaluates models on expert-level knowledge and complex reasoning across environmental sound clips.",
  "paper_link": "https://arxiv.org/abs/2410.19168",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.859503+00:00",
  "updated_at": "2025-07-19T19:56:14.859503+00:00"
}

================================================
FILE: data/benchmarks/mmau-speech.json
================================================
{
  "benchmark_id": "mmau-speech",
  "name": "MMAU Speech",
  "parent_benchmark_id": null,
  "categories": ["audio", "multimodal", "reasoning", "speech-to-text"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "A subset of the MMAU benchmark focused specifically on speech understanding and reasoning tasks. Part of a comprehensive multimodal audio understanding benchmark that evaluates models on expert-level knowledge and complex reasoning across speech audio clips.",
  "paper_link": "https://arxiv.org/abs/2410.19168",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.863540+00:00",
  "updated_at": "2025-07-19T19:56:14.863540+00:00"
}

================================================
FILE: data/benchmarks/mmau.json
================================================
{
  "benchmark_id": "mmau",
  "name": "MMAU",
  "parent_benchmark_id": null,
  "categories": ["audio", "multimodal", "reasoning"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "A massive multi-task audio understanding and reasoning benchmark comprising 10,000 carefully curated audio clips paired with human-annotated natural language questions spanning speech, environmental sounds, and music. Requires expert-level knowledge and complex reasoning across 27 distinct skills.",
  "paper_link": "https://arxiv.org/abs/2410.19168",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.846435+00:00",
  "updated_at": "2025-07-19T19:56:14.846435+00:00"
}

================================================
FILE: data/benchmarks/mmbench-test.json
================================================
{
  "benchmark_id": "mmbench-test",
  "name": "MMBench_test",
  "parent_benchmark_id": null,
  "categories": ["vision", "multimodal", "reasoning"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "Test set of MMBench, a bilingual benchmark for assessing multi-modal capabilities of vision-language models through multiple-choice questions in both English and Chinese, providing systematic evaluation across diverse vision-language tasks.",
  "paper_link": "https://arxiv.org/abs/2307.06281",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.607904+00:00",
  "updated_at": "2025-07-19T19:56:14.607904+00:00"
}

================================================
FILE: data/benchmarks/mmbench-v1.1.json
================================================
{
  "benchmark_id": "mmbench-v1.1",
  "name": "MMBench-V1.1",
  "parent_benchmark_id": null,
  "categories": ["vision", "multimodal", "reasoning"],
  "modality": "multimodal",
  "multilingual": true,
  "max_score": 1.0,
  "language": "en",
  "description": "Version 1.1 of MMBench, an improved bilingual benchmark for assessing multi-modal capabilities of vision-language models through multiple-choice questions in both English and Chinese, providing systematic evaluation across diverse vision-language tasks.",
  "paper_link": "https://arxiv.org/abs/2307.06281",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.868950+00:00",
  "updated_at": "2025-07-19T19:56:14.868950+00:00"
}

================================================
FILE: data/benchmarks/mmbench-video.json
================================================
{
  "benchmark_id": "mmbench-video",
  "name": "MMBench-Video",
  "parent_benchmark_id": null,
  "categories": ["video", "multimodal", "reasoning"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "A long-form multi-shot benchmark for holistic video understanding that incorporates approximately 600 web videos from YouTube spanning 16 major categories, with each video ranging from 30 seconds to 6 minutes. Includes roughly 2,000 original question-answer pairs covering 26 fine-grained capabilities.",
  "paper_link": "https://arxiv.org/abs/2406.14515",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.738914+00:00",
  "updated_at": "2025-07-19T19:56:14.738914+00:00"
}

================================================
FILE: data/benchmarks/mmbench.json
================================================
{
  "benchmark_id": "mmbench",
  "name": "MMBench",
  "parent_benchmark_id": null,
  "categories": ["vision", "multimodal", "reasoning"],
  "modality": "multimodal",
  "multilingual": true,
  "max_score": 1.0,
  "language": "en",
  "description": "A bilingual benchmark for assessing multi-modal capabilities of vision-language models through multiple-choice questions in both English and Chinese, providing systematic evaluation across diverse vision-language tasks with robust metrics.",
  "paper_link": "https://arxiv.org/abs/2307.06281",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.235585+00:00",
  "updated_at": "2025-07-19T19:56:14.235585+00:00"
}

================================================
FILE: data/benchmarks/mme-realworld.json
================================================
{
  "benchmark_id": "mme-realworld",
  "name": "MME-RealWorld",
  "parent_benchmark_id": null,
  "categories": ["vision", "multimodal", "general"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "A comprehensive evaluation benchmark for Multimodal Large Language Models featuring over 13,366 high-resolution images and 29,429 question-answer pairs across 43 subtasks and 5 real-world scenarios. The largest manually annotated multimodal benchmark to date, designed to test MLLMs on challenging high-resolution real-world scenarios.",
  "paper_link": "https://arxiv.org/abs/2408.13257",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.877676+00:00",
  "updated_at": "2025-07-19T19:56:14.877676+00:00"
}

================================================
FILE: data/benchmarks/mme.json
================================================
{
  "benchmark_id": "mme",
  "name": "MME",
  "parent_benchmark_id": null,
  "categories": ["vision", "multimodal", "reasoning"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "A comprehensive evaluation benchmark for Multimodal Large Language Models measuring both perception and cognition abilities across 14 subtasks. Features manually designed instruction-answer pairs to avoid data leakage and provides systematic quantitative assessment of MLLM capabilities.",
  "paper_link": "https://arxiv.org/abs/2306.13394",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:15.022505+00:00",
  "updated_at": "2025-07-19T19:56:15.022505+00:00"
}

================================================
FILE: data/benchmarks/mmlu-(cot).json
================================================
{
  "benchmark_id": "mmlu-(cot)",
  "name": "MMLU (CoT)",
  "parent_benchmark_id": null,
  "categories": ["language", "reasoning", "math", "general"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "Chain-of-Thought variant of the Massive Multitask Language Understanding benchmark, evaluating language models across 57 tasks including elementary mathematics, US history, computer science, law, and other professional and academic subjects. This version uses chain-of-thought prompting to elicit step-by-step reasoning.",
  "paper_link": "https://arxiv.org/abs/2009.03300",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.330830+00:00",
  "updated_at": "2025-07-19T19:56:14.330830+00:00"
}

================================================
FILE: data/benchmarks/mmlu-base.json
================================================
{
  "benchmark_id": "mmlu-base",
  "name": "MMLU-Base",
  "parent_benchmark_id": null,
  "categories": ["language", "reasoning", "math", "general"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "Base version of the Massive Multitask Language Understanding benchmark, evaluating language models across 57 tasks including elementary mathematics, US history, computer science, law, and other professional and academic subjects. Designed to comprehensively measure the breadth and depth of a model's academic and professional understanding.",
  "paper_link": "https://arxiv.org/abs/2009.03300",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.562710+00:00",
  "updated_at": "2025-07-19T19:56:14.562710+00:00"
}

================================================
FILE: data/benchmarks/mmlu-chat.json
================================================
{
  "benchmark_id": "mmlu-chat",
  "name": "MMLU Chat",
  "parent_benchmark_id": null,
  "categories": ["language", "reasoning", "math", "general"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "Chat-format variant of the Massive Multitask Language Understanding benchmark, evaluating language models across 57 tasks including elementary mathematics, US history, computer science, law, and other professional and academic subjects. This version uses conversational prompting format for model evaluation.",
  "paper_link": "https://arxiv.org/abs/2009.03300",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:15.095600+00:00",
  "updated_at": "2025-07-19T19:56:15.095600+00:00"
}

================================================
FILE: data/benchmarks/mmlu-french.json
================================================
{
  "benchmark_id": "mmlu-french",
  "name": "MMLU French",
  "parent_benchmark_id": null,
  "categories": ["language", "reasoning", "math", "general"],
  "modality": "text",
  "multilingual": true,
  "max_score": 1.0,
  "language": "fr",
  "description": "French language variant of the Massive Multitask Language Understanding benchmark, evaluating language models across 57 tasks including elementary mathematics, US history, computer science, law, and other professional and academic subjects. This multilingual version tests model performance in French.",
  "paper_link": "https://arxiv.org/abs/2009.03300",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:15.175211+00:00",
  "updated_at": "2025-07-19T19:56:15.175211+00:00"
}

================================================
FILE: data/benchmarks/mmlu-pro.json
================================================
{
  "benchmark_id": "mmlu-pro",
  "name": "MMLU-Pro",
  "parent_benchmark_id": null,
  "categories": ["language", "reasoning", "math", "general"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "A more robust and challenging multi-task language understanding benchmark that extends MMLU by expanding multiple-choice options from 4 to 10, eliminating trivial questions, and focusing on reasoning-intensive tasks. Features over 12,000 curated questions across 14 domains and causes a 16-33% accuracy drop compared to original MMLU.",
  "paper_link": "https://arxiv.org/abs/2406.01574",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:11.408351+00:00",
  "updated_at": "2025-07-19T19:56:11.408351+00:00"
}

================================================
FILE: data/benchmarks/mmlu-prox.json
================================================
{
  "benchmark_id": "mmlu-prox",
  "name": "MMLU-ProX",
  "parent_benchmark_id": null,
  "categories": ["language", "reasoning", "math", "general"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "Extended version of MMLU-Pro providing additional challenging multiple-choice questions for evaluating language models across diverse academic and professional domains. Built on the foundation of the Massive Multitask Language Understanding benchmark framework.",
  "paper_link": "https://arxiv.org/abs/2406.01574",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:13.738623+00:00",
  "updated_at": "2025-07-19T19:56:13.738623+00:00"
}

================================================
FILE: data/benchmarks/mmlu-redux-2.0.json
================================================
{
  "benchmark_id": "mmlu-redux-2.0",
  "name": "MMLU-redux-2.0",
  "parent_benchmark_id": null,
  "categories": ["language", "reasoning", "math", "general"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "A curated version of the MMLU benchmark featuring manually re-annotated 5,700 questions across 57 subjects to identify and correct errors in the original dataset. Addresses the 6.49% error rate found in MMLU and provides more reliable evaluation metrics for language models.",
  "paper_link": "https://arxiv.org/abs/2406.04127",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:11.518552+00:00",
  "updated_at": "2025-07-19T19:56:11.518552+00:00"
}

================================================
FILE: data/benchmarks/mmlu-redux.json
================================================
{
  "benchmark_id": "mmlu-redux",
  "name": "MMLU-Redux",
  "parent_benchmark_id": null,
  "categories": ["language", "reasoning", "math", "general"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "An improved version of the MMLU benchmark featuring manually re-annotated questions to identify and correct errors in the original dataset. Provides more reliable evaluation metrics for language models by addressing dataset quality issues found in the original MMLU.",
  "paper_link": "https://arxiv.org/abs/2406.04127",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-09-05T00:00:00.000000+00:00",
  "updated_at": "2025-09-05T00:00:00.000000+00:00"
}


================================================
FILE: data/benchmarks/mmlu-stem.json
================================================
{
  "benchmark_id": "mmlu-stem",
  "name": "MMLU-STEM",
  "parent_benchmark_id": null,
  "categories": ["math", "reasoning", "physics", "chemistry"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "STEM-focused subset of the Massive Multitask Language Understanding benchmark, evaluating language models on science, technology, engineering, and mathematics topics including physics, chemistry, mathematics, and other technical subjects.",
  "paper_link": "https://arxiv.org/abs/2009.03300",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.495405+00:00",
  "updated_at": "2025-07-19T19:56:14.495405+00:00"
}

================================================
FILE: data/benchmarks/mmlu.json
================================================
{
  "benchmark_id": "mmlu",
  "name": "MMLU",
  "parent_benchmark_id": null,
  "categories": ["general", "reasoning", "language", "math"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "Massive Multitask Language Understanding benchmark testing knowledge across 57 diverse subjects including STEM, humanities, social sciences, and professional domains",
  "paper_link": "https://arxiv.org/abs/2009.03300",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:11.200416+00:00",
  "updated_at": "2025-07-19T19:56:11.200416+00:00"
}

================================================
FILE: data/benchmarks/mmmlu.json
================================================
{
  "benchmark_id": "mmmlu",
  "name": "MMMLU",
  "parent_benchmark_id": null,
  "categories": ["language", "reasoning", "math", "general"],
  "modality": "text",
  "multilingual": true,
  "max_score": 1.0,
  "language": "en",
  "description": "Multilingual Massive Multitask Language Understanding dataset released by OpenAI, featuring professionally translated MMLU test questions across 14 languages including Arabic, Bengali, German, Spanish, French, Hindi, Indonesian, Italian, Japanese, Korean, Portuguese, Swahili, Yoruba, and Chinese. Contains approximately 15,908 multiple-choice questions per language covering 57 subjects.",
  "paper_link": "https://arxiv.org/abs/2009.03300",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.144789+00:00",
  "updated_at": "2025-07-19T19:56:14.144789+00:00"
}

================================================
FILE: data/benchmarks/mmmu-(val).json
================================================
{
  "benchmark_id": "mmmu-(val)",
  "name": "MMMU (val)",
  "parent_benchmark_id": null,
  "categories": ["vision", "multimodal", "reasoning", "general"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "Validation set of the Massive Multi-discipline Multimodal Understanding and Reasoning benchmark. Features college-level multimodal questions across 6 core disciplines (Art & Design, Business, Science, Health & Medicine, Humanities & Social Science, Tech & Engineering) spanning 30 subjects and 183 subfields with diverse image types including charts, diagrams, maps, and tables.",
  "paper_link": "https://arxiv.org/abs/2311.16502",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:13.593262+00:00",
  "updated_at": "2025-07-19T19:56:13.593262+00:00"
}

================================================
FILE: data/benchmarks/mmmu-(validation).json
================================================
{
  "benchmark_id": "mmmu-(validation)",
  "name": "MMMU (validation)",
  "parent_benchmark_id": null,
  "categories": ["vision", "multimodal", "reasoning", "general"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "Validation set of the Massive Multi-discipline Multimodal Understanding and Reasoning benchmark. Features college-level multimodal questions across 6 core disciplines (Art & Design, Business, Science, Health & Medicine, Humanities & Social Science, Tech & Engineering) spanning 30 subjects and 183 subfields with diverse image types including charts, diagrams, maps, and tables.",
  "paper_link": "https://arxiv.org/abs/2311.16502",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:15.118197+00:00",
  "updated_at": "2025-07-19T19:56:15.118197+00:00"
}

================================================
FILE: data/benchmarks/mmmu-pro.json
================================================
{
  "benchmark_id": "mmmu-pro",
  "name": "MMMU-Pro",
  "parent_benchmark_id": null,
  "categories": ["vision", "multimodal", "reasoning", "general"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "A more robust multi-discipline multimodal understanding benchmark that enhances MMMU through a three-step process: filtering text-only answerable questions, augmenting candidate options, and introducing vision-only input settings. Achieves significantly lower model performance (16.8-26.9%) compared to original MMMU, providing more rigorous evaluation that closely mimics real-world scenarios.",
  "paper_link": "https://arxiv.org/abs/2409.02813",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.282252+00:00",
  "updated_at": "2025-07-19T19:56:14.282252+00:00"
}

================================================
FILE: data/benchmarks/mmmu.json
================================================
{
  "benchmark_id": "mmmu",
  "name": "MMMU",
  "parent_benchmark_id": null,
  "categories": ["multimodal", "reasoning", "general"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "MMMU (Massive Multi-discipline Multimodal Understanding) is a benchmark designed to evaluate multimodal models on college-level subject knowledge and deliberate reasoning. Contains 11.5K meticulously collected multimodal questions from college exams, quizzes, and textbooks, covering six core disciplines: Art & Design, Business, Science, Health & Medicine, Humanities & Social Science, and Tech & Engineering across 30 subjects and 183 subfields.",
  "paper_link": "https://arxiv.org/abs/2311.16502",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:12.130105+00:00",
  "updated_at": "2025-07-19T19:56:12.130105+00:00"
}

================================================
FILE: data/benchmarks/mmmuval.json
================================================
{
  "benchmark_id": "mmmuval",
  "name": "MMMUval",
  "parent_benchmark_id": null,
  "categories": ["vision", "general", "reasoning", "multimodal"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "Validation set for MMMU (Massive Multi-discipline Multimodal Understanding and Reasoning) benchmark, designed to evaluate multimodal models on massive multi-discipline tasks demanding college-level subject knowledge and deliberate reasoning across Art & Design, Business, Science, Health & Medicine, Humanities & Social Science, and Tech & Engineering.",
  "paper_link": "https://arxiv.org/abs/2311.16502",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.575948+00:00",
  "updated_at": "2025-07-19T19:56:14.575948+00:00"
}

================================================
FILE: data/benchmarks/mmstar.json
================================================
{
  "benchmark_id": "mmstar",
  "name": "MMStar",
  "parent_benchmark_id": null,
  "categories": ["vision", "multimodal", "reasoning", "general"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "MMStar is an elite vision-indispensable multimodal benchmark comprising 1,500 challenge samples meticulously selected by humans to evaluate 6 core capabilities and 18 detailed axes. The benchmark addresses issues of visual content unnecessity and unintentional data leakage in existing multimodal evaluations.",
  "paper_link": "https://arxiv.org/abs/2403.20330",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.660584+00:00",
  "updated_at": "2025-07-19T19:56:14.660584+00:00"
}

================================================
FILE: data/benchmarks/mmt-bench.json
================================================
{
  "benchmark_id": "mmt-bench",
  "name": "MMT-Bench",
  "parent_benchmark_id": null,
  "categories": ["vision", "multimodal", "reasoning", "general"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "MMT-Bench is a comprehensive multimodal benchmark for evaluating Large Vision-Language Models towards multitask AGI. It comprises 31,325 meticulously curated multi-choice visual questions from various multimodal scenarios such as vehicle driving and embodied navigation, covering 32 core meta-tasks and 162 subtasks in multimodal understanding.",
  "paper_link": "https://arxiv.org/abs/2404.16006",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.674184+00:00",
  "updated_at": "2025-07-19T19:56:14.674184+00:00"
}

================================================
FILE: data/benchmarks/mmvet.json
================================================
{
  "benchmark_id": "mmvet",
  "name": "MMVet",
  "parent_benchmark_id": null,
  "categories": ["vision", "multimodal", "reasoning", "general", "spatial_reasoning", "math"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "MM-Vet is an evaluation benchmark that examines large multimodal models on complicated multimodal tasks requiring integrated capabilities. It assesses six core vision-language capabilities: recognition, knowledge, spatial awareness, language generation, OCR, and math through questions that require one or more of these capabilities.",
  "paper_link": "https://arxiv.org/abs/2308.02490",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.684742+00:00",
  "updated_at": "2025-07-19T19:56:14.684742+00:00"
}

================================================
FILE: data/benchmarks/mmvetgpt4turbo.json
================================================
{
  "benchmark_id": "mmvetgpt4turbo",
  "name": "MMVetGPT4Turbo",
  "parent_benchmark_id": null,
  "categories": ["vision", "multimodal", "reasoning", "general", "spatial_reasoning", "math"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "MM-Vet evaluation using GPT-4 Turbo for scoring. This variant of MM-Vet examines large multimodal models on complicated multimodal tasks requiring integrated capabilities across six core vision-language abilities: recognition, knowledge, spatial awareness, language generation, OCR, and math.",
  "paper_link": "https://arxiv.org/abs/2308.02490",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.611567+00:00",
  "updated_at": "2025-07-19T19:56:14.611567+00:00"
}

================================================
FILE: data/benchmarks/mobileminiwob++-sr.json
================================================
{
  "benchmark_id": "mobileminiwob++-sr",
  "name": "MobileMiniWob++_SR",
  "parent_benchmark_id": null,
  "categories": ["multimodal", "frontend_development"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "MobileMiniWob++ SR (Success Rate) is an adaptation of the MiniWob++ web interaction benchmark for mobile Android environments within AndroidWorld. It comprises 92 web interaction tasks adapted for touch-based mobile interfaces, evaluating agents' ability to navigate and interact with web applications on mobile devices.",
  "paper_link": "https://arxiv.org/abs/2405.14573",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.816755+00:00",
  "updated_at": "2025-07-19T19:56:14.816755+00:00"
}


================================================
FILE: data/benchmarks/mrcr-1m-(pointwise).json
================================================
{
  "benchmark_id": "mrcr-1m-(pointwise)",
  "name": "MRCR 1M (pointwise)",
  "parent_benchmark_id": null,
  "categories": ["long_context", "reasoning", "general"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "MRCR 1M (pointwise) is a variant of the Multi-Round Coreference Resolution benchmark that uses pointwise evaluation for ultra-long contexts (~1M tokens). This version evaluates each response independently rather than comparatively, testing models' absolute performance on long-context reasoning tasks.",
  "paper_link": "https://arxiv.org/abs/2409.12640",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:13.912789+00:00",
  "updated_at": "2025-07-19T19:56:13.912789+00:00"
}

================================================
FILE: data/benchmarks/mrcr-1m.json
================================================
{
  "benchmark_id": "mrcr-1m",
  "name": "MRCR 1M",
  "parent_benchmark_id": null,
  "categories": ["long_context", "reasoning", "general"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "MRCR 1M is a variant of the Multi-Round Coreference Resolution benchmark designed for testing extremely long context capabilities with approximately 1 million tokens. It evaluates models' ability to maintain reasoning and attention across ultra-long conversations.",
  "paper_link": "https://arxiv.org/abs/2409.12640",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:13.954336+00:00",
  "updated_at": "2025-07-19T19:56:13.954336+00:00"
}

================================================
FILE: data/benchmarks/mrcr-v2-(8-needle).json
================================================
{
  "benchmark_id": "mrcr-v2-(8-needle)",
  "name": "MRCR v2 (8-needle)",
  "parent_benchmark_id": null,
  "categories": ["long_context", "reasoning", "general"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "MRCR v2 (8-needle) is a variant of the Multi-Round Coreference Resolution benchmark that includes 8 needle items to retrieve from long contexts. This tests models' ability to simultaneously track and reason about multiple pieces of information across extended conversations.",
  "paper_link": "https://arxiv.org/abs/2409.12640",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.010914+00:00",
  "updated_at": "2025-07-19T19:56:14.010914+00:00"
}

================================================
FILE: data/benchmarks/mrcr-v2.json
================================================
{
  "benchmark_id": "mrcr-v2",
  "name": "MRCR v2",
  "parent_benchmark_id": null,
  "categories": ["long_context", "reasoning", "general"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "MRCR v2 (Multi-Round Coreference Resolution version 2) is an enhanced version of the synthetic long-context reasoning task. It extends the original MRCR framework with improved evaluation criteria and additional complexity for testing models' ability to maintain attention and reasoning across extended contexts.",
  "paper_link": "https://arxiv.org/abs/2409.12640",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:13.963241+00:00",
  "updated_at": "2025-07-19T19:56:13.963241+00:00"
}

================================================
FILE: data/benchmarks/mrcr.json
================================================
{
  "benchmark_id": "mrcr",
  "name": "MRCR",
  "parent_benchmark_id": null,
  "categories": ["long_context", "reasoning", "general"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "MRCR (Multi-Round Coreference Resolution) is a synthetic long-context reasoning task where models must navigate long conversations to reproduce specific model outputs. It tests the ability to distinguish between similar requests and reason about ordering while maintaining attention across extended contexts.",
  "paper_link": "https://arxiv.org/abs/2409.12640",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:13.887445+00:00",
  "updated_at": "2025-07-19T19:56:13.887445+00:00"
}

================================================
FILE: data/benchmarks/mt-bench.json
================================================
{
  "benchmark_id": "mt-bench",
  "name": "MT-Bench",
  "parent_benchmark_id": null,
  "categories": ["communication", "reasoning", "general", "roleplay"],
  "modality": "text",
  "multilingual": false,
  "max_score": 100.0,
  "language": "en",
  "description": "MT-Bench is a challenging multi-turn benchmark that measures the ability of large language models to engage in coherent, informative, and engaging conversations. It uses strong LLMs as judges for scalable and explainable evaluation of multi-turn dialogue capabilities.",
  "paper_link": "https://arxiv.org/abs/2306.05685",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.516415+00:00",
  "updated_at": "2025-07-19T19:56:14.516415+00:00"
}

================================================
FILE: data/benchmarks/mtvqa.json
================================================
{
  "benchmark_id": "mtvqa",
  "name": "MTVQA",
  "parent_benchmark_id": null,
  "categories": ["vision", "multimodal", "text-to-image"],
  "modality": "multimodal",
  "multilingual": true,
  "max_score": 1.0,
  "language": "en",
  "description": "MTVQA (Multilingual Text-Centric Visual Question Answering) is the first benchmark featuring high-quality human expert annotations across 9 diverse languages, consisting of 6,778 question-answer pairs across 2,116 images. It addresses visual-textual misalignment problems in multilingual text-centric VQA.",
  "paper_link": "https://arxiv.org/abs/2405.11985",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.587333+00:00",
  "updated_at": "2025-07-19T19:56:14.587333+00:00"
}

================================================
FILE: data/benchmarks/muirbench.json
================================================
{
  "benchmark_id": "muirbench",
  "name": "MuirBench",
  "parent_benchmark_id": null,
  "categories": ["vision", "multimodal", "reasoning"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "A comprehensive benchmark for robust multi-image understanding capabilities of multimodal LLMs. Consists of 12 diverse multi-image tasks involving 10 categories of multi-image relations (e.g., multiview, temporal relations, narrative, complementary). Comprises 11,264 images and 2,600 multiple-choice questions created in a pairwise manner, where each standard instance is paired with an unanswerable variant for reliable assessment.",
  "paper_link": "https://arxiv.org/abs/2406.09411",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.888428+00:00",
  "updated_at": "2025-07-19T19:56:14.888428+00:00"
}

================================================
FILE: data/benchmarks/multi-if.json
================================================
{
  "benchmark_id": "multi-if",
  "name": "Multi-IF",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "communication", "language"],
  "modality": "text",
  "multilingual": true,
  "max_score": 1.0,
  "language": "en",
  "description": "Multi-IF benchmarks LLMs on multi-turn and multilingual instruction following. It expands upon IFEval by incorporating multi-turn sequences and translating English prompts into 7 other languages, resulting in 4,501 multilingual conversations with three turns each. The benchmark reveals that current leading LLMs struggle with maintaining accuracy in multi-turn instructions and shows higher error rates for non-Latin script languages.",
  "paper_link": "https://arxiv.org/abs/2410.15553",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.638787+00:00",
  "updated_at": "2025-07-19T19:56:14.638787+00:00"
}

================================================
FILE: data/benchmarks/multi-swe-bench.json
================================================
{
  "benchmark_id": "multi-swe-bench",
  "name": "Multi-SWE-Bench",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "code"],
  "modality": "text",
  "multilingual": true,
  "max_score": 1.0,
  "language": "en",
  "description": "A multilingual benchmark for issue resolving that evaluates Large Language Models' ability to resolve software issues across diverse programming ecosystems. Covers 7 programming languages (Java, TypeScript, JavaScript, Go, Rust, C, and C++) with 1,632 high-quality instances carefully annotated by 68 expert annotators. Addresses limitations of existing benchmarks that focus almost exclusively on Python.",
  "paper_link": "https://arxiv.org/abs/2504.02605",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-09-15T00:00:00.000000+00:00",
  "updated_at": "2025-09-15T00:00:00.000000+00:00"
}


================================================
FILE: data/benchmarks/multichallenge-(o3-mini-grader).json
================================================
{
  "benchmark_id": "multichallenge-(o3-mini-grader)",
  "name": "MultiChallenge (o3-mini grader)",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "language"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "A realistic multi-turn conversation evaluation benchmark that challenges frontier LLMs across four key areas: instruction retention, inference memory, reliable versioned editing, and self-coherence. Despite near-perfect scores on existing benchmarks, frontier models achieve less than 50% accuracy on MultiChallenge.",
  "paper_link": "https://arxiv.org/abs/2501.17399",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:15.235758+00:00",
  "updated_at": "2025-07-19T19:56:15.235758+00:00"
}

================================================
FILE: data/benchmarks/multichallenge.json
================================================
{
  "benchmark_id": "multichallenge",
  "name": "Multi-Challenge",
  "parent_benchmark_id": null,
  "categories": ["communication", "reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "MultiChallenge is a realistic multi-turn conversation evaluation benchmark that challenges frontier LLMs across four key categories: instruction retention (maintaining instructions throughout conversations), inference memory (recalling and connecting details from previous turns), reliable versioned editing (adapting to evolving instructions during collaborative editing), and self-coherence (avoiding contradictions in responses). The benchmark evaluates models on sustained, contextually complex dialogues across diverse topics including travel planning, technical documentation, and professional communication.",
  "paper_link": "https://arxiv.org/abs/2501.17399",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-09-05T00:00:00.000000+00:00",
  "updated_at": "2025-09-05T00:00:00.000000+00:00"
}


================================================
FILE: data/benchmarks/multilf.json
================================================
{
  "benchmark_id": "multilf",
  "name": "MultiLF",
  "parent_benchmark_id": null,
  "categories": ["general"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "MultiLF benchmark",
  "paper_link": null,
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.628191+00:00",
  "updated_at": "2025-07-19T19:56:14.628191+00:00"
}

================================================
FILE: data/benchmarks/multilingual-mgsm-(cot).json
================================================
{
  "benchmark_id": "multilingual-mgsm-(cot)",
  "name": "Multilingual MGSM (CoT)",
  "parent_benchmark_id": null,
  "categories": ["math", "reasoning"],
  "modality": "text",
  "multilingual": true,
  "max_score": 1.0,
  "language": "en",
  "description": "Multilingual Grade School Math (MGSM) benchmark evaluates language models' chain-of-thought reasoning abilities across ten typologically diverse languages. Contains 250 grade-school math problems manually translated from GSM8K dataset into languages including Bengali and Swahili.",
  "paper_link": "https://arxiv.org/abs/2210.03057",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.402248+00:00",
  "updated_at": "2025-07-19T19:56:14.402248+00:00"
}

================================================
FILE: data/benchmarks/multilingual-mmlu.json
================================================
{
  "benchmark_id": "multilingual-mmlu",
  "name": "Multilingual MMLU",
  "parent_benchmark_id": null,
  "categories": ["general", "reasoning", "language"],
  "modality": "text",
  "multilingual": true,
  "max_score": 1.0,
  "language": "en",
  "description": "MMLU-ProX is a comprehensive multilingual benchmark covering 29 typologically diverse languages, building upon MMLU-Pro. Each language version consists of 11,829 identical questions enabling direct cross-linguistic comparisons. The benchmark evaluates large language models' reasoning capabilities across linguistic and cultural boundaries through challenging, reasoning-focused questions with 10 answer choices.",
  "paper_link": "https://arxiv.org/abs/2503.10497",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.139086+00:00",
  "updated_at": "2025-07-19T19:56:14.139086+00:00"
}

================================================
FILE: data/benchmarks/multipl-e-humaneval.json
================================================
{
  "benchmark_id": "multipl-e-humaneval",
  "name": "Multipl-E HumanEval",
  "parent_benchmark_id": null,
  "categories": ["language", "general"],
  "modality": "text",
  "multilingual": true,
  "max_score": 1.0,
  "language": "en",
  "description": "MultiPL-E is a scalable and extensible approach to benchmarking neural code generation that translates unit test-driven code generation benchmarks across multiple programming languages. It extends the HumanEval benchmark to 18 additional programming languages, enabling evaluation of code generation models across diverse programming paradigms and providing insights into how models generalize programming knowledge across language boundaries.",
  "paper_link": "https://arxiv.org/abs/2208.08227",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.345081+00:00",
  "updated_at": "2025-07-19T19:56:14.345081+00:00"
}

================================================
FILE: data/benchmarks/multipl-e-mbpp.json
================================================
{
  "benchmark_id": "multipl-e-mbpp",
  "name": "Multipl-E MBPP",
  "parent_benchmark_id": null,
  "categories": ["general", "reasoning"],
  "modality": "text",
  "multilingual": true,
  "max_score": 1.0,
  "language": "en",
  "description": "MultiPL-E extends the Mostly Basic Python Problems (MBPP) benchmark to 18+ programming languages for evaluating multilingual code generation capabilities. MBPP contains 974 crowd-sourced programming problems designed to be solvable by entry-level programmers, covering programming fundamentals and standard library functionality. Each problem includes a task description, code solution, and automated test cases.",
  "paper_link": "https://arxiv.org/abs/2208.08227",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.353635+00:00",
  "updated_at": "2025-07-19T19:56:14.353635+00:00"
}

================================================
FILE: data/benchmarks/multipl-e.json
================================================
{
  "benchmark_id": "multipl-e",
  "name": "MultiPL-E",
  "parent_benchmark_id": null,
  "categories": ["general", "language"],
  "modality": "text",
  "multilingual": true,
  "max_score": 1.0,
  "language": "en",
  "description": "MultiPL-E is a scalable and extensible system for translating unit test-driven code generation benchmarks to multiple programming languages. It extends HumanEval and MBPP Python benchmarks to 18 additional programming languages, enabling evaluation of neural code generation models across diverse programming paradigms and language features.",
  "paper_link": "https://arxiv.org/abs/2208.08227",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:12.311919+00:00",
  "updated_at": "2025-07-19T19:56:12.311919+00:00"
}


================================================
FILE: data/benchmarks/musiccaps.json
================================================
{
  "benchmark_id": "musiccaps",
  "name": "MusicCaps",
  "parent_benchmark_id": null,
  "categories": ["audio", "multimodal"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "MusicCaps is a dataset composed of 5,521 music examples, each labeled with an English aspect list and a free text caption written by musicians. The dataset contains 10-second music clips from AudioSet paired with rich textual descriptions that capture sonic qualities and musical elements like genre, mood, tempo, instrumentation, and rhythm. Created to support research in music-text understanding and generation tasks.",
  "paper_link": "https://arxiv.org/abs/2301.11325",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.892085+00:00",
  "updated_at": "2025-07-19T19:56:14.892085+00:00"
}

================================================
FILE: data/benchmarks/musr.json
================================================
{
  "benchmark_id": "musr",
  "name": "MuSR",
  "parent_benchmark_id": null,
  "categories": ["reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "MuSR (Multistep Soft Reasoning) is a benchmark for evaluating language models on multistep soft reasoning tasks specified in natural language narratives. Created through a neurosymbolic synthetic-to-natural generation algorithm, it generates complex reasoning scenarios like murder mysteries roughly 1000 words in length that challenge current LLMs including GPT-4. The benchmark tests chain-of-thought reasoning capabilities across domains involving commonsense reasoning about physical and social situations.",
  "paper_link": "https://arxiv.org/abs/2310.16049",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:12.708705+00:00",
  "updated_at": "2025-07-19T19:56:12.708705+00:00"
}

================================================
FILE: data/benchmarks/mvbench.json
================================================
{
  "benchmark_id": "mvbench",
  "name": "MVBench",
  "parent_benchmark_id": null,
  "categories": ["vision", "video", "multimodal", "spatial_reasoning", "reasoning"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "A comprehensive multi-modal video understanding benchmark covering 20 challenging video tasks that require temporal understanding beyond single-frame analysis. Tasks span from perception to cognition, including action recognition, temporal reasoning, spatial reasoning, object interaction, scene transition, and counterfactual inference. Uses a novel static-to-dynamic method to systematically generate video tasks from existing annotations.",
  "paper_link": "https://arxiv.org/abs/2311.17005",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.615534+00:00",
  "updated_at": "2025-07-19T19:56:14.615534+00:00"
}

================================================
FILE: data/benchmarks/natural-questions.json
================================================
{
  "benchmark_id": "natural-questions",
  "name": "Natural Questions",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "general", "search"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "Natural Questions is a question answering dataset featuring real anonymized queries issued to Google search engine. It contains 307,373 training examples where annotators provide long answers (passages) and short answers (entities) from Wikipedia pages, or mark them as unanswerable.",
  "paper_link": "https://arxiv.org/abs/1901.08634",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:13.178778+00:00",
  "updated_at": "2025-07-19T19:56:13.178778+00:00"
}

================================================
FILE: data/benchmarks/natural2code.json
================================================
{
  "benchmark_id": "natural2code",
  "name": "Natural2Code",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "general"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "NaturalCodeBench (NCB) is a challenging code benchmark designed to mirror the complexity and variety of real-world coding tasks. It comprises 402 high-quality problems in Python and Java, selected from natural user queries from online coding services, covering 6 different domains.",
  "paper_link": "https://arxiv.org/abs/2405.04520",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:13.518784+00:00",
  "updated_at": "2025-07-19T19:56:13.518784+00:00"
}

================================================
FILE: data/benchmarks/nexus.json
================================================
{
  "benchmark_id": "nexus",
  "name": "Nexus",
  "parent_benchmark_id": null,
  "categories": ["general"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "NexusRaven benchmark for evaluating function calling capabilities of large language models in zero-shot scenarios across cybersecurity tools and API interactions",
  "paper_link": "https://openreview.net/pdf?id=5lcPe6DqfI",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.391550+00:00",
  "updated_at": "2025-07-19T19:56:14.391550+00:00"
}

================================================
FILE: data/benchmarks/nih-multi-needle.json
================================================
{
  "benchmark_id": "nih-multi-needle",
  "name": "NIH/Multi-needle",
  "parent_benchmark_id": null,
  "categories": ["long_context"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "Multi-needle in a haystack benchmark for evaluating long-context comprehension capabilities of language models by testing retrieval of multiple target pieces of information from extended documents",
  "paper_link": "https://arxiv.org/abs/2406.11230",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.465778+00:00",
  "updated_at": "2025-07-19T19:56:14.465778+00:00"
}

================================================
FILE: data/benchmarks/nmos.json
================================================
{
  "benchmark_id": "nmos",
  "name": "NMOS",
  "parent_benchmark_id": null,
  "categories": ["general"],
  "modality": "text",
  "multilingual": false,
  "max_score": 100.0,
  "language": "en",
  "description": "NMOS evaluation benchmark for assessing model performance on specialized tasks",
  "paper_link": null,
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.895373+00:00",
  "updated_at": "2025-07-19T19:56:14.895373+00:00"
}

================================================
FILE: data/benchmarks/nq.json
================================================
{
  "benchmark_id": "nq",
  "name": "NQ",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "general"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "Natural Questions (NQ) benchmark containing real user questions issued to Google search with answers found from Wikipedia, designed for training and evaluation of automatic question answering systems",
  "paper_link": "https://aclanthology.org/Q19-1026/",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:15.088246+00:00",
  "updated_at": "2025-07-19T19:56:15.088246+00:00"
}

================================================
FILE: data/benchmarks/ocrbench-v2-(en).json
================================================
{
  "benchmark_id": "ocrbench-v2-(en)",
  "name": "OCRBench-V2 (en)",
  "parent_benchmark_id": null,
  "categories": ["vision", "image-to-text"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "OCRBench v2 English subset: Enhanced benchmark for evaluating Large Multimodal Models on visual text localization and reasoning with English text content",
  "paper_link": "https://arxiv.org/abs/2501.00321",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.926330+00:00",
  "updated_at": "2025-07-19T19:56:14.926330+00:00"
}

================================================
FILE: data/benchmarks/ocrbench-v2-(zh).json
================================================
{
  "benchmark_id": "ocrbench-v2-(zh)",
  "name": "OCRBench-V2 (zh)",
  "parent_benchmark_id": null,
  "categories": ["vision", "image-to-text"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "zh",
  "description": "OCRBench v2 Chinese subset: Enhanced benchmark for evaluating Large Multimodal Models on visual text localization and reasoning with Chinese text content",
  "paper_link": "https://arxiv.org/abs/2501.00321",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.944963+00:00",
  "updated_at": "2025-07-19T19:56:14.944963+00:00"
}

================================================
FILE: data/benchmarks/ocrbench-v2.json
================================================
{
  "benchmark_id": "ocrbench-v2",
  "name": "OCRBench_V2",
  "parent_benchmark_id": null,
  "categories": ["vision", "image-to-text"],
  "modality": "multimodal",
  "multilingual": true,
  "max_score": 1.0,
  "language": "en",
  "description": "OCRBench v2: Enhanced large-scale bilingual benchmark for evaluating Large Multimodal Models on visual text localization and reasoning with 10,000 human-verified question-answering pairs across 8 core OCR capabilities",
  "paper_link": "https://arxiv.org/abs/2501.00321",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.898625+00:00",
  "updated_at": "2025-07-19T19:56:14.898625+00:00"
}

================================================
FILE: data/benchmarks/ocrbench.json
================================================
{
  "benchmark_id": "ocrbench",
  "name": "OCRBench",
  "parent_benchmark_id": null,
  "categories": ["vision", "image-to-text"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "OCRBench: Comprehensive evaluation benchmark for assessing Optical Character Recognition (OCR) capabilities in Large Multimodal Models across text recognition, scene text VQA, and document understanding tasks",
  "paper_link": "https://arxiv.org/abs/2305.07895",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.304601+00:00",
  "updated_at": "2025-07-19T19:56:14.304601+00:00"
}

================================================
FILE: data/benchmarks/odinw.json
================================================
{
  "benchmark_id": "odinw",
  "name": "ODinW",
  "parent_benchmark_id": null,
  "categories": ["vision"],
  "modality": "image",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "Object Detection in the Wild (ODinW) benchmark for evaluating object detection models' task-level transfer ability across diverse real-world datasets in terms of prediction accuracy and adaptation efficiency",
  "paper_link": "https://arxiv.org/abs/2112.03857",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.902703+00:00",
  "updated_at": "2025-07-19T19:56:14.902703+00:00"
}

================================================
FILE: data/benchmarks/ojbench.json
================================================
{
  "benchmark_id": "ojbench",
  "name": "OJBench",
  "parent_benchmark_id": null,
  "categories": ["reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "OJBench is a competition-level code benchmark designed to assess the competitive-level code reasoning abilities of large language models. It comprises 232 programming competition problems from NOI and ICPC, categorized into Easy, Medium, and Hard difficulty levels. The benchmark evaluates models' ability to solve complex competitive programming challenges using Python and C++.",
  "paper_link": "https://arxiv.org/abs/2506.16395",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-09-05T00:00:00.000000+00:00",
  "updated_at": "2025-09-05T00:00:00.000000+00:00"
}


================================================
FILE: data/benchmarks/olympiadbench.json
================================================
{
  "benchmark_id": "olympiadbench",
  "name": "OlympiadBench",
  "parent_benchmark_id": null,
  "categories": ["math", "reasoning", "physics", "multimodal"],
  "modality": "multimodal",
  "multilingual": true,
  "max_score": 1.0,
  "language": "en",
  "description": "A challenging benchmark for promoting AGI with Olympiad-level bilingual multimodal scientific problems. Comprises 8,476 math and physics problems from international and Chinese Olympiads and the Chinese college entrance exam, featuring expert-level annotations for step-by-step reasoning. Includes both text-only and multimodal problems in English and Chinese.",
  "paper_link": "https://arxiv.org/abs/2402.14008",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.821916+00:00",
  "updated_at": "2025-07-19T19:56:14.821916+00:00"
}

================================================
FILE: data/benchmarks/omnibench-music.json
================================================
{
  "benchmark_id": "omnibench-music",
  "name": "OmniBench Music",
  "parent_benchmark_id": null,
  "categories": ["multimodal", "audio"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "Music component of OmniBench, a comprehensive benchmark for evaluating omni-language models' ability to recognize, interpret, and reason across visual, acoustic, and textual inputs simultaneously. The music category includes various compositions and performances that require integrated understanding across text, image, and audio modalities.",
  "paper_link": "https://arxiv.org/abs/2409.15272",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.911093+00:00",
  "updated_at": "2025-07-19T19:56:14.911093+00:00"
}

================================================
FILE: data/benchmarks/omnibench.json
================================================
{
  "benchmark_id": "omnibench",
  "name": "OmniBench",
  "parent_benchmark_id": null,
  "categories": ["multimodal", "reasoning"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "A novel multimodal benchmark designed to evaluate large language models' ability to recognize, interpret, and reason across visual, acoustic, and textual inputs simultaneously. Comprises 1,142 question-answer pairs covering 8 task categories from basic perception to complex inference, with a unique constraint that accurate responses require integrated understanding of all three modalities.",
  "paper_link": "https://arxiv.org/abs/2409.15272",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.906402+00:00",
  "updated_at": "2025-07-19T19:56:14.906402+00:00"
}

================================================
FILE: data/benchmarks/omnimath.json
================================================
{
  "benchmark_id": "omnimath",
  "name": "OmniMath",
  "parent_benchmark_id": null,
  "categories": ["math", "reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "A Universal Olympiad Level Mathematic Benchmark for Large Language Models containing 4,428 competition-level problems with rigorous human annotation, categorized into over 33 sub-domains and spanning more than 10 distinct difficulty levels",
  "paper_link": "https://arxiv.org/abs/2410.07985",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.271468+00:00",
  "updated_at": "2025-07-19T19:56:14.271468+00:00"
}

================================================
FILE: data/benchmarks/open-rewrite.json
================================================
{
  "benchmark_id": "open-rewrite",
  "name": "Open-rewrite",
  "parent_benchmark_id": null,
  "categories": ["language", "writing"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "OpenRewriteEval is a benchmark for evaluating open-ended rewriting of long-form texts, covering a wide variety of rewriting types expressed through natural language instructions including formality, expansion, conciseness, paraphrasing, and tone and style transfer.",
  "paper_link": "https://arxiv.org/abs/2305.15685",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.435616+00:00",
  "updated_at": "2025-07-19T19:56:14.435616+00:00"
}

================================================
FILE: data/benchmarks/openai-mmlu.json
================================================
{
  "benchmark_id": "openai-mmlu",
  "name": "OpenAI MMLU",
  "parent_benchmark_id": null,
  "categories": ["general", "reasoning", "math", "legal", "healthcare", "finance", "physics", "chemistry", "economics", "psychology"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "MMLU (Massive Multitask Language Understanding) is a comprehensive benchmark that measures a text model's multitask accuracy across 57 diverse academic and professional subjects. The test covers elementary mathematics, US history, computer science, law, morality, business ethics, clinical knowledge, and many other domains spanning STEM, humanities, social sciences, and professional fields. To attain high accuracy, models must possess extensive world knowledge and problem-solving ability.",
  "paper_link": "https://arxiv.org/abs/2009.03300",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.043675+00:00",
  "updated_at": "2025-07-19T19:56:14.043675+00:00"
}

================================================
FILE: data/benchmarks/openai-mrcr%3A-2-needle-128k.json
================================================
{
  "benchmark_id": "openai-mrcr:-2-needle-128k",
  "name": "OpenAI-MRCR: 2 needle 128k",
  "parent_benchmark_id": null,
  "categories": ["long_context", "reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "Multi-round Co-reference Resolution (MRCR) benchmark for evaluating an LLM's ability to distinguish between multiple needles hidden in long context. Models are given a long, multi-turn synthetic conversation and must retrieve a specific instance of a repeated request, requiring reasoning and disambiguation skills beyond simple retrieval.",
  "paper_link": "https://arxiv.org/abs/2403.05530",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:15.266878+00:00",
  "updated_at": "2025-07-19T19:56:15.266878+00:00"
}

================================================
FILE: data/benchmarks/openai-mrcr%3A-2-needle-1m.json
================================================
{
  "benchmark_id": "openai-mrcr:-2-needle-1m",
  "name": "OpenAI-MRCR: 2 needle 1M",
  "parent_benchmark_id": null,
  "categories": ["long_context", "reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "Multi-Round Co-reference Resolution benchmark that tests an LLM's ability to distinguish between multiple similar needles hidden in long conversations. Models must reproduce specific instances of content (e.g., 'Return the 2nd poem about tapirs') from multi-turn synthetic conversations, requiring reasoning about context, ordering, and subtle differences between similar outputs.",
  "paper_link": "https://arxiv.org/abs/2409.12640",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:15.280285+00:00",
  "updated_at": "2025-07-19T19:56:15.280285+00:00"
}

================================================
FILE: data/benchmarks/openai-mrcr%3A-2-needle-256k.json
================================================
{
  "benchmark_id": "openai-mrcr:-2-needle-256k",
  "name": "OpenAI-MRCR: 2 needle 256k",
  "parent_benchmark_id": null,
  "categories": ["long_context", "reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "Multi-Round Co-reference Resolution (MRCR) benchmark that tests long-context reasoning by evaluating a model's ability to distinguish between similar outputs, reason about ordering, and reproduce specific content from multi-turn conversations containing multiple writing requests on overlapping topics at 256k tokens.",
  "paper_link": "https://arxiv.org/abs/2409.12640",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-24T12:00:00.000000+00:00",
  "updated_at": "2025-07-24T12:00:00.000000+00:00"
}


================================================
FILE: data/benchmarks/openbookqa.json
================================================
{
  "benchmark_id": "openbookqa",
  "name": "OpenBookQA",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "general"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "OpenBookQA is a question-answering dataset modeled after open book exams for assessing human understanding. It contains 5,957 multiple-choice elementary-level science questions that probe understanding of 1,326 core science facts and their application to novel situations, requiring combination of open book facts with broad common knowledge through multi-hop reasoning.",
  "paper_link": "https://arxiv.org/abs/1809.02789",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.129348+00:00",
  "updated_at": "2025-07-19T19:56:14.129348+00:00"
}

================================================
FILE: data/benchmarks/osworld-extended.json
================================================
{
  "benchmark_id": "osworld-extended",
  "name": "OSWorld Extended",
  "parent_benchmark_id": null,
  "categories": ["general", "reasoning", "multimodal"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "OSWorld is a scalable, real computer environment benchmark for evaluating multimodal agents on open-ended tasks across Ubuntu, Windows, and macOS. It comprises 369 computer tasks involving real web and desktop applications, OS file I/O, and multi-application workflows. The benchmark evaluates agents' ability to interact with computer interfaces using screenshots and actions in realistic computing environments.",
  "paper_link": "https://arxiv.org/abs/2404.07972",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:15.113488+00:00",
  "updated_at": "2025-07-19T19:56:15.113488+00:00"
}

================================================
FILE: data/benchmarks/osworld-screenshot-only.json
================================================
{
  "benchmark_id": "osworld-screenshot-only",
  "name": "OSWorld Screenshot-only",
  "parent_benchmark_id": null,
  "categories": ["multimodal", "vision", "general"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "OSWorld Screenshot-only: A variant of the OSWorld benchmark that evaluates multimodal AI agents using only screenshot observations to complete open-ended computer tasks across real operating systems (Ubuntu, Windows, macOS). Tests agents' ability to perform complex workflows involving web apps, desktop applications, file I/O, and multi-application tasks through visual interface understanding and GUI grounding.",
  "paper_link": "https://arxiv.org/abs/2404.07972",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:15.109647+00:00",
  "updated_at": "2025-07-19T19:56:15.109647+00:00"
}

================================================
FILE: data/benchmarks/osworld.json
================================================
{
  "benchmark_id": "osworld",
  "name": "OSWorld",
  "parent_benchmark_id": null,
  "categories": ["multimodal", "general", "vision"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "OSWorld: The first-of-its-kind scalable, real computer environment for multimodal agents, supporting task setup, execution-based evaluation, and interactive learning across Ubuntu, Windows, and macOS with 369 computer tasks involving real web and desktop applications, OS file I/O, and multi-application workflows",
  "paper_link": "https://arxiv.org/abs/2404.07972",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.935426+00:00",
  "updated_at": "2025-07-19T19:56:14.935426+00:00"
}

================================================
FILE: data/benchmarks/pathmcqa.json
================================================
{
  "benchmark_id": "pathmcqa",
  "name": "PathMCQA",
  "parent_benchmark_id": null,
  "categories": ["healthcare", "vision", "multimodal", "reasoning"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "PathMMU is a massive multimodal expert-level benchmark for understanding and reasoning in pathology, containing 33,428 multimodal multi-choice questions and 24,067 images validated by seven pathologists. It evaluates Large Multimodal Models (LMMs) performance on pathology tasks, with the top-performing model GPT-4V achieving only 49.8% zero-shot performance compared to 71.8% for human pathologists.",
  "paper_link": "https://arxiv.org/abs/2401.16355",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.036453+00:00",
  "updated_at": "2025-07-19T19:56:14.036453+00:00"
}

================================================
FILE: data/benchmarks/perceptiontest.json
================================================
{
  "benchmark_id": "perceptiontest",
  "name": "PerceptionTest",
  "parent_benchmark_id": null,
  "categories": ["video", "multimodal", "reasoning", "physics", "spatial_reasoning"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "A novel multimodal video benchmark designed to evaluate perception and reasoning skills of pre-trained models across video, audio, and text modalities. Contains 11.6k real-world videos (average 23 seconds) filmed by participants worldwide, densely annotated with six types of labels. Focuses on skills (Memory, Abstraction, Physics, Semantics) and reasoning types (descriptive, explanatory, predictive, counterfactual). Shows significant performance gap between human baseline (91.4%) and state-of-the-art video QA models (46.2%).",
  "paper_link": "https://arxiv.org/abs/2305.13786",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.708910+00:00",
  "updated_at": "2025-07-19T19:56:14.708910+00:00"
}

================================================
FILE: data/benchmarks/phibench.json
================================================
{
  "benchmark_id": "phibench",
  "name": "PhiBench",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "math", "general"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "PhiBench is an internal benchmark designed to evaluate diverse skills and reasoning abilities of language models, covering a wide range of tasks including coding (debugging, extending incomplete code, explaining code snippets) and mathematics (identifying proof errors, generating related problems). Created by Microsoft's research team to address limitations of standard academic benchmarks and guide the development of the Phi-4 model.",
  "paper_link": "https://arxiv.org/abs/2412.08905",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.121593+00:00",
  "updated_at": "2025-07-19T19:56:14.121593+00:00"
}

================================================
FILE: data/benchmarks/physicsfinals.json
================================================
{
  "benchmark_id": "physicsfinals",
  "name": "PhysicsFinals",
  "parent_benchmark_id": null,
  "categories": ["physics", "math", "reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "PHYSICS is a comprehensive benchmark for university-level physics problem solving, containing 1,297 expert-annotated problems covering six core areas: classical mechanics, quantum mechanics, thermodynamics and statistical mechanics, electromagnetism, atomic physics, and optics. Each problem requires advanced physics knowledge and mathematical reasoning. Even advanced models like o3-mini achieve only 59.9% accuracy.",
  "paper_link": "https://arxiv.org/abs/2503.21821",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:13.981919+00:00",
  "updated_at": "2025-07-19T19:56:13.981919+00:00"
}

================================================
FILE: data/benchmarks/piqa.json
================================================
{
  "benchmark_id": "piqa",
  "name": "PIQA",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "physics", "general"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "PIQA (Physical Interaction: Question Answering) is a benchmark dataset for physical commonsense reasoning in natural language. It tests AI systems' ability to answer questions requiring physical world knowledge through multiple choice questions with everyday situations, focusing on atypical solutions inspired by instructables.com. The dataset contains 21,000 multiple choice questions where models must choose the most appropriate solution for physical interactions.",
  "paper_link": "https://arxiv.org/abs/1911.11641",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:13.133817+00:00",
  "updated_at": "2025-07-19T19:56:13.133817+00:00"
}

================================================
FILE: data/benchmarks/pointgrounding.json
================================================
{
  "benchmark_id": "pointgrounding",
  "name": "PointGrounding",
  "parent_benchmark_id": null,
  "categories": ["vision", "spatial_reasoning", "multimodal"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "PointArena is a comprehensive platform for evaluating multimodal pointing across diverse reasoning scenarios. It includes Point-Bench, a curated dataset of ~1,000 pointing tasks across five categories: Spatial (positional references), Affordance (functional part identification), Counting (attribute-based grouping), Steerable (relative pointing), and Reasoning (open-ended visual inference). The benchmark evaluates language-guided pointing capabilities in vision-language models.",
  "paper_link": "https://arxiv.org/abs/2505.09990",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.914897+00:00",
  "updated_at": "2025-07-19T19:56:14.914897+00:00"
}

================================================
FILE: data/benchmarks/polymath-en.json
================================================
{
  "benchmark_id": "polymath-en",
  "name": "PolyMath-en",
  "parent_benchmark_id": null,
  "categories": ["math", "reasoning"],
  "modality": "text",
  "multilingual": true,
  "max_score": 1.0,
  "language": "en",
  "description": "PolyMath is a multilingual mathematical reasoning benchmark covering 18 languages and 4 difficulty levels from easy to hard, ensuring difficulty comprehensiveness, language diversity, and high-quality translation. The benchmark evaluates mathematical reasoning capabilities of large language models across diverse linguistic contexts, making it a highly discriminative multilingual mathematical benchmark.",
  "paper_link": "https://arxiv.org/abs/2504.18428",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-09-05T00:00:00.000000+00:00",
  "updated_at": "2025-09-05T00:00:00.000000+00:00"
}


================================================
FILE: data/benchmarks/polymath.json
================================================
{
  "benchmark_id": "polymath",
  "name": "PolyMATH",
  "parent_benchmark_id": null,
  "categories": ["math", "reasoning", "spatial_reasoning", "multimodal", "vision"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "Polymath is a challenging multi-modal mathematical reasoning benchmark designed to evaluate the general cognitive reasoning abilities of Multi-modal Large Language Models (MLLMs). The benchmark comprises 5,000 manually collected high-quality images of cognitive textual and visual challenges across 10 distinct categories, including pattern recognition, spatial reasoning, and relative reasoning.",
  "paper_link": "https://arxiv.org/abs/2410.14702",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-08-03T22:06:11.108063+00:00",
  "updated_at": "2025-08-03T22:06:11.108063+00:00"
}

================================================
FILE: data/benchmarks/pope.json
================================================
{
  "benchmark_id": "pope",
  "name": "POPE",
  "parent_benchmark_id": null,
  "categories": ["vision", "safety", "multimodal"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "Polling-based Object Probing Evaluation (POPE) is a benchmark for evaluating object hallucination in Large Vision-Language Models (LVLMs). POPE addresses the problem where LVLMs generate objects inconsistent with target images by using a polling-based query method that asks yes/no questions about object presence in images, providing more stable and flexible evaluation of object hallucination.",
  "paper_link": "https://arxiv.org/abs/2305.10355",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.264312+00:00",
  "updated_at": "2025-07-19T19:56:14.264312+00:00"
}

================================================
FILE: data/benchmarks/popqa.json
================================================
{
  "benchmark_id": "popqa",
  "name": "PopQA",
  "parent_benchmark_id": null,
  "categories": ["general", "reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "PopQA is an entity-centric open-domain question-answering dataset consisting of 14,000 QA pairs designed to evaluate language models' ability to memorize and recall factual knowledge across entities with varying popularity levels. The dataset probes both parametric memory (stored in model parameters) and non-parametric memory effectiveness, with questions covering 16 diverse relationship types from Wikidata converted to natural language using templates. Created by sampling knowledge triples from Wikidata and converting them to natural language questions, focusing on long-tail entities to understand LMs' strengths and limitations in memorizing factual knowledge.",
  "paper_link": "https://arxiv.org/abs/2212.10511",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:15.072897+00:00",
  "updated_at": "2025-07-19T19:56:15.072897+00:00"
}

================================================
FILE: data/benchmarks/qasper.json
================================================
{
  "benchmark_id": "qasper",
  "name": "Qasper",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "long_context"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "QASPER is a dataset of 5,049 information-seeking questions and answers anchored in 1,585 NLP research papers. Questions are written by NLP practitioners who read only titles and abstracts, while answers require understanding the full paper text and provide supporting evidence. The dataset challenges models with complex reasoning across document sections for academic document question answering. Each question seeks information present in the full text and is answered by a separate set of NLP practitioners who also provide supporting evidence to answers.",
  "paper_link": "https://arxiv.org/abs/2105.03011",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.166932+00:00",
  "updated_at": "2025-07-19T19:56:14.166932+00:00"
}

================================================
FILE: data/benchmarks/qmsum.json
================================================
{
  "benchmark_id": "qmsum",
  "name": "QMSum",
  "parent_benchmark_id": null,
  "categories": ["summarization", "long_context"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "QMSum is a benchmark for query-based multi-domain meeting summarization consisting of 1,808 query-summary pairs over 232 meetings across academic, product, and committee domains. The dataset enables models to select and summarize relevant spans of meetings in response to specific queries. Published at NAACL 2021, QMSum presents significant challenges in long meeting summarization where models must identify and summarize relevant content based on user queries.",
  "paper_link": "https://arxiv.org/abs/2104.05938",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.223595+00:00",
  "updated_at": "2025-07-19T19:56:14.223595+00:00"
}

================================================
FILE: data/benchmarks/realworldqa.json
================================================
{
  "benchmark_id": "realworldqa",
  "name": "RealWorldQA",
  "parent_benchmark_id": null,
  "categories": ["vision", "spatial_reasoning"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "RealWorldQA is a benchmark designed to evaluate basic real-world spatial understanding capabilities of multimodal models. The initial release consists of over 700 anonymized images taken from vehicles and other real-world scenarios, each accompanied by a question and easily verifiable answer. Released by xAI as part of their Grok-1.5 Vision preview to test models' ability to understand natural scenes and spatial relationships in everyday visual contexts.",
  "paper_link": null,
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.595271+00:00",
  "updated_at": "2025-07-19T19:56:14.595271+00:00"
}

================================================
FILE: data/benchmarks/repobench.json
================================================
{
  "benchmark_id": "repobench",
  "name": "RepoBench",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "code"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "RepoBench is a benchmark for evaluating repository-level code auto-completion systems through three interconnected tasks: RepoBench-R (retrieval of relevant code snippets across files), RepoBench-C (code completion with cross-file and in-file context), and RepoBench-P (pipeline combining retrieval and prediction). Supports Python and Java programming languages and addresses the gap in evaluating real-world, multi-file programming scenarios by providing a more complete comparison of performance in auto-completion systems.",
  "paper_link": "https://arxiv.org/abs/2306.03091",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:15.152588+00:00",
  "updated_at": "2025-07-19T19:56:15.152588+00:00"
}


================================================
FILE: data/benchmarks/repoqa.json
================================================
{
  "benchmark_id": "repoqa",
  "name": "RepoQA",
  "parent_benchmark_id": null,
  "categories": ["long_context", "reasoning", "code"],
  "modality": "text",
  "multilingual": true,
  "max_score": 1.0,
  "language": "en",
  "description": "RepoQA is a benchmark for evaluating long-context code understanding capabilities of Large Language Models through the Searching Needle Function (SNF) task, where LLMs must locate specific functions in code repositories using natural language descriptions. The benchmark contains 500 code search tasks spanning 50 repositories across 5 modern programming languages (Python, Java, TypeScript, C++, and Rust), tested on 26 general and code-specific LLMs to assess their ability to comprehend and navigate code repositories.",
  "paper_link": "https://arxiv.org/abs/2406.06025",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.180278+00:00",
  "updated_at": "2025-07-19T19:56:14.180278+00:00"
}


================================================
FILE: data/benchmarks/ruler.json
================================================
{
  "benchmark_id": "ruler",
  "name": "RULER",
  "parent_benchmark_id": null,
  "categories": ["long_context", "reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "RULER (What's the Real Context Size of Your Long-Context Language Models?) is a synthetic benchmark designed to comprehensively evaluate the long-context capabilities of language models. It expands on needle-in-a-haystack (NIAH) testing by introducing new task categories including multi-hop tracing and aggregation tasks. The benchmark provides flexible configurations for customized sequence length and task complexity, evaluating 17 long-context language models across 13 representative tasks to reveal that despite models claiming 32K+ token context sizes, only half maintain satisfactory performance at 32K length.",
  "paper_link": "https://arxiv.org/abs/2404.06654",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.175181+00:00",
  "updated_at": "2025-07-19T19:56:14.175181+00:00"
}

================================================
FILE: data/benchmarks/sat-math.json
================================================
{
  "benchmark_id": "sat-math",
  "name": "SAT Math",
  "parent_benchmark_id": null,
  "categories": ["math", "reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "SAT Math benchmark from AGIEval containing standardized mathematics questions from the College Board SAT examination, designed to evaluate mathematical reasoning capabilities of foundation models using human-centric assessment methods.",
  "paper_link": "https://arxiv.org/abs/2304.06364",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:15.414463+00:00",
  "updated_at": "2025-07-19T19:56:15.414463+00:00"
}

================================================
FILE: data/benchmarks/scale-multichallenge.json
================================================
{
  "benchmark_id": "scale-multichallenge",
  "name": "Scale MultiChallenge",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "communication", "general"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "MultiChallenge is a realistic multi-turn conversation evaluation benchmark developed by Scale AI that evaluates large language models on four challenging conversation categories: instruction retention, inference memory of user information, reliable versioned editing, and self-coherence. Each challenge requires accurate instruction-following, context allocation, and in-context reasoning. Despite achieving near-perfect scores on existing multi-turn evaluation benchmarks, all frontier models have less than 50% accuracy on MultiChallenge.",
  "paper_link": "https://arxiv.org/abs/2501.17399",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:15.205789+00:00",
  "updated_at": "2025-07-19T19:56:15.205789+00:00"
}

================================================
FILE: data/benchmarks/scicode.json
================================================
{
  "benchmark_id": "scicode",
  "name": "SciCode",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "math", "physics", "chemistry", "code"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "SciCode is a research coding benchmark curated by scientists that challenges language models to code solutions for scientific problems. It contains 338 subproblems decomposed from 80 challenging main problems across 16 natural science sub-fields including mathematics, physics, chemistry, biology, and materials science. Problems require knowledge recall, reasoning, and code synthesis skills.",
  "paper_link": "https://arxiv.org/abs/2407.13168",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-28T00:00:00.000000+00:00",
  "updated_at": "2025-07-28T00:00:00.000000+00:00"
}


================================================
FILE: data/benchmarks/scienceqa-visual.json
================================================
{
  "benchmark_id": "scienceqa-visual",
  "name": "ScienceQA Visual",
  "parent_benchmark_id": null,
  "categories": ["vision", "reasoning", "multimodal"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "ScienceQA Visual is a multimodal science question answering benchmark consisting of 21,208 multiple-choice questions from elementary and high school science curricula. The dataset covers 3 subjects (natural science, language science, social science), 26 topics, 127 categories, and 379 skills. 48.7% of questions include image context requiring multimodal reasoning. Questions are annotated with lectures (83.9%) and explanations (90.5%) to support chain-of-thought reasoning for science question answering.",
  "paper_link": "https://arxiv.org/abs/2209.09513",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.300722+00:00",
  "updated_at": "2025-07-19T19:56:14.300722+00:00"
}

================================================
FILE: data/benchmarks/scienceqa.json
================================================
{
  "benchmark_id": "scienceqa",
  "name": "ScienceQA",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "math", "multimodal"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "ScienceQA is the first large-scale multimodal science question answering benchmark with 21,208 multiple-choice questions covering 3 subjects (natural science, language science, social science), 26 topics, 127 categories, and 379 skills. The benchmark includes both text and image modalities, featuring detailed explanations and Chain-of-Thought reasoning to diagnose multi-hop reasoning ability.",
  "paper_link": "https://arxiv.org/abs/2209.09513",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.255251+00:00",
  "updated_at": "2025-07-19T19:56:14.255251+00:00"
}

================================================
FILE: data/benchmarks/screenspot-pro.json
================================================
{
  "benchmark_id": "screenspot-pro",
  "name": "ScreenSpot Pro",
  "parent_benchmark_id": null,
  "categories": ["vision", "multimodal", "spatial_reasoning"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "ScreenSpot-Pro is a novel GUI grounding benchmark designed to rigorously evaluate the grounding capabilities of multimodal large language models (MLLMs) in professional high-resolution computing environments. The benchmark comprises 1,581 instructions across 23 applications spanning 5 industries and 3 operating systems, featuring authentic high-resolution images from professional domains with expert annotations. Unlike previous benchmarks that focus on cropped screenshots in consumer applications, ScreenSpot-Pro addresses the complexity and diversity of real-world professional software scenarios, revealing significant performance gaps in current MLLM GUI perception capabilities.",
  "paper_link": "https://arxiv.org/abs/2504.07981",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.776671+00:00",
  "updated_at": "2025-07-19T19:56:14.776671+00:00"
}

================================================
FILE: data/benchmarks/screenspot.json
================================================
{
  "benchmark_id": "screenspot",
  "name": "ScreenSpot",
  "parent_benchmark_id": null,
  "categories": ["vision", "multimodal", "spatial_reasoning"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "ScreenSpot is the first realistic GUI grounding benchmark that encompasses mobile, desktop, and web environments. The dataset comprises over 1,200 instructions from iOS, Android, macOS, Windows and Web environments, along with annotated element types (text and icon/widget), designed to evaluate visual GUI agents' ability to accurately locate screen elements based on natural language instructions.",
  "paper_link": "https://arxiv.org/abs/2401.10935",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.766976+00:00",
  "updated_at": "2025-07-19T19:56:14.766976+00:00"
}

================================================
FILE: data/benchmarks/simpleqa.json
================================================
{
  "benchmark_id": "simpleqa",
  "name": "SimpleQA",
  "parent_benchmark_id": null,
  "categories": ["general", "reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "SimpleQA is a factuality benchmark developed by OpenAI that measures the short-form factual accuracy of large language models. The benchmark contains 4,326 short, fact-seeking questions that are adversarially collected and designed to have single, indisputable answers. Questions cover diverse topics from science and technology to entertainment, and the benchmark also measures model calibration by evaluating whether models know what they know.",
  "paper_link": "https://arxiv.org/abs/2411.04368",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-09-05T00:00:00.000000+00:00",
  "updated_at": "2025-09-05T00:00:00.000000+00:00"
}


================================================
FILE: data/benchmarks/slakevqa.json
================================================
{
  "benchmark_id": "slakevqa",
  "name": "SlakeVQA",
  "parent_benchmark_id": null,
  "categories": ["vision", "healthcare", "multimodal", "reasoning"],
  "modality": "multimodal",
  "multilingual": true,
  "max_score": 1.0,
  "language": "en",
  "description": "A semantically-labeled knowledge-enhanced dataset for medical visual question answering. Contains 642 radiology images (CT scans, MRI scans, X-rays) covering five body parts and 14,028 bilingual English-Chinese question-answer pairs annotated by experienced physicians. Features comprehensive semantic labels and a structural medical knowledge base with both vision-only and knowledge-based questions requiring external medical knowledge reasoning.",
  "paper_link": "https://arxiv.org/abs/2102.09542",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.027646+00:00",
  "updated_at": "2025-07-19T19:56:14.027646+00:00"
}

================================================
FILE: data/benchmarks/social-iqa.json
================================================
{
  "benchmark_id": "social-iqa",
  "name": "Social IQa",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "psychology"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "The first large-scale benchmark for commonsense reasoning about social situations. Contains 38,000 multiple choice questions probing emotional and social intelligence in everyday situations, testing commonsense understanding of social interactions and theory of mind reasoning about the implied emotions and behavior of others.",
  "paper_link": "https://arxiv.org/abs/1904.09728",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:13.155825+00:00",
  "updated_at": "2025-07-19T19:56:13.155825+00:00"
}

================================================
FILE: data/benchmarks/spider.json
================================================
{
  "benchmark_id": "spider",
  "name": "Spider",
  "parent_benchmark_id": null,
  "categories": ["language", "reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "A large-scale, complex and cross-domain semantic parsing and text-to-SQL dataset annotated by 11 college students. Contains 10,181 questions and 5,693 unique complex SQL queries on 200 databases with multiple tables, covering 138 different domains. Requires models to generalize to both new SQL queries and new database schemas, making it distinct from previous semantic parsing tasks that use single databases.",
  "paper_link": "https://arxiv.org/abs/1809.08887",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:15.156791+00:00",
  "updated_at": "2025-07-19T19:56:15.156791+00:00"
}

================================================
FILE: data/benchmarks/squality.json
================================================
{
  "benchmark_id": "squality",
  "name": "SQuALITY",
  "parent_benchmark_id": null,
  "categories": ["summarization", "long_context", "language"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "SQuALITY (Summarization-format QUestion Answering with Long Input Texts, Yes!) is a long-document summarization dataset built by hiring highly-qualified contractors to read public-domain short stories (3000-6000 words) and write original summaries from scratch. Each document has five summaries: one overview and four question-focused summaries. Designed to address limitations in existing summarization datasets by providing high-quality, faithful summaries.",
  "paper_link": "https://arxiv.org/abs/2205.11465",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:12.712415+00:00",
  "updated_at": "2025-07-19T19:56:12.712415+00:00"
}

================================================
FILE: data/benchmarks/stem.json
================================================
{
  "benchmark_id": "stem",
  "name": "STEM",
  "parent_benchmark_id": null,
  "categories": ["math", "reasoning", "multimodal"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "A comprehensive multimodal benchmark dataset with 448 skills and 1,073,146 questions spanning all STEM subjects (Science, Technology, Engineering, Mathematics), designed to test neural models' vision-language STEM skills based on K-12 curriculum. Unlike existing datasets that focus on expert-level ability, this dataset includes fundamental skills designed around educational standards.",
  "paper_link": "https://arxiv.org/abs/2402.17205",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.559354+00:00",
  "updated_at": "2025-07-19T19:56:14.559354+00:00"
}

================================================
FILE: data/benchmarks/summscreenfd.json
================================================
{
  "benchmark_id": "summscreenfd",
  "name": "SummScreenFD",
  "parent_benchmark_id": null,
  "categories": ["summarization", "long_context"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "SummScreenFD is the ForeverDreaming subset of the SummScreen dataset for abstractive screenplay summarization, comprising pairs of TV series transcripts and human-written recaps from 88 different shows. The dataset provides a challenging testbed for abstractive summarization where plot details are often expressed indirectly in character dialogues and scattered across the entirety of the transcript, requiring models to find and integrate these details to form succinct plot descriptions.",
  "paper_link": "https://arxiv.org/abs/2104.07091",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.229354+00:00",
  "updated_at": "2025-07-19T19:56:14.229354+00:00"
}

================================================
FILE: data/benchmarks/superglue.json
================================================
{
  "benchmark_id": "superglue",
  "name": "SuperGLUE",
  "parent_benchmark_id": null,
  "categories": ["general", "language", "reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "SuperGLUE is a new benchmark styled after GLUE with a new set of more difficult language understanding tasks, improved resources, and a new public leaderboard. It includes 8 primary tasks: BoolQ (Boolean Questions), CB (CommitmentBank), COPA (Choice of Plausible Alternatives), MultiRC (Multi-Sentence Reading Comprehension), ReCoRD (Reading Comprehension with Commonsense Reasoning), RTE (Recognizing Textual Entailment), WiC (Word-in-Context), and WSC (Winograd Schema Challenge). The benchmark evaluates diverse language understanding capabilities including reading comprehension, commonsense reasoning, causal reasoning, coreference resolution, textual entailment, and word sense disambiguation across multiple domains.",
  "paper_link": "https://arxiv.org/abs/1905.00537",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:15.382590+00:00",
  "updated_at": "2025-07-19T19:56:15.382590+00:00"
}

================================================
FILE: data/benchmarks/supergpqa.json
================================================
{
  "benchmark_id": "supergpqa",
  "name": "SuperGPQA",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "general", "math", "legal", "healthcare", "finance", "chemistry", "economics", "physics"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "SuperGPQA is a comprehensive benchmark that evaluates large language models across 285 graduate-level academic disciplines. The benchmark contains 25,957 questions covering 13 broad disciplinary areas including Engineering, Medicine, Science, and Law, with specialized fields in light industry, agriculture, and service-oriented domains. It employs a Human-LLM collaborative filtering mechanism with over 80 expert annotators to create challenging questions that assess graduate-level knowledge and reasoning capabilities.",
  "paper_link": "https://arxiv.org/abs/2502.14739",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-09-05T00:00:00.000000+00:00",
  "updated_at": "2025-09-05T00:00:00.000000+00:00"
}


================================================
FILE: data/benchmarks/swe-bench-multilingual.json
================================================
{
  "benchmark_id": "swe-bench-multilingual",
  "name": "SWE-bench Multilingual",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "code"],
  "modality": "text",
  "multilingual": true,
  "max_score": 1.0,
  "language": "en",
  "description": "A multilingual benchmark for issue resolving in software engineering that covers Java, TypeScript, JavaScript, Go, Rust, C, and C++. Contains 1,632 high-quality instances carefully annotated from 2,456 candidates by 68 expert annotators, designed to evaluate Large Language Models across diverse software ecosystems beyond Python.",
  "paper_link": "https://arxiv.org/abs/2504.02605",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:12.340903+00:00",
  "updated_at": "2025-07-19T19:56:12.340903+00:00"
}


================================================
FILE: data/benchmarks/swe-bench-verified-(agentic-coding).json
================================================
{
  "benchmark_id": "swe-bench-verified-(agentic-coding)",
  "name": "SWE-bench Verified (Agentic Coding)",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "code"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "SWE-bench Verified is a human-filtered subset of 500 software engineering problems drawn from real GitHub issues across 12 popular Python repositories. Given a codebase and an issue description, language models are tasked with generating patches that resolve the described problems. This benchmark evaluates AI's real-world agentic coding skills by requiring models to navigate complex codebases, understand software engineering problems, and coordinate changes across multiple functions, classes, and files to fix well-defined issues with clear descriptions.",
  "paper_link": "https://arxiv.org/abs/2310.06770",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:12.331440+00:00",
  "updated_at": "2025-07-19T19:56:12.331440+00:00"
}


================================================
FILE: data/benchmarks/swe-bench-verified-(agentless).json
================================================
{
  "benchmark_id": "swe-bench-verified-(agentless)",
  "name": "SWE-bench Verified (Agentless)",
  "parent_benchmark_id": null,
  "categories": ["general", "reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "A human-validated subset of SWE-bench that evaluates language models' ability to resolve real-world GitHub issues using an agentless approach. The benchmark tests models on software engineering problems requiring understanding and coordinating changes across multiple functions, classes, and files simultaneously.",
  "paper_link": "https://arxiv.org/abs/2407.01489",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:12.328122+00:00",
  "updated_at": "2025-07-19T19:56:12.328122+00:00"
}

================================================
FILE: data/benchmarks/swe-bench-verified-(multiple-attempts).json
================================================
{
  "benchmark_id": "swe-bench-verified-(multiple-attempts)",
  "name": "SWE-bench Verified (Multiple Attempts)",
  "parent_benchmark_id": null,
  "categories": ["reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "SWE-bench Verified is a human-validated subset of 500 test samples from the original SWE-bench dataset that evaluates AI systems' ability to automatically resolve real GitHub issues in Python repositories. Given a codebase and issue description, models must edit the code to successfully resolve the problem, requiring understanding and coordination of changes across multiple functions, classes, and files. The Verified version provides more reliable evaluation through manual validation of test samples.",
  "paper_link": "https://arxiv.org/abs/2310.06770",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:12.336780+00:00",
  "updated_at": "2025-07-19T19:56:12.336780+00:00"
}

================================================
FILE: data/benchmarks/swe-bench-verified.json
================================================
{
  "benchmark_id": "swe-bench-verified",
  "name": "SWE-Bench Verified",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "frontend_development", "code"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "A verified subset of 500 software engineering problems from real GitHub issues, validated by human annotators for evaluating language models' ability to resolve real-world coding issues by generating patches for Python codebases.",
  "paper_link": "https://arxiv.org/abs/2310.06770",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:13.812805+00:00",
  "updated_at": "2025-07-19T19:56:13.812805+00:00"
}


================================================
FILE: data/benchmarks/swe-dev.json
================================================
{
  "benchmark_id": "swe-dev",
  "name": "SWE-Dev",
  "parent_benchmark_id": null,
  "categories": ["frontend_development"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "SWE-bench development split consisting of 225 software engineering problems drawn from real GitHub issues across 12 popular Python repositories. Language models are given a codebase along with a description of an issue to be resolved and must edit the codebase to address the issue, often requiring understanding and coordinating changes across multiple functions, classes, and files.",
  "paper_link": "https://arxiv.org/abs/2310.06770",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-09-15T00:00:00.000000+00:00",
  "updated_at": "2025-09-15T00:00:00.000000+00:00"
}


================================================
FILE: data/benchmarks/swe-lancer-(ic-diamond-subset).json
================================================
{
  "benchmark_id": "swe-lancer-(ic-diamond-subset)",
  "name": "SWE-Lancer (IC-Diamond subset)",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "code"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "SWE-Lancer (IC-Diamond subset) is a benchmark of real-world freelance software engineering tasks from Upwork, ranging from $50 bug fixes to $32,000 feature implementations. It evaluates AI models on independent engineering tasks using end-to-end tests triple-verified by experienced software engineers, and includes managerial tasks where models choose between technical implementation proposals.",
  "paper_link": "https://arxiv.org/abs/2502.12115",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:15.359574+00:00",
  "updated_at": "2025-07-19T19:56:15.359574+00:00"
}


================================================
FILE: data/benchmarks/swe-lancer.json
================================================
{
  "benchmark_id": "swe-lancer",
  "name": "SWE-Lancer",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "code"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "A benchmark for evaluating large language models on real-world freelance software engineering tasks from Upwork. Contains over 1,400 tasks valued at $1 million USD total, ranging from $50 bug fixes to $32,000 feature implementations. Includes both independent engineering tasks graded via end-to-end tests and managerial tasks assessed against original engineering managers' choices.",
  "paper_link": "https://arxiv.org/abs/2502.12115",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:15.352660+00:00",
  "updated_at": "2025-07-19T19:56:15.352660+00:00"
}


================================================
FILE: data/benchmarks/tau-bench-airline.json
================================================
{
  "benchmark_id": "tau-bench-airline",
  "name": "TAU-bench Airline",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "communication"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "Part of τ-bench (TAU-bench), a benchmark for Tool-Agent-User interaction in real-world domains. The airline domain evaluates language agents' ability to interact with users through dynamic conversations while following domain-specific rules and using API tools. Agents must handle airline-related tasks and policies reliably.",
  "paper_link": "https://arxiv.org/abs/2406.12045",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.993213+00:00",
  "updated_at": "2025-07-19T19:56:14.993213+00:00"
}

================================================
FILE: data/benchmarks/tau-bench-retail.json
================================================
{
  "benchmark_id": "tau-bench-retail",
  "name": "TAU-bench Retail",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "communication"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "A benchmark for evaluating tool-agent-user interaction in retail environments. Tests language agents' ability to handle dynamic conversations with users while using domain-specific API tools and following policy guidelines. Evaluates agents on tasks like order cancellations, address changes, and order status checks through multi-turn conversations.",
  "paper_link": "https://arxiv.org/abs/2406.12045",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.965635+00:00",
  "updated_at": "2025-07-19T19:56:14.965635+00:00"
}

================================================
FILE: data/benchmarks/tau-bench.json
================================================
{
  "benchmark_id": "tau-bench",
  "name": "Tau-bench",
  "parent_benchmark_id": null,
  "categories": ["general", "reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "τ-bench: A benchmark for tool-agent-user interaction in real-world domains. Tests language agents' ability to interact with users and follow domain-specific rules through dynamic conversations using API tools and policy guidelines across retail and airline domains. Evaluates consistency and reliability of agent behavior over multiple trials.",
  "paper_link": "https://arxiv.org/abs/2406.12045",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:15.219001+00:00",
  "updated_at": "2025-07-19T19:56:15.219001+00:00"
}

================================================
FILE: data/benchmarks/tau2-airline.json
================================================
{
  "benchmark_id": "tau2-airline",
  "name": "Tau2 Airline",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "communication"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "TAU2 airline domain benchmark for evaluating conversational agents in dual-control environments where both AI agents and users interact with tools in airline customer service scenarios. Tests agent coordination, communication, and ability to guide user actions in tasks like flight booking, modifications, cancellations, and refunds.",
  "paper_link": "https://arxiv.org/abs/2506.07982",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-09-05T00:00:00.000000+00:00",
  "updated_at": "2025-09-05T00:00:00.000000+00:00"
}


================================================
FILE: data/benchmarks/tau2-retail.json
================================================
{
  "benchmark_id": "tau2-retail",
  "name": "Tau2 Retail",
  "parent_benchmark_id": null,
  "categories": ["communication", "reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "τ²-bench retail domain evaluates conversational AI agents in customer service scenarios within a dual-control environment where both agent and user can interact with tools. Tests tool-agent-user interaction, rule adherence, and task consistency in retail customer support contexts.",
  "paper_link": "https://arxiv.org/abs/2506.07982",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-09-05T00:00:00.000000+00:00",
  "updated_at": "2025-09-05T00:00:00.000000+00:00"
}


================================================
FILE: data/benchmarks/tau2-telecom.json
================================================
{
  "benchmark_id": "tau2-telecom",
  "name": "Tau2 Telecom",
  "parent_benchmark_id": null,
  "categories": ["communication", "reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "τ²-Bench telecom domain evaluates conversational agents in a dual-control environment modeled as a Dec-POMDP, where both agent and user use tools in shared telecommunications troubleshooting scenarios that test coordination and communication capabilities.",
  "paper_link": "https://arxiv.org/abs/2506.07982",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-09-05T00:00:00.000000+00:00",
  "updated_at": "2025-09-05T00:00:00.000000+00:00"
}


================================================
FILE: data/benchmarks/tempcompass.json
================================================
{
  "benchmark_id": "tempcompass",
  "name": "TempCompass",
  "parent_benchmark_id": null,
  "categories": ["vision", "multimodal", "reasoning"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "TempCompass is a comprehensive benchmark for evaluating temporal perception capabilities of Video Large Language Models (Video LLMs). It constructs conflicting videos that share identical static content but differ in specific temporal aspects to prevent models from exploiting single-frame bias. The benchmark evaluates multiple temporal aspects including action, motion, speed, temporal order, and attribute changes across diverse task formats including multi-choice QA, yes/no QA, caption matching, and caption generation.",
  "paper_link": "https://arxiv.org/abs/2403.00476",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.748364+00:00",
  "updated_at": "2025-07-19T19:56:14.748364+00:00"
}

================================================
FILE: data/benchmarks/terminal-bench.json
================================================
{
  "benchmark_id": "terminal-bench",
  "name": "Terminal-Bench",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "code"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "Terminal-Bench is a benchmark for testing AI agents in real terminal environments. It evaluates how well agents can handle real-world, end-to-end tasks autonomously, including compiling code, training models, setting up servers, system administration, security tasks, data science workflows, and cybersecurity vulnerabilities. The benchmark consists of a dataset of ~100 hand-crafted, human-verified tasks and an execution harness that connects language models to a terminal sandbox.",
  "paper_link": null,
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-28T00:00:00.000000+00:00",
  "updated_at": "2025-07-28T00:00:00.000000+00:00"
}


================================================
FILE: data/benchmarks/terminus.json
================================================
{
  "benchmark_id": "terminus",
  "name": "Terminus",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "code"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "Terminal-Bench is a benchmark for testing AI agents in real terminal environments, evaluating how well agents can handle real-world, end-to-end tasks autonomously. The benchmark includes tasks spanning coding, system administration, security, data science, model training, file operations, version control, and web development. Terminus is the neutral test-bed agent designed to work with Terminal-Bench, operating purely through tmux sessions without dedicated tools.",
  "paper_link": "https://github.com/laude-institute/terminal-bench",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:12.355994+00:00",
  "updated_at": "2025-07-19T19:56:12.355994+00:00"
}


================================================
FILE: data/benchmarks/textvqa.json
================================================
{
  "benchmark_id": "textvqa",
  "name": "TextVQA",
  "parent_benchmark_id": null,
  "categories": ["vision", "multimodal", "image-to-text"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "TextVQA contains 45,336 questions on 28,408 images that require reasoning about text to answer. Introduced to benchmark VQA models' ability to read and reason about text within images, particularly for assistive technologies for visually impaired users. The dataset addresses the gap where existing VQA datasets had few text-based questions or were too small.",
  "paper_link": "https://arxiv.org/abs/1904.08920",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:12.875287+00:00",
  "updated_at": "2025-07-19T19:56:12.875287+00:00"
}

================================================
FILE: data/benchmarks/theoremqa.json
================================================
{
  "benchmark_id": "theoremqa",
  "name": "TheoremQA",
  "parent_benchmark_id": null,
  "categories": ["math", "reasoning", "physics", "finance"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "A theorem-driven question answering dataset containing 800 high-quality questions covering 350+ theorems from Math, Physics, EE&CS, and Finance. Designed to evaluate AI models' capabilities to apply theorems to solve challenging university-level science problems.",
  "paper_link": "https://arxiv.org/abs/2305.12524",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.479157+00:00",
  "updated_at": "2025-07-19T19:56:14.479157+00:00"
}

================================================
FILE: data/benchmarks/tldr9+-(test).json
================================================
{
  "benchmark_id": "tldr9+-(test)",
  "name": "TLDR9+ (test)",
  "parent_benchmark_id": null,
  "categories": ["summarization", "language"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "A large-scale summarization dataset containing over 9 million training instances extracted from Reddit, designed for extreme summarization (generating one-sentence summaries with high compression and abstraction). More than twice larger than previously proposed datasets.",
  "paper_link": "https://arxiv.org/abs/2110.01159",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.439927+00:00",
  "updated_at": "2025-07-19T19:56:14.439927+00:00"
}

================================================
FILE: data/benchmarks/translation-en-to-set1-comet22.json
================================================
{
  "benchmark_id": "translation-en\u2192set1-comet22",
  "name": "Translation en\u2192Set1 COMET22",
  "parent_benchmark_id": null,
  "categories": ["language"],
  "modality": "text",
  "multilingual": true,
  "max_score": 1.0,
  "language": "en",
  "description": "COMET-22 is an ensemble machine translation evaluation metric combining a COMET estimator model trained with Direct Assessments and a multitask model that predicts sentence-level scores and word-level OK/BAD tags. It demonstrates improved correlations compared to state-of-the-art metrics and increased robustness to critical errors.",
  "paper_link": "https://aclanthology.org/2022.wmt-1.52/",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:12.959436+00:00",
  "updated_at": "2025-07-19T19:56:12.959436+00:00"
}

================================================
FILE: data/benchmarks/translation-en-to-set1-spbleu.json
================================================
{
  "benchmark_id": "translation-en\u2192set1-spbleu",
  "name": "Translation en\u2192Set1 spBleu",
  "parent_benchmark_id": null,
  "categories": ["language"],
  "modality": "text",
  "multilingual": true,
  "max_score": 1.0,
  "language": "en",
  "description": "Translation evaluation using spBLEU (SentencePiece BLEU), a BLEU metric computed over text tokenized with a language-agnostic SentencePiece subword model. Introduced in the FLORES-101 evaluation benchmark for low-resource and multilingual machine translation.",
  "paper_link": "https://arxiv.org/abs/2106.03193",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:12.936891+00:00",
  "updated_at": "2025-07-19T19:56:12.936891+00:00"
}

================================================
FILE: data/benchmarks/translation-set1-to-en-comet22.json
================================================
{
  "benchmark_id": "translation-set1\u2192en-comet22",
  "name": "Translation Set1\u2192en COMET22",
  "parent_benchmark_id": null,
  "categories": ["language"],
  "modality": "text",
  "multilingual": true,
  "max_score": 1.0,
  "language": "en",
  "description": "COMET-22 is a neural machine translation evaluation metric that uses an ensemble of two models: a COMET estimator trained with Direct Assessments and a multitask model that predicts sentence-level scores and word-level OK/BAD tags. It provides improved correlations with human judgments and increased robustness to critical errors compared to previous metrics.",
  "paper_link": "https://aclanthology.org/2022.wmt-1.52/",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:12.974744+00:00",
  "updated_at": "2025-07-19T19:56:12.974744+00:00"
}

================================================
FILE: data/benchmarks/translation-set1-to-en-spbleu.json
================================================
{
  "benchmark_id": "translation-set1\u2192en-spbleu",
  "name": "Translation Set1\u2192en spBleu",
  "parent_benchmark_id": null,
  "categories": ["language"],
  "modality": "text",
  "multilingual": true,
  "max_score": 1.0,
  "language": "en",
  "description": "spBLEU (SentencePiece BLEU) evaluation metric for machine translation quality assessment, using language-agnostic SentencePiece tokenization with BLEU scoring. Part of the FLORES-101 evaluation benchmark for low-resource and multilingual machine translation.",
  "paper_link": "https://arxiv.org/abs/2106.03193",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:12.967240+00:00",
  "updated_at": "2025-07-19T19:56:12.967240+00:00"
}

================================================
FILE: data/benchmarks/triviaqa.json
================================================
{
  "benchmark_id": "triviaqa",
  "name": "TriviaQA",
  "parent_benchmark_id": null,
  "categories": ["general", "reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "A large-scale reading comprehension dataset containing over 650K question-answer-evidence triples. TriviaQA includes 95K question-answer pairs authored by trivia enthusiasts and independently gathered evidence documents (six per question on average) that provide high quality distant supervision for answering the questions. The dataset features relatively complex, compositional questions with considerable syntactic and lexical variability, requiring cross-sentence reasoning to find answers.",
  "paper_link": "https://arxiv.org/abs/1705.03551",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:11.563587+00:00",
  "updated_at": "2025-07-19T19:56:11.563587+00:00"
}

================================================
FILE: data/benchmarks/truthfulqa.json
================================================
{
  "benchmark_id": "truthfulqa",
  "name": "TruthfulQA",
  "parent_benchmark_id": null,
  "categories": ["general", "reasoning", "legal", "healthcare", "finance"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "TruthfulQA is a benchmark to measure whether language models are truthful in generating answers to questions. It comprises 817 questions that span 38 categories, including health, law, finance and politics. The questions are crafted such that some humans would answer falsely due to a false belief or misconception, testing models' ability to avoid generating false answers learned from human texts.",
  "paper_link": "https://arxiv.org/abs/2109.07958",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:11.339268+00:00",
  "updated_at": "2025-07-19T19:56:11.339268+00:00"
}

================================================
FILE: data/benchmarks/tydiqa.json
================================================
{
  "benchmark_id": "tydiqa",
  "name": "TydiQA",
  "parent_benchmark_id": null,
  "categories": ["language", "reasoning"],
  "modality": "text",
  "multilingual": true,
  "max_score": 1.0,
  "language": "en",
  "description": "A multilingual question answering benchmark covering 11 typologically diverse languages with 204K question-answer pairs. Questions are written by people seeking genuine information and data is collected directly in each language without translation to test model generalization across diverse linguistic structures.",
  "paper_link": "https://arxiv.org/abs/2003.05002",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.470500+00:00",
  "updated_at": "2025-07-19T19:56:14.470500+00:00"
}

================================================
FILE: data/benchmarks/uniform-bar-exam.json
================================================
{
  "benchmark_id": "uniform-bar-exam",
  "name": "Uniform Bar Exam",
  "parent_benchmark_id": null,
  "categories": ["legal", "reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "The Uniform Bar Examination (UBE) benchmark evaluates language models on the complete bar exam including multiple-choice Multistate Bar Examination (MBE), open-ended Multistate Essay Exam (MEE), and Multistate Performance Test (MPT) components. Used to assess legal reasoning capabilities across seven subject areas including Evidence, Torts, Constitutional Law, Contracts, Criminal Law and Procedure, Real Property, and Civil Procedure.",
  "paper_link": "https://royalsocietypublishing.org/doi/10.1098/rsta.2023.0254",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:15.404860+00:00",
  "updated_at": "2025-07-19T19:56:15.404860+00:00"
}

================================================
FILE: data/benchmarks/usamo25.json
================================================
{
  "benchmark_id": "usamo25",
  "name": "USAMO25",
  "parent_benchmark_id": null,
  "categories": ["math", "reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "The 2025 United States of America Mathematical Olympiad (USAMO) benchmark consists of six challenging mathematical problems requiring rigorous proof-based reasoning. USAMO is the most prestigious high school mathematics competition in the United States, serving as the final round of the American Mathematics Competitions series. This benchmark evaluates models on mathematical problem-solving capabilities beyond simple numerical computation, focusing on formal mathematical reasoning and proof generation.",
  "paper_link": "https://arxiv.org/abs/2503.21934",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:15.067604+00:00",
  "updated_at": "2025-07-19T19:56:15.067604+00:00"
}

================================================
FILE: data/benchmarks/vatex.json
================================================
{
  "benchmark_id": "vatex",
  "name": "VATEX",
  "parent_benchmark_id": null,
  "categories": ["multimodal", "video", "language"],
  "modality": "multimodal",
  "multilingual": true,
  "max_score": 1.0,
  "language": "en",
  "description": "VaTeX: A Large-Scale, High-Quality Multilingual Dataset for Video-and-Language Research. Contains over 41,250 videos and 825,000 captions in both English and Chinese, with over 206,000 English-Chinese parallel translation pairs. Supports multilingual video captioning and video-guided machine translation tasks.",
  "paper_link": "https://arxiv.org/abs/1904.03493",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:12.909879+00:00",
  "updated_at": "2025-07-19T19:56:12.909879+00:00"
}

================================================
FILE: data/benchmarks/vcr-en-easy.json
================================================
{
  "benchmark_id": "vcr-en-easy",
  "name": "VCR_en_easy",
  "parent_benchmark_id": null,
  "categories": ["vision", "reasoning"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "Visual Commonsense Reasoning (VCR) benchmark that tests higher-order cognition and commonsense reasoning beyond simple object recognition. Models must answer challenging questions about images and provide rationales justifying their answers. The benchmark measures the ability to infer people's actions, goals, and mental states from visual context.",
  "paper_link": "https://arxiv.org/abs/1811.10830",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.592175+00:00",
  "updated_at": "2025-07-19T19:56:14.592175+00:00"
}

================================================
FILE: data/benchmarks/vibe-eval.json
================================================
{
  "benchmark_id": "vibe-eval",
  "name": "Vibe-Eval",
  "parent_benchmark_id": null,
  "categories": ["multimodal", "vision", "general"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "VIBE-Eval is a hard evaluation suite for measuring progress of multimodal language models, consisting of 269 visual understanding prompts with gold-standard responses authored by experts. The benchmark has dual objectives: vibe checking multimodal chat models for day-to-day tasks and rigorously testing frontier models, with the hard set containing >50% questions that all frontier models answer incorrectly.",
  "paper_link": "https://arxiv.org/abs/2405.02287",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:13.871369+00:00",
  "updated_at": "2025-07-19T19:56:13.871369+00:00"
}

================================================
FILE: data/benchmarks/video-mme-(long,-no-subtitles).json
================================================
{
  "benchmark_id": "video-mme-(long,-no-subtitles)",
  "name": "Video-MME (long, no subtitles)",
  "parent_benchmark_id": null,
  "categories": ["vision", "multimodal", "video"],
  "modality": "multimodal",
  "multilingual": true,
  "max_score": 1.0,
  "language": "en",
  "description": "Video-MME is the first-ever comprehensive evaluation benchmark for Multi-modal Large Language Models (MLLMs) in video analysis. This variant focuses on long-term videos (30min-60min) without subtitle inputs, testing robust contextual dynamics across 6 primary visual domains with 30 subfields including knowledge, film & television, sports competition, life record, and multilingual content.",
  "paper_link": "https://arxiv.org/abs/2405.21075",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:15.374053+00:00",
  "updated_at": "2025-07-19T19:56:15.374053+00:00"
}

================================================
FILE: data/benchmarks/video-mme.json
================================================
{
  "benchmark_id": "video-mme",
  "name": "Video-MME",
  "parent_benchmark_id": null,
  "categories": ["multimodal", "vision", "reasoning"],
  "modality": "multimodal",
  "multilingual": true,
  "max_score": 1.0,
  "language": "en",
  "description": "Video-MME is the first-ever comprehensive evaluation benchmark of Multi-modal Large Language Models (MLLMs) in video analysis. It features 900 videos totaling 254 hours with 2,700 human-annotated question-answer pairs across 6 primary visual domains (Knowledge, Film & Television, Sports Competition, Life Record, Multilingual, and others) and 30 subfields. The benchmark evaluates models across diverse temporal dimensions (11 seconds to 1 hour), integrates multi-modal inputs including video frames, subtitles, and audio, and uses rigorous manual labeling by expert annotators for precise assessment.",
  "paper_link": "https://arxiv.org/abs/2405.21075",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:13.901883+00:00",
  "updated_at": "2025-07-19T19:56:13.901883+00:00"
}

================================================
FILE: data/benchmarks/video-mmew-sub.json
================================================
{
  "benchmark_id": "video-mmew-sub",
  "name": "Video-MMEw sub",
  "parent_benchmark_id": null,
  "categories": ["multimodal", "reasoning", "vision"],
  "modality": "multimodal",
  "multilingual": true,
  "max_score": 1.0,
  "language": "en",
  "description": "Video-MME is the first comprehensive evaluation benchmark for multi-modal large language models in video analysis. It consists of 900 videos (254 hours total) across 6 domains and 30 sub-categories, with 2,700 high-quality multiple-choice questions. The benchmark evaluates MLLMs on diverse video types of varying durations (11 seconds to 1 hour) with multi-modal inputs including video frames, subtitles, and audio to assess perception, reasoning, and temporal understanding capabilities.",
  "paper_link": "https://arxiv.org/abs/2405.21075",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-08-03T22:06:11.276310+00:00",
  "updated_at": "2025-08-03T22:06:11.276310+00:00"
}

================================================
FILE: data/benchmarks/videomme-w-o-sub..json
================================================
{
  "benchmark_id": "videomme-w-o-sub.",
  "name": "VideoMME w/o sub.",
  "parent_benchmark_id": null,
  "categories": ["multimodal", "video", "vision"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "Video-MME is a comprehensive evaluation benchmark for multi-modal large language models in video analysis. It features 900 videos across 6 primary visual domains with 30 subfields, ranging from 11 seconds to 1 hour in duration, with 2,700 question-answer pairs. The benchmark evaluates MLLMs' capabilities in processing sequential visual data and multi-modal content including video frames, subtitles, and audio.",
  "paper_link": "https://arxiv.org/abs/2405.21075",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.715184+00:00",
  "updated_at": "2025-07-19T19:56:14.715184+00:00"
}

================================================
FILE: data/benchmarks/videomme-w-sub..json
================================================
{
  "benchmark_id": "videomme-w-sub.",
  "name": "VideoMME w sub.",
  "parent_benchmark_id": null,
  "categories": ["vision", "multimodal", "video"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "The first-ever comprehensive evaluation benchmark of Multi-modal LLMs in Video analysis. Features 900 videos (254 hours) with 2,700 question-answer pairs covering 6 primary visual domains and 30 subfields. Evaluates temporal understanding across short (11 seconds) to long (1 hour) videos with multi-modal inputs including video frames, subtitles, and audio.",
  "paper_link": "https://arxiv.org/abs/2405.21075",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.723259+00:00",
  "updated_at": "2025-07-19T19:56:14.723259+00:00"
}

================================================
FILE: data/benchmarks/videommmu.json
================================================
{
  "benchmark_id": "videommmu",
  "name": "VideoMMMU",
  "parent_benchmark_id": null,
  "categories": ["multimodal", "vision", "reasoning"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "Video-MMMU evaluates Large Multimodal Models' ability to acquire knowledge from expert-level professional videos across six disciplines through three cognitive stages: perception, comprehension, and adaptation. Contains 300 videos and 900 human-annotated questions spanning Art, Business, Science, Medicine, Humanities, and Engineering.",
  "paper_link": "https://arxiv.org/abs/2501.13826",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.007381+00:00",
  "updated_at": "2025-07-19T19:56:14.007381+00:00"
}

================================================
FILE: data/benchmarks/visualwebbench.json
================================================
{
  "benchmark_id": "visualwebbench",
  "name": "VisualWebBench",
  "parent_benchmark_id": null,
  "categories": ["vision", "multimodal", "frontend_development"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "A multimodal benchmark designed to assess the capabilities of multimodal large language models (MLLMs) across web page understanding and grounding tasks. Comprises 7 tasks (captioning, webpage QA, heading OCR, element OCR, element grounding, action prediction, and action grounding) with 1.5K human-curated instances from 139 real websites across 87 sub-domains.",
  "paper_link": "https://arxiv.org/abs/2404.05955",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:12.747583+00:00",
  "updated_at": "2025-07-19T19:56:12.747583+00:00"
}

================================================
FILE: data/benchmarks/vocalsound.json
================================================
{
  "benchmark_id": "vocalsound",
  "name": "VocalSound",
  "parent_benchmark_id": null,
  "categories": ["audio"],
  "modality": "audio",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "A dataset for improving human vocal sounds recognition, containing over 21,000 crowdsourced recordings of laughter, sighs, coughs, throat clearing, sneezes, and sniffs from 3,365 unique subjects. Used for audio event classification and recognition of human non-speech vocalizations.",
  "paper_link": "https://arxiv.org/abs/2205.03433",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.919198+00:00",
  "updated_at": "2025-07-19T19:56:14.919198+00:00"
}

================================================
FILE: data/benchmarks/voicebench-avg.json
================================================
{
  "benchmark_id": "voicebench-avg",
  "name": "VoiceBench Avg",
  "parent_benchmark_id": null,
  "categories": ["general", "reasoning", "safety", "communication"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "VoiceBench is the first benchmark designed to provide a multi-faceted evaluation of LLM-based voice assistants, evaluating capabilities including general knowledge, instruction-following, reasoning, and safety using both synthetic and real spoken instruction data with diverse speaker characteristics and environmental conditions.",
  "paper_link": "https://arxiv.org/abs/2410.17196",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.922519+00:00",
  "updated_at": "2025-07-19T19:56:14.922519+00:00"
}

================================================
FILE: data/benchmarks/vqa-rad.json
================================================
{
  "benchmark_id": "vqa-rad",
  "name": "VQA-Rad",
  "parent_benchmark_id": null,
  "categories": ["vision", "healthcare", "multimodal"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "VQA-RAD (Visual Question Answering in Radiology) is the first manually constructed dataset of medical visual question answering containing 3,515 clinically generated visual questions and answers about radiology images. The dataset includes questions created by clinical trainees on 315 radiology images from MedPix covering head, chest, and abdominal scans, designed to support AI development for medical image analysis and improve patient care.",
  "paper_link": "https://doi.org/10.1038/sdata.2018.251",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.031802+00:00",
  "updated_at": "2025-07-19T19:56:14.031802+00:00"
}

================================================
FILE: data/benchmarks/vqav2-(test).json
================================================
{
  "benchmark_id": "vqav2-(test)",
  "name": "VQAv2 (test)",
  "parent_benchmark_id": null,
  "categories": ["vision", "multimodal", "reasoning"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "VQA v2.0 (Visual Question Answering v2.0) is a balanced dataset designed to counter language priors in visual question answering. It consists of complementary image pairs where the same question yields different answers, forcing models to rely on visual understanding rather than language bias. The dataset contains 1,105,904 questions across 204,721 COCO images, requiring understanding of vision, language, and commonsense knowledge.",
  "paper_link": "https://arxiv.org/abs/1612.00837",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.430940+00:00",
  "updated_at": "2025-07-19T19:56:14.430940+00:00"
}

================================================
FILE: data/benchmarks/vqav2-(val).json
================================================
{
  "benchmark_id": "vqav2-(val)",
  "name": "VQAv2 (val)",
  "parent_benchmark_id": null,
  "categories": ["vision", "multimodal", "language", "reasoning"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "VQAv2 is a balanced Visual Question Answering dataset containing open-ended questions about images that require understanding of vision, language, and commonsense knowledge to answer. VQAv2 addresses bias issues from the original VQA dataset by collecting complementary images such that every question is associated with similar images that result in different answers, forcing models to actually understand visual content rather than relying on language priors.",
  "paper_link": "https://arxiv.org/abs/1612.00837",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:13.647852+00:00",
  "updated_at": "2025-07-19T19:56:13.647852+00:00"
}

================================================
FILE: data/benchmarks/vqav2.json
================================================
{
  "benchmark_id": "vqav2",
  "name": "VQAv2",
  "parent_benchmark_id": null,
  "categories": ["vision", "multimodal", "reasoning"],
  "modality": "multimodal",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "VQAv2 is a balanced Visual Question Answering dataset that addresses language bias by providing complementary images for each question, forcing models to rely on visual understanding rather than language priors. It contains approximately twice the number of image-question pairs compared to the original VQA dataset.",
  "paper_link": "https://arxiv.org/abs/1612.00837",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:14.410411+00:00",
  "updated_at": "2025-07-19T19:56:14.410411+00:00"
}

================================================
FILE: data/benchmarks/wild-bench.json
================================================
{
  "benchmark_id": "wild-bench",
  "name": "Wild Bench",
  "parent_benchmark_id": null,
  "categories": ["general", "reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "WildBench is an automated evaluation framework that benchmarks large language models using 1,024 challenging, real-world tasks selected from over one million human-chatbot conversation logs. It introduces two evaluation metrics (WB-Reward and WB-Score) that achieve high correlation with human preferences and uses task-specific checklists for systematic evaluation.",
  "paper_link": "https://arxiv.org/abs/2406.04770",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:15.122112+00:00",
  "updated_at": "2025-07-19T19:56:15.122112+00:00"
}

================================================
FILE: data/benchmarks/winogrande.json
================================================
{
  "benchmark_id": "winogrande",
  "name": "Winogrande",
  "parent_benchmark_id": null,
  "categories": ["reasoning", "language"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "WinoGrande: An Adversarial Winograd Schema Challenge at Scale. A large-scale dataset of 44,000 pronoun resolution problems designed to test machine commonsense reasoning. Uses adversarial filtering to reduce spurious biases and provides a more robust evaluation of whether AI systems truly understand commonsense or exploit statistical shortcuts. Current best AI methods achieve 59.4-79.1% accuracy, significantly below human performance of 94.0%.",
  "paper_link": "https://arxiv.org/abs/1907.10641",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:11.370408+00:00",
  "updated_at": "2025-07-19T19:56:11.370408+00:00"
}

================================================
FILE: data/benchmarks/wmt23.json
================================================
{
  "benchmark_id": "wmt23",
  "name": "WMT23",
  "parent_benchmark_id": null,
  "categories": ["language"],
  "modality": "text",
  "multilingual": true,
  "max_score": 1.0,
  "language": "en",
  "description": "The Eighth Conference on Machine Translation (WMT23) benchmark evaluating machine translation systems across 8 language pairs (14 translation directions) including general, biomedical, literary, and low-resource language translation tasks. Features specialized shared tasks for quality estimation, metrics evaluation, sign language translation, and discourse-level literary translation with professional human assessment.",
  "paper_link": "https://aclanthology.org/2023.wmt-1.1/",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:13.934606+00:00",
  "updated_at": "2025-07-19T19:56:13.934606+00:00"
}

================================================
FILE: data/benchmarks/wmt24++.json
================================================
{
  "benchmark_id": "wmt24++",
  "name": "WMT24++",
  "parent_benchmark_id": null,
  "categories": ["language"],
  "modality": "text",
  "multilingual": true,
  "max_score": 1.0,
  "language": "en",
  "description": "WMT24++ is a comprehensive multilingual machine translation benchmark that expands the WMT24 dataset to cover 55 languages and dialects. It includes human-written references and post-edits across four domains (literary, news, social, and speech) to evaluate machine translation systems and large language models across diverse linguistic contexts.",
  "paper_link": "https://arxiv.org/abs/2502.12404",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:13.576712+00:00",
  "updated_at": "2025-07-19T19:56:13.576712+00:00"
}

================================================
FILE: data/benchmarks/writingbench.json
================================================
{
  "benchmark_id": "writingbench",
  "name": "WritingBench",
  "parent_benchmark_id": null,
  "categories": ["writing", "creativity", "communication"],
  "modality": "text",
  "multilingual": true,
  "max_score": 1.0,
  "language": "en",
  "description": "A comprehensive benchmark for evaluating large language models' generative writing capabilities across 6 core writing domains (Academic & Engineering, Finance & Business, Politics & Law, Literature & Art, Education, Advertising & Marketing) and 100 subdomains. Contains 1,239 queries with a query-dependent evaluation framework that dynamically generates 5 instance-specific assessment criteria for each writing task, using a fine-tuned critic model to score responses on style, format, and length dimensions.",
  "paper_link": "https://arxiv.org/abs/2503.05244",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-08-03T22:06:11.074130+00:00",
  "updated_at": "2025-08-03T22:06:11.074130+00:00"
}

================================================
FILE: data/benchmarks/xlsum-english.json
================================================
{
  "benchmark_id": "xlsum-english",
  "name": "XLSum English",
  "parent_benchmark_id": null,
  "categories": ["summarization", "language"],
  "modality": "text",
  "multilingual": true,
  "max_score": 1.0,
  "language": "en",
  "description": "Large-scale multilingual abstractive summarization dataset comprising 1 million professionally annotated article-summary pairs from BBC, covering 44 languages. XL-Sum is highly abstractive, concise, and of high quality, designed to encourage research on multilingual abstractive summarization tasks.",
  "paper_link": "https://arxiv.org/abs/2106.13822",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:15.092213+00:00",
  "updated_at": "2025-07-19T19:56:15.092213+00:00"
}

================================================
FILE: data/benchmarks/xstest.json
================================================
{
  "benchmark_id": "xstest",
  "name": "XSTest",
  "parent_benchmark_id": null,
  "categories": ["safety"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "XSTest is a test suite designed to identify exaggerated safety behaviours in large language models. It comprises 450 prompts: 250 safe prompts across ten prompt types that well-calibrated models should not refuse to comply with, and 200 unsafe prompts as contrasts that models should refuse. The benchmark systematically evaluates whether models refuse to respond to clearly safe prompts due to overly cautious safety mechanisms.",
  "paper_link": "https://arxiv.org/abs/2308.01263",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-07-19T19:56:13.998594+00:00",
  "updated_at": "2025-07-19T19:56:13.998594+00:00"
}

================================================
FILE: data/benchmarks/zebralogic.json
================================================
{
  "benchmark_id": "zebralogic",
  "name": "ZebraLogic",
  "parent_benchmark_id": null,
  "categories": ["reasoning"],
  "modality": "text",
  "multilingual": false,
  "max_score": 1.0,
  "language": "en",
  "description": "ZebraLogic is an evaluation framework for assessing large language models' logical reasoning capabilities through logic grid puzzles derived from constraint satisfaction problems (CSPs). The benchmark consists of 1,000 programmatically generated puzzles with controllable and quantifiable complexity, revealing a 'curse of complexity' where model accuracy declines significantly as problem complexity grows.",
  "paper_link": "https://arxiv.org/abs/2502.01100",
  "implementation_link": null,
  "verified": false,
  "created_at": "2025-09-05T00:00:00.000000+00:00",
  "updated_at": "2025-09-05T00:00:00.000000+00:00"
}


================================================
FILE: data/licenses/apache_2_0.json
================================================
{
  "license_id": "apache_2_0",
  "name": "Apache 2.0",
  "allow_commercial": true,
  "description": "Apache License 2.0 - allows commercial use",
  "created_at": "2025-07-19T19:49:05.605369+00:00",
  "updated_at": "2025-07-19T19:49:05.605369+00:00"
}

================================================
FILE: data/licenses/cc_by_nc.json
================================================
{
  "license_id": "cc_by_nc",
  "name": "CC BY-NC",
  "allow_commercial": false,
  "description": "Creative Commons Non-Commercial",
  "created_at": "2025-07-19T19:49:05.408956+00:00",
  "updated_at": "2025-07-19T19:49:05.408956+00:00"
}

================================================
FILE: data/licenses/creative_commons_attribution_4_0_license.json
================================================
{
  "license_id": "creative_commons_attribution_4_0_license",
  "name": "Creative Commons Attribution 4.0 License",
  "allow_commercial": false,
  "description": "Creative Commons Attribution 4.0 License license",
  "created_at": "2025-07-19T19:49:05.471773+00:00",
  "updated_at": "2025-07-19T19:49:05.471773+00:00"
}

================================================
FILE: data/licenses/deepseek.json
================================================
{
  "license_id": "deepseek",
  "name": "deepseek",
  "allow_commercial": false,
  "description": "deepseek license",
  "created_at": "2025-07-19T19:49:05.656652+00:00",
  "updated_at": "2025-07-19T19:49:05.656652+00:00"
}

================================================
FILE: data/licenses/gemma.json
================================================
{
  "license_id": "gemma",
  "name": "Gemma",
  "allow_commercial": true,
  "description": "Google Gemma Terms of Use",
  "created_at": "2025-07-19T19:49:05.442645+00:00",
  "updated_at": "2025-07-19T19:49:05.442645+00:00"
}

================================================
FILE: data/licenses/health_ai_developer_foundations_terms_of_use.json
================================================
{
  "license_id": "health_ai_developer_foundations_terms_of_use",
  "name": "Health AI Developer Foundations terms of use",
  "allow_commercial": false,
  "description": "Health AI Developer Foundations terms of use license",
  "created_at": "2025-07-19T19:49:05.510423+00:00",
  "updated_at": "2025-07-19T19:49:05.510423+00:00"
}

================================================
FILE: data/licenses/jamba_open_model_license.json
================================================
{
  "license_id": "jamba_open_model_license",
  "name": "Jamba Open Model License",
  "allow_commercial": false,
  "description": "Jamba Open Model License license",
  "created_at": "2025-07-19T19:49:05.763778+00:00",
  "updated_at": "2025-07-19T19:49:05.763778+00:00"
}

================================================
FILE: data/licenses/llama3_2.json
================================================
{
  "license_id": "llama3_2",
  "name": "Llama 3.2",
  "allow_commercial": true,
  "description": "Meta Llama 3.2 Community License",
  "created_at": "2025-07-19T19:49:05.578287+00:00",
  "updated_at": "2025-07-19T19:49:05.578287+00:00"
}

================================================
FILE: data/licenses/llama_3_1_community_license.json
================================================
{
  "license_id": "llama_3_1_community_license",
  "name": "Llama 3.1 Community License",
  "allow_commercial": false,
  "description": "Llama 3.1 Community License license",
  "created_at": "2025-07-19T19:49:05.574080+00:00",
  "updated_at": "2025-07-19T19:49:05.574080+00:00"
}

================================================
FILE: data/licenses/llama_3_2_community_license.json
================================================
{
  "license_id": "llama_3_2_community_license",
  "name": "Llama 3.2 Community License",
  "allow_commercial": false,
  "description": "Llama 3.2 Community License license",
  "created_at": "2025-07-19T19:49:05.587308+00:00",
  "updated_at": "2025-07-19T19:49:05.587308+00:00"
}

================================================
FILE: data/licenses/llama_3_3_community_license_agreement.json
================================================
{
  "license_id": "llama_3_3_community_license_agreement",
  "name": "Llama 3.3 Community License Agreement",
  "allow_commercial": false,
  "description": "Llama 3.3 Community License Agreement license",
  "created_at": "2025-07-19T19:49:05.602167+00:00",
  "updated_at": "2025-07-19T19:49:05.602167+00:00"
}

================================================
FILE: data/licenses/llama_4_community_license_agreement.json
================================================
{
  "license_id": "llama_4_community_license_agreement",
  "name": "Llama 4 Community License Agreement",
  "allow_commercial": false,
  "description": "Llama 4 Community License Agreement license",
  "created_at": "2025-07-19T19:49:05.593881+00:00",
  "updated_at": "2025-07-19T19:49:05.593881+00:00"
}

================================================
FILE: data/licenses/mistral_research_license.json
================================================
{
  "license_id": "mistral_research_license",
  "name": "Mistral Research License",
  "allow_commercial": false,
  "description": "Mistral Research License license",
  "created_at": "2025-07-19T19:49:05.785093+00:00",
  "updated_at": "2025-07-19T19:49:05.785093+00:00"
}

================================================
FILE: data/licenses/mistral_research_license_(mrl)_for_research;_mistral_commercial_license_for_commercial_use.json
================================================
{
  "license_id": "mistral_research_license_(mrl)_for_research;_mistral_commercial_license_for_commercial_use",
  "name": "Mistral Research License (MRL) for research; Mistral Commercial License for commercial use",
  "allow_commercial": false,
  "description": "Mistral Research License (MRL) for research; Mistral Commercial License for commercial use license",
  "created_at": "2025-07-19T19:49:05.911442+00:00",
  "updated_at": "2025-07-19T19:49:05.911442+00:00"
}

================================================
FILE: data/licenses/mit.json
================================================
{
  "license_id": "mit",
  "name": "MIT",
  "allow_commercial": true,
  "description": "MIT License - allows commercial use",
  "created_at": "2025-07-19T19:49:05.544627+00:00",
  "updated_at": "2025-07-19T19:49:05.544627+00:00"
}

================================================
FILE: data/licenses/mit_+_model_license_(commercial_use_allowed).json
================================================
{
  "license_id": "mit_+_model_license_(commercial_use_allowed)",
  "name": "MIT + Model License (Commercial use allowed)",
  "allow_commercial": false,
  "description": "MIT + Model License (Commercial use allowed) license",
  "created_at": "2025-07-19T19:49:05.676049+00:00",
  "updated_at": "2025-07-19T19:49:05.676049+00:00"
}

================================================
FILE: data/licenses/mit_license.json
================================================
{
  "license_id": "mit_license",
  "name": "MIT License",
  "allow_commercial": false,
  "description": "MIT License license",
  "created_at": "2025-07-19T19:49:05.897679+00:00",
  "updated_at": "2025-07-19T19:49:05.897679+00:00"
}

================================================
FILE: data/licenses/mnpl_0_1.json
================================================
{
  "license_id": "mnpl_0_1",
  "name": "MNPL-0.1",
  "allow_commercial": false,
  "description": "MNPL-0.1 license",
  "created_at": "2025-07-19T19:49:05.804469+00:00",
  "updated_at": "2025-07-19T19:49:05.804469+00:00"
}

================================================
FILE: data/licenses/modified_mit_license.json
================================================
{
  "license_id": "modified_mit_license",
  "name": "Modified MIT License",
  "allow_commercial": false,
  "description": "Modified MIT License license",
  "created_at": "2025-07-19T19:49:05.420757+00:00",
  "updated_at": "2025-07-19T19:49:05.420757+00:00"
}

================================================
FILE: data/licenses/nvidia_open_model_license_agreement.json
================================================
{
  "license_id": "nvidia_open_model_license_agreement",
  "name": "NVIDIA Open Model License Agreement ",
  "allow_commercial": true,
  "description": "NVIDIA Open Model License Agreement ",
  "created_at": "2025-10-02T21:51:16.835+00:00",
  "updated_at": "2025-10-02T21:51:16.835+00:00"
}

================================================
FILE: data/licenses/proprietary.json
================================================
{
  "license_id": "proprietary",
  "name": "Proprietary",
  "allow_commercial": false,
  "description": "Proprietary license - usage restrictions apply",
  "created_at": "2025-07-19T19:49:05.425183+00:00",
  "updated_at": "2025-07-19T19:49:05.425183+00:00"
}

================================================
FILE: data/licenses/qwen.json
================================================
{
  "license_id": "qwen",
  "name": "Qwen",
  "allow_commercial": true,
  "description": "Alibaba Qwen License",
  "created_at": "2025-07-19T19:49:05.626726+00:00",
  "updated_at": "2025-07-19T19:49:05.626726+00:00"
}

================================================
FILE: data/licenses/tongyi_qianwen.json
================================================
{
  "license_id": "tongyi_qianwen",
  "name": "tongyi-qianwen",
  "allow_commercial": false,
  "description": "tongyi-qianwen license",
  "created_at": "2025-07-19T19:49:05.618579+00:00",
  "updated_at": "2025-07-19T19:49:05.618579+00:00"
}

================================================
FILE: data/licenses/unknown.json
================================================
{
  "license_id": "unknown",
  "name": "Unknown",
  "allow_commercial": false,
  "description": "Unknown license",
  "created_at": "2025-08-03T22:06:10.793734+00:00",
  "updated_at": "2025-08-03T22:06:10.793734+00:00"
}

================================================
FILE: data/organizations/ai21/models/jamba-1.5-large/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 28,
    "benchmark_id": "arc-c",
    "model_id": "jamba-1.5-large",
    "score": 0.93,
    "normalized_score": 0.93,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/ai21labs/AI21-Jamba-1.5-Large",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.139664+00:00",
    "updated_at": "2025-07-19T19:56:11.139664+00:00",
    "benchmark_name": "ARC-C"
  },
  {
    "model_benchmark_id": 1462,
    "benchmark_id": "arena-hard",
    "model_id": "jamba-1.5-large",
    "score": 0.654,
    "normalized_score": 0.654,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/ai21labs/AI21-Jamba-1.5-Large",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.114965+00:00",
    "updated_at": "2025-07-19T19:56:14.114965+00:00",
    "benchmark_name": "Arena Hard"
  },
  {
    "model_benchmark_id": 338,
    "benchmark_id": "gpqa",
    "model_id": "jamba-1.5-large",
    "score": 0.369,
    "normalized_score": 0.369,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/ai21labs/AI21-Jamba-1.5-Large",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.736664+00:00",
    "updated_at": "2025-07-19T19:56:11.736664+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 1011,
    "benchmark_id": "gsm8k",
    "model_id": "jamba-1.5-large",
    "score": 0.87,
    "normalized_score": 0.87,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/ai21labs/AI21-Jamba-1.5-Large",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.109009+00:00",
    "updated_at": "2025-07-19T19:56:13.109009+00:00",
    "benchmark_name": "GSM8k"
  },
  {
    "model_benchmark_id": 108,
    "benchmark_id": "mmlu",
    "model_id": "jamba-1.5-large",
    "score": 0.812,
    "normalized_score": 0.812,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/ai21labs/AI21-Jamba-1.5-Large",
    "verified_by_llmstats": false,
    "analysis_method": "Chain-of-Thought accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.302578+00:00",
    "updated_at": "2025-07-19T19:56:11.302578+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 213,
    "benchmark_id": "mmlu-pro",
    "model_id": "jamba-1.5-large",
    "score": 0.535,
    "normalized_score": 0.535,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/ai21labs/AI21-Jamba-1.5-Large",
    "verified_by_llmstats": false,
    "analysis_method": "Chain-of-Thought accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.505024+00:00",
    "updated_at": "2025-07-19T19:56:11.505024+00:00",
    "benchmark_name": "MMLU-Pro"
  },
  {
    "model_benchmark_id": 144,
    "benchmark_id": "truthfulqa",
    "model_id": "jamba-1.5-large",
    "score": 0.583,
    "normalized_score": 0.583,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/ai21labs/AI21-Jamba-1.5-Large",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.365684+00:00",
    "updated_at": "2025-07-19T19:56:11.365684+00:00",
    "benchmark_name": "TruthfulQA"
  },
  {
    "model_benchmark_id": 1816,
    "benchmark_id": "wild-bench",
    "model_id": "jamba-1.5-large",
    "score": 0.485,
    "normalized_score": 0.485,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/ai21labs/AI21-Jamba-1.5-Large",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.125090+00:00",
    "updated_at": "2025-07-19T19:56:15.125090+00:00",
    "benchmark_name": "Wild Bench"
  }
]

================================================
FILE: data/organizations/ai21/models/jamba-1.5-large/model.json
================================================
{
  "model_id": "jamba-1.5-large",
  "name": "Jamba 1.5 Large",
  "organization_id": "ai21",
  "fine_tuned_from_model_id": null,
  "description": "State-of-the-art hybrid SSM-Transformer instruction following foundation model, offering superior long context handling, speed, and quality.",
  "release_date": "2024-08-22",
  "announcement_date": "2024-08-22",
  "license_id": "jamba_open_model_license",
  "multimodal": false,
  "knowledge_cutoff": "2024-03-05",
  "param_count": 398000000000,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://docs.ai21.com/reference/jamba-15-api-ref",
  "source_playground": null,
  "source_paper": null,
  "source_scorecard_blog_link": "https://www.ai21.com/blog/announcing-jamba-model-family",
  "source_repo_link": null,
  "source_weights_link": "https://huggingface.co/ai21labs/AI21-Jamba-1.5-Large",
  "created_at": "2025-07-19T19:49:05.764734+00:00",
  "updated_at": "2025-07-19T19:49:05.764734+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/ai21/models/jamba-1.5-mini/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 29,
    "benchmark_id": "arc-c",
    "model_id": "jamba-1.5-mini",
    "score": 0.857,
    "normalized_score": 0.857,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/ai21labs/AI21-Jamba-1.5-Mini",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.141043+00:00",
    "updated_at": "2025-07-19T19:56:11.141043+00:00",
    "benchmark_name": "ARC-C"
  },
  {
    "model_benchmark_id": 1463,
    "benchmark_id": "arena-hard",
    "model_id": "jamba-1.5-mini",
    "score": 0.461,
    "normalized_score": 0.461,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/ai21labs/AI21-Jamba-1.5-Mini",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.117178+00:00",
    "updated_at": "2025-07-19T19:56:14.117178+00:00",
    "benchmark_name": "Arena Hard"
  },
  {
    "model_benchmark_id": 339,
    "benchmark_id": "gpqa",
    "model_id": "jamba-1.5-mini",
    "score": 0.323,
    "normalized_score": 0.323,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/ai21labs/AI21-Jamba-1.5-Mini",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.739037+00:00",
    "updated_at": "2025-07-19T19:56:11.739037+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 1012,
    "benchmark_id": "gsm8k",
    "model_id": "jamba-1.5-mini",
    "score": 0.758,
    "normalized_score": 0.758,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/ai21labs/AI21-Jamba-1.5-Mini",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.110443+00:00",
    "updated_at": "2025-07-19T19:56:13.110443+00:00",
    "benchmark_name": "GSM8k"
  },
  {
    "model_benchmark_id": 109,
    "benchmark_id": "mmlu",
    "model_id": "jamba-1.5-mini",
    "score": 0.697,
    "normalized_score": 0.697,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/ai21labs/AI21-Jamba-1.5-Mini",
    "verified_by_llmstats": false,
    "analysis_method": "Chain-of-Thought accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.304017+00:00",
    "updated_at": "2025-07-19T19:56:11.304017+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 214,
    "benchmark_id": "mmlu-pro",
    "model_id": "jamba-1.5-mini",
    "score": 0.425,
    "normalized_score": 0.425,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/ai21labs/AI21-Jamba-1.5-Mini",
    "verified_by_llmstats": false,
    "analysis_method": "Chain-of-Thought accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.506893+00:00",
    "updated_at": "2025-07-19T19:56:11.506893+00:00",
    "benchmark_name": "MMLU-Pro"
  },
  {
    "model_benchmark_id": 145,
    "benchmark_id": "truthfulqa",
    "model_id": "jamba-1.5-mini",
    "score": 0.541,
    "normalized_score": 0.541,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/ai21labs/AI21-Jamba-1.5-Mini",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.367476+00:00",
    "updated_at": "2025-07-19T19:56:11.367476+00:00",
    "benchmark_name": "TruthfulQA"
  },
  {
    "model_benchmark_id": 1817,
    "benchmark_id": "wild-bench",
    "model_id": "jamba-1.5-mini",
    "score": 0.424,
    "normalized_score": 0.424,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/ai21labs/AI21-Jamba-1.5-Mini",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.127075+00:00",
    "updated_at": "2025-07-19T19:56:15.127075+00:00",
    "benchmark_name": "Wild Bench"
  }
]

================================================
FILE: data/organizations/ai21/models/jamba-1.5-mini/model.json
================================================
{
  "model_id": "jamba-1.5-mini",
  "name": "Jamba 1.5 Mini",
  "organization_id": "ai21",
  "fine_tuned_from_model_id": null,
  "description": "Part of the Jamba 1.5 family, a state-of-the-art hybrid SSM-Transformer instruction following foundation model offering superior long context handling, speed, and quality.",
  "release_date": "2024-08-22",
  "announcement_date": "2024-08-22",
  "license_id": "jamba_open_model_license",
  "multimodal": false,
  "knowledge_cutoff": "2024-03-05",
  "param_count": 52000000000,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://docs.ai21.com/reference/jamba-15-api-ref",
  "source_playground": null,
  "source_paper": "https://arxiv.org/abs/2408.12570",
  "source_scorecard_blog_link": "https://www.ai21.com/blog/announcing-jamba-model-family",
  "source_repo_link": null,
  "source_weights_link": "https://huggingface.co/ai21labs/AI21-Jamba-1.5-Mini",
  "created_at": "2025-07-19T19:49:05.767535+00:00",
  "updated_at": "2025-07-19T19:49:05.767535+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/ai21/organization.json
================================================
{
  "organization_id": "ai21",
  "name": "AI21 Labs",
  "website": "https://ai21.com",
  "description": "NLP AI company",
  "country": null,
  "created_at": "2025-07-19T19:49:05.762555+00:00",
  "updated_at": "2025-07-19T19:49:05.762555+00:00"
}

================================================
FILE: data/organizations/amazon/models/nova-lite/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 2,
    "benchmark_id": "arc-c",
    "model_id": "nova-lite",
    "score": 0.924,
    "normalized_score": 0.924,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot chain-of-thought",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.080108+00:00",
    "updated_at": "2025-07-19T19:56:11.080108+00:00",
    "benchmark_name": "ARC-C"
  },
  {
    "model_benchmark_id": 967,
    "benchmark_id": "bbh",
    "model_id": "nova-lite",
    "score": 0.824,
    "normalized_score": 0.824,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "3-shot CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.034481+00:00",
    "updated_at": "2025-07-19T19:56:13.034481+00:00",
    "benchmark_name": "BBH"
  },
  {
    "model_benchmark_id": 843,
    "benchmark_id": "bfcl",
    "model_id": "nova-lite",
    "score": 0.666,
    "normalized_score": 0.666,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.766776+00:00",
    "updated_at": "2025-07-19T19:56:12.766776+00:00",
    "benchmark_name": "BFCL"
  },
  {
    "model_benchmark_id": 853,
    "benchmark_id": "chartqa",
    "model_id": "nova-lite",
    "score": 0.868,
    "normalized_score": 0.868,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "relaxed accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.786772+00:00",
    "updated_at": "2025-07-19T19:56:12.786772+00:00",
    "benchmark_name": "ChartQA"
  },
  {
    "model_benchmark_id": 834,
    "benchmark_id": "crag",
    "model_id": "nova-lite",
    "score": 0.438,
    "normalized_score": 0.438,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.743484+00:00",
    "updated_at": "2025-07-19T19:56:12.743484+00:00",
    "benchmark_name": "CRAG"
  },
  {
    "model_benchmark_id": 876,
    "benchmark_id": "docvqa",
    "model_id": "nova-lite",
    "score": 0.924,
    "normalized_score": 0.924,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "ANLS",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.827478+00:00",
    "updated_at": "2025-07-19T19:56:12.827478+00:00",
    "benchmark_name": "DocVQA"
  },
  {
    "model_benchmark_id": 939,
    "benchmark_id": "drop",
    "model_id": "nova-lite",
    "score": 0.802,
    "normalized_score": 0.802,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.984716+00:00",
    "updated_at": "2025-07-19T19:56:12.984716+00:00",
    "benchmark_name": "DROP"
  },
  {
    "model_benchmark_id": 918,
    "benchmark_id": "egoschema",
    "model_id": "nova-lite",
    "score": 0.714,
    "normalized_score": 0.714,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.918221+00:00",
    "updated_at": "2025-07-19T19:56:12.918221+00:00",
    "benchmark_name": "EgoSchema"
  },
  {
    "model_benchmark_id": 831,
    "benchmark_id": "finqa",
    "model_id": "nova-lite",
    "score": 0.736,
    "normalized_score": 0.736,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.736609+00:00",
    "updated_at": "2025-07-19T19:56:12.736609+00:00",
    "benchmark_name": "FinQA"
  },
  {
    "model_benchmark_id": 258,
    "benchmark_id": "gpqa",
    "model_id": "nova-lite",
    "score": 0.42,
    "normalized_score": 0.42,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "6-shot CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.594691+00:00",
    "updated_at": "2025-07-19T19:56:11.594691+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 841,
    "benchmark_id": "groundui-1k",
    "model_id": "nova-lite",
    "score": 0.802,
    "normalized_score": 0.802,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.761300+00:00",
    "updated_at": "2025-07-19T19:56:12.761300+00:00",
    "benchmark_name": "GroundUI-1K"
  },
  {
    "model_benchmark_id": 160,
    "benchmark_id": "gsm8k",
    "model_id": "nova-lite",
    "score": 0.945,
    "normalized_score": 0.945,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.407299+00:00",
    "updated_at": "2025-07-19T19:56:11.407299+00:00",
    "benchmark_name": "GSM8k"
  },
  {
    "model_benchmark_id": 759,
    "benchmark_id": "humaneval",
    "model_id": "nova-lite",
    "score": 0.854,
    "normalized_score": 0.854,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.601822+00:00",
    "updated_at": "2025-07-19T19:56:12.601822+00:00",
    "benchmark_name": "HumanEval"
  },
  {
    "model_benchmark_id": 604,
    "benchmark_id": "ifeval",
    "model_id": "nova-lite",
    "score": 0.897,
    "normalized_score": 0.897,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.248959+00:00",
    "updated_at": "2025-07-19T19:56:12.248959+00:00",
    "benchmark_name": "IFEval"
  },
  {
    "model_benchmark_id": 826,
    "benchmark_id": "lvbench",
    "model_id": "nova-lite",
    "score": 0.404,
    "normalized_score": 0.404,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.726573+00:00",
    "updated_at": "2025-07-19T19:56:12.726573+00:00",
    "benchmark_name": "LVBench"
  },
  {
    "model_benchmark_id": 374,
    "benchmark_id": "math",
    "model_id": "nova-lite",
    "score": 0.733,
    "normalized_score": 0.733,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.810622+00:00",
    "updated_at": "2025-07-19T19:56:11.810622+00:00",
    "benchmark_name": "MATH"
  },
  {
    "model_benchmark_id": 60,
    "benchmark_id": "mmlu",
    "model_id": "nova-lite",
    "score": 0.805,
    "normalized_score": 0.805,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot chain-of-thought",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.212315+00:00",
    "updated_at": "2025-07-19T19:56:11.212315+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 839,
    "benchmark_id": "mm-mind2web",
    "model_id": "nova-lite",
    "score": 0.607,
    "normalized_score": 0.607,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.755878+00:00",
    "updated_at": "2025-07-19T19:56:12.755878+00:00",
    "benchmark_name": "MM-Mind2Web"
  },
  {
    "model_benchmark_id": 550,
    "benchmark_id": "mmmu",
    "model_id": "nova-lite",
    "score": 0.562,
    "normalized_score": 0.562,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "CoT accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.134288+00:00",
    "updated_at": "2025-07-19T19:56:12.134288+00:00",
    "benchmark_name": "MMMU"
  },
  {
    "model_benchmark_id": 821,
    "benchmark_id": "squality",
    "model_id": "nova-lite",
    "score": 0.192,
    "normalized_score": 0.192,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "rouge-l",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.715662+00:00",
    "updated_at": "2025-07-19T19:56:12.715662+00:00",
    "benchmark_name": "SQuALITY"
  },
  {
    "model_benchmark_id": 901,
    "benchmark_id": "textvqa",
    "model_id": "nova-lite",
    "score": 0.802,
    "normalized_score": 0.802,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "weighted accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.878076+00:00",
    "updated_at": "2025-07-19T19:56:12.878076+00:00",
    "benchmark_name": "TextVQA"
  },
  {
    "model_benchmark_id": 930,
    "benchmark_id": "translation-en\u2192set1-comet22",
    "model_id": "nova-lite",
    "score": 0.888,
    "normalized_score": 0.888,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "COMET22",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.962491+00:00",
    "updated_at": "2025-07-19T19:56:12.962491+00:00",
    "benchmark_name": "Translation en\u2192Set1 COMET22"
  },
  {
    "model_benchmark_id": 927,
    "benchmark_id": "translation-en\u2192set1-spbleu",
    "model_id": "nova-lite",
    "score": 0.415,
    "normalized_score": 0.415,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "spBleu",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.942744+00:00",
    "updated_at": "2025-07-19T19:56:12.942744+00:00",
    "benchmark_name": "Translation en\u2192Set1 spBleu"
  },
  {
    "model_benchmark_id": 936,
    "benchmark_id": "translation-set1\u2192en-comet22",
    "model_id": "nova-lite",
    "score": 0.888,
    "normalized_score": 0.888,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "COMET22",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.977060+00:00",
    "updated_at": "2025-07-19T19:56:12.977060+00:00",
    "benchmark_name": "Translation Set1\u2192en COMET22"
  },
  {
    "model_benchmark_id": 933,
    "benchmark_id": "translation-set1\u2192en-spbleu",
    "model_id": "nova-lite",
    "score": 0.431,
    "normalized_score": 0.431,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "spBleu",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.969524+00:00",
    "updated_at": "2025-07-19T19:56:12.969524+00:00",
    "benchmark_name": "Translation Set1\u2192en spBleu"
  },
  {
    "model_benchmark_id": 916,
    "benchmark_id": "vatex",
    "model_id": "nova-lite",
    "score": 0.778,
    "normalized_score": 0.778,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "CIDEr",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.912261+00:00",
    "updated_at": "2025-07-19T19:56:12.912261+00:00",
    "benchmark_name": "VATEX"
  },
  {
    "model_benchmark_id": 837,
    "benchmark_id": "visualwebbench",
    "model_id": "nova-lite",
    "score": 0.777,
    "normalized_score": 0.777,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "composite step accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.750738+00:00",
    "updated_at": "2025-07-19T19:56:12.750738+00:00",
    "benchmark_name": "VisualWebBench"
  }
]

================================================
FILE: data/organizations/amazon/models/nova-lite/model.json
================================================
{
  "model_id": "nova-lite",
  "name": "Nova Lite",
  "organization_id": "amazon",
  "fine_tuned_from_model_id": null,
  "description": "A low-cost multimodal model that is lightning fast for processing images, video, documents, and text.",
  "release_date": "2024-11-20",
  "announcement_date": "2024-11-20",
  "license_id": "proprietary",
  "multimodal": true,
  "knowledge_cutoff": null,
  "param_count": null,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://aws.amazon.com/bedrock/amazon-nova-lite",
  "source_playground": null,
  "source_paper": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
  "source_scorecard_blog_link": null,
  "source_repo_link": null,
  "source_weights_link": null,
  "created_at": "2025-07-19T19:49:05.429271+00:00",
  "updated_at": "2025-07-19T19:49:05.429271+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/amazon/models/nova-micro/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 4,
    "benchmark_id": "arc-c",
    "model_id": "nova-micro",
    "score": 0.902,
    "normalized_score": 0.902,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.088301+00:00",
    "updated_at": "2025-07-19T19:56:11.088301+00:00",
    "benchmark_name": "ARC-C"
  },
  {
    "model_benchmark_id": 969,
    "benchmark_id": "bbh",
    "model_id": "nova-micro",
    "score": 0.795,
    "normalized_score": 0.795,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "3-shot Chain-of-Thought",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.038288+00:00",
    "updated_at": "2025-07-19T19:56:13.038288+00:00",
    "benchmark_name": "BBH"
  },
  {
    "model_benchmark_id": 845,
    "benchmark_id": "bfcl",
    "model_id": "nova-micro",
    "score": 0.562,
    "normalized_score": 0.562,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.770319+00:00",
    "updated_at": "2025-07-19T19:56:12.770319+00:00",
    "benchmark_name": "BFCL"
  },
  {
    "model_benchmark_id": 836,
    "benchmark_id": "crag",
    "model_id": "nova-micro",
    "score": 0.431,
    "normalized_score": 0.431,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.746657+00:00",
    "updated_at": "2025-07-19T19:56:12.746657+00:00",
    "benchmark_name": "CRAG"
  },
  {
    "model_benchmark_id": 941,
    "benchmark_id": "drop",
    "model_id": "nova-micro",
    "score": 0.793,
    "normalized_score": 0.793,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "6-shot Chain-of-Thought",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.987950+00:00",
    "updated_at": "2025-07-19T19:56:12.987950+00:00",
    "benchmark_name": "DROP"
  },
  {
    "model_benchmark_id": 833,
    "benchmark_id": "finqa",
    "model_id": "nova-micro",
    "score": 0.652,
    "normalized_score": 0.652,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.740201+00:00",
    "updated_at": "2025-07-19T19:56:12.740201+00:00",
    "benchmark_name": "FinQA"
  },
  {
    "model_benchmark_id": 260,
    "benchmark_id": "gpqa",
    "model_id": "nova-micro",
    "score": 0.4,
    "normalized_score": 0.4,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot Chain-of-Thought",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.598530+00:00",
    "updated_at": "2025-07-19T19:56:11.598530+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 976,
    "benchmark_id": "gsm8k",
    "model_id": "nova-micro",
    "score": 0.923,
    "normalized_score": 0.923,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot Chain-of-Thought",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.051041+00:00",
    "updated_at": "2025-07-19T19:56:13.051041+00:00",
    "benchmark_name": "GSM8k"
  },
  {
    "model_benchmark_id": 761,
    "benchmark_id": "humaneval",
    "model_id": "nova-micro",
    "score": 0.811,
    "normalized_score": 0.811,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.605066+00:00",
    "updated_at": "2025-07-19T19:56:12.605066+00:00",
    "benchmark_name": "HumanEval"
  },
  {
    "model_benchmark_id": 606,
    "benchmark_id": "ifeval",
    "model_id": "nova-micro",
    "score": 0.872,
    "normalized_score": 0.872,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.252589+00:00",
    "updated_at": "2025-07-19T19:56:12.252589+00:00",
    "benchmark_name": "IFEval"
  },
  {
    "model_benchmark_id": 376,
    "benchmark_id": "math",
    "model_id": "nova-micro",
    "score": 0.693,
    "normalized_score": 0.693,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot Chain-of-Thought",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.814150+00:00",
    "updated_at": "2025-07-19T19:56:11.814150+00:00",
    "benchmark_name": "MATH"
  },
  {
    "model_benchmark_id": 62,
    "benchmark_id": "mmlu",
    "model_id": "nova-micro",
    "score": 0.776,
    "normalized_score": 0.776,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot Chain-of-Thought",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.217284+00:00",
    "updated_at": "2025-07-19T19:56:11.217284+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 823,
    "benchmark_id": "squality",
    "model_id": "nova-micro",
    "score": 0.188,
    "normalized_score": 0.188,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "rouge-l",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.719314+00:00",
    "updated_at": "2025-07-19T19:56:12.719314+00:00",
    "benchmark_name": "SQuALITY"
  },
  {
    "model_benchmark_id": 932,
    "benchmark_id": "translation-en\u2192set1-comet22",
    "model_id": "nova-micro",
    "score": 0.885,
    "normalized_score": 0.885,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "COMET22",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.966157+00:00",
    "updated_at": "2025-07-19T19:56:12.966157+00:00",
    "benchmark_name": "Translation en\u2192Set1 COMET22"
  },
  {
    "model_benchmark_id": 929,
    "benchmark_id": "translation-en\u2192set1-spbleu",
    "model_id": "nova-micro",
    "score": 0.402,
    "normalized_score": 0.402,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "spBleu",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.958167+00:00",
    "updated_at": "2025-07-19T19:56:12.958167+00:00",
    "benchmark_name": "Translation en\u2192Set1 spBleu"
  },
  {
    "model_benchmark_id": 938,
    "benchmark_id": "translation-set1\u2192en-comet22",
    "model_id": "nova-micro",
    "score": 0.887,
    "normalized_score": 0.887,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "COMET22",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.980365+00:00",
    "updated_at": "2025-07-19T19:56:12.980365+00:00",
    "benchmark_name": "Translation Set1\u2192en COMET22"
  },
  {
    "model_benchmark_id": 935,
    "benchmark_id": "translation-set1\u2192en-spbleu",
    "model_id": "nova-micro",
    "score": 0.426,
    "normalized_score": 0.426,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "spBleu",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.973209+00:00",
    "updated_at": "2025-07-19T19:56:12.973209+00:00",
    "benchmark_name": "Translation Set1\u2192en spBleu"
  }
]

================================================
FILE: data/organizations/amazon/models/nova-micro/model.json
================================================
{
  "model_id": "nova-micro",
  "name": "Nova Micro",
  "organization_id": "amazon",
  "fine_tuned_from_model_id": null,
  "description": "A text-only model that delivers lowest-latency responses at very low cost while maintaining strong performance on core language tasks. Optimized for speed and efficiency while preserving high accuracy on key benchmarks.",
  "release_date": "2024-11-20",
  "announcement_date": "2024-11-20",
  "license_id": "proprietary",
  "multimodal": false,
  "knowledge_cutoff": null,
  "param_count": null,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://docs.aws.amazon.com/bedrock/latest/userguide/model-parameters-nova.html",
  "source_playground": null,
  "source_paper": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
  "source_scorecard_blog_link": null,
  "source_repo_link": "https://huggingface.co/amazon-agi",
  "source_weights_link": null,
  "created_at": "2025-07-19T19:49:05.435386+00:00",
  "updated_at": "2025-07-19T19:49:05.435386+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/amazon/models/nova-pro/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 3,
    "benchmark_id": "arc-c",
    "model_id": "nova-pro",
    "score": 0.948,
    "normalized_score": 0.948,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot Chain-of-Thought",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.085849+00:00",
    "updated_at": "2025-07-19T19:56:11.085849+00:00",
    "benchmark_name": "ARC-C"
  },
  {
    "model_benchmark_id": 968,
    "benchmark_id": "bbh",
    "model_id": "nova-pro",
    "score": 0.869,
    "normalized_score": 0.869,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "3-shot Chain-of-Thought",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.036192+00:00",
    "updated_at": "2025-07-19T19:56:13.036192+00:00",
    "benchmark_name": "BBH"
  },
  {
    "model_benchmark_id": 844,
    "benchmark_id": "bfcl",
    "model_id": "nova-pro",
    "score": 0.684,
    "normalized_score": 0.684,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.768714+00:00",
    "updated_at": "2025-07-19T19:56:12.768714+00:00",
    "benchmark_name": "BFCL"
  },
  {
    "model_benchmark_id": 854,
    "benchmark_id": "chartqa",
    "model_id": "nova-pro",
    "score": 0.892,
    "normalized_score": 0.892,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "relaxed accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.788270+00:00",
    "updated_at": "2025-07-19T19:56:12.788270+00:00",
    "benchmark_name": "ChartQA"
  },
  {
    "model_benchmark_id": 835,
    "benchmark_id": "crag",
    "model_id": "nova-pro",
    "score": 0.503,
    "normalized_score": 0.503,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.744994+00:00",
    "updated_at": "2025-07-19T19:56:12.744994+00:00",
    "benchmark_name": "CRAG"
  },
  {
    "model_benchmark_id": 877,
    "benchmark_id": "docvqa",
    "model_id": "nova-pro",
    "score": 0.935,
    "normalized_score": 0.935,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "ANLS",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.829064+00:00",
    "updated_at": "2025-07-19T19:56:12.829064+00:00",
    "benchmark_name": "DocVQA"
  },
  {
    "model_benchmark_id": 940,
    "benchmark_id": "drop",
    "model_id": "nova-pro",
    "score": 0.854,
    "normalized_score": 0.854,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.986311+00:00",
    "updated_at": "2025-07-19T19:56:12.986311+00:00",
    "benchmark_name": "DROP"
  },
  {
    "model_benchmark_id": 919,
    "benchmark_id": "egoschema",
    "model_id": "nova-pro",
    "score": 0.721,
    "normalized_score": 0.721,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.920400+00:00",
    "updated_at": "2025-07-19T19:56:12.920400+00:00",
    "benchmark_name": "EgoSchema"
  },
  {
    "model_benchmark_id": 832,
    "benchmark_id": "finqa",
    "model_id": "nova-pro",
    "score": 0.772,
    "normalized_score": 0.772,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.738456+00:00",
    "updated_at": "2025-07-19T19:56:12.738456+00:00",
    "benchmark_name": "FinQA"
  },
  {
    "model_benchmark_id": 259,
    "benchmark_id": "gpqa",
    "model_id": "nova-pro",
    "score": 0.469,
    "normalized_score": 0.469,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "6-shot Chain-of-Thought",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.596541+00:00",
    "updated_at": "2025-07-19T19:56:11.596541+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 842,
    "benchmark_id": "groundui-1k",
    "model_id": "nova-pro",
    "score": 0.814,
    "normalized_score": 0.814,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.762846+00:00",
    "updated_at": "2025-07-19T19:56:12.762846+00:00",
    "benchmark_name": "GroundUI-1K"
  },
  {
    "model_benchmark_id": 975,
    "benchmark_id": "gsm8k",
    "model_id": "nova-pro",
    "score": 0.948,
    "normalized_score": 0.948,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot Chain-of-Thought",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.049455+00:00",
    "updated_at": "2025-07-19T19:56:13.049455+00:00",
    "benchmark_name": "GSM8k"
  },
  {
    "model_benchmark_id": 760,
    "benchmark_id": "humaneval",
    "model_id": "nova-pro",
    "score": 0.89,
    "normalized_score": 0.89,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.603428+00:00",
    "updated_at": "2025-07-19T19:56:12.603428+00:00",
    "benchmark_name": "HumanEval"
  },
  {
    "model_benchmark_id": 605,
    "benchmark_id": "ifeval",
    "model_id": "nova-pro",
    "score": 0.921,
    "normalized_score": 0.921,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.250818+00:00",
    "updated_at": "2025-07-19T19:56:12.250818+00:00",
    "benchmark_name": "IFEval"
  },
  {
    "model_benchmark_id": 827,
    "benchmark_id": "lvbench",
    "model_id": "nova-pro",
    "score": 0.416,
    "normalized_score": 0.416,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.728104+00:00",
    "updated_at": "2025-07-19T19:56:12.728104+00:00",
    "benchmark_name": "LVBench"
  },
  {
    "model_benchmark_id": 375,
    "benchmark_id": "math",
    "model_id": "nova-pro",
    "score": 0.766,
    "normalized_score": 0.766,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot Chain-of-Thought",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.812663+00:00",
    "updated_at": "2025-07-19T19:56:11.812663+00:00",
    "benchmark_name": "MATH"
  },
  {
    "model_benchmark_id": 61,
    "benchmark_id": "mmlu",
    "model_id": "nova-pro",
    "score": 0.859,
    "normalized_score": 0.859,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot Chain-of-Thought",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.214544+00:00",
    "updated_at": "2025-07-19T19:56:11.214544+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 840,
    "benchmark_id": "mm-mind2web",
    "model_id": "nova-pro",
    "score": 0.637,
    "normalized_score": 0.637,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "step accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.757670+00:00",
    "updated_at": "2025-07-19T19:56:12.757670+00:00",
    "benchmark_name": "MM-Mind2Web"
  },
  {
    "model_benchmark_id": 551,
    "benchmark_id": "mmmu",
    "model_id": "nova-pro",
    "score": 0.617,
    "normalized_score": 0.617,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "Chain-of-Thought",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.135953+00:00",
    "updated_at": "2025-07-19T19:56:12.135953+00:00",
    "benchmark_name": "MMMU"
  },
  {
    "model_benchmark_id": 822,
    "benchmark_id": "squality",
    "model_id": "nova-pro",
    "score": 0.198,
    "normalized_score": 0.198,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "ROUGE-L",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.717624+00:00",
    "updated_at": "2025-07-19T19:56:12.717624+00:00",
    "benchmark_name": "SQuALITY"
  },
  {
    "model_benchmark_id": 902,
    "benchmark_id": "textvqa",
    "model_id": "nova-pro",
    "score": 0.815,
    "normalized_score": 0.815,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "weighted accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.880228+00:00",
    "updated_at": "2025-07-19T19:56:12.880228+00:00",
    "benchmark_name": "TextVQA"
  },
  {
    "model_benchmark_id": 931,
    "benchmark_id": "translation-en\u2192set1-comet22",
    "model_id": "nova-pro",
    "score": 0.891,
    "normalized_score": 0.891,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "COMET22",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.964047+00:00",
    "updated_at": "2025-07-19T19:56:12.964047+00:00",
    "benchmark_name": "Translation en\u2192Set1 COMET22"
  },
  {
    "model_benchmark_id": 928,
    "benchmark_id": "translation-en\u2192set1-spbleu",
    "model_id": "nova-pro",
    "score": 0.434,
    "normalized_score": 0.434,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "spBleu",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.950458+00:00",
    "updated_at": "2025-07-19T19:56:12.950458+00:00",
    "benchmark_name": "Translation en\u2192Set1 spBleu"
  },
  {
    "model_benchmark_id": 937,
    "benchmark_id": "translation-set1\u2192en-comet22",
    "model_id": "nova-pro",
    "score": 0.89,
    "normalized_score": 0.89,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "COMET22",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.978787+00:00",
    "updated_at": "2025-07-19T19:56:12.978787+00:00",
    "benchmark_name": "Translation Set1\u2192en COMET22"
  },
  {
    "model_benchmark_id": 934,
    "benchmark_id": "translation-set1\u2192en-spbleu",
    "model_id": "nova-pro",
    "score": 0.444,
    "normalized_score": 0.444,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "spBleu",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.971295+00:00",
    "updated_at": "2025-07-19T19:56:12.971295+00:00",
    "benchmark_name": "Translation Set1\u2192en spBleu"
  },
  {
    "model_benchmark_id": 917,
    "benchmark_id": "vatex",
    "model_id": "nova-pro",
    "score": 0.778,
    "normalized_score": 0.778,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "CIDEr",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.913837+00:00",
    "updated_at": "2025-07-19T19:56:12.913837+00:00",
    "benchmark_name": "VATEX"
  },
  {
    "model_benchmark_id": 838,
    "benchmark_id": "visualwebbench",
    "model_id": "nova-pro",
    "score": 0.797,
    "normalized_score": 0.797,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
    "verified_by_llmstats": false,
    "analysis_method": "composite",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.752533+00:00",
    "updated_at": "2025-07-19T19:56:12.752533+00:00",
    "benchmark_name": "VisualWebBench"
  }
]

================================================
FILE: data/organizations/amazon/models/nova-pro/model.json
================================================
{
  "model_id": "nova-pro",
  "name": "Nova Pro",
  "organization_id": "amazon",
  "fine_tuned_from_model_id": null,
  "description": "Amazon Nova Pro is a highly-capable multimodal model with state-of-the-art performance across text, image, and video understanding. It excels at core capabilities like language understanding, mathematical reasoning, and multimodal tasks while offering industry-leading speed and cost efficiency.",
  "release_date": "2024-11-20",
  "announcement_date": "2024-11-20",
  "license_id": "proprietary",
  "multimodal": true,
  "knowledge_cutoff": null,
  "param_count": null,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://docs.aws.amazon.com/bedrock/latest/userguide/model-parameters-nova.html",
  "source_playground": null,
  "source_paper": "https://www.amazon.science/publications/the-amazon-nova-family-of-models-technical-report-and-model-card",
  "source_scorecard_blog_link": null,
  "source_repo_link": "https://huggingface.co/amazon-agi",
  "source_weights_link": null,
  "created_at": "2025-07-19T19:49:05.431675+00:00",
  "updated_at": "2025-07-19T19:49:05.431675+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/amazon/organization.json
================================================
{
  "organization_id": "amazon",
  "name": "Amazon",
  "website": "https://aws.amazon.com",
  "description": "Cloud and AI services",
  "country": null,
  "created_at": "2025-07-19T19:49:05.427427+00:00",
  "updated_at": "2025-07-19T19:49:05.427427+00:00"
}

================================================
FILE: data/organizations/anthropic/models/claude-3-5-haiku-20241022/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 958,
    "benchmark_id": "drop",
    "model_id": "claude-3-5-haiku-20241022",
    "score": 0.831,
    "normalized_score": 0.831,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-3-5-haiku",
    "verified_by_llmstats": false,
    "analysis_method": "3-shot F1 Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.017079+00:00",
    "updated_at": "2025-07-19T19:56:13.017079+00:00",
    "benchmark_name": "DROP"
  },
  {
    "model_benchmark_id": 331,
    "benchmark_id": "gpqa",
    "model_id": "claude-3-5-haiku-20241022",
    "score": 0.416,
    "normalized_score": 0.416,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-3-5-haiku",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.725835+00:00",
    "updated_at": "2025-07-19T19:56:11.725835+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 801,
    "benchmark_id": "humaneval",
    "model_id": "claude-3-5-haiku-20241022",
    "score": 0.881,
    "normalized_score": 0.881,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-3-5-haiku",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.671817+00:00",
    "updated_at": "2025-07-19T19:56:12.671817+00:00",
    "benchmark_name": "HumanEval"
  },
  {
    "model_benchmark_id": 417,
    "benchmark_id": "math",
    "model_id": "claude-3-5-haiku-20241022",
    "score": 0.694,
    "normalized_score": 0.694,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-3-5-haiku",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.885732+00:00",
    "updated_at": "2025-07-19T19:56:11.885732+00:00",
    "benchmark_name": "MATH"
  },
  {
    "model_benchmark_id": 1292,
    "benchmark_id": "mgsm",
    "model_id": "claude-3-5-haiku-20241022",
    "score": 0.856,
    "normalized_score": 0.856,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-3-5-haiku",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.705114+00:00",
    "updated_at": "2025-07-19T19:56:13.705114+00:00",
    "benchmark_name": "MGSM"
  },
  {
    "model_benchmark_id": 210,
    "benchmark_id": "mmlu-pro",
    "model_id": "claude-3-5-haiku-20241022",
    "score": 0.65,
    "normalized_score": 0.65,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-3-5-haiku",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.499754+00:00",
    "updated_at": "2025-07-19T19:56:11.499754+00:00",
    "benchmark_name": "MMLU-Pro"
  },
  {
    "model_benchmark_id": 1347,
    "benchmark_id": "swe-bench-verified",
    "model_id": "claude-3-5-haiku-20241022",
    "score": 0.406,
    "normalized_score": 0.406,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-3-5-haiku",
    "verified_by_llmstats": false,
    "analysis_method": "standard",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.836974+00:00",
    "updated_at": "2025-07-19T19:56:13.836974+00:00",
    "benchmark_name": "SWE-Bench Verified"
  },
  {
    "model_benchmark_id": 1771,
    "benchmark_id": "tau-bench-airline",
    "model_id": "claude-3-5-haiku-20241022",
    "score": 0.228,
    "normalized_score": 0.228,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-3-5-haiku",
    "verified_by_llmstats": false,
    "analysis_method": "standard",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.997081+00:00",
    "updated_at": "2025-07-19T19:56:14.997081+00:00",
    "benchmark_name": "TAU-bench Airline"
  },
  {
    "model_benchmark_id": 1757,
    "benchmark_id": "tau-bench-retail",
    "model_id": "claude-3-5-haiku-20241022",
    "score": 0.51,
    "normalized_score": 0.51,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-3-5-haiku",
    "verified_by_llmstats": false,
    "analysis_method": "standard",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.970473+00:00",
    "updated_at": "2025-07-19T19:56:14.970473+00:00",
    "benchmark_name": "TAU-bench Retail"
  }
]

================================================
FILE: data/organizations/anthropic/models/claude-3-5-haiku-20241022/model.json
================================================
{
  "model_id": "claude-3-5-haiku-20241022",
  "name": "Claude 3.5 Haiku",
  "organization_id": "anthropic",
  "fine_tuned_from_model_id": null,
  "description": "Claude 3.5 Haiku is Anthropic's fastest model, delivering advanced coding, tool use, and reasoning capabilities at an accessible price. It excels at user-facing products, specialized sub-agent tasks, and generating personalized experiences from large data volumes. The model is particularly well-suited for code completions, interactive chatbots, data extraction, and real-time content moderation.",
  "release_date": "2024-10-22",
  "announcement_date": "2024-10-22",
  "license_id": "proprietary",
  "multimodal": false,
  "knowledge_cutoff": null,
  "param_count": null,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://docs.anthropic.com/en/docs/intro-to-claude#claude-3-5-family",
  "source_playground": "https://claude.ai",
  "source_paper": null,
  "source_scorecard_blog_link": "https://www.anthropic.com/news/claude-3-5-haiku",
  "source_repo_link": null,
  "source_weights_link": null,
  "created_at": "2025-07-19T19:49:05.744002+00:00",
  "updated_at": "2025-07-19T19:49:05.744002+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/anthropic/models/claude-3-5-sonnet-20240620/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 1086,
    "benchmark_id": "big-bench-hard",
    "model_id": "claude-3-5-sonnet-20240620",
    "score": 0.931,
    "normalized_score": 0.931,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-3-5-sonnet",
    "verified_by_llmstats": false,
    "analysis_method": "3-shot CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.259482+00:00",
    "updated_at": "2025-07-19T19:56:13.259482+00:00",
    "benchmark_name": "BIG-Bench Hard"
  },
  {
    "model_benchmark_id": 961,
    "benchmark_id": "drop",
    "model_id": "claude-3-5-sonnet-20240620",
    "score": 0.871,
    "normalized_score": 0.871,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-3-5-sonnet",
    "verified_by_llmstats": false,
    "analysis_method": "3-shot F1 Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.021997+00:00",
    "updated_at": "2025-07-19T19:56:13.021997+00:00",
    "benchmark_name": "DROP"
  },
  {
    "model_benchmark_id": 336,
    "benchmark_id": "gpqa",
    "model_id": "claude-3-5-sonnet-20240620",
    "score": 0.594,
    "normalized_score": 0.594,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-3-5-sonnet",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.733246+00:00",
    "updated_at": "2025-07-19T19:56:11.733246+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 1010,
    "benchmark_id": "gsm8k",
    "model_id": "claude-3-5-sonnet-20240620",
    "score": 0.964,
    "normalized_score": 0.964,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-3-5-sonnet",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.107479+00:00",
    "updated_at": "2025-07-19T19:56:13.107479+00:00",
    "benchmark_name": "GSM8k"
  },
  {
    "model_benchmark_id": 804,
    "benchmark_id": "humaneval",
    "model_id": "claude-3-5-sonnet-20240620",
    "score": 0.92,
    "normalized_score": 0.92,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-3-5-sonnet",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.676235+00:00",
    "updated_at": "2025-07-19T19:56:12.676235+00:00",
    "benchmark_name": "HumanEval"
  },
  {
    "model_benchmark_id": 420,
    "benchmark_id": "math",
    "model_id": "claude-3-5-sonnet-20240620",
    "score": 0.711,
    "normalized_score": 0.711,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-3-5-sonnet",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.891344+00:00",
    "updated_at": "2025-07-19T19:56:11.891344+00:00",
    "benchmark_name": "MATH"
  },
  {
    "model_benchmark_id": 1295,
    "benchmark_id": "mgsm",
    "model_id": "claude-3-5-sonnet-20240620",
    "score": 0.916,
    "normalized_score": 0.916,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-3-5-sonnet",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.710814+00:00",
    "updated_at": "2025-07-19T19:56:13.710814+00:00",
    "benchmark_name": "MGSM"
  },
  {
    "model_benchmark_id": 107,
    "benchmark_id": "mmlu",
    "model_id": "claude-3-5-sonnet-20240620",
    "score": 0.904,
    "normalized_score": 0.904,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-3-5-sonnet",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.300996+00:00",
    "updated_at": "2025-07-19T19:56:11.300996+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 212,
    "benchmark_id": "mmlu-pro",
    "model_id": "claude-3-5-sonnet-20240620",
    "score": 0.761,
    "normalized_score": 0.761,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/spaces/TIGER-Lab/MMLU-Pro",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.503274+00:00",
    "updated_at": "2025-07-19T19:56:11.503274+00:00",
    "benchmark_name": "MMLU-Pro"
  }
]

================================================
FILE: data/organizations/anthropic/models/claude-3-5-sonnet-20240620/model.json
================================================
{
  "model_id": "claude-3-5-sonnet-20240620",
  "name": "Claude 3.5 Sonnet",
  "organization_id": "anthropic",
  "fine_tuned_from_model_id": null,
  "description": "Claude 3.5 Sonnet is a powerful AI model. It excels in graduate-level reasoning, undergraduate-level knowledge, and coding proficiency, with improved understanding of nuance, humor, and complex instructions.",
  "release_date": "2024-06-21",
  "announcement_date": "2024-06-21",
  "license_id": "proprietary",
  "multimodal": true,
  "knowledge_cutoff": null,
  "param_count": null,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://docs.anthropic.com/en/docs/intro-to-claude#claude-3-5-family",
  "source_playground": "https://claude.ai",
  "source_paper": null,
  "source_scorecard_blog_link": "https://www.anthropic.com/news/claude-3-5-sonnet",
  "source_repo_link": null,
  "source_weights_link": null,
  "created_at": "2025-07-19T19:49:05.757926+00:00",
  "updated_at": "2025-07-19T19:49:05.757926+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/anthropic/models/claude-3-5-sonnet-20241022/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 1260,
    "benchmark_id": "ai2d",
    "model_id": "claude-3-5-sonnet-20241022",
    "score": 0.947,
    "normalized_score": 0.947,
    "is_self_reported": true,
    "self_reported_source_link": "https://www-cdn.anthropic.com/fed9cc193a14b84131812372d8d5857f8f304c52/Model_Card_Claude_3_Addendum.pdf",
    "verified_by_llmstats": false,
    "analysis_method": "test",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.643744+00:00",
    "updated_at": "2025-07-19T19:56:13.643744+00:00",
    "benchmark_name": "AI2D"
  },
  {
    "model_benchmark_id": 1084,
    "benchmark_id": "big-bench-hard",
    "model_id": "claude-3-5-sonnet-20241022",
    "score": 0.931,
    "normalized_score": 0.931,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/3-5-models-and-computer-use",
    "verified_by_llmstats": false,
    "analysis_method": "3-shot CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.256021+00:00",
    "updated_at": "2025-07-19T19:56:13.256021+00:00",
    "benchmark_name": "BIG-Bench Hard"
  },
  {
    "model_benchmark_id": 872,
    "benchmark_id": "chartqa",
    "model_id": "claude-3-5-sonnet-20241022",
    "score": 0.908,
    "normalized_score": 0.908,
    "is_self_reported": true,
    "self_reported_source_link": "https://www-cdn.anthropic.com/fed9cc193a14b84131812372d8d5857f8f304c52/Model_Card_Claude_3_Addendum.pdf",
    "verified_by_llmstats": false,
    "analysis_method": "test, relaxed accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.819413+00:00",
    "updated_at": "2025-07-19T19:56:12.819413+00:00",
    "benchmark_name": "ChartQA"
  },
  {
    "model_benchmark_id": 897,
    "benchmark_id": "docvqa",
    "model_id": "claude-3-5-sonnet-20241022",
    "score": 0.952,
    "normalized_score": 0.952,
    "is_self_reported": true,
    "self_reported_source_link": "https://www-cdn.anthropic.com/fed9cc193a14b84131812372d8d5857f8f304c52/Model_Card_Claude_3_Addendum.pdf",
    "verified_by_llmstats": false,
    "analysis_method": "test, ANLS score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.867423+00:00",
    "updated_at": "2025-07-19T19:56:12.867423+00:00",
    "benchmark_name": "DocVQA"
  },
  {
    "model_benchmark_id": 959,
    "benchmark_id": "drop",
    "model_id": "claude-3-5-sonnet-20241022",
    "score": 0.871,
    "normalized_score": 0.871,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/3-5-models-and-computer-use",
    "verified_by_llmstats": false,
    "analysis_method": "3-shot F1 Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.018623+00:00",
    "updated_at": "2025-07-19T19:56:13.018623+00:00",
    "benchmark_name": "DROP"
  },
  {
    "model_benchmark_id": 334,
    "benchmark_id": "gpqa",
    "model_id": "claude-3-5-sonnet-20241022",
    "score": 0.672,
    "normalized_score": 0.672,
    "is_self_reported": true,
    "self_reported_source_link": "https://www-cdn.anthropic.com/fed9cc193a14b84131812372d8d5857f8f304c52/Model_Card_Claude_3_Addendum.pdf",
    "verified_by_llmstats": false,
    "analysis_method": "Maj@32 5-shot CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.730271+00:00",
    "updated_at": "2025-07-19T19:56:11.730271+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 1008,
    "benchmark_id": "gsm8k",
    "model_id": "claude-3-5-sonnet-20241022",
    "score": 0.964,
    "normalized_score": 0.964,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/3-5-models-and-computer-use",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.104248+00:00",
    "updated_at": "2025-07-19T19:56:13.104248+00:00",
    "benchmark_name": "GSM8k"
  },
  {
    "model_benchmark_id": 802,
    "benchmark_id": "humaneval",
    "model_id": "claude-3-5-sonnet-20241022",
    "score": 0.937,
    "normalized_score": 0.937,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/3-5-models-and-computer-use",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.673295+00:00",
    "updated_at": "2025-07-19T19:56:12.673295+00:00",
    "benchmark_name": "HumanEval"
  },
  {
    "model_benchmark_id": 418,
    "benchmark_id": "math",
    "model_id": "claude-3-5-sonnet-20241022",
    "score": 0.783,
    "normalized_score": 0.783,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/3-5-models-and-computer-use",
    "verified_by_llmstats": false,
    "analysis_method": "standard",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.887521+00:00",
    "updated_at": "2025-07-19T19:56:11.887521+00:00",
    "benchmark_name": "MATH"
  },
  {
    "model_benchmark_id": 535,
    "benchmark_id": "mathvista",
    "model_id": "claude-3-5-sonnet-20241022",
    "score": 0.677,
    "normalized_score": 0.677,
    "is_self_reported": true,
    "self_reported_source_link": "https://www-cdn.anthropic.com/fed9cc193a14b84131812372d8d5857f8f304c52/Model_Card_Claude_3_Addendum.pdf",
    "verified_by_llmstats": false,
    "analysis_method": "testmini",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.108158+00:00",
    "updated_at": "2025-07-19T19:56:12.108158+00:00",
    "benchmark_name": "MathVista"
  },
  {
    "model_benchmark_id": 1293,
    "benchmark_id": "mgsm",
    "model_id": "claude-3-5-sonnet-20241022",
    "score": 0.916,
    "normalized_score": 0.916,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/3-5-models-and-computer-use",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.707042+00:00",
    "updated_at": "2025-07-19T19:56:13.707042+00:00",
    "benchmark_name": "MGSM"
  },
  {
    "model_benchmark_id": 105,
    "benchmark_id": "mmlu",
    "model_id": "claude-3-5-sonnet-20241022",
    "score": 0.904,
    "normalized_score": 0.904,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/3-5-models-and-computer-use",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.298011+00:00",
    "updated_at": "2025-07-19T19:56:11.298011+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 211,
    "benchmark_id": "mmlu-pro",
    "model_id": "claude-3-5-sonnet-20241022",
    "score": 0.776,
    "normalized_score": 0.776,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/spaces/TIGER-Lab/MMLU-Pro",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.501331+00:00",
    "updated_at": "2025-07-19T19:56:11.501331+00:00",
    "benchmark_name": "MMLU-Pro"
  },
  {
    "model_benchmark_id": 584,
    "benchmark_id": "mmmu",
    "model_id": "claude-3-5-sonnet-20241022",
    "score": 0.683,
    "normalized_score": 0.683,
    "is_self_reported": true,
    "self_reported_source_link": "https://www-cdn.anthropic.com/fed9cc193a14b84131812372d8d5857f8f304c52/Model_Card_Claude_3_Addendum.pdf",
    "verified_by_llmstats": false,
    "analysis_method": "validation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.201491+00:00",
    "updated_at": "2025-07-19T19:56:12.201491+00:00",
    "benchmark_name": "MMMU"
  },
  {
    "model_benchmark_id": 1814,
    "benchmark_id": "osworld-extended",
    "model_id": "claude-3-5-sonnet-20241022",
    "score": 0.22,
    "normalized_score": 0.22,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/3-5-models-and-computer-use",
    "verified_by_llmstats": false,
    "analysis_method": "standard",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.117020+00:00",
    "updated_at": "2025-07-19T19:56:15.117020+00:00",
    "benchmark_name": "OSWorld Extended"
  },
  {
    "model_benchmark_id": 1813,
    "benchmark_id": "osworld-screenshot-only",
    "model_id": "claude-3-5-sonnet-20241022",
    "score": 0.149,
    "normalized_score": 0.149,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/3-5-models-and-computer-use",
    "verified_by_llmstats": false,
    "analysis_method": "standard",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.112291+00:00",
    "updated_at": "2025-07-19T19:56:15.112291+00:00",
    "benchmark_name": "OSWorld Screenshot-only"
  },
  {
    "model_benchmark_id": 1350,
    "benchmark_id": "swe-bench-verified",
    "model_id": "claude-3-5-sonnet-20241022",
    "score": 0.49,
    "normalized_score": 0.49,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/3-5-models-and-computer-use",
    "verified_by_llmstats": false,
    "analysis_method": "standard",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.842061+00:00",
    "updated_at": "2025-07-19T19:56:13.842061+00:00",
    "benchmark_name": "SWE-Bench Verified"
  },
  {
    "model_benchmark_id": 1774,
    "benchmark_id": "tau-bench-airline",
    "model_id": "claude-3-5-sonnet-20241022",
    "score": 0.46,
    "normalized_score": 0.46,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/3-5-models-and-computer-use",
    "verified_by_llmstats": false,
    "analysis_method": "standard",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.003886+00:00",
    "updated_at": "2025-07-19T19:56:15.003886+00:00",
    "benchmark_name": "TAU-bench Airline"
  },
  {
    "model_benchmark_id": 1760,
    "benchmark_id": "tau-bench-retail",
    "model_id": "claude-3-5-sonnet-20241022",
    "score": 0.692,
    "normalized_score": 0.692,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/3-5-models-and-computer-use",
    "verified_by_llmstats": false,
    "analysis_method": "standard",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.975456+00:00",
    "updated_at": "2025-07-19T19:56:14.975456+00:00",
    "benchmark_name": "TAU-bench Retail"
  }
]

================================================
FILE: data/organizations/anthropic/models/claude-3-5-sonnet-20241022/model.json
================================================
{
  "model_id": "claude-3-5-sonnet-20241022",
  "name": "Claude 3.5 Sonnet",
  "organization_id": "anthropic",
  "fine_tuned_from_model_id": null,
  "description": "Claude 3.5 Sonnet is a powerful AI model with industry-leading software engineering skills. It excels in coding, planning, and problem-solving, with significant improvements in agentic coding and tool use tasks. The model includes computer use capabilities in public beta, allowing it to interact with computer interfaces like a human user.",
  "release_date": "2024-10-22",
  "announcement_date": "2024-10-22",
  "license_id": "proprietary",
  "multimodal": true,
  "knowledge_cutoff": null,
  "param_count": null,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://docs.anthropic.com/en/docs/intro-to-claude#claude-3-5-family",
  "source_playground": "https://claude.ai",
  "source_paper": "https://www-cdn.anthropic.com/fed9cc193a14b84131812372d8d5857f8f304c52/Model_Card_Claude_3_Addendum.pdf",
  "source_scorecard_blog_link": "https://www.anthropic.com/news/claude-3-5-sonnet",
  "source_repo_link": null,
  "source_weights_link": null,
  "created_at": "2025-07-19T19:49:05.752534+00:00",
  "updated_at": "2025-07-19T19:49:05.752534+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/anthropic/models/claude-3-7-sonnet-20250219/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 478,
    "benchmark_id": "aime-2024",
    "model_id": "claude-3-7-sonnet-20250219",
    "score": 0.8,
    "normalized_score": 0.8,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-3-7-sonnet",
    "verified_by_llmstats": false,
    "analysis_method": "",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.007831+00:00",
    "updated_at": "2025-07-19T19:56:12.007831+00:00",
    "benchmark_name": "AIME 2024"
  },
  {
    "model_benchmark_id": 700,
    "benchmark_id": "aime-2025",
    "model_id": "claude-3-7-sonnet-20250219",
    "score": 0.548,
    "normalized_score": 0.548,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-4",
    "verified_by_llmstats": false,
    "analysis_method": "Parallel test-time compute (footnotes 4, 5)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.464908+00:00",
    "updated_at": "2025-07-19T19:56:12.464908+00:00",
    "benchmark_name": "AIME 2025"
  },
  {
    "model_benchmark_id": 332,
    "benchmark_id": "gpqa",
    "model_id": "claude-3-7-sonnet-20250219",
    "score": 0.848,
    "normalized_score": 0.848,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-3-7-sonnet",
    "verified_by_llmstats": false,
    "analysis_method": "Diamond",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.727330+00:00",
    "updated_at": "2025-07-19T19:56:11.727330+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 629,
    "benchmark_id": "ifeval",
    "model_id": "claude-3-7-sonnet-20250219",
    "score": 0.932,
    "normalized_score": 0.932,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-3-7-sonnet",
    "verified_by_llmstats": false,
    "analysis_method": "",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.294010+00:00",
    "updated_at": "2025-07-19T19:56:12.294010+00:00",
    "benchmark_name": "IFEval"
  },
  {
    "model_benchmark_id": 512,
    "benchmark_id": "math-500",
    "model_id": "claude-3-7-sonnet-20250219",
    "score": 0.962,
    "normalized_score": 0.962,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-3-7-sonnet",
    "verified_by_llmstats": false,
    "analysis_method": "",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.063685+00:00",
    "updated_at": "2025-07-19T19:56:12.063685+00:00",
    "benchmark_name": "MATH-500"
  },
  {
    "model_benchmark_id": 1478,
    "benchmark_id": "mmmlu",
    "model_id": "claude-3-7-sonnet-20250219",
    "score": 0.861,
    "normalized_score": 0.861,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-3-7-sonnet",
    "verified_by_llmstats": false,
    "analysis_method": "Average over 14 non-English languages (footnote 3)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.152773+00:00",
    "updated_at": "2025-07-19T19:56:14.152773+00:00",
    "benchmark_name": "MMMLU"
  },
  {
    "model_benchmark_id": 582,
    "benchmark_id": "mmmu",
    "model_id": "claude-3-7-sonnet-20250219",
    "score": 0.75,
    "normalized_score": 0.75,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-3-7-sonnet",
    "verified_by_llmstats": false,
    "analysis_method": "validation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.197283+00:00",
    "updated_at": "2025-07-19T19:56:12.197283+00:00",
    "benchmark_name": "MMMU"
  },
  {
    "model_benchmark_id": 1348,
    "benchmark_id": "swe-bench-verified",
    "model_id": "claude-3-7-sonnet-20250219",
    "score": 0.703,
    "normalized_score": 0.703,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-3-7-sonnet",
    "verified_by_llmstats": false,
    "analysis_method": "With multiple parallel attempts and advanced scaffolding",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.838599+00:00",
    "updated_at": "2025-07-19T19:56:13.838599+00:00",
    "benchmark_name": "SWE-Bench Verified"
  },
  {
    "model_benchmark_id": 1772,
    "benchmark_id": "tau-bench-airline",
    "model_id": "claude-3-7-sonnet-20250219",
    "score": 0.584,
    "normalized_score": 0.584,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-3-7-sonnet",
    "verified_by_llmstats": false,
    "analysis_method": "With prompt addendum to better utilize planning",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.999875+00:00",
    "updated_at": "2025-07-19T19:56:14.999875+00:00",
    "benchmark_name": "TAU-bench Airline"
  },
  {
    "model_benchmark_id": 1758,
    "benchmark_id": "tau-bench-retail",
    "model_id": "claude-3-7-sonnet-20250219",
    "score": 0.812,
    "normalized_score": 0.812,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-3-7-sonnet",
    "verified_by_llmstats": false,
    "analysis_method": "With prompt addendum to better utilize planning",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.971988+00:00",
    "updated_at": "2025-07-19T19:56:14.971988+00:00",
    "benchmark_name": "TAU-bench Retail"
  },
  {
    "model_benchmark_id": 653,
    "benchmark_id": "terminal-bench",
    "model_id": "claude-3-7-sonnet-20250219",
    "score": 0.352,
    "normalized_score": 0.352,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-4",
    "verified_by_llmstats": false,
    "analysis_method": "Parallel test-time compute, Claude Code agent framework (footnotes 2, 5)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.350298+00:00",
    "updated_at": "2025-07-19T19:56:12.350298+00:00",
    "benchmark_name": "Terminal-bench"
  }
]

================================================
FILE: data/organizations/anthropic/models/claude-3-7-sonnet-20250219/model.json
================================================
{
  "model_id": "claude-3-7-sonnet-20250219",
  "name": "Claude 3.7 Sonnet",
  "organization_id": "anthropic",
  "fine_tuned_from_model_id": null,
  "description": "The most intelligent Claude model and the first hybrid reasoning model on the market. Claude 3.7 Sonnet can produce near-instant responses or extended, step-by-step thinking that is made visible to the user. Shows particularly strong improvements in coding and front-end web development.",
  "release_date": "2025-02-24",
  "announcement_date": "2025-02-24",
  "license_id": "proprietary",
  "multimodal": true,
  "knowledge_cutoff": null,
  "param_count": null,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://docs.anthropic.com/en/docs/about-claude/models/all-models",
  "source_playground": "https://claude.ai",
  "source_paper": null,
  "source_scorecard_blog_link": "https://www.anthropic.com/news/claude-3-7-sonnet",
  "source_repo_link": null,
  "source_weights_link": null,
  "created_at": "2025-07-19T19:49:05.747775+00:00",
  "updated_at": "2025-07-19T19:49:05.747775+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/anthropic/models/claude-3-haiku-20240307/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 27,
    "benchmark_id": "arc-c",
    "model_id": "claude-3-haiku-20240307",
    "score": 0.892,
    "normalized_score": 0.892,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-3-haiku",
    "verified_by_llmstats": false,
    "analysis_method": "25-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.137830+00:00",
    "updated_at": "2025-07-19T19:56:11.137830+00:00",
    "benchmark_name": "ARC-C"
  },
  {
    "model_benchmark_id": 1085,
    "benchmark_id": "big-bench-hard",
    "model_id": "claude-3-haiku-20240307",
    "score": 0.737,
    "normalized_score": 0.737,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-3-haiku",
    "verified_by_llmstats": false,
    "analysis_method": "3-shot CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.257814+00:00",
    "updated_at": "2025-07-19T19:56:13.257814+00:00",
    "benchmark_name": "BIG-Bench Hard"
  },
  {
    "model_benchmark_id": 960,
    "benchmark_id": "drop",
    "model_id": "claude-3-haiku-20240307",
    "score": 0.784,
    "normalized_score": 0.784,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-3-haiku",
    "verified_by_llmstats": false,
    "analysis_method": "3-shot, F1 score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.020609+00:00",
    "updated_at": "2025-07-19T19:56:13.020609+00:00",
    "benchmark_name": "DROP"
  },
  {
    "model_benchmark_id": 335,
    "benchmark_id": "gpqa",
    "model_id": "claude-3-haiku-20240307",
    "score": 0.333,
    "normalized_score": 0.333,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-3-haiku",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.731729+00:00",
    "updated_at": "2025-07-19T19:56:11.731729+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 1009,
    "benchmark_id": "gsm8k",
    "model_id": "claude-3-haiku-20240307",
    "score": 0.889,
    "normalized_score": 0.889,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-3-haiku",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.105970+00:00",
    "updated_at": "2025-07-19T19:56:13.105970+00:00",
    "benchmark_name": "GSM8k"
  },
  {
    "model_benchmark_id": 53,
    "benchmark_id": "hellaswag",
    "model_id": "claude-3-haiku-20240307",
    "score": 0.859,
    "normalized_score": 0.859,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-3-haiku",
    "verified_by_llmstats": false,
    "analysis_method": "10-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.195028+00:00",
    "updated_at": "2025-07-19T19:56:11.195028+00:00",
    "benchmark_name": "HellaSwag"
  },
  {
    "model_benchmark_id": 803,
    "benchmark_id": "humaneval",
    "model_id": "claude-3-haiku-20240307",
    "score": 0.759,
    "normalized_score": 0.759,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-3-haiku",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.674804+00:00",
    "updated_at": "2025-07-19T19:56:12.674804+00:00",
    "benchmark_name": "HumanEval"
  },
  {
    "model_benchmark_id": 419,
    "benchmark_id": "math",
    "model_id": "claude-3-haiku-20240307",
    "score": 0.389,
    "normalized_score": 0.389,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-3-haiku",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.889123+00:00",
    "updated_at": "2025-07-19T19:56:11.889123+00:00",
    "benchmark_name": "MATH"
  },
  {
    "model_benchmark_id": 1294,
    "benchmark_id": "mgsm",
    "model_id": "claude-3-haiku-20240307",
    "score": 0.751,
    "normalized_score": 0.751,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-3-haiku",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.709200+00:00",
    "updated_at": "2025-07-19T19:56:13.709200+00:00",
    "benchmark_name": "MGSM"
  },
  {
    "model_benchmark_id": 106,
    "benchmark_id": "mmlu",
    "model_id": "claude-3-haiku-20240307",
    "score": 0.752,
    "normalized_score": 0.752,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-3-haiku",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.299416+00:00",
    "updated_at": "2025-07-19T19:56:11.299416+00:00",
    "benchmark_name": "MMLU"
  }
]

================================================
FILE: data/organizations/anthropic/models/claude-3-haiku-20240307/model.json
================================================
{
  "model_id": "claude-3-haiku-20240307",
  "name": "Claude 3 Haiku",
  "organization_id": "anthropic",
  "fine_tuned_from_model_id": null,
  "description": "Claude 3 Haiku is the fastest and most compact model in the Claude 3 family, designed for near-instant responsiveness. It excels at answering simple queries and requests with unmatched speed, making it ideal for seamless AI experiences that mimic human interactions.",
  "release_date": "2024-03-13",
  "announcement_date": "2024-03-13",
  "license_id": "proprietary",
  "multimodal": true,
  "knowledge_cutoff": null,
  "param_count": null,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://www.anthropic.com/claude",
  "source_playground": "https://claude.ai",
  "source_paper": "https://www-cdn.anthropic.com/de8ba9b01c9ab7cbabf5c33b80b7bbc618857627/Model_Card_Claude_3.pdf",
  "source_scorecard_blog_link": "https://www.anthropic.com/news/claude-3-haiku",
  "source_repo_link": null,
  "source_weights_link": null,
  "created_at": "2025-07-19T19:49:05.755159+00:00",
  "updated_at": "2025-07-19T19:49:05.755159+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/anthropic/models/claude-3-opus-20240229/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 25,
    "benchmark_id": "arc-c",
    "model_id": "claude-3-opus-20240229",
    "score": 0.964,
    "normalized_score": 0.964,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-3-family",
    "verified_by_llmstats": false,
    "analysis_method": "25-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.134917+00:00",
    "updated_at": "2025-07-19T19:56:11.134917+00:00",
    "benchmark_name": "ARC-C"
  },
  {
    "model_benchmark_id": 1082,
    "benchmark_id": "big-bench-hard",
    "model_id": "claude-3-opus-20240229",
    "score": 0.868,
    "normalized_score": 0.868,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-3-family",
    "verified_by_llmstats": false,
    "analysis_method": "3-shot CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.252820+00:00",
    "updated_at": "2025-07-19T19:56:13.252820+00:00",
    "benchmark_name": "BIG-Bench Hard"
  },
  {
    "model_benchmark_id": 956,
    "benchmark_id": "drop",
    "model_id": "claude-3-opus-20240229",
    "score": 0.831,
    "normalized_score": 0.831,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-3-family",
    "verified_by_llmstats": false,
    "analysis_method": "3-shot, F1 Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.013702+00:00",
    "updated_at": "2025-07-19T19:56:13.013702+00:00",
    "benchmark_name": "DROP"
  },
  {
    "model_benchmark_id": 329,
    "benchmark_id": "gpqa",
    "model_id": "claude-3-opus-20240229",
    "score": 0.504,
    "normalized_score": 0.504,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-3-family",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot CoT - Diamond",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.722913+00:00",
    "updated_at": "2025-07-19T19:56:11.722913+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 1006,
    "benchmark_id": "gsm8k",
    "model_id": "claude-3-opus-20240229",
    "score": 0.95,
    "normalized_score": 0.95,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-3-family",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.101310+00:00",
    "updated_at": "2025-07-19T19:56:13.101310+00:00",
    "benchmark_name": "GSM8k"
  },
  {
    "model_benchmark_id": 51,
    "benchmark_id": "hellaswag",
    "model_id": "claude-3-opus-20240229",
    "score": 0.954,
    "normalized_score": 0.954,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-3-family",
    "verified_by_llmstats": false,
    "analysis_method": "10-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.190975+00:00",
    "updated_at": "2025-07-19T19:56:11.190975+00:00",
    "benchmark_name": "HellaSwag"
  },
  {
    "model_benchmark_id": 799,
    "benchmark_id": "humaneval",
    "model_id": "claude-3-opus-20240229",
    "score": 0.849,
    "normalized_score": 0.849,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-3-family",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.668395+00:00",
    "updated_at": "2025-07-19T19:56:12.668395+00:00",
    "benchmark_name": "HumanEval"
  },
  {
    "model_benchmark_id": 415,
    "benchmark_id": "math",
    "model_id": "claude-3-opus-20240229",
    "score": 0.601,
    "normalized_score": 0.601,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-3-family",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.882261+00:00",
    "updated_at": "2025-07-19T19:56:11.882261+00:00",
    "benchmark_name": "MATH"
  },
  {
    "model_benchmark_id": 1290,
    "benchmark_id": "mgsm",
    "model_id": "claude-3-opus-20240229",
    "score": 0.907,
    "normalized_score": 0.907,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-3-family",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.701952+00:00",
    "updated_at": "2025-07-19T19:56:13.701952+00:00",
    "benchmark_name": "MGSM"
  },
  {
    "model_benchmark_id": 103,
    "benchmark_id": "mmlu",
    "model_id": "claude-3-opus-20240229",
    "score": 0.868,
    "normalized_score": 0.868,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-3-family",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.294591+00:00",
    "updated_at": "2025-07-19T19:56:11.294591+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 208,
    "benchmark_id": "mmlu-pro",
    "model_id": "claude-3-opus-20240229",
    "score": 0.685,
    "normalized_score": 0.685,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2406.01574",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.496438+00:00",
    "updated_at": "2025-07-19T19:56:11.496438+00:00",
    "benchmark_name": "MMLU-Pro"
  }
]

================================================
FILE: data/organizations/anthropic/models/claude-3-opus-20240229/model.json
================================================
{
  "model_id": "claude-3-opus-20240229",
  "name": "Claude 3 Opus",
  "organization_id": "anthropic",
  "fine_tuned_from_model_id": null,
  "description": "Claude 3 Opus is Anthropic's most intelligent model, with best-in-market performance on highly complex tasks. It can navigate open-ended prompts and sight-unseen scenarios with remarkable fluency and human-like understanding, showing the outer limits of what's possible with generative AI.",
  "release_date": "2024-02-29",
  "announcement_date": "2024-02-29",
  "license_id": "proprietary",
  "multimodal": true,
  "knowledge_cutoff": null,
  "param_count": null,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://www.anthropic.com/claude",
  "source_playground": "https://claude.ai",
  "source_paper": "https://www-cdn.anthropic.com/de8ba9b01c9ab7cbabf5c33b80b7bbc618857627/Model_Card_Claude_3.pdf",
  "source_scorecard_blog_link": "https://www.anthropic.com/news/claude-3-family",
  "source_repo_link": null,
  "source_weights_link": null,
  "created_at": "2025-07-19T19:49:05.738279+00:00",
  "updated_at": "2025-07-19T19:49:05.738279+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/anthropic/models/claude-3-sonnet-20240229/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 26,
    "benchmark_id": "arc-c",
    "model_id": "claude-3-sonnet-20240229",
    "score": 0.932,
    "normalized_score": 0.932,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-3-family",
    "verified_by_llmstats": false,
    "analysis_method": "25-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.136363+00:00",
    "updated_at": "2025-07-19T19:56:11.136363+00:00",
    "benchmark_name": "ARC-C"
  },
  {
    "model_benchmark_id": 1083,
    "benchmark_id": "big-bench-hard",
    "model_id": "claude-3-sonnet-20240229",
    "score": 0.829,
    "normalized_score": 0.829,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-3-family",
    "verified_by_llmstats": false,
    "analysis_method": "3-shot CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.254531+00:00",
    "updated_at": "2025-07-19T19:56:13.254531+00:00",
    "benchmark_name": "BIG-Bench Hard"
  },
  {
    "model_benchmark_id": 957,
    "benchmark_id": "drop",
    "model_id": "claude-3-sonnet-20240229",
    "score": 0.789,
    "normalized_score": 0.789,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-3-family",
    "verified_by_llmstats": false,
    "analysis_method": "3-shot, F1 score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.015601+00:00",
    "updated_at": "2025-07-19T19:56:13.015601+00:00",
    "benchmark_name": "DROP"
  },
  {
    "model_benchmark_id": 330,
    "benchmark_id": "gpqa",
    "model_id": "claude-3-sonnet-20240229",
    "score": 0.404,
    "normalized_score": 0.404,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-3-family",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot CoT - Diamond",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.724379+00:00",
    "updated_at": "2025-07-19T19:56:11.724379+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 1007,
    "benchmark_id": "gsm8k",
    "model_id": "claude-3-sonnet-20240229",
    "score": 0.923,
    "normalized_score": 0.923,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-3-family",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.102758+00:00",
    "updated_at": "2025-07-19T19:56:13.102758+00:00",
    "benchmark_name": "GSM8k"
  },
  {
    "model_benchmark_id": 52,
    "benchmark_id": "hellaswag",
    "model_id": "claude-3-sonnet-20240229",
    "score": 0.89,
    "normalized_score": 0.89,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-3-family",
    "verified_by_llmstats": false,
    "analysis_method": "10-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.193193+00:00",
    "updated_at": "2025-07-19T19:56:11.193193+00:00",
    "benchmark_name": "HellaSwag"
  },
  {
    "model_benchmark_id": 800,
    "benchmark_id": "humaneval",
    "model_id": "claude-3-sonnet-20240229",
    "score": 0.73,
    "normalized_score": 0.73,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-3-family",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.670119+00:00",
    "updated_at": "2025-07-19T19:56:12.670119+00:00",
    "benchmark_name": "HumanEval"
  },
  {
    "model_benchmark_id": 416,
    "benchmark_id": "math",
    "model_id": "claude-3-sonnet-20240229",
    "score": 0.431,
    "normalized_score": 0.431,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-3-family",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.884160+00:00",
    "updated_at": "2025-07-19T19:56:11.884160+00:00",
    "benchmark_name": "MATH"
  },
  {
    "model_benchmark_id": 1291,
    "benchmark_id": "mgsm",
    "model_id": "claude-3-sonnet-20240229",
    "score": 0.835,
    "normalized_score": 0.835,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-3-family",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.703593+00:00",
    "updated_at": "2025-07-19T19:56:13.703593+00:00",
    "benchmark_name": "MGSM"
  },
  {
    "model_benchmark_id": 104,
    "benchmark_id": "mmlu",
    "model_id": "claude-3-sonnet-20240229",
    "score": 0.79,
    "normalized_score": 0.79,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-3-family",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.296409+00:00",
    "updated_at": "2025-07-19T19:56:11.296409+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 209,
    "benchmark_id": "mmlu-pro",
    "model_id": "claude-3-sonnet-20240229",
    "score": 0.568,
    "normalized_score": 0.568,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2406.01574",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.498008+00:00",
    "updated_at": "2025-07-19T19:56:11.498008+00:00",
    "benchmark_name": "MMLU-Pro"
  }
]

================================================
FILE: data/organizations/anthropic/models/claude-3-sonnet-20240229/model.json
================================================
{
  "model_id": "claude-3-sonnet-20240229",
  "name": "Claude 3 Sonnet",
  "organization_id": "anthropic",
  "fine_tuned_from_model_id": null,
  "description": "Claude 3 Sonnet strikes the ideal balance between intelligence and speed\u2014particularly for enterprise workloads. It delivers strong performance at a lower cost compared to its peers, and is engineered for high endurance in large-scale AI deployments.",
  "release_date": "2024-02-29",
  "announcement_date": "2024-02-29",
  "license_id": "proprietary",
  "multimodal": true,
  "knowledge_cutoff": null,
  "param_count": null,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://www.anthropic.com/claude",
  "source_playground": "https://claude.ai",
  "source_paper": "https://www-cdn.anthropic.com/de8ba9b01c9ab7cbabf5c33b80b7bbc618857627/Model_Card_Claude_3.pdf",
  "source_scorecard_blog_link": "https://www.anthropic.com/news/claude-3-family",
  "source_repo_link": null,
  "source_weights_link": null,
  "created_at": "2025-07-19T19:49:05.740647+00:00",
  "updated_at": "2025-07-19T19:49:05.740647+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/anthropic/models/claude-haiku-4-5-20251015/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 22228,
    "benchmark_id": "swe-bench-verified",
    "model_id": "claude-haiku-4-5-20251015",
    "score": 0.733,
    "normalized_score": 0.733,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-haiku-4-5",
    "verified_by_llmstats": false,
    "analysis_method": "standard",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-10-15T00:00:00.000000+00:00",
    "updated_at": "2025-10-15T00:00:00.000000+00:00",
    "benchmark_name": "SWE-Bench Verified"
  },
  {
    "model_benchmark_id": 22229,
    "benchmark_id": "terminal-bench",
    "model_id": "claude-haiku-4-5-20251015",
    "score": 0.41,
    "normalized_score": 0.41,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-haiku-4-5",
    "verified_by_llmstats": false,
    "analysis_method": "standard",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-10-15T00:00:00.000000+00:00",
    "updated_at": "2025-10-15T00:00:00.000000+00:00",
    "benchmark_name": "Terminal-Bench"
  },
  {
    "model_benchmark_id": 22230,
    "benchmark_id": "tau2-retail",
    "model_id": "claude-haiku-4-5-20251015",
    "score": 0.832,
    "normalized_score": 0.832,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-haiku-4-5",
    "verified_by_llmstats": false,
    "analysis_method": "standard",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-10-15T00:00:00.000000+00:00",
    "updated_at": "2025-10-15T00:00:00.000000+00:00",
    "benchmark_name": "Tau2 Retail"
  },
  {
    "model_benchmark_id": 22231,
    "benchmark_id": "tau2-airline",
    "model_id": "claude-haiku-4-5-20251015",
    "score": 0.636,
    "normalized_score": 0.636,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-haiku-4-5",
    "verified_by_llmstats": false,
    "analysis_method": "standard",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-10-15T00:00:00.000000+00:00",
    "updated_at": "2025-10-15T00:00:00.000000+00:00",
    "benchmark_name": "Tau2 Airline"
  },
  {
    "model_benchmark_id": 22232,
    "benchmark_id": "tau2-telecom",
    "model_id": "claude-haiku-4-5-20251015",
    "score": 0.83,
    "normalized_score": 0.83,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-haiku-4-5",
    "verified_by_llmstats": false,
    "analysis_method": "standard",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-10-15T00:00:00.000000+00:00",
    "updated_at": "2025-10-15T00:00:00.000000+00:00",
    "benchmark_name": "Tau2 Telecom"
  },
  {
    "model_benchmark_id": 22233,
    "benchmark_id": "osworld",
    "model_id": "claude-haiku-4-5-20251015",
    "score": 0.507,
    "normalized_score": 0.507,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-haiku-4-5",
    "verified_by_llmstats": false,
    "analysis_method": "standard",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-10-15T00:00:00.000000+00:00",
    "updated_at": "2025-10-15T00:00:00.000000+00:00",
    "benchmark_name": "OSWorld"
  },
  {
    "model_benchmark_id": 22234,
    "benchmark_id": "aime-2025",
    "model_id": "claude-haiku-4-5-20251015",
    "score": 0.963,
    "normalized_score": 0.963,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-haiku-4-5",
    "verified_by_llmstats": false,
    "analysis_method": "python",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-10-15T00:00:00.000000+00:00",
    "updated_at": "2025-10-15T00:00:00.000000+00:00",
    "benchmark_name": "AIME 2025"
  },
  {
    "model_benchmark_id": 22235,
    "benchmark_id": "aime-2025",
    "model_id": "claude-haiku-4-5-20251015",
    "score": 0.807,
    "normalized_score": 0.807,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-haiku-4-5",
    "verified_by_llmstats": false,
    "analysis_method": "no tools",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-10-15T00:00:00.000000+00:00",
    "updated_at": "2025-10-15T00:00:00.000000+00:00",
    "benchmark_name": "AIME 2025"
  },
  {
    "model_benchmark_id": 22236,
    "benchmark_id": "gpqa",
    "model_id": "claude-haiku-4-5-20251015",
    "score": 0.73,
    "normalized_score": 0.73,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-haiku-4-5",
    "verified_by_llmstats": false,
    "analysis_method": "Diamond subset",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-10-15T00:00:00.000000+00:00",
    "updated_at": "2025-10-15T00:00:00.000000+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 22237,
    "benchmark_id": "mmmlu",
    "model_id": "claude-haiku-4-5-20251015",
    "score": 0.83,
    "normalized_score": 0.83,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-haiku-4-5",
    "verified_by_llmstats": false,
    "analysis_method": "standard",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-10-15T00:00:00.000000+00:00",
    "updated_at": "2025-10-15T00:00:00.000000+00:00",
    "benchmark_name": "MMMLU"
  },
  {
    "model_benchmark_id": 22238,
    "benchmark_id": "mmmu-(validation)",
    "model_id": "claude-haiku-4-5-20251015",
    "score": 0.732,
    "normalized_score": 0.732,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-haiku-4-5",
    "verified_by_llmstats": false,
    "analysis_method": "standard",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-10-15T00:00:00.000000+00:00",
    "updated_at": "2025-10-15T00:00:00.000000+00:00",
    "benchmark_name": "MMMU (validation)"
  },
  {
    "model_benchmark_id": 22239,
    "benchmark_id": "cybersecurity-ctfs",
    "model_id": "claude-haiku-4-5-20251015",
    "score": 0.46875,
    "normalized_score": 0.46875,
    "is_self_reported": true,
    "self_reported_source_link": "https://assets.anthropic.com/m/99128ddd009bdcb/original/Claude-Haiku-4-5-System-Card.pdf",
    "verified_by_llmstats": false,
    "analysis_method": "32-challenge subset",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": "15/32 challenges solved (pass@30)",
    "created_at": "2025-10-15T00:00:00.000000+00:00",
    "updated_at": "2025-10-15T00:00:00.000000+00:00",
    "benchmark_name": "Cybersecurity CTFs"
  }
]


================================================
FILE: data/organizations/anthropic/models/claude-haiku-4-5-20251015/model.json
================================================
{
  "model_id": "claude-haiku-4-5-20251015",
  "name": "Claude Haiku 4.5",
  "organization_id": "anthropic",
  "fine_tuned_from_model_id": null,
  "description": "Claude Haiku 4.5 is Anthropic's fastest, most cost-efficient model, matching Sonnet 4's performance on coding, computer use, and agent tasks. It offers similar performance to Sonnet 4 at one-third the cost and more than twice the speed, making it ideal for high-volume, latency-sensitive applications and multi-agent orchestration.",
  "release_date": "2025-10-15",
  "announcement_date": "2025-10-15",
  "license_id": "proprietary",
  "multimodal": true,
  "knowledge_cutoff": "2025-02-01",
  "param_count": null,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://docs.anthropic.com/en/docs/about-claude/models",
  "source_playground": "https://claude.ai",
  "source_paper": "https://assets.anthropic.com/m/99128ddd009bdcb/original/Claude-Haiku-4-5-System-Card.pdf",
  "source_scorecard_blog_link": "https://www.anthropic.com/news/claude-haiku-4-5",
  "source_repo_link": null,
  "source_weights_link": null,
  "created_at": "2025-10-15T00:00:00.000000+00:00",
  "updated_at": "2025-10-15T00:00:00.000000+00:00",
  "model_family_id": null
}


================================================
FILE: data/organizations/anthropic/models/claude-opus-4-1-20250805/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 2001,
    "benchmark_id": "swe-bench-verified",
    "model_id": "claude-opus-4-1-20250805",
    "score": 0.745,
    "normalized_score": 0.745,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-opus-4-1",
    "verified_by_llmstats": false,
    "analysis_method": "No extended thinking. Simple scaffold with bash tool and file editing tool via string replacements. Scores reported out of full 500 problems.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-05T00:00:00.000000+00:00",
    "updated_at": "2025-08-05T00:00:00.000000+00:00",
    "benchmark_name": "SWE-Bench Verified"
  },
  {
    "model_benchmark_id": 2002,
    "benchmark_id": "terminal-bench",
    "model_id": "claude-opus-4-1-20250805",
    "score": 0.433,
    "normalized_score": 0.433,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-opus-4-1",
    "verified_by_llmstats": false,
    "analysis_method": "No extended thinking. Terminus 1 averaged over 5 trials.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-05T00:00:00.000000+00:00",
    "updated_at": "2025-08-05T00:00:00.000000+00:00",
    "benchmark_name": "Terminal-bench"
  },
  {
    "model_benchmark_id": 2003,
    "benchmark_id": "gpqa",
    "model_id": "claude-opus-4-1-20250805",
    "score": 0.809,
    "normalized_score": 0.809,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-opus-4-1",
    "verified_by_llmstats": false,
    "analysis_method": "Diamond: Extended thinking (up to 64K tokens)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-05T00:00:00.000000+00:00",
    "updated_at": "2025-08-05T00:00:00.000000+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 2004,
    "benchmark_id": "tau-bench-retail",
    "model_id": "claude-opus-4-1-20250805",
    "score": 0.824,
    "normalized_score": 0.824,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-opus-4-1",
    "verified_by_llmstats": false,
    "analysis_method": "Extended thinking with tool use (up to 64K tokens, prompt addendum, increased max steps from 30 to 100).",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-05T00:00:00.000000+00:00",
    "updated_at": "2025-08-05T00:00:00.000000+00:00",
    "benchmark_name": "TAU-bench Retail"
  },
  {
    "model_benchmark_id": 2005,
    "benchmark_id": "tau-bench-airline",
    "model_id": "claude-opus-4-1-20250805",
    "score": 0.56,
    "normalized_score": 0.56,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-opus-4-1",
    "verified_by_llmstats": false,
    "analysis_method": "Extended thinking with tool use (up to 64K tokens, prompt addendum, increased max steps from 30 to 100).",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-05T00:00:00.000000+00:00",
    "updated_at": "2025-08-05T00:00:00.000000+00:00",
    "benchmark_name": "TAU-bench Airline"
  },
  {
    "model_benchmark_id": 2006,
    "benchmark_id": "mmmlu",
    "model_id": "claude-opus-4-1-20250805",
    "score": 0.895,
    "normalized_score": 0.895,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-opus-4-1",
    "verified_by_llmstats": false,
    "analysis_method": "Extended thinking (up to 64K tokens). Average over 14 non-English languages.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-05T00:00:00.000000+00:00",
    "updated_at": "2025-08-05T00:00:00.000000+00:00",
    "benchmark_name": "MMMLU"
  },
  {
    "model_benchmark_id": 2007,
    "benchmark_id": "mmmu-(validation)",
    "model_id": "claude-opus-4-1-20250805",
    "score": 0.771,
    "normalized_score": 0.771,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-opus-4-1",
    "verified_by_llmstats": false,
    "analysis_method": "Extended thinking (up to 64K tokens)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-05T00:00:00.000000+00:00",
    "updated_at": "2025-08-05T00:00:00.000000+00:00",
    "benchmark_name": "MMMU (validation)"
  },
  {
    "model_benchmark_id": 2008,
    "benchmark_id": "aime-2025",
    "model_id": "claude-opus-4-1-20250805",
    "score": 0.78,
    "normalized_score": 0.78,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-opus-4-1",
    "verified_by_llmstats": false,
    "analysis_method": "Extended thinking (up to 64K tokens). AIME 2025 using nucleus sampling with a top_p of 0.95.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-05T00:00:00.000000+00:00",
    "updated_at": "2025-08-05T00:00:00.000000+00:00",
    "benchmark_name": "AIME 2025"
  }
]


================================================
FILE: data/organizations/anthropic/models/claude-opus-4-1-20250805/model.json
================================================
{
  "model_id": "claude-opus-4-1-20250805",
  "name": "Claude Opus 4.1",
  "organization_id": "anthropic",
  "fine_tuned_from_model_id": null,
  "description": "Claude Opus 4.1 is a hybrid reasoning model that pushes the frontier for coding and AI agents, featuring a 200K context window. It delivers superior performance and precision for real-world coding and agentic tasks, handling complex multi-step problems with rigor and attention to detail. With extended thinking capabilities, it offers instant responses or extended step-by-step thinking visible through user-friendly summaries. It advances state-of-the-art coding performance to 74.5% on SWE-bench Verified, excels at agentic search and research, and produces human-quality content with exceptional writing abilities. It supports 32K output tokens and adapts to specific coding styles while delivering exceptional quality for extensive generation and refactoring projects.",
  "release_date": "2025-08-05",
  "announcement_date": "2025-08-05",
  "license_id": "proprietary",
  "multimodal": true,
  "knowledge_cutoff": null,
  "param_count": null,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://docs.anthropic.com/en/docs/about-claude/models/all-models",
  "source_playground": "https://claude.ai",
  "source_paper": null,
  "source_scorecard_blog_link": "https://www.anthropic.com/news/claude-opus-4-1",
  "source_repo_link": null,
  "source_weights_link": null,
  "created_at": "2025-08-05T00:00:00.000000+00:00",
  "updated_at": "2025-08-05T00:00:00.000000+00:00",
  "model_family_id": null
}


================================================
FILE: data/organizations/anthropic/models/claude-opus-4-20250514/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 702,
    "benchmark_id": "aime-2025",
    "model_id": "claude-opus-4-20250514",
    "score": 0.755,
    "normalized_score": 0.755,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-4",
    "verified_by_llmstats": false,
    "analysis_method": "Extended thinking (up to 64K tokens) with parallel test-time compute (multiple attempts, internal scoring model selection). Nucleus sampling (top_p 0.95). Based on footnotes 4, 5 and blog appendix.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.468994+00:00",
    "updated_at": "2025-07-19T19:56:12.468994+00:00",
    "benchmark_name": "AIME 2025"
  },
  {
    "model_benchmark_id": 1388,
    "benchmark_id": "arc-agi-v2",
    "model_id": "claude-opus-4-20250514",
    "score": 0.086,
    "normalized_score": 0.086,
    "is_self_reported": false,
    "self_reported_source_link": "https://x.com/xai/status/1943158495588815072",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.923803+00:00",
    "updated_at": "2025-07-19T19:56:13.923803+00:00",
    "benchmark_name": "ARC-AGI v2"
  },
  {
    "model_benchmark_id": 337,
    "benchmark_id": "gpqa",
    "model_id": "claude-opus-4-20250514",
    "score": 0.796,
    "normalized_score": 0.796,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-4",
    "verified_by_llmstats": false,
    "analysis_method": "Diamond: Extended thinking (up to 64K tokens) with parallel test-time compute (multiple attempts, internal scoring model selection). Based on footnote 5 and blog appendix.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.734764+00:00",
    "updated_at": "2025-07-19T19:56:11.734764+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 1480,
    "benchmark_id": "mmmlu",
    "model_id": "claude-opus-4-20250514",
    "score": 0.888,
    "normalized_score": 0.888,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-4",
    "verified_by_llmstats": false,
    "analysis_method": "Extended thinking (up to 64K tokens). Average over 14 non-English languages. Based on blog appendix and footnote 3.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.155829+00:00",
    "updated_at": "2025-07-19T19:56:14.155829+00:00",
    "benchmark_name": "MMMLU"
  },
  {
    "model_benchmark_id": 1815,
    "benchmark_id": "mmmu-(validation)",
    "model_id": "claude-opus-4-20250514",
    "score": 0.765,
    "normalized_score": 0.765,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-4",
    "verified_by_llmstats": false,
    "analysis_method": "Extended thinking (up to 64K tokens). Based on blog appendix.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.120938+00:00",
    "updated_at": "2025-07-19T19:56:15.120938+00:00",
    "benchmark_name": "MMMU (validation)"
  },
  {
    "model_benchmark_id": 1351,
    "benchmark_id": "swe-bench-verified",
    "model_id": "claude-opus-4-20250514",
    "score": 0.725,
    "normalized_score": 0.725,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-4",
    "verified_by_llmstats": false,
    "analysis_method": "Parallel test-time compute (multiple attempts, internal scoring model selection). No extended thinking. Based on footnote 5 and SWE-bench methodology for high compute.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.843719+00:00",
    "updated_at": "2025-07-19T19:56:13.843719+00:00",
    "benchmark_name": "SWE-Bench Verified"
  },
  {
    "model_benchmark_id": 1775,
    "benchmark_id": "tau-bench-airline",
    "model_id": "claude-opus-4-20250514",
    "score": 0.596,
    "normalized_score": 0.596,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-4",
    "verified_by_llmstats": false,
    "analysis_method": "Extended thinking with tool use (up to 64K tokens, prompt addendum, increased max steps). Based on blog appendix and TAU-bench methodology.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.005622+00:00",
    "updated_at": "2025-07-19T19:56:15.005622+00:00",
    "benchmark_name": "TAU-bench Airline"
  },
  {
    "model_benchmark_id": 1761,
    "benchmark_id": "tau-bench-retail",
    "model_id": "claude-opus-4-20250514",
    "score": 0.814,
    "normalized_score": 0.814,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-4",
    "verified_by_llmstats": false,
    "analysis_method": "Extended thinking with tool use (up to 64K tokens, prompt addendum, increased max steps). Based on blog appendix and TAU-bench methodology.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.977090+00:00",
    "updated_at": "2025-07-19T19:56:14.977090+00:00",
    "benchmark_name": "TAU-bench Retail"
  },
  {
    "model_benchmark_id": 655,
    "benchmark_id": "terminal-bench",
    "model_id": "claude-opus-4-20250514",
    "score": 0.392,
    "normalized_score": 0.392,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-4",
    "verified_by_llmstats": false,
    "analysis_method": "Parallel test-time compute (multiple attempts, internal scoring model selection). No extended thinking. Claude Code as agent framework. Based on footnotes 2 and 5.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.354970+00:00",
    "updated_at": "2025-07-19T19:56:12.354970+00:00",
    "benchmark_name": "Terminal-bench"
  }
]


================================================
FILE: data/organizations/anthropic/models/claude-opus-4-20250514/model.json
================================================
{
  "model_id": "claude-opus-4-20250514",
  "name": "Claude Opus 4",
  "organization_id": "anthropic",
  "fine_tuned_from_model_id": null,
  "description": "Claude Opus 4 is Anthropic's most powerful model and the world's best coding model, part of the Claude 4 family. It delivers sustained performance on complex, long-running tasks and agent workflows. Opus 4 excels at coding, advanced reasoning, and can use tools (like web search) during extended thinking. It supports parallel tool execution and has improved memory capabilities.",
  "release_date": "2025-05-22",
  "announcement_date": "2025-05-22",
  "license_id": "proprietary",
  "multimodal": true,
  "knowledge_cutoff": null,
  "param_count": null,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://docs.anthropic.com/en/docs/about-claude/models/all-models",
  "source_playground": "https://claude.ai",
  "source_paper": null,
  "source_scorecard_blog_link": "https://www.anthropic.com/news/claude-4",
  "source_repo_link": null,
  "source_weights_link": null,
  "created_at": "2025-07-19T19:49:05.760983+00:00",
  "updated_at": "2025-07-19T19:49:05.760983+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/anthropic/models/claude-sonnet-4-20250514/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 701,
    "benchmark_id": "aime-2025",
    "model_id": "claude-sonnet-4-20250514",
    "score": 0.705,
    "normalized_score": 0.705,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-4",
    "verified_by_llmstats": false,
    "analysis_method": "Extended thinking (up to 64K tokens) with parallel test-time compute (multiple attempts, internal scoring model selection). Nucleus sampling (top_p 0.95). Based on footnotes 4, 5 and blog appendix.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.466833+00:00",
    "updated_at": "2025-07-19T19:56:12.466833+00:00",
    "benchmark_name": "AIME 2025"
  },
  {
    "model_benchmark_id": 333,
    "benchmark_id": "gpqa",
    "model_id": "claude-sonnet-4-20250514",
    "score": 0.754,
    "normalized_score": 0.754,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-4",
    "verified_by_llmstats": false,
    "analysis_method": "Diamond: Extended thinking (up to 64K tokens) with parallel test-time compute (multiple attempts, internal scoring model selection). Based on footnote 5 and blog appendix.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.728759+00:00",
    "updated_at": "2025-07-19T19:56:11.728759+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 1479,
    "benchmark_id": "mmmlu",
    "model_id": "claude-sonnet-4-20250514",
    "score": 0.865,
    "normalized_score": 0.865,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-4",
    "verified_by_llmstats": false,
    "analysis_method": "Extended thinking (up to 64K tokens). Average over 14 non-English languages. Based on blog appendix and footnote 3.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.154357+00:00",
    "updated_at": "2025-07-19T19:56:14.154357+00:00",
    "benchmark_name": "MMMLU"
  },
  {
    "model_benchmark_id": 583,
    "benchmark_id": "mmmu",
    "model_id": "claude-sonnet-4-20250514",
    "score": 0.744,
    "normalized_score": 0.744,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-4",
    "verified_by_llmstats": false,
    "analysis_method": "Extended thinking (up to 64K tokens). Based on blog appendix.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.199608+00:00",
    "updated_at": "2025-07-19T19:56:12.199608+00:00",
    "benchmark_name": "MMMU"
  },
  {
    "model_benchmark_id": 1349,
    "benchmark_id": "swe-bench-verified",
    "model_id": "claude-sonnet-4-20250514",
    "score": 0.727,
    "normalized_score": 0.727,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-4",
    "verified_by_llmstats": false,
    "analysis_method": "Parallel test-time compute (multiple attempts, internal scoring model selection). No extended thinking. Based on footnote 5 and SWE-bench methodology for high compute.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.840540+00:00",
    "updated_at": "2025-07-19T19:56:13.840540+00:00",
    "benchmark_name": "SWE-Bench Verified"
  },
  {
    "model_benchmark_id": 1773,
    "benchmark_id": "tau-bench-airline",
    "model_id": "claude-sonnet-4-20250514",
    "score": 0.6,
    "normalized_score": 0.6,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-4",
    "verified_by_llmstats": false,
    "analysis_method": "Extended thinking with tool use (up to 64K tokens, prompt addendum, increased max steps). Based on blog appendix and TAU-bench methodology.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.002282+00:00",
    "updated_at": "2025-07-19T19:56:15.002282+00:00",
    "benchmark_name": "TAU-bench Airline"
  },
  {
    "model_benchmark_id": 1759,
    "benchmark_id": "tau-bench-retail",
    "model_id": "claude-sonnet-4-20250514",
    "score": 0.805,
    "normalized_score": 0.805,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-4",
    "verified_by_llmstats": false,
    "analysis_method": "Extended thinking with tool use (up to 64K tokens, prompt addendum, increased max steps). Based on blog appendix and TAU-bench methodology.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.973668+00:00",
    "updated_at": "2025-07-19T19:56:14.973668+00:00",
    "benchmark_name": "TAU-bench Retail"
  },
  {
    "model_benchmark_id": 654,
    "benchmark_id": "terminal-bench",
    "model_id": "claude-sonnet-4-20250514",
    "score": 0.355,
    "normalized_score": 0.355,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-4",
    "verified_by_llmstats": false,
    "analysis_method": "Parallel test-time compute (multiple attempts, internal scoring model selection). No extended thinking. Claude Code as agent framework. Based on footnotes 2 and 5.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.353338+00:00",
    "updated_at": "2025-07-19T19:56:12.353338+00:00",
    "benchmark_name": "Terminal-bench"
  }
]


================================================
FILE: data/organizations/anthropic/models/claude-sonnet-4-20250514/model.json
================================================
{
  "model_id": "claude-sonnet-4-20250514",
  "name": "Claude Sonnet 4",
  "organization_id": "anthropic",
  "fine_tuned_from_model_id": null,
  "description": "Claude Sonnet 4, part of the Claude 4 family, is a significant upgrade to Claude Sonnet 3.7. It excels in coding (72.7% on SWE-bench) and reasoning, responding more precisely to instructions. Sonnet 4 offers an optimal mix of capability and practicality, with enhanced steerability, and supports extended thinking with tool use.",
  "release_date": "2025-05-22",
  "announcement_date": "2025-05-22",
  "license_id": "proprietary",
  "multimodal": true,
  "knowledge_cutoff": null,
  "param_count": null,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://docs.anthropic.com/en/docs/about-claude/models/all-models",
  "source_playground": "https://claude.ai",
  "source_paper": null,
  "source_scorecard_blog_link": "https://www.anthropic.com/news/claude-4",
  "source_repo_link": null,
  "source_weights_link": null,
  "created_at": "2025-07-19T19:49:05.750182+00:00",
  "updated_at": "2025-07-19T19:49:05.750182+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/anthropic/models/claude-sonnet-4-5-20250929/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 701,
    "benchmark_id": "swe-bench-verified-(agentic-coding)",
    "model_id": "claude-sonnet-4-5-20250929",
    "score": 0.772,
    "normalized_score": 0.772,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-sonnet-4-5",
    "verified_by_llmstats": false,
    "analysis_method": "Agentic coding",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-09-29T19:56:12.466833+00:00",
    "updated_at": "2025-09-29T19:56:12.466833+00:00",
    "benchmark_name": "SWE-bench Verified (Agentic Coding)"
  },
  {
    "model_benchmark_id": 702,
    "benchmark_id": "terminal-bench",
    "model_id": "claude-sonnet-4-5-20250929",
    "score": 0.5,
    "normalized_score": 0.5,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-sonnet-4-5",
    "verified_by_llmstats": false,
    "analysis_method": "Agentic terminal coding",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-09-29T19:56:12.466833+00:00",
    "updated_at": "2025-09-29T19:56:12.466833+00:00",
    "benchmark_name": "Terminal-Bench"
  },
  {
    "model_benchmark_id": 703,
    "benchmark_id": "osworld",
    "model_id": "claude-sonnet-4-5-20250929",
    "score": 0.614,
    "normalized_score": 0.614,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-sonnet-4-5",
    "verified_by_llmstats": false,
    "analysis_method": "Computer use",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-09-29T19:56:12.466833+00:00",
    "updated_at": "2025-09-29T19:56:12.466833+00:00",
    "benchmark_name": "OSWorld"
  },
  {
    "model_benchmark_id": 704,
    "benchmark_id": "aime-2025",
    "model_id": "claude-sonnet-4-5-20250929",
    "score": 0.87,
    "normalized_score": 0.87,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-sonnet-4-5",
    "verified_by_llmstats": false,
    "analysis_method": "High school math competition",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-09-29T19:56:12.466833+00:00",
    "updated_at": "2025-09-29T19:56:12.466833+00:00",
    "benchmark_name": "AIME 2025"
  },
  {
    "model_benchmark_id": 705,
    "benchmark_id": "gpqa",
    "model_id": "claude-sonnet-4-5-20250929",
    "score": 0.834,
    "normalized_score": 0.834,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-sonnet-4-5",
    "verified_by_llmstats": false,
    "analysis_method": "Graduate-level reasoning",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-09-29T19:56:12.466833+00:00",
    "updated_at": "2025-09-29T19:56:12.466833+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 706,
    "benchmark_id": "mmmlu",
    "model_id": "claude-sonnet-4-5-20250929",
    "score": 0.891,
    "normalized_score": 0.891,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-sonnet-4-5",
    "verified_by_llmstats": false,
    "analysis_method": "Multilingual Q&A",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-09-29T19:56:12.466833+00:00",
    "updated_at": "2025-09-29T19:56:12.466833+00:00",
    "benchmark_name": "MMMLU"
  },
  {
    "model_benchmark_id": 707,
    "benchmark_id": "tau-bench-retail",
    "model_id": "claude-sonnet-4-5-20250929",
    "score": 0.862,
    "normalized_score": 0.862,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-sonnet-4-5",
    "verified_by_llmstats": false,
    "analysis_method": "Agentic tool use",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-09-29T19:56:12.466833+00:00",
    "updated_at": "2025-09-29T19:56:12.466833+00:00",
    "benchmark_name": "TAU-bench Retail"
  },
  {
    "model_benchmark_id": 708,
    "benchmark_id": "tau-bench-airline",
    "model_id": "claude-sonnet-4-5-20250929",
    "score": 0.7,
    "normalized_score": 0.7,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-sonnet-4-5",
    "verified_by_llmstats": false,
    "analysis_method": "Agentic tool use",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-09-29T19:56:12.466833+00:00",
    "updated_at": "2025-09-29T19:56:12.466833+00:00",
    "benchmark_name": "TAU-bench Airline"
  },
  {
    "model_benchmark_id": 710,
    "benchmark_id": "mmmuval",
    "model_id": "claude-sonnet-4-5-20250929",
    "score": 0.778,
    "normalized_score": 0.778,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.anthropic.com/news/claude-sonnet-4-5",
    "verified_by_llmstats": false,
    "analysis_method": "Visual reasoning",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-09-29T19:56:12.466833+00:00",
    "updated_at": "2025-09-29T19:56:12.466833+00:00",
    "benchmark_name": "MMMUval"
  }
]


================================================
FILE: data/organizations/anthropic/models/claude-sonnet-4-5-20250929/model.json
================================================
{
  "model_id": "claude-sonnet-4-5-20250929",
  "name": "Claude Sonnet 4.5",
  "organization_id": "anthropic",
  "fine_tuned_from_model_id": null,
  "description": "Claude Sonnet 4.5 is the best coding model in the world. It's the strongest model for building complex agents. It’s the best model at using computers. And it shows substantial gains in reasoning and math. Highest intelligence across most tasks with exceptional agent and coding capabilities.",
  "release_date": "2025-09-29",
  "announcement_date": "2025-09-29",
  "license_id": "proprietary",
  "multimodal": true,
  "knowledge_cutoff": "2025-01-31",
  "param_count": null,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://docs.anthropic.com/en/docs/about-claude/models/all-models",
  "source_playground": "https://claude.ai",
  "source_paper": null,
  "source_scorecard_blog_link": "https://www.anthropic.com/news/claude-sonnet-4-5",
  "source_repo_link": null,
  "source_weights_link": null,
  "created_at": "2025-07-19T19:49:05.750182+00:00",
  "updated_at": "2025-07-19T19:49:05.750182+00:00",
  "model_family_id": null
}


================================================
FILE: data/organizations/anthropic/organization.json
================================================
{
  "organization_id": "anthropic",
  "name": "Anthropic",
  "website": "https://anthropic.com",
  "description": "AI safety company",
  "country": "US",
  "created_at": "2025-07-19T19:49:05.736520+00:00",
  "updated_at": "2025-07-19T19:49:05.736520+00:00"
}


================================================
FILE: data/organizations/cohere/models/command-r-plus-04-2024/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 1,
    "benchmark_id": "arc-c",
    "model_id": "command-r-plus-04-2024",
    "score": 0.7099,
    "normalized_score": 0.7099,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/CohereForAI/c4ai-command-r-plus",
    "verified_by_llmstats": false,
    "analysis_method": "Standardized Evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.062949+00:00",
    "updated_at": "2025-07-19T19:56:11.062949+00:00",
    "benchmark_name": "ARC-C"
  },
  {
    "model_benchmark_id": 157,
    "benchmark_id": "gsm8k",
    "model_id": "command-r-plus-04-2024",
    "score": 0.707,
    "normalized_score": 0.707,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/CohereForAI/c4ai-command-r-plus",
    "verified_by_llmstats": false,
    "analysis_method": "Standardized Evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.401017+00:00",
    "updated_at": "2025-07-19T19:56:11.401017+00:00",
    "benchmark_name": "GSM8k"
  },
  {
    "model_benchmark_id": 32,
    "benchmark_id": "hellaswag",
    "model_id": "command-r-plus-04-2024",
    "score": 0.886,
    "normalized_score": 0.886,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/CohereForAI/c4ai-command-r-plus",
    "verified_by_llmstats": false,
    "analysis_method": "Standardized Evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.149067+00:00",
    "updated_at": "2025-07-19T19:56:11.149067+00:00",
    "benchmark_name": "HellaSwag"
  },
  {
    "model_benchmark_id": 56,
    "benchmark_id": "mmlu",
    "model_id": "command-r-plus-04-2024",
    "score": 0.757,
    "normalized_score": 0.757,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/CohereForAI/c4ai-command-r-plus",
    "verified_by_llmstats": false,
    "analysis_method": "Standardized Evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.202939+00:00",
    "updated_at": "2025-07-19T19:56:11.202939+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 131,
    "benchmark_id": "truthfulqa",
    "model_id": "command-r-plus-04-2024",
    "score": 0.563,
    "normalized_score": 0.563,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/CohereForAI/c4ai-command-r-plus",
    "verified_by_llmstats": false,
    "analysis_method": "Standardized Evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.341733+00:00",
    "updated_at": "2025-07-19T19:56:11.341733+00:00",
    "benchmark_name": "TruthfulQA"
  },
  {
    "model_benchmark_id": 147,
    "benchmark_id": "winogrande",
    "model_id": "command-r-plus-04-2024",
    "score": 0.854,
    "normalized_score": 0.854,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/CohereForAI/c4ai-command-r-plus",
    "verified_by_llmstats": false,
    "analysis_method": "Standardized Evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.378573+00:00",
    "updated_at": "2025-07-19T19:56:11.378573+00:00",
    "benchmark_name": "Winogrande"
  }
]

================================================
FILE: data/organizations/cohere/models/command-r-plus-04-2024/model.json
================================================
{
  "model_id": "command-r-plus-04-2024",
  "name": "Command R+",
  "organization_id": "cohere",
  "fine_tuned_from_model_id": null,
  "description": "C4AI Command R+ is a 104 billion parameter model with advanced capabilities, including Retrieval Augmented Generation (RAG) and multi-step tool use, optimized for multilingual tasks.",
  "release_date": "2024-08-30",
  "announcement_date": "2024-08-30",
  "license_id": "cc_by_nc",
  "multimodal": false,
  "knowledge_cutoff": null,
  "param_count": 104000000000,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://docs.cohere.com/v2/docs/command-r-plus",
  "source_playground": "https://huggingface.co/CohereForAI/c4ai-command-r-plus",
  "source_paper": null,
  "source_scorecard_blog_link": null,
  "source_repo_link": "https://huggingface.co/CohereForAI/c4ai-command-r-plus",
  "source_weights_link": "",
  "created_at": "2025-07-19T19:49:05.415748+00:00",
  "updated_at": "2025-07-19T19:49:05.415748+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/cohere/organization.json
================================================
{
  "organization_id": "cohere",
  "name": "Cohere",
  "website": "https://cohere.ai",
  "description": "Enterprise AI company",
  "country": "CA",
  "created_at": "2025-07-19T19:49:05.404836+00:00",
  "updated_at": "2025-07-19T19:49:05.404836+00:00"
}


================================================
FILE: data/organizations/deepseek/models/deepseek-r1/benchmarks.json
================================================
[]


================================================
FILE: data/organizations/deepseek/models/deepseek-r1/model.json
================================================
{
  "model_id": "deepseek-r1",
  "name": "DeepSeek-R1",
  "organization_id": "deepseek",
  "model_family_id": null,
  "fine_tuned_from_model_id": null,
  "description": "DeepSeek-R1 is a reasoning-focused language model from DeepSeek that features advanced thinking capabilities. It serves as the foundation for DeepSeek's reasoning model family and pioneered their thinking mode approach for complex problem-solving tasks.",
  "release_date": "2025-01-20",
  "announcement_date": "2025-01-20",
  "license_id": "mit",
  "multimodal": false,
  "knowledge_cutoff": null,
  "param_count": 671000000000,
  "training_tokens": null,
  "available_in_zeroeval": false,
  "source_api_ref": "https://api.deepseek.com/docs",
  "source_playground": "https://chat.deepseek.com/",
  "source_paper": null,
  "source_scorecard_blog_link": "https://www.deepseek.com/",
  "source_repo_link": null,
  "source_weights_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1",
  "created_at": "2025-01-20T00:00:00.000000+00:00",
  "updated_at": "2025-09-15T00:00:00.000000+00:00"
}


================================================
FILE: data/organizations/deepseek/models/deepseek-r1-0528/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 9601,
    "benchmark_id": "mmlu-redux",
    "model_id": "deepseek-r1-0528",
    "score": 0.934,
    "normalized_score": 0.934,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1",
    "verified_by_llmstats": false,
    "analysis_method": "Thinking mode",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-05-28T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "MMLU-Redux"
  },
  {
    "model_benchmark_id": 9602,
    "benchmark_id": "mmlu-pro",
    "model_id": "deepseek-r1-0528",
    "score": 0.85,
    "normalized_score": 0.85,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1",
    "verified_by_llmstats": false,
    "analysis_method": "Thinking mode",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-05-28T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "MMLU-Pro"
  },
  {
    "model_benchmark_id": 9603,
    "benchmark_id": "gpqa",
    "model_id": "deepseek-r1-0528",
    "score": 0.81,
    "normalized_score": 0.81,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1, Thinking mode",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-05-28T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 9604,
    "benchmark_id": "humanity's-last-exam",
    "model_id": "deepseek-r1-0528",
    "score": 0.177,
    "normalized_score": 0.177,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1, Thinking mode, text-only subset",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": "Text-only subset evaluation",
    "created_at": "2025-05-28T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "Humanity's Last Exam"
  },
  {
    "model_benchmark_id": 9605,
    "benchmark_id": "browsecomp",
    "model_id": "deepseek-r1-0528",
    "score": 0.089,
    "normalized_score": 0.089,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1",
    "verified_by_llmstats": false,
    "analysis_method": "Search agent with pre-defined workflow",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": "Evaluated with pre-defined workflow",
    "created_at": "2025-05-28T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "BrowseComp"
  },
  {
    "model_benchmark_id": 9606,
    "benchmark_id": "browsecomp-zh",
    "model_id": "deepseek-r1-0528",
    "score": 0.357,
    "normalized_score": 0.357,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1",
    "verified_by_llmstats": false,
    "analysis_method": "Search agent with pre-defined workflow",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": "Evaluated with pre-defined workflow",
    "created_at": "2025-05-28T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "BrowseComp-zh"
  },
  {
    "model_benchmark_id": 9607,
    "benchmark_id": "simpleqa",
    "model_id": "deepseek-r1-0528",
    "score": 0.923,
    "normalized_score": 0.923,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1",
    "verified_by_llmstats": false,
    "analysis_method": "Search agent evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-05-28T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "SimpleQA"
  },
  {
    "model_benchmark_id": 9608,
    "benchmark_id": "livecodebench",
    "model_id": "deepseek-r1-0528",
    "score": 0.733,
    "normalized_score": 0.733,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1, 2408-2505, Thinking mode",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-05-28T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "LiveCodeBench"
  },
  {
    "model_benchmark_id": 9609,
    "benchmark_id": "codeforces",
    "model_id": "deepseek-r1-0528",
    "score": 0.6433,
    "normalized_score": 0.6433,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1",
    "verified_by_llmstats": false,
    "analysis_method": "Div1 Rating, Thinking mode",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-05-28T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "Codeforces"
  },
  {
    "model_benchmark_id": 9610,
    "benchmark_id": "aider-polyglot",
    "model_id": "deepseek-r1-0528",
    "score": 0.716,
    "normalized_score": 0.716,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1",
    "verified_by_llmstats": false,
    "analysis_method": "Thinking mode",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-05-28T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "Aider-Polyglot"
  },
  {
    "model_benchmark_id": 9611,
    "benchmark_id": "swe-bench-verified",
    "model_id": "deepseek-r1-0528",
    "score": 0.446,
    "normalized_score": 0.446,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1",
    "verified_by_llmstats": false,
    "analysis_method": "Agent mode",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": "Evaluated with internal code agent framework",
    "created_at": "2025-05-28T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "SWE-Bench Verified"
  },
  {
    "model_benchmark_id": 9612,
    "benchmark_id": "swe-bench-multilingual",
    "model_id": "deepseek-r1-0528",
    "score": 0.305,
    "normalized_score": 0.305,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1",
    "verified_by_llmstats": false,
    "analysis_method": "Agent mode",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": "Evaluated with internal code agent framework",
    "created_at": "2025-05-28T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "SWE-Bench Multilingual"
  },
  {
    "model_benchmark_id": 9613,
    "benchmark_id": "terminal-bench",
    "model_id": "deepseek-r1-0528",
    "score": 0.057,
    "normalized_score": 0.057,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1",
    "verified_by_llmstats": false,
    "analysis_method": "Terminus 1 framework",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-05-28T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "Terminal-Bench"
  },
  {
    "model_benchmark_id": 9614,
    "benchmark_id": "aime-2024",
    "model_id": "deepseek-r1-0528",
    "score": 0.914,
    "normalized_score": 0.914,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1, Thinking mode",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-05-28T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "AIME 2024"
  },
  {
    "model_benchmark_id": 9615,
    "benchmark_id": "aime-2025",
    "model_id": "deepseek-r1-0528",
    "score": 0.875,
    "normalized_score": 0.875,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1, Thinking mode",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-05-28T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "AIME 2025"
  },
  {
    "model_benchmark_id": 9616,
    "benchmark_id": "hmmt-2025",
    "model_id": "deepseek-r1-0528",
    "score": 0.794,
    "normalized_score": 0.794,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1, Thinking mode",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-05-28T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "HMMT 2025"
  }
]

================================================
FILE: data/organizations/deepseek/models/deepseek-r1-0528/model.json
================================================
{
  "model_id": "deepseek-r1-0528",
  "name": "DeepSeek-R1-0528",
  "organization_id": "deepseek",
  "model_family_id": null,
  "fine_tuned_from_model_id": "deepseek-r1",
  "description": "DeepSeek-R1-0528 is the May 28, 2025 version of DeepSeek's reasoning model. It features advanced thinking capabilities and serves as a benchmark comparison for newer models like DeepSeek-V3.1. This model excels in complex reasoning tasks, mathematical problem-solving, and code generation through its thinking mode approach.",
  "release_date": "2025-05-28",
  "announcement_date": "2025-05-28",
  "license_id": "mit",
  "multimodal": false,
  "knowledge_cutoff": null,
  "param_count": 671000000000,
  "training_tokens": null,
  "available_in_zeroeval": false,
  "source_api_ref": "https://api.deepseek.com/docs",
  "source_playground": "https://chat.deepseek.com/",
  "source_paper": null,
  "source_scorecard_blog_link": "https://www.deepseek.com/",
  "source_repo_link": null,
  "source_weights_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1",
  "created_at": "2025-05-28T00:00:00.000000+00:00",
  "updated_at": "2025-09-15T00:00:00.000000+00:00"
}


================================================
FILE: data/organizations/deepseek/models/deepseek-r1-distill-llama-70b/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 467,
    "benchmark_id": "aime-2024",
    "model_id": "deepseek-r1-distill-llama-70b",
    "score": 0.867,
    "normalized_score": 0.867,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
    "verified_by_llmstats": false,
    "analysis_method": "Cons@64",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.987242+00:00",
    "updated_at": "2025-07-19T19:56:11.989505+00:00",
    "benchmark_name": "AIME 2024"
  },
  {
    "model_benchmark_id": 315,
    "benchmark_id": "gpqa",
    "model_id": "deepseek-r1-distill-llama-70b",
    "score": 0.652,
    "normalized_score": 0.652,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
    "verified_by_llmstats": false,
    "analysis_method": "Diamond, Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.700874+00:00",
    "updated_at": "2025-07-19T19:56:11.700874+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 1135,
    "benchmark_id": "livecodebench",
    "model_id": "deepseek-r1-distill-llama-70b",
    "score": 0.575,
    "normalized_score": 0.575,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.386337+00:00",
    "updated_at": "2025-07-19T19:56:13.386337+00:00",
    "benchmark_name": "LiveCodeBench"
  },
  {
    "model_benchmark_id": 503,
    "benchmark_id": "math-500",
    "model_id": "deepseek-r1-distill-llama-70b",
    "score": 0.945,
    "normalized_score": 0.945,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.048302+00:00",
    "updated_at": "2025-07-19T19:56:12.048302+00:00",
    "benchmark_name": "MATH-500"
  }
]

================================================
FILE: data/organizations/deepseek/models/deepseek-r1-distill-llama-70b/model.json
================================================
{
  "model_id": "deepseek-r1-distill-llama-70b",
  "name": "DeepSeek R1 Distill Llama 70B",
  "organization_id": "deepseek",
  "fine_tuned_from_model_id": null,
  "description": "DeepSeek-R1 is the first-generation reasoning model built atop DeepSeek-V3 (671B total parameters, 37B activated per token). It incorporates large-scale reinforcement learning (RL) to enhance its chain-of-thought and reasoning capabilities, delivering strong performance in math, code, and multi-step reasoning tasks.",
  "release_date": "2025-01-20",
  "announcement_date": "2025-01-20",
  "license_id": "mit",
  "multimodal": false,
  "knowledge_cutoff": null,
  "param_count": 70600000000,
  "training_tokens": 14800000000000,
  "available_in_zeroeval": true,
  "source_api_ref": "https://api-docs.deepseek.com/news/news250120",
  "source_playground": "https://chat.deepseek.com",
  "source_paper": "https://arxiv.org/pdf/2501.12948",
  "source_scorecard_blog_link": null,
  "source_repo_link": "https://github.com/deepseek-ai/DeepSeek-R1",
  "source_weights_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
  "created_at": "2025-07-19T19:49:05.685839+00:00",
  "updated_at": "2025-07-19T19:49:05.685839+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/deepseek/models/deepseek-r1-distill-llama-8b/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 465,
    "benchmark_id": "aime-2024",
    "model_id": "deepseek-r1-distill-llama-8b",
    "score": 0.8,
    "normalized_score": 0.8,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
    "verified_by_llmstats": false,
    "analysis_method": "Cons@64",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.984093+00:00",
    "updated_at": "2025-07-19T19:56:11.985582+00:00",
    "benchmark_name": "AIME 2024"
  },
  {
    "model_benchmark_id": 314,
    "benchmark_id": "gpqa",
    "model_id": "deepseek-r1-distill-llama-8b",
    "score": 0.49,
    "normalized_score": 0.49,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
    "verified_by_llmstats": false,
    "analysis_method": "Diamond, Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.699365+00:00",
    "updated_at": "2025-07-19T19:56:11.699365+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 1134,
    "benchmark_id": "livecodebench",
    "model_id": "deepseek-r1-distill-llama-8b",
    "score": 0.396,
    "normalized_score": 0.396,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.384499+00:00",
    "updated_at": "2025-07-19T19:56:13.384499+00:00",
    "benchmark_name": "LiveCodeBench"
  },
  {
    "model_benchmark_id": 502,
    "benchmark_id": "math-500",
    "model_id": "deepseek-r1-distill-llama-8b",
    "score": 0.891,
    "normalized_score": 0.891,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.046427+00:00",
    "updated_at": "2025-07-19T19:56:12.046427+00:00",
    "benchmark_name": "MATH-500"
  }
]

================================================
FILE: data/organizations/deepseek/models/deepseek-r1-distill-llama-8b/model.json
================================================
{
  "model_id": "deepseek-r1-distill-llama-8b",
  "name": "DeepSeek R1 Distill Llama 8B",
  "organization_id": "deepseek",
  "fine_tuned_from_model_id": null,
  "description": "DeepSeek-R1 is the first-generation reasoning model built atop DeepSeek-V3 (671B total parameters, 37B activated per token). It incorporates large-scale reinforcement learning (RL) to enhance its chain-of-thought and reasoning capabilities, delivering strong performance in math, code, and multi-step reasoning tasks.",
  "release_date": "2025-01-20",
  "announcement_date": "2025-01-20",
  "license_id": "mit",
  "multimodal": false,
  "knowledge_cutoff": null,
  "param_count": 8030000000,
  "training_tokens": 14800000000000,
  "available_in_zeroeval": true,
  "source_api_ref": "https://api-docs.deepseek.com/news/news250120",
  "source_playground": "https://chat.deepseek.com",
  "source_paper": "https://arxiv.org/pdf/2501.12948",
  "source_scorecard_blog_link": null,
  "source_repo_link": "https://github.com/deepseek-ai/DeepSeek-R1",
  "source_weights_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
  "created_at": "2025-07-19T19:49:05.683265+00:00",
  "updated_at": "2025-07-19T19:49:05.683265+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/deepseek/models/deepseek-r1-distill-qwen-1.5b/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 461,
    "benchmark_id": "aime-2024",
    "model_id": "deepseek-r1-distill-qwen-1.5b",
    "score": 0.527,
    "normalized_score": 0.527,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
    "verified_by_llmstats": false,
    "analysis_method": "Cons@64",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.976978+00:00",
    "updated_at": "2025-07-19T19:56:11.978475+00:00",
    "benchmark_name": "AIME 2024"
  },
  {
    "model_benchmark_id": 311,
    "benchmark_id": "gpqa",
    "model_id": "deepseek-r1-distill-qwen-1.5b",
    "score": 0.338,
    "normalized_score": 0.338,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
    "verified_by_llmstats": false,
    "analysis_method": "Diamond, Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.694071+00:00",
    "updated_at": "2025-07-19T19:56:11.694071+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 1130,
    "benchmark_id": "livecodebench",
    "model_id": "deepseek-r1-distill-qwen-1.5b",
    "score": 0.169,
    "normalized_score": 0.169,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.362673+00:00",
    "updated_at": "2025-07-19T19:56:13.362673+00:00",
    "benchmark_name": "LiveCodeBench"
  },
  {
    "model_benchmark_id": 499,
    "benchmark_id": "math-500",
    "model_id": "deepseek-r1-distill-qwen-1.5b",
    "score": 0.839,
    "normalized_score": 0.839,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.041592+00:00",
    "updated_at": "2025-07-19T19:56:12.041592+00:00",
    "benchmark_name": "MATH-500"
  }
]

================================================
FILE: data/organizations/deepseek/models/deepseek-r1-distill-qwen-1.5b/model.json
================================================
{
  "model_id": "deepseek-r1-distill-qwen-1.5b",
  "name": "DeepSeek R1 Distill Qwen 1.5B",
  "organization_id": "deepseek",
  "fine_tuned_from_model_id": null,
  "description": "DeepSeek-R1 is the first-generation reasoning model built atop DeepSeek-V3 (671B total parameters, 37B activated per token). It incorporates large-scale reinforcement learning (RL) to enhance its chain-of-thought and reasoning capabilities, delivering strong performance in math, code, and multi-step reasoning tasks.",
  "release_date": "2025-01-20",
  "announcement_date": "2025-01-20",
  "license_id": "mit",
  "multimodal": false,
  "knowledge_cutoff": null,
  "param_count": 1780000000,
  "training_tokens": 14800000000000,
  "available_in_zeroeval": true,
  "source_api_ref": "https://api-docs.deepseek.com/news/news250120",
  "source_playground": "https://chat.deepseek.com",
  "source_paper": "https://arxiv.org/pdf/2501.12948",
  "source_scorecard_blog_link": null,
  "source_repo_link": "https://github.com/deepseek-ai/DeepSeek-R1",
  "source_weights_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
  "created_at": "2025-07-19T19:49:05.672853+00:00",
  "updated_at": "2025-07-19T19:49:05.672853+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/deepseek/models/deepseek-r1-distill-qwen-14b/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 469,
    "benchmark_id": "aime-2024",
    "model_id": "deepseek-r1-distill-qwen-14b",
    "score": 0.8,
    "normalized_score": 0.8,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
    "verified_by_llmstats": false,
    "analysis_method": "Cons@64",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.991646+00:00",
    "updated_at": "2025-07-19T19:56:11.993518+00:00",
    "benchmark_name": "AIME 2024"
  },
  {
    "model_benchmark_id": 316,
    "benchmark_id": "gpqa",
    "model_id": "deepseek-r1-distill-qwen-14b",
    "score": 0.591,
    "normalized_score": 0.591,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
    "verified_by_llmstats": false,
    "analysis_method": "Diamond, Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.702334+00:00",
    "updated_at": "2025-07-19T19:56:11.702334+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 1136,
    "benchmark_id": "livecodebench",
    "model_id": "deepseek-r1-distill-qwen-14b",
    "score": 0.531,
    "normalized_score": 0.531,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.387993+00:00",
    "updated_at": "2025-07-19T19:56:13.387993+00:00",
    "benchmark_name": "LiveCodeBench"
  },
  {
    "model_benchmark_id": 504,
    "benchmark_id": "math-500",
    "model_id": "deepseek-r1-distill-qwen-14b",
    "score": 0.939,
    "normalized_score": 0.939,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.050287+00:00",
    "updated_at": "2025-07-19T19:56:12.050287+00:00",
    "benchmark_name": "MATH-500"
  }
]

================================================
FILE: data/organizations/deepseek/models/deepseek-r1-distill-qwen-14b/model.json
================================================
{
  "model_id": "deepseek-r1-distill-qwen-14b",
  "name": "DeepSeek R1 Distill Qwen 14B",
  "organization_id": "deepseek",
  "fine_tuned_from_model_id": null,
  "description": "DeepSeek-R1 is the first-generation reasoning model built atop DeepSeek-V3 (671B total parameters, 37B activated per token). It incorporates large-scale reinforcement learning (RL) to enhance its chain-of-thought and reasoning capabilities, delivering strong performance in math, code, and multi-step reasoning tasks.",
  "release_date": "2025-01-20",
  "announcement_date": "2025-01-20",
  "license_id": "mit",
  "multimodal": false,
  "knowledge_cutoff": null,
  "param_count": 14800000000,
  "training_tokens": 14800000000000,
  "available_in_zeroeval": true,
  "source_api_ref": "https://api-docs.deepseek.com/news/news250120",
  "source_playground": "https://chat.deepseek.com",
  "source_paper": "https://arxiv.org/pdf/2501.12948",
  "source_scorecard_blog_link": null,
  "source_repo_link": "https://github.com/deepseek-ai/DeepSeek-R1",
  "source_weights_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
  "created_at": "2025-07-19T19:49:05.688267+00:00",
  "updated_at": "2025-07-19T19:49:05.688267+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/deepseek/models/deepseek-r1-distill-qwen-32b/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 471,
    "benchmark_id": "aime-2024",
    "model_id": "deepseek-r1-distill-qwen-32b",
    "score": 0.833,
    "normalized_score": 0.833,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
    "verified_by_llmstats": false,
    "analysis_method": "Cons@64",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.995645+00:00",
    "updated_at": "2025-07-19T19:56:11.997517+00:00",
    "benchmark_name": "AIME 2024"
  },
  {
    "model_benchmark_id": 317,
    "benchmark_id": "gpqa",
    "model_id": "deepseek-r1-distill-qwen-32b",
    "score": 0.621,
    "normalized_score": 0.621,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
    "verified_by_llmstats": false,
    "analysis_method": "Diamond, Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.703902+00:00",
    "updated_at": "2025-07-19T19:56:11.703902+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 1137,
    "benchmark_id": "livecodebench",
    "model_id": "deepseek-r1-distill-qwen-32b",
    "score": 0.572,
    "normalized_score": 0.572,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.389729+00:00",
    "updated_at": "2025-07-19T19:56:13.389729+00:00",
    "benchmark_name": "LiveCodeBench"
  },
  {
    "model_benchmark_id": 505,
    "benchmark_id": "math-500",
    "model_id": "deepseek-r1-distill-qwen-32b",
    "score": 0.943,
    "normalized_score": 0.943,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.051744+00:00",
    "updated_at": "2025-07-19T19:56:12.051744+00:00",
    "benchmark_name": "MATH-500"
  }
]

================================================
FILE: data/organizations/deepseek/models/deepseek-r1-distill-qwen-32b/model.json
================================================
{
  "model_id": "deepseek-r1-distill-qwen-32b",
  "name": "DeepSeek R1 Distill Qwen 32B",
  "organization_id": "deepseek",
  "fine_tuned_from_model_id": null,
  "description": "DeepSeek-R1 is the first-generation reasoning model built atop DeepSeek-V3 (671B total parameters, 37B activated per token). It incorporates large-scale reinforcement learning (RL) to enhance its chain-of-thought and reasoning capabilities, delivering strong performance in math, code, and multi-step reasoning tasks.",
  "release_date": "2025-01-20",
  "announcement_date": "2025-01-20",
  "license_id": "mit",
  "multimodal": false,
  "knowledge_cutoff": null,
  "param_count": 32800000000,
  "training_tokens": 14800000000000,
  "available_in_zeroeval": true,
  "source_api_ref": "https://api-docs.deepseek.com/news/news250120",
  "source_playground": "https://chat.deepseek.com",
  "source_paper": "https://arxiv.org/pdf/2501.12948",
  "source_scorecard_blog_link": null,
  "source_repo_link": "https://github.com/deepseek-ai/DeepSeek-R1",
  "source_weights_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
  "created_at": "2025-07-19T19:49:05.690560+00:00",
  "updated_at": "2025-07-19T19:49:05.690560+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/deepseek/models/deepseek-r1-distill-qwen-7b/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 459,
    "benchmark_id": "aime-2024",
    "model_id": "deepseek-r1-distill-qwen-7b",
    "score": 0.833,
    "normalized_score": 0.833,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
    "verified_by_llmstats": false,
    "analysis_method": "Cons@64",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.973870+00:00",
    "updated_at": "2025-07-19T19:56:11.975371+00:00",
    "benchmark_name": "AIME 2024"
  },
  {
    "model_benchmark_id": 310,
    "benchmark_id": "gpqa",
    "model_id": "deepseek-r1-distill-qwen-7b",
    "score": 0.491,
    "normalized_score": 0.491,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
    "verified_by_llmstats": false,
    "analysis_method": "Diamond, Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.692702+00:00",
    "updated_at": "2025-07-19T19:56:11.692702+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 1129,
    "benchmark_id": "livecodebench",
    "model_id": "deepseek-r1-distill-qwen-7b",
    "score": 0.376,
    "normalized_score": 0.376,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.360567+00:00",
    "updated_at": "2025-07-19T19:56:13.360567+00:00",
    "benchmark_name": "LiveCodeBench"
  },
  {
    "model_benchmark_id": 498,
    "benchmark_id": "math-500",
    "model_id": "deepseek-r1-distill-qwen-7b",
    "score": 0.928,
    "normalized_score": 0.928,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.039853+00:00",
    "updated_at": "2025-07-19T19:56:12.039853+00:00",
    "benchmark_name": "MATH-500"
  }
]

================================================
FILE: data/organizations/deepseek/models/deepseek-r1-distill-qwen-7b/model.json
================================================
{
  "model_id": "deepseek-r1-distill-qwen-7b",
  "name": "DeepSeek R1 Distill Qwen 7B",
  "organization_id": "deepseek",
  "fine_tuned_from_model_id": null,
  "description": "DeepSeek-R1 is the first-generation reasoning model built atop DeepSeek-V3 (671B total parameters, 37B activated per token). It incorporates large-scale reinforcement learning (RL) to enhance its chain-of-thought and reasoning capabilities, delivering strong performance in math, code, and multi-step reasoning tasks.",
  "release_date": "2025-01-20",
  "announcement_date": "2025-01-20",
  "license_id": "mit",
  "multimodal": false,
  "knowledge_cutoff": null,
  "param_count": 7620000000,
  "training_tokens": 14800000000000,
  "available_in_zeroeval": true,
  "source_api_ref": "https://api-docs.deepseek.com/news/news250120",
  "source_playground": "https://chat.deepseek.com",
  "source_paper": "https://arxiv.org/pdf/2501.12948",
  "source_scorecard_blog_link": null,
  "source_repo_link": "https://github.com/deepseek-ai/DeepSeek-R1",
  "source_weights_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
  "created_at": "2025-07-19T19:49:05.669926+00:00",
  "updated_at": "2025-07-19T19:49:05.669926+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/deepseek/models/deepseek-r1-zero/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 457,
    "benchmark_id": "aime-2024",
    "model_id": "deepseek-r1-zero",
    "score": 0.867,
    "normalized_score": 0.867,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/abs/2501.12948",
    "verified_by_llmstats": false,
    "analysis_method": "Cons@64",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.970600+00:00",
    "updated_at": "2025-07-19T19:56:11.972162+00:00",
    "benchmark_name": "AIME 2024"
  },
  {
    "model_benchmark_id": 309,
    "benchmark_id": "gpqa",
    "model_id": "deepseek-r1-zero",
    "score": 0.733,
    "normalized_score": 0.733,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/abs/2501.12948",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1 Diamond",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.691175+00:00",
    "updated_at": "2025-07-19T19:56:11.691175+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 1128,
    "benchmark_id": "livecodebench",
    "model_id": "deepseek-r1-zero",
    "score": 0.5,
    "normalized_score": 0.5,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/abs/2501.12948",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.357962+00:00",
    "updated_at": "2025-07-19T19:56:13.357962+00:00",
    "benchmark_name": "LiveCodeBench"
  },
  {
    "model_benchmark_id": 497,
    "benchmark_id": "math-500",
    "model_id": "deepseek-r1-zero",
    "score": 0.959,
    "normalized_score": 0.959,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/abs/2501.12948",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.038172+00:00",
    "updated_at": "2025-07-19T19:56:12.038172+00:00",
    "benchmark_name": "MATH-500"
  }
]

================================================
FILE: data/organizations/deepseek/models/deepseek-r1-zero/model.json
================================================
{
  "model_id": "deepseek-r1-zero",
  "name": "DeepSeek R1 Zero",
  "organization_id": "deepseek",
  "fine_tuned_from_model_id": "deepseek-v3",
  "description": "DeepSeek-R1-Zero, a model trained via large-scale reinforcement learning (RL) without supervised fine-tuning (SFT) as a preliminary step, demonstrated remarkable performance on reasoning. With RL, DeepSeek-R1-Zero naturally emerged with numerous powerful and interesting reasoning behaviors. However, DeepSeek-R1-Zero encounters challenges such as endless repetition, poor readability, and language mixing. To address these issues and further enhance reasoning performance, we introduce DeepSeek-R1, which incorporates cold-start data before RL. DeepSeek-R1 achieves performance comparable to OpenAI-o1 across math, code, and reasoning tasks.",
  "release_date": "2025-01-20",
  "announcement_date": "2025-01-20",
  "license_id": "mit",
  "multimodal": false,
  "knowledge_cutoff": null,
  "param_count": 671000000000,
  "training_tokens": 14800000000000,
  "available_in_zeroeval": true,
  "source_api_ref": "https://api-docs.deepseek.com/news/news250120",
  "source_playground": "https://chat.deepseek.com",
  "source_paper": "https://arxiv.org/abs/2501.12948",
  "source_scorecard_blog_link": null,
  "source_repo_link": "https://github.com/deepseek-ai/DeepSeek-R1",
  "source_weights_link": "https://huggingface.co/deepseek-ai/DeepSeek-R1",
  "created_at": "2025-07-19T19:49:05.902496+00:00",
  "updated_at": "2025-07-19T19:49:05.902496+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/deepseek/models/deepseek-v2.5/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 1627,
    "benchmark_id": "aider",
    "model_id": "deepseek-v2.5",
    "score": 0.722,
    "normalized_score": 0.722,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V2.5",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.574890+00:00",
    "updated_at": "2025-07-19T19:56:14.574890+00:00",
    "benchmark_name": "Aider"
  },
  {
    "model_benchmark_id": 1619,
    "benchmark_id": "alignbench",
    "model_id": "deepseek-v2.5",
    "score": 0.804,
    "normalized_score": 0.804,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.deepseek.com/",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.550691+00:00",
    "updated_at": "2025-07-19T19:56:14.550691+00:00",
    "benchmark_name": "AlignBench"
  },
  {
    "model_benchmark_id": 1790,
    "benchmark_id": "alpacaeval-2.0",
    "model_id": "deepseek-v2.5",
    "score": 0.505,
    "normalized_score": 0.505,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V2.5",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.041535+00:00",
    "updated_at": "2025-07-19T19:56:15.041535+00:00",
    "benchmark_name": "AlpacaEval 2.0"
  },
  {
    "model_benchmark_id": 1456,
    "benchmark_id": "arena-hard",
    "model_id": "deepseek-v2.5",
    "score": 0.762,
    "normalized_score": 0.762,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V2.5",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.104170+00:00",
    "updated_at": "2025-07-19T19:56:14.104170+00:00",
    "benchmark_name": "Arena Hard"
  },
  {
    "model_benchmark_id": 974,
    "benchmark_id": "bbh",
    "model_id": "deepseek-v2.5",
    "score": 0.843,
    "normalized_score": 0.843,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.deepseek.com/",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.046694+00:00",
    "updated_at": "2025-07-19T19:56:13.046694+00:00",
    "benchmark_name": "BBH"
  },
  {
    "model_benchmark_id": 1797,
    "benchmark_id": "ds-arena-code",
    "model_id": "deepseek-v2.5",
    "score": 0.631,
    "normalized_score": 0.631,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V2.5",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.060324+00:00",
    "updated_at": "2025-07-19T19:56:15.060324+00:00",
    "benchmark_name": "DS-Arena-Code"
  },
  {
    "model_benchmark_id": 1796,
    "benchmark_id": "ds-fim-eval",
    "model_id": "deepseek-v2.5",
    "score": 0.783,
    "normalized_score": 0.783,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V2.5",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.056487+00:00",
    "updated_at": "2025-07-19T19:56:15.056487+00:00",
    "benchmark_name": "DS-FIM-Eval"
  },
  {
    "model_benchmark_id": 1000,
    "benchmark_id": "gsm8k",
    "model_id": "deepseek-v2.5",
    "score": 0.951,
    "normalized_score": 0.951,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.deepseek.com/",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.091340+00:00",
    "updated_at": "2025-07-19T19:56:13.091340+00:00",
    "benchmark_name": "GSM8k"
  },
  {
    "model_benchmark_id": 792,
    "benchmark_id": "humaneval",
    "model_id": "deepseek-v2.5",
    "score": 0.89,
    "normalized_score": 0.89,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.deepseek.com/",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.656959+00:00",
    "updated_at": "2025-07-19T19:56:12.656959+00:00",
    "benchmark_name": "HumanEval"
  },
  {
    "model_benchmark_id": 1789,
    "benchmark_id": "humaneval-mul",
    "model_id": "deepseek-v2.5",
    "score": 0.738,
    "normalized_score": 0.738,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V2.5",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.037209+00:00",
    "updated_at": "2025-07-19T19:56:15.037209+00:00",
    "benchmark_name": "HumanEval-Mul"
  },
  {
    "model_benchmark_id": 1795,
    "benchmark_id": "livecodebench(01-09)",
    "model_id": "deepseek-v2.5",
    "score": 0.418,
    "normalized_score": 0.418,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V2.5",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.052983+00:00",
    "updated_at": "2025-07-19T19:56:15.052983+00:00",
    "benchmark_name": "LiveCodeBench(01-09)"
  },
  {
    "model_benchmark_id": 411,
    "benchmark_id": "math",
    "model_id": "deepseek-v2.5",
    "score": 0.747,
    "normalized_score": 0.747,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.deepseek.com/",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.874944+00:00",
    "updated_at": "2025-07-19T19:56:11.874944+00:00",
    "benchmark_name": "MATH"
  },
  {
    "model_benchmark_id": 94,
    "benchmark_id": "mmlu",
    "model_id": "deepseek-v2.5",
    "score": 0.804,
    "normalized_score": 0.804,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.deepseek.com/",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.277903+00:00",
    "updated_at": "2025-07-19T19:56:11.277903+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 1608,
    "benchmark_id": "mt-bench",
    "model_id": "deepseek-v2.5",
    "score": 0.902,
    "normalized_score": 0.902,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.deepseek.com/",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.525856+00:00",
    "updated_at": "2025-07-19T19:56:14.525856+00:00",
    "benchmark_name": "MT-Bench"
  },
  {
    "model_benchmark_id": 1345,
    "benchmark_id": "swe-bench-verified",
    "model_id": "deepseek-v2.5",
    "score": 0.168,
    "normalized_score": 0.168,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V2.5",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.830793+00:00",
    "updated_at": "2025-07-19T19:56:13.830793+00:00",
    "benchmark_name": "SWE-Bench Verified"
  }
]

================================================
FILE: data/organizations/deepseek/models/deepseek-v2.5/model.json
================================================
{
  "model_id": "deepseek-v2.5",
  "name": "DeepSeek-V2.5",
  "organization_id": "deepseek",
  "fine_tuned_from_model_id": null,
  "description": "DeepSeek-V2.5 is an upgraded version that combines DeepSeek-V2-Chat and DeepSeek-Coder-V2-Instruct, integrating general and coding abilities. It better aligns with human preferences and has been optimized in various aspects, including writing and instruction following.",
  "release_date": "2024-05-08",
  "announcement_date": "2024-05-08",
  "license_id": "deepseek",
  "multimodal": false,
  "knowledge_cutoff": null,
  "param_count": 236000000000,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://www.deepseek.com/",
  "source_playground": "https://huggingface.co/deepseek-ai/DeepSeek-V2.5",
  "source_paper": "https://arxiv.org/abs/2405.04434",
  "source_scorecard_blog_link": null,
  "source_repo_link": "https://huggingface.co/deepseek-ai/DeepSeek-V2.5",
  "source_weights_link": "https://huggingface.co/deepseek-ai/DeepSeek-V2.5",
  "created_at": "2025-07-19T19:49:05.680851+00:00",
  "updated_at": "2025-07-19T19:49:05.680851+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/deepseek/models/deepseek-v3/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 663,
    "benchmark_id": "aider-polyglot",
    "model_id": "deepseek-v3",
    "score": 0.496,
    "normalized_score": 0.496,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/deepseek-ai/DeepSeek-V3",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.374175+00:00",
    "updated_at": "2025-07-19T19:56:12.374175+00:00",
    "benchmark_name": "Aider-Polyglot"
  },
  {
    "model_benchmark_id": 1330,
    "benchmark_id": "aider-polyglot-edit",
    "model_id": "deepseek-v3",
    "score": 0.797,
    "normalized_score": 0.797,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/deepseek-ai/DeepSeek-V3",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.796886+00:00",
    "updated_at": "2025-07-19T19:56:13.796886+00:00",
    "benchmark_name": "Aider-Polyglot Edit"
  },
  {
    "model_benchmark_id": 463,
    "benchmark_id": "aime-2024",
    "model_id": "deepseek-v3",
    "score": 0.392,
    "normalized_score": 0.392,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/deepseek-ai/DeepSeek-V3",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.980196+00:00",
    "updated_at": "2025-07-19T19:56:11.980196+00:00",
    "benchmark_name": "AIME 2024"
  },
  {
    "model_benchmark_id": 438,
    "benchmark_id": "c-eval",
    "model_id": "deepseek-v3",
    "score": 0.865,
    "normalized_score": 0.865,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/deepseek-ai/DeepSeek-V3",
    "verified_by_llmstats": false,
    "analysis_method": "Exact Match",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.928060+00:00",
    "updated_at": "2025-07-19T19:56:11.928060+00:00",
    "benchmark_name": "C-Eval"
  },
  {
    "model_benchmark_id": 600,
    "benchmark_id": "cluewsc",
    "model_id": "deepseek-v3",
    "score": 0.909,
    "normalized_score": 0.909,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/deepseek-ai/DeepSeek-V3",
    "verified_by_llmstats": false,
    "analysis_method": "Exact Match",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.237991+00:00",
    "updated_at": "2025-07-19T19:56:12.237991+00:00",
    "benchmark_name": "CLUEWSC"
  },
  {
    "model_benchmark_id": 711,
    "benchmark_id": "cnmo-2024",
    "model_id": "deepseek-v3",
    "score": 0.432,
    "normalized_score": 0.432,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/deepseek-ai/DeepSeek-V3",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.493124+00:00",
    "updated_at": "2025-07-19T19:56:12.493124+00:00",
    "benchmark_name": "CNMO 2024"
  },
  {
    "model_benchmark_id": 442,
    "benchmark_id": "csimpleqa",
    "model_id": "deepseek-v3",
    "score": 0.648,
    "normalized_score": 0.648,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/deepseek-ai/DeepSeek-V3",
    "verified_by_llmstats": false,
    "analysis_method": "Correct",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.937598+00:00",
    "updated_at": "2025-07-19T19:56:11.937598+00:00",
    "benchmark_name": "CSimpleQA"
  },
  {
    "model_benchmark_id": 951,
    "benchmark_id": "drop",
    "model_id": "deepseek-v3",
    "score": 0.916,
    "normalized_score": 0.916,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/deepseek-ai/DeepSeek-V3",
    "verified_by_llmstats": false,
    "analysis_method": "3-shot F1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.005931+00:00",
    "updated_at": "2025-07-19T19:56:13.005931+00:00",
    "benchmark_name": "DROP"
  },
  {
    "model_benchmark_id": 1753,
    "benchmark_id": "frames",
    "model_id": "deepseek-v3",
    "score": 0.733,
    "normalized_score": 0.733,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/deepseek-ai/DeepSeek-V3",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.958906+00:00",
    "updated_at": "2025-07-19T19:56:14.958906+00:00",
    "benchmark_name": "FRAMES"
  },
  {
    "model_benchmark_id": 312,
    "benchmark_id": "gpqa",
    "model_id": "deepseek-v3",
    "score": 0.591,
    "normalized_score": 0.591,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/deepseek-ai/DeepSeek-V3",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.695757+00:00",
    "updated_at": "2025-07-19T19:56:11.695757+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 1788,
    "benchmark_id": "humaneval-mul",
    "model_id": "deepseek-v3",
    "score": 0.826,
    "normalized_score": 0.826,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/deepseek-ai/DeepSeek-V3",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.035409+00:00",
    "updated_at": "2025-07-19T19:56:15.035409+00:00",
    "benchmark_name": "HumanEval-Mul"
  },
  {
    "model_benchmark_id": 622,
    "benchmark_id": "ifeval",
    "model_id": "deepseek-v3",
    "score": 0.861,
    "normalized_score": 0.861,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/deepseek-ai/DeepSeek-V3",
    "verified_by_llmstats": false,
    "analysis_method": "Prompt Strict",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.280659+00:00",
    "updated_at": "2025-07-19T19:56:12.280659+00:00",
    "benchmark_name": "IFEval"
  },
  {
    "model_benchmark_id": 1131,
    "benchmark_id": "livecodebench",
    "model_id": "deepseek-v3",
    "score": 0.376,
    "normalized_score": 0.376,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/deepseek-ai/DeepSeek-V3",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.364940+00:00",
    "updated_at": "2025-07-19T19:56:13.372242+00:00",
    "benchmark_name": "LiveCodeBench"
  },
  {
    "model_benchmark_id": 1787,
    "benchmark_id": "longbench-v2",
    "model_id": "deepseek-v3",
    "score": 0.487,
    "normalized_score": 0.487,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/deepseek-ai/DeepSeek-V3",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.031520+00:00",
    "updated_at": "2025-07-19T19:56:15.031520+00:00",
    "benchmark_name": "LongBench v2"
  },
  {
    "model_benchmark_id": 500,
    "benchmark_id": "math-500",
    "model_id": "deepseek-v3",
    "score": 0.902,
    "normalized_score": 0.902,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/deepseek-ai/DeepSeek-V3",
    "verified_by_llmstats": false,
    "analysis_method": "Exact Match",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.043125+00:00",
    "updated_at": "2025-07-19T19:56:12.043125+00:00",
    "benchmark_name": "MATH-500"
  },
  {
    "model_benchmark_id": 93,
    "benchmark_id": "mmlu",
    "model_id": "deepseek-v3",
    "score": 0.885,
    "normalized_score": 0.885,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/deepseek-ai/DeepSeek-V3",
    "verified_by_llmstats": false,
    "analysis_method": "Exact Match",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.275957+00:00",
    "updated_at": "2025-07-19T19:56:11.275957+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 202,
    "benchmark_id": "mmlu-pro",
    "model_id": "deepseek-v3",
    "score": 0.759,
    "normalized_score": 0.759,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/deepseek-ai/DeepSeek-V3",
    "verified_by_llmstats": false,
    "analysis_method": "Exact Match",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.485394+00:00",
    "updated_at": "2025-07-19T19:56:11.485394+00:00",
    "benchmark_name": "MMLU-Pro"
  },
  {
    "model_benchmark_id": 737,
    "benchmark_id": "mmlu-redux",
    "model_id": "deepseek-v3",
    "score": 0.891,
    "normalized_score": 0.891,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/deepseek-ai/DeepSeek-V3",
    "verified_by_llmstats": false,
    "analysis_method": "Exact Match",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.548864+00:00",
    "updated_at": "2025-07-19T19:56:12.548864+00:00",
    "benchmark_name": "MMLU-Redux"
  },
  {
    "model_benchmark_id": 235,
    "benchmark_id": "simpleqa",
    "model_id": "deepseek-v3",
    "score": 0.249,
    "normalized_score": 0.249,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/deepseek-ai/DeepSeek-V3",
    "verified_by_llmstats": false,
    "analysis_method": "Correct",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.549943+00:00",
    "updated_at": "2025-07-19T19:56:11.549943+00:00",
    "benchmark_name": "SimpleQA"
  },
  {
    "model_benchmark_id": 1344,
    "benchmark_id": "swe-bench-verified",
    "model_id": "deepseek-v3",
    "score": 0.42,
    "normalized_score": 0.42,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/deepseek-ai/DeepSeek-V3",
    "verified_by_llmstats": false,
    "analysis_method": "Resolved",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.828562+00:00",
    "updated_at": "2025-07-19T19:56:13.828562+00:00",
    "benchmark_name": "SWE-Bench Verified"
  }
]

================================================
FILE: data/organizations/deepseek/models/deepseek-v3/model.json
================================================
{
  "model_id": "deepseek-v3",
  "name": "DeepSeek-V3",
  "organization_id": "deepseek",
  "fine_tuned_from_model_id": null,
  "description": "A powerful Mixture-of-Experts (MoE) language model with 671B total parameters (37B activated per token). Features Multi-head Latent Attention (MLA), auxiliary-loss-free load balancing, and multi-token prediction training. Pre-trained on 14.8T tokens with strong performance in reasoning, math, and code tasks.",
  "release_date": "2024-12-25",
  "announcement_date": "2024-12-25",
  "license_id": "mit_+_model_license_(commercial_use_allowed)",
  "multimodal": false,
  "knowledge_cutoff": null,
  "param_count": 671000000000,
  "training_tokens": 14800000000000,
  "available_in_zeroeval": true,
  "source_api_ref": "https://platform.deepseek.com",
  "source_playground": "https://chat.deepseek.com",
  "source_paper": "https://github.com/deepseek-ai/DeepSeek-V3/blob/main/DeepSeek_V3.pdf",
  "source_scorecard_blog_link": null,
  "source_repo_link": "https://github.com/deepseek-ai/DeepSeek-V3",
  "source_weights_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3",
  "created_at": "2025-07-19T19:49:05.677307+00:00",
  "updated_at": "2025-07-19T19:49:05.677307+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/deepseek/models/deepseek-v3-0324/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 473,
    "benchmark_id": "aime-2024",
    "model_id": "deepseek-v3-0324",
    "score": 0.594,
    "normalized_score": 0.594,
    "is_self_reported": true,
    "self_reported_source_link": "https://api-docs.deepseek.com/news/news250325",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.999879+00:00",
    "updated_at": "2025-07-19T19:56:11.999879+00:00",
    "benchmark_name": "AIME 2024"
  },
  {
    "model_benchmark_id": 318,
    "benchmark_id": "gpqa",
    "model_id": "deepseek-v3-0324",
    "score": 0.684,
    "normalized_score": 0.684,
    "is_self_reported": true,
    "self_reported_source_link": "https://api-docs.deepseek.com/news/news250325",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.705537+00:00",
    "updated_at": "2025-07-19T19:56:11.705537+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 1138,
    "benchmark_id": "livecodebench",
    "model_id": "deepseek-v3-0324",
    "score": 0.492,
    "normalized_score": 0.492,
    "is_self_reported": true,
    "self_reported_source_link": "https://api-docs.deepseek.com/news/news250325",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.392232+00:00",
    "updated_at": "2025-07-19T19:56:13.392232+00:00",
    "benchmark_name": "LiveCodeBench"
  },
  {
    "model_benchmark_id": 506,
    "benchmark_id": "math-500",
    "model_id": "deepseek-v3-0324",
    "score": 0.94,
    "normalized_score": 0.94,
    "is_self_reported": true,
    "self_reported_source_link": "https://api-docs.deepseek.com/news/news250325",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.053333+00:00",
    "updated_at": "2025-07-19T19:56:12.053333+00:00",
    "benchmark_name": "MATH-500"
  },
  {
    "model_benchmark_id": 204,
    "benchmark_id": "mmlu-pro",
    "model_id": "deepseek-v3-0324",
    "score": 0.812,
    "normalized_score": 0.812,
    "is_self_reported": true,
    "self_reported_source_link": "https://api-docs.deepseek.com/news/news250325",
    "verified_by_llmstats": false,
    "analysis_method": "Exact Match",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.488686+00:00",
    "updated_at": "2025-07-19T19:56:11.488686+00:00",
    "benchmark_name": "MMLU-Pro"
  }
]

================================================
FILE: data/organizations/deepseek/models/deepseek-v3-0324/model.json
================================================
{
  "model_id": "deepseek-v3-0324",
  "name": "DeepSeek-V3 0324",
  "organization_id": "deepseek",
  "fine_tuned_from_model_id": null,
  "description": "A powerful Mixture-of-Experts (MoE) language model with 671B total parameters (37B activated per token). Features Multi-head Latent Attention (MLA), auxiliary-loss-free load balancing, and multi-token prediction training. Pre-trained on 14.8T tokens with strong performance in reasoning, math, and code tasks.",
  "release_date": "2025-03-25",
  "announcement_date": "2025-03-25",
  "license_id": "mit_+_model_license_(commercial_use_allowed)",
  "multimodal": false,
  "knowledge_cutoff": null,
  "param_count": 671000000000,
  "training_tokens": 14800000000000,
  "available_in_zeroeval": true,
  "source_api_ref": "https://platform.deepseek.com",
  "source_playground": "https://chat.deepseek.com",
  "source_paper": "https://arxiv.org/abs/2412.19437",
  "source_scorecard_blog_link": null,
  "source_repo_link": "https://github.com/deepseek-ai/DeepSeek-V3",
  "source_weights_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3-0324",
  "created_at": "2025-07-19T19:49:05.693499+00:00",
  "updated_at": "2025-07-19T19:49:05.693499+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/deepseek/models/deepseek-v3.1/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 9501,
    "benchmark_id": "mmlu-redux",
    "model_id": "deepseek-v3.1",
    "score": 0.918,
    "normalized_score": 0.918,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1",
    "verified_by_llmstats": false,
    "analysis_method": "Non-Thinking mode",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": "Non-thinking: 91.8%, Thinking: 93.7%",
    "created_at": "2025-01-10T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "MMLU-Redux"
  },
  {
    "model_benchmark_id": 9502,
    "benchmark_id": "mmlu-pro",
    "model_id": "deepseek-v3.1",
    "score": 0.837,
    "normalized_score": 0.837,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1",
    "verified_by_llmstats": false,
    "analysis_method": "Non-Thinking mode",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": "Non-thinking: 83.7%, Thinking: 84.8%",
    "created_at": "2025-01-10T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "MMLU-Pro"
  },
  {
    "model_benchmark_id": 9503,
    "benchmark_id": "gpqa",
    "model_id": "deepseek-v3.1",
    "score": 0.749,
    "normalized_score": 0.749,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1, Non-Thinking mode",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": "Non-thinking: 74.9%, Thinking: 80.1%",
    "created_at": "2025-01-10T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 9504,
    "benchmark_id": "humanity's-last-exam",
    "model_id": "deepseek-v3.1",
    "score": 0.159,
    "normalized_score": 0.159,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1, Thinking mode, text-only subset",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": "Thinking mode only, text-only subset",
    "created_at": "2025-01-10T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "Humanity's Last Exam"
  },
  {
    "model_benchmark_id": 9505,
    "benchmark_id": "browsecomp",
    "model_id": "deepseek-v3.1",
    "score": 0.3,
    "normalized_score": 0.3,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1",
    "verified_by_llmstats": false,
    "analysis_method": "Thinking mode with search agent",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": "Search agent with commercial API + webpage filter + 128K context",
    "created_at": "2025-01-10T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "BrowseComp"
  },
  {
    "model_benchmark_id": 9506,
    "benchmark_id": "browsecomp-zh",
    "model_id": "deepseek-v3.1",
    "score": 0.492,
    "normalized_score": 0.492,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1",
    "verified_by_llmstats": false,
    "analysis_method": "Thinking mode with search agent",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": "Search agent with commercial API + webpage filter + 128K context",
    "created_at": "2025-01-10T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "BrowseComp-zh"
  },
  {
    "model_benchmark_id": 9507,
    "benchmark_id": "simpleqa",
    "model_id": "deepseek-v3.1",
    "score": 0.934,
    "normalized_score": 0.934,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1",
    "verified_by_llmstats": false,
    "analysis_method": "Thinking mode with search agent",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": "Search agent evaluation",
    "created_at": "2025-01-10T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "SimpleQA"
  },
  {
    "model_benchmark_id": 9508,
    "benchmark_id": "livecodebench",
    "model_id": "deepseek-v3.1",
    "score": 0.564,
    "normalized_score": 0.564,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1, 2408-2505, Non-Thinking mode",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": "Non-thinking: 56.4%, Thinking: 74.8%",
    "created_at": "2025-01-10T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "LiveCodeBench"
  },
  {
    "model_benchmark_id": 9509,
    "benchmark_id": "codeforces",
    "model_id": "deepseek-v3.1",
    "score": 0.697,
    "normalized_score": 0.697,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1",
    "verified_by_llmstats": false,
    "analysis_method": "Div1 Rating, Thinking mode",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": "Codeforces Div1 rating in thinking mode",
    "created_at": "2025-01-10T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "Codeforces"
  },
  {
    "model_benchmark_id": 9510,
    "benchmark_id": "aider-polyglot",
    "model_id": "deepseek-v3.1",
    "score": 0.684,
    "normalized_score": 0.684,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1",
    "verified_by_llmstats": false,
    "analysis_method": "Non-Thinking mode",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": "Non-thinking: 68.4%, Thinking: 76.3%",
    "created_at": "2025-01-10T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "Aider-Polyglot"
  },
  {
    "model_benchmark_id": 9511,
    "benchmark_id": "swe-bench-verified",
    "model_id": "deepseek-v3.1",
    "score": 0.66,
    "normalized_score": 0.66,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1",
    "verified_by_llmstats": false,
    "analysis_method": "Agent mode, Non-Thinking",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": "Evaluated with internal code agent framework",
    "created_at": "2025-01-10T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "SWE-Bench Verified"
  },
  {
    "model_benchmark_id": 9512,
    "benchmark_id": "swe-bench-multilingual",
    "model_id": "deepseek-v3.1",
    "score": 0.545,
    "normalized_score": 0.545,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1",
    "verified_by_llmstats": false,
    "analysis_method": "Agent mode, Non-Thinking",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": "Evaluated with internal code agent framework",
    "created_at": "2025-01-10T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "SWE-Bench Multilingual"
  },
  {
    "model_benchmark_id": 9513,
    "benchmark_id": "terminal-bench",
    "model_id": "deepseek-v3.1",
    "score": 0.313,
    "normalized_score": 0.313,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1",
    "verified_by_llmstats": false,
    "analysis_method": "Terminus 1 framework, Non-Thinking",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-01-10T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "Terminal-Bench"
  },
  {
    "model_benchmark_id": 9514,
    "benchmark_id": "aime-2024",
    "model_id": "deepseek-v3.1",
    "score": 0.663,
    "normalized_score": 0.663,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1, Non-Thinking mode",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": "Non-thinking: 66.3%, Thinking: 93.1%",
    "created_at": "2025-01-10T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "AIME 2024"
  },
  {
    "model_benchmark_id": 9515,
    "benchmark_id": "aime-2025",
    "model_id": "deepseek-v3.1",
    "score": 0.498,
    "normalized_score": 0.498,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1, Non-Thinking mode",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": "Non-thinking: 49.8%, Thinking: 88.4%",
    "created_at": "2025-01-10T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "AIME 2025"
  },
  {
    "model_benchmark_id": 9516,
    "benchmark_id": "hmmt-2025",
    "model_id": "deepseek-v3.1",
    "score": 0.335,
    "normalized_score": 0.335,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1, Non-Thinking mode",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": "Non-thinking: 33.5%, Thinking: 84.2%",
    "created_at": "2025-01-10T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "HMMT 2025"
  }
]

================================================
FILE: data/organizations/deepseek/models/deepseek-v3.1/model.json
================================================
{
  "model_id": "deepseek-v3.1",
  "name": "DeepSeek-V3.1",
  "organization_id": "deepseek",
  "model_family_id": null,
  "fine_tuned_from_model_id": "deepseek-v3",
  "description": "DeepSeek-V3.1 is a hybrid model supporting both thinking and non-thinking modes through different chat templates. Built on DeepSeek-V3.1-Base with a two-phase long context extension (32K phase: 630B tokens, 128K phase: 209B tokens), it features 671B total parameters with 37B activated. Key improvements include smarter tool calling through post-training optimization, higher thinking efficiency achieving comparable quality to DeepSeek-R1-0528 while responding more quickly, and UE8M0 FP8 scale data format for model weights and activations. The model excels in both reasoning tasks (thinking mode) and practical applications (non-thinking mode), with particularly strong performance in code agent tasks, math competitions, and search-based problem solving.",
  "release_date": "2025-01-10",
  "announcement_date": "2025-01-10",
  "license_id": "mit",
  "multimodal": false,
  "knowledge_cutoff": null,
  "param_count": 671000000000,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://api.deepseek.com/docs",
  "source_playground": "https://chat.deepseek.com/",
  "source_paper": "https://github.com/deepseek-ai/DeepSeek-V3/blob/main/DeepSeek-V3.pdf",
  "source_scorecard_blog_link": "https://www.deepseek.com/news/deepseek-v3-1",
  "source_repo_link": "https://github.com/deepseek-ai/DeepSeek-V3",
  "source_weights_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1",
  "created_at": "2025-01-10T00:00:00.000000+00:00",
  "updated_at": "2025-09-15T00:00:00.000000+00:00"
}


================================================
FILE: data/organizations/deepseek/models/deepseek-v3.2-exp/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 9521,
    "benchmark_id": "mmlu-pro",
    "model_id": "deepseek-v3.2-exp",
    "score": 0.85,
    "normalized_score": 0.85,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.2-Exp",
    "verified_by_llmstats": false,
    "analysis_method": "Reasoning Mode (w/o Tool Use)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-09-29T00:00:00.000000+00:00",
    "updated_at": "2025-09-29T00:00:00.000000+00:00",
    "benchmark_name": "MMLU-Pro"
  },
  {
    "model_benchmark_id": 9522,
    "benchmark_id": "gpqa",
    "model_id": "deepseek-v3.2-exp",
    "score": 0.799,
    "normalized_score": 0.799,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.2-Exp",
    "verified_by_llmstats": false,
    "analysis_method": "Reasoning Mode (w/o Tool Use)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-09-29T00:00:00.000000+00:00",
    "updated_at": "2025-09-29T00:00:00.000000+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 9523,
    "benchmark_id": "humanity's-last-exam",
    "model_id": "deepseek-v3.2-exp",
    "score": 0.198,
    "normalized_score": 0.198,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.2-Exp",
    "verified_by_llmstats": false,
    "analysis_method": "Reasoning Mode (w/o Tool Use)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": "Text-only subset where applicable",
    "created_at": "2025-09-29T00:00:00.000000+00:00",
    "updated_at": "2025-09-29T00:00:00.000000+00:00",
    "benchmark_name": "Humanity's Last Exam"
  },
  {
    "model_benchmark_id": 9524,
    "benchmark_id": "livecodebench",
    "model_id": "deepseek-v3.2-exp",
    "score": 0.741,
    "normalized_score": 0.741,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.2-Exp",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1 (Reasoning Mode w/o Tool Use)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-09-29T00:00:00.000000+00:00",
    "updated_at": "2025-09-29T00:00:00.000000+00:00",
    "benchmark_name": "LiveCodeBench"
  },
  {
    "model_benchmark_id": 9525,
    "benchmark_id": "aime-2025",
    "model_id": "deepseek-v3.2-exp",
    "score": 0.893,
    "normalized_score": 0.893,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.2-Exp",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1 (Reasoning Mode w/o Tool Use)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-09-29T00:00:00.000000+00:00",
    "updated_at": "2025-09-29T00:00:00.000000+00:00",
    "benchmark_name": "AIME 2025"
  },
  {
    "model_benchmark_id": 9526,
    "benchmark_id": "hmmt-2025",
    "model_id": "deepseek-v3.2-exp",
    "score": 0.836,
    "normalized_score": 0.836,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.2-Exp",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1 (Reasoning Mode w/o Tool Use)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-09-29T00:00:00.000000+00:00",
    "updated_at": "2025-09-29T00:00:00.000000+00:00",
    "benchmark_name": "HMMT 2025"
  },
  {
    "model_benchmark_id": 9527,
    "benchmark_id": "codeforces",
    "model_id": "deepseek-v3.2-exp",
    "score": 0.707,
    "normalized_score": 0.707,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.2-Exp",
    "verified_by_llmstats": false,
    "analysis_method": "Div1 rating (Reasoning Mode)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": "Raw rating ≈ 2121; normalized by 3000 max",
    "created_at": "2025-09-29T00:00:00.000000+00:00",
    "updated_at": "2025-09-29T00:00:00.000000+00:00",
    "benchmark_name": "Codeforces"
  },
  {
    "model_benchmark_id": 9528,
    "benchmark_id": "aider-polyglot",
    "model_id": "deepseek-v3.2-exp",
    "score": 0.745,
    "normalized_score": 0.745,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.2-Exp",
    "verified_by_llmstats": false,
    "analysis_method": "Reasoning Mode (w/o Tool Use)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-09-29T00:00:00.000000+00:00",
    "updated_at": "2025-09-29T00:00:00.000000+00:00",
    "benchmark_name": "Aider-Polyglot"
  },
  {
    "model_benchmark_id": 9529,
    "benchmark_id": "browsecomp",
    "model_id": "deepseek-v3.2-exp",
    "score": 0.401,
    "normalized_score": 0.401,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.2-Exp",
    "verified_by_llmstats": false,
    "analysis_method": "Agentic Tool Use",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-09-29T00:00:00.000000+00:00",
    "updated_at": "2025-09-29T00:00:00.000000+00:00",
    "benchmark_name": "BrowseComp"
  },
  {
    "model_benchmark_id": 9530,
    "benchmark_id": "browsecomp-zh",
    "model_id": "deepseek-v3.2-exp",
    "score": 0.479,
    "normalized_score": 0.479,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.2-Exp",
    "verified_by_llmstats": false,
    "analysis_method": "Agentic Tool Use",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-09-29T00:00:00.000000+00:00",
    "updated_at": "2025-09-29T00:00:00.000000+00:00",
    "benchmark_name": "BrowseComp-zh"
  },
  {
    "model_benchmark_id": 9531,
    "benchmark_id": "simpleqa",
    "model_id": "deepseek-v3.2-exp",
    "score": 0.971,
    "normalized_score": 0.971,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.2-Exp",
    "verified_by_llmstats": false,
    "analysis_method": "Agentic Tool Use",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-09-29T00:00:00.000000+00:00",
    "updated_at": "2025-09-29T00:00:00.000000+00:00",
    "benchmark_name": "SimpleQA"
  },
  {
    "model_benchmark_id": 9532,
    "benchmark_id": "swe-bench-verified",
    "model_id": "deepseek-v3.2-exp",
    "score": 0.678,
    "normalized_score": 0.678,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.2-Exp",
    "verified_by_llmstats": false,
    "analysis_method": "Agentic Tool Use",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-09-29T00:00:00.000000+00:00",
    "updated_at": "2025-09-29T00:00:00.000000+00:00",
    "benchmark_name": "SWE-Bench Verified"
  },
  {
    "model_benchmark_id": 9533,
    "benchmark_id": "swe-bench-multilingual",
    "model_id": "deepseek-v3.2-exp",
    "score": 0.579,
    "normalized_score": 0.579,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.2-Exp",
    "verified_by_llmstats": false,
    "analysis_method": "Agentic Tool Use",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-09-29T00:00:00.000000+00:00",
    "updated_at": "2025-09-29T00:00:00.000000+00:00",
    "benchmark_name": "SWE-Bench Multilingual"
  },
  {
    "model_benchmark_id": 9534,
    "benchmark_id": "terminal-bench",
    "model_id": "deepseek-v3.2-exp",
    "score": 0.377,
    "normalized_score": 0.377,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.2-Exp",
    "verified_by_llmstats": false,
    "analysis_method": "Agentic Tool Use",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-09-29T00:00:00.000000+00:00",
    "updated_at": "2025-09-29T00:00:00.000000+00:00",
    "benchmark_name": "Terminal-Bench"
  }
]


================================================
FILE: data/organizations/deepseek/models/deepseek-v3.2-exp/model.json
================================================
{
  "model_id": "deepseek-v3.2-exp",
  "name": "DeepSeek-V3.2-Exp",
  "organization_id": "deepseek",
  "model_family_id": null,
  "fine_tuned_from_model_id": null,
  "description": "DeepSeek-V3.2-Exp is an experimental iteration introducing DeepSeek Sparse Attention (DSA) to improve long-context training and inference efficiency while keeping output quality on par with V3.1. It explores fine-grained sparse attention for extended sequence processing.",
  "release_date": "2025-09-29",
  "announcement_date": "2025-09-29",
  "license_id": "mit",
  "multimodal": false,
  "knowledge_cutoff": null,
  "param_count": 685000000000,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://api.deepseek.com/docs",
  "source_playground": "https://chat.deepseek.com/",
  "source_paper": "https://github.com/deepseek-ai/DeepSeek-V3.2-Exp/blob/main/DeepSeek_V3_2.pdf",
  "source_scorecard_blog_link": null,
  "source_repo_link": "https://github.com/deepseek-ai/DeepSeek-V3.2-Exp",
  "source_weights_link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.2-Exp",
  "created_at": "2025-09-29T00:00:00.000000+00:00",
  "updated_at": "2025-09-29T00:00:00.000000+00:00"
}


================================================
FILE: data/organizations/deepseek/models/deepseek-vl2/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 1256,
    "benchmark_id": "ai2d",
    "model_id": "deepseek-vl2",
    "score": 0.814,
    "normalized_score": 0.814,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2412.10302",
    "verified_by_llmstats": false,
    "analysis_method": "test",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.636398+00:00",
    "updated_at": "2025-07-19T19:56:13.636398+00:00",
    "benchmark_name": "AI2D"
  },
  {
    "model_benchmark_id": 868,
    "benchmark_id": "chartqa",
    "model_id": "deepseek-vl2",
    "score": 0.86,
    "normalized_score": 0.86,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2412.10302",
    "verified_by_llmstats": false,
    "analysis_method": "test",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.812840+00:00",
    "updated_at": "2025-07-19T19:56:12.812840+00:00",
    "benchmark_name": "ChartQA"
  },
  {
    "model_benchmark_id": 890,
    "benchmark_id": "docvqa",
    "model_id": "deepseek-vl2",
    "score": 0.933,
    "normalized_score": 0.933,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2412.10302",
    "verified_by_llmstats": false,
    "analysis_method": "test",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.852402+00:00",
    "updated_at": "2025-07-19T19:56:12.852402+00:00",
    "benchmark_name": "DocVQA"
  },
  {
    "model_benchmark_id": 1244,
    "benchmark_id": "infovqa",
    "model_id": "deepseek-vl2",
    "score": 0.781,
    "normalized_score": 0.781,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2412.10302",
    "verified_by_llmstats": false,
    "analysis_method": "test",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.614094+00:00",
    "updated_at": "2025-07-19T19:56:13.614094+00:00",
    "benchmark_name": "InfoVQA"
  },
  {
    "model_benchmark_id": 528,
    "benchmark_id": "mathvista",
    "model_id": "deepseek-vl2",
    "score": 0.628,
    "normalized_score": 0.628,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2412.10302",
    "verified_by_llmstats": false,
    "analysis_method": "testmini",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.096047+00:00",
    "updated_at": "2025-07-19T19:56:12.096047+00:00",
    "benchmark_name": "MathVista"
  },
  {
    "model_benchmark_id": 1513,
    "benchmark_id": "mmbench",
    "model_id": "deepseek-vl2",
    "score": 0.796,
    "normalized_score": 0.796,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2412.10302",
    "verified_by_llmstats": false,
    "analysis_method": "en test",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.245378+00:00",
    "updated_at": "2025-07-19T19:56:14.247008+00:00",
    "benchmark_name": "MMBench"
  },
  {
    "model_benchmark_id": 1727,
    "benchmark_id": "mmbench-v1.1",
    "model_id": "deepseek-vl2",
    "score": 0.792,
    "normalized_score": 0.792,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2412.10302",
    "verified_by_llmstats": false,
    "analysis_method": "cn test",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.873346+00:00",
    "updated_at": "2025-07-19T19:56:14.873346+00:00",
    "benchmark_name": "MMBench-V1.1"
  },
  {
    "model_benchmark_id": 1784,
    "benchmark_id": "mme",
    "model_id": "deepseek-vl2",
    "score": 0.2253,
    "normalized_score": 0.2253,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2412.10302",
    "verified_by_llmstats": false,
    "analysis_method": "Standard Evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.025040+00:00",
    "updated_at": "2025-07-19T19:56:15.025040+00:00",
    "benchmark_name": "MME"
  },
  {
    "model_benchmark_id": 574,
    "benchmark_id": "mmmu",
    "model_id": "deepseek-vl2",
    "score": 0.511,
    "normalized_score": 0.511,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2412.10302",
    "verified_by_llmstats": false,
    "analysis_method": "val",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.181251+00:00",
    "updated_at": "2025-07-19T19:56:12.181251+00:00",
    "benchmark_name": "MMMU"
  },
  {
    "model_benchmark_id": 1663,
    "benchmark_id": "mmstar",
    "model_id": "deepseek-vl2",
    "score": 0.613,
    "normalized_score": 0.613,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2412.10302",
    "verified_by_llmstats": false,
    "analysis_method": "Standard Evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.669907+00:00",
    "updated_at": "2025-07-19T19:56:14.669907+00:00",
    "benchmark_name": "MMStar"
  },
  {
    "model_benchmark_id": 1667,
    "benchmark_id": "mmt-bench",
    "model_id": "deepseek-vl2",
    "score": 0.636,
    "normalized_score": 0.636,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2412.10302",
    "verified_by_llmstats": false,
    "analysis_method": "Standard Evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.678247+00:00",
    "updated_at": "2025-07-19T19:56:14.678247+00:00",
    "benchmark_name": "MMT-Bench"
  },
  {
    "model_benchmark_id": 1542,
    "benchmark_id": "ocrbench",
    "model_id": "deepseek-vl2",
    "score": 0.811,
    "normalized_score": 0.811,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2412.10302",
    "verified_by_llmstats": false,
    "analysis_method": "Standard Evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.320020+00:00",
    "updated_at": "2025-07-19T19:56:14.320020+00:00",
    "benchmark_name": "OCRBench"
  },
  {
    "model_benchmark_id": 1635,
    "benchmark_id": "realworldqa",
    "model_id": "deepseek-vl2",
    "score": 0.684,
    "normalized_score": 0.684,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2412.10302",
    "verified_by_llmstats": false,
    "analysis_method": "Standard Evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.601290+00:00",
    "updated_at": "2025-07-19T19:56:14.601290+00:00",
    "benchmark_name": "RealWorldQA"
  },
  {
    "model_benchmark_id": 912,
    "benchmark_id": "textvqa",
    "model_id": "deepseek-vl2",
    "score": 0.842,
    "normalized_score": 0.842,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2412.10302",
    "verified_by_llmstats": false,
    "analysis_method": "val",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.902069+00:00",
    "updated_at": "2025-07-19T19:56:12.902069+00:00",
    "benchmark_name": "TextVQA"
  }
]

================================================
FILE: data/organizations/deepseek/models/deepseek-vl2/model.json
================================================
{
  "model_id": "deepseek-vl2",
  "name": "DeepSeek VL2",
  "organization_id": "deepseek",
  "fine_tuned_from_model_id": null,
  "description": "An advanced series of large Mixture-of-Experts (MoE) Vision-Language Models that significantly improves upon its predecessor, DeepSeek-VL. DeepSeek-VL2 demonstrates superior capabilities across various tasks, including but not limited to visual question answering, optical character recognition, document/table/chart understanding, and visual grounding.",
  "release_date": "2024-12-13",
  "announcement_date": "2024-12-13",
  "license_id": "deepseek",
  "multimodal": true,
  "knowledge_cutoff": null,
  "param_count": 27000000000,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://www.deepseek.com/",
  "source_playground": "https://huggingface.co/deepseek-ai/deepseek-vl2",
  "source_paper": "https://arxiv.org/pdf/2412.10302",
  "source_scorecard_blog_link": null,
  "source_repo_link": "https://github.com/deepseek-ai/DeepSeek-VL2?tab=readme-ov-file",
  "source_weights_link": "https://huggingface.co/deepseek-ai/deepseek-vl2",
  "created_at": "2025-07-19T19:49:05.658016+00:00",
  "updated_at": "2025-07-19T19:49:05.658016+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/deepseek/models/deepseek-vl2-small/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 1258,
    "benchmark_id": "ai2d",
    "model_id": "deepseek-vl2-small",
    "score": 0.8,
    "normalized_score": 0.8,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2412.10302",
    "verified_by_llmstats": false,
    "analysis_method": "test",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.640145+00:00",
    "updated_at": "2025-07-19T19:56:13.640145+00:00",
    "benchmark_name": "AI2D"
  },
  {
    "model_benchmark_id": 870,
    "benchmark_id": "chartqa",
    "model_id": "deepseek-vl2-small",
    "score": 0.845,
    "normalized_score": 0.845,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2412.10302",
    "verified_by_llmstats": false,
    "analysis_method": "test",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.816278+00:00",
    "updated_at": "2025-07-19T19:56:12.816278+00:00",
    "benchmark_name": "ChartQA"
  },
  {
    "model_benchmark_id": 892,
    "benchmark_id": "docvqa",
    "model_id": "deepseek-vl2-small",
    "score": 0.923,
    "normalized_score": 0.923,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2412.10302",
    "verified_by_llmstats": false,
    "analysis_method": "test",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.857733+00:00",
    "updated_at": "2025-07-19T19:56:12.857733+00:00",
    "benchmark_name": "DocVQA"
  },
  {
    "model_benchmark_id": 1246,
    "benchmark_id": "infovqa",
    "model_id": "deepseek-vl2-small",
    "score": 0.758,
    "normalized_score": 0.758,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2412.10302",
    "verified_by_llmstats": false,
    "analysis_method": "test",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.617970+00:00",
    "updated_at": "2025-07-19T19:56:13.617970+00:00",
    "benchmark_name": "InfoVQA"
  },
  {
    "model_benchmark_id": 530,
    "benchmark_id": "mathvista",
    "model_id": "deepseek-vl2-small",
    "score": 0.607,
    "normalized_score": 0.607,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2412.10302",
    "verified_by_llmstats": false,
    "analysis_method": "testmini",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.100314+00:00",
    "updated_at": "2025-07-19T19:56:12.100314+00:00",
    "benchmark_name": "MathVista"
  },
  {
    "model_benchmark_id": 1517,
    "benchmark_id": "mmbench",
    "model_id": "deepseek-vl2-small",
    "score": 0.803,
    "normalized_score": 0.803,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2412.10302",
    "verified_by_llmstats": false,
    "analysis_method": "en test",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.252930+00:00",
    "updated_at": "2025-07-19T19:56:14.254459+00:00",
    "benchmark_name": "MMBench"
  },
  {
    "model_benchmark_id": 1729,
    "benchmark_id": "mmbench-v1.1",
    "model_id": "deepseek-vl2-small",
    "score": 0.793,
    "normalized_score": 0.793,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2412.10302",
    "verified_by_llmstats": false,
    "analysis_method": "cn test",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.876824+00:00",
    "updated_at": "2025-07-19T19:56:14.876824+00:00",
    "benchmark_name": "MMBench-V1.1"
  },
  {
    "model_benchmark_id": 1786,
    "benchmark_id": "mme",
    "model_id": "deepseek-vl2-small",
    "score": 0.2123,
    "normalized_score": 0.2123,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2412.10302",
    "verified_by_llmstats": false,
    "analysis_method": "Standard Evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.028315+00:00",
    "updated_at": "2025-07-19T19:56:15.028315+00:00",
    "benchmark_name": "MME"
  },
  {
    "model_benchmark_id": 576,
    "benchmark_id": "mmmu",
    "model_id": "deepseek-vl2-small",
    "score": 0.48,
    "normalized_score": 0.48,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2412.10302",
    "verified_by_llmstats": false,
    "analysis_method": "val",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.184966+00:00",
    "updated_at": "2025-07-19T19:56:12.184966+00:00",
    "benchmark_name": "MMMU"
  },
  {
    "model_benchmark_id": 1665,
    "benchmark_id": "mmstar",
    "model_id": "deepseek-vl2-small",
    "score": 0.57,
    "normalized_score": 0.57,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2412.10302",
    "verified_by_llmstats": false,
    "analysis_method": "Standard Evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.672978+00:00",
    "updated_at": "2025-07-19T19:56:14.672978+00:00",
    "benchmark_name": "MMStar"
  },
  {
    "model_benchmark_id": 1669,
    "benchmark_id": "mmt-bench",
    "model_id": "deepseek-vl2-small",
    "score": 0.629,
    "normalized_score": 0.629,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2412.10302",
    "verified_by_llmstats": false,
    "analysis_method": "Standard Evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.683443+00:00",
    "updated_at": "2025-07-19T19:56:14.683443+00:00",
    "benchmark_name": "MMT-Bench"
  },
  {
    "model_benchmark_id": 1544,
    "benchmark_id": "ocrbench",
    "model_id": "deepseek-vl2-small",
    "score": 0.834,
    "normalized_score": 0.834,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2412.10302",
    "verified_by_llmstats": false,
    "analysis_method": "Standard Evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.324965+00:00",
    "updated_at": "2025-07-19T19:56:14.324965+00:00",
    "benchmark_name": "OCRBench"
  },
  {
    "model_benchmark_id": 1637,
    "benchmark_id": "realworldqa",
    "model_id": "deepseek-vl2-small",
    "score": 0.654,
    "normalized_score": 0.654,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2412.10302",
    "verified_by_llmstats": false,
    "analysis_method": "Standard Evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.604508+00:00",
    "updated_at": "2025-07-19T19:56:14.604508+00:00",
    "benchmark_name": "RealWorldQA"
  },
  {
    "model_benchmark_id": 914,
    "benchmark_id": "textvqa",
    "model_id": "deepseek-vl2-small",
    "score": 0.834,
    "normalized_score": 0.834,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2412.10302",
    "verified_by_llmstats": false,
    "analysis_method": "val",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.906237+00:00",
    "updated_at": "2025-07-19T19:56:12.906237+00:00",
    "benchmark_name": "TextVQA"
  }
]

================================================
FILE: data/organizations/deepseek/models/deepseek-vl2-small/model.json
================================================
{
  "model_id": "deepseek-vl2-small",
  "name": "DeepSeek VL2 Small",
  "organization_id": "deepseek",
  "fine_tuned_from_model_id": null,
  "description": "An advanced series of large Mixture-of-Experts (MoE) Vision-Language Models that significantly improves upon its predecessor, DeepSeek-VL. DeepSeek-VL2 demonstrates superior capabilities across various tasks, including but not limited to visual question answering, optical character recognition, document/table/chart understanding, and visual grounding.",
  "release_date": "2024-12-13",
  "announcement_date": "2024-12-13",
  "license_id": "deepseek",
  "multimodal": true,
  "knowledge_cutoff": null,
  "param_count": 16000000000,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://www.deepseek.com/",
  "source_playground": "https://huggingface.co/deepseek-ai/deepseek-vl2-small",
  "source_paper": "https://arxiv.org/pdf/2412.10302",
  "source_scorecard_blog_link": null,
  "source_repo_link": "https://github.com/deepseek-ai/DeepSeek-VL2",
  "source_weights_link": "https://huggingface.co/deepseek-ai/deepseek-vl2-small",
  "created_at": "2025-07-19T19:49:05.666424+00:00",
  "updated_at": "2025-07-19T19:49:05.666424+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/deepseek/models/deepseek-vl2-tiny/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 1257,
    "benchmark_id": "ai2d",
    "model_id": "deepseek-vl2-tiny",
    "score": 0.716,
    "normalized_score": 0.716,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2412.10302",
    "verified_by_llmstats": false,
    "analysis_method": "test",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.638556+00:00",
    "updated_at": "2025-07-19T19:56:13.638556+00:00",
    "benchmark_name": "AI2D"
  },
  {
    "model_benchmark_id": 869,
    "benchmark_id": "chartqa",
    "model_id": "deepseek-vl2-tiny",
    "score": 0.81,
    "normalized_score": 0.81,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2412.10302",
    "verified_by_llmstats": false,
    "analysis_method": "test",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.814592+00:00",
    "updated_at": "2025-07-19T19:56:12.814592+00:00",
    "benchmark_name": "ChartQA"
  },
  {
    "model_benchmark_id": 891,
    "benchmark_id": "docvqa",
    "model_id": "deepseek-vl2-tiny",
    "score": 0.889,
    "normalized_score": 0.889,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2412.10302",
    "verified_by_llmstats": false,
    "analysis_method": "test",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.854588+00:00",
    "updated_at": "2025-07-19T19:56:12.854588+00:00",
    "benchmark_name": "DocVQA"
  },
  {
    "model_benchmark_id": 1245,
    "benchmark_id": "infovqa",
    "model_id": "deepseek-vl2-tiny",
    "score": 0.661,
    "normalized_score": 0.661,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2412.10302",
    "verified_by_llmstats": false,
    "analysis_method": "test",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.616113+00:00",
    "updated_at": "2025-07-19T19:56:13.616113+00:00",
    "benchmark_name": "InfoVQA"
  },
  {
    "model_benchmark_id": 529,
    "benchmark_id": "mathvista",
    "model_id": "deepseek-vl2-tiny",
    "score": 0.536,
    "normalized_score": 0.536,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2412.10302",
    "verified_by_llmstats": false,
    "analysis_method": "testmini",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.098477+00:00",
    "updated_at": "2025-07-19T19:56:12.098477+00:00",
    "benchmark_name": "MathVista"
  },
  {
    "model_benchmark_id": 1515,
    "benchmark_id": "mmbench",
    "model_id": "deepseek-vl2-tiny",
    "score": 0.692,
    "normalized_score": 0.692,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2412.10302",
    "verified_by_llmstats": false,
    "analysis_method": "en test",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.249349+00:00",
    "updated_at": "2025-07-19T19:56:14.251060+00:00",
    "benchmark_name": "MMBench"
  },
  {
    "model_benchmark_id": 1728,
    "benchmark_id": "mmbench-v1.1",
    "model_id": "deepseek-vl2-tiny",
    "score": 0.683,
    "normalized_score": 0.683,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2412.10302",
    "verified_by_llmstats": false,
    "analysis_method": "cn test",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.875207+00:00",
    "updated_at": "2025-07-19T19:56:14.875207+00:00",
    "benchmark_name": "MMBench-V1.1"
  },
  {
    "model_benchmark_id": 1785,
    "benchmark_id": "mme",
    "model_id": "deepseek-vl2-tiny",
    "score": 0.1915,
    "normalized_score": 0.1915,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2412.10302",
    "verified_by_llmstats": false,
    "analysis_method": "Standard Evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.026734+00:00",
    "updated_at": "2025-07-19T19:56:15.026734+00:00",
    "benchmark_name": "MME"
  },
  {
    "model_benchmark_id": 575,
    "benchmark_id": "mmmu",
    "model_id": "deepseek-vl2-tiny",
    "score": 0.407,
    "normalized_score": 0.407,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2412.10302",
    "verified_by_llmstats": false,
    "analysis_method": "val",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.183016+00:00",
    "updated_at": "2025-07-19T19:56:12.183016+00:00",
    "benchmark_name": "MMMU"
  },
  {
    "model_benchmark_id": 1664,
    "benchmark_id": "mmstar",
    "model_id": "deepseek-vl2-tiny",
    "score": 0.459,
    "normalized_score": 0.459,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2412.10302",
    "verified_by_llmstats": false,
    "analysis_method": "Standard Evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.671412+00:00",
    "updated_at": "2025-07-19T19:56:14.671412+00:00",
    "benchmark_name": "MMStar"
  },
  {
    "model_benchmark_id": 1668,
    "benchmark_id": "mmt-bench",
    "model_id": "deepseek-vl2-tiny",
    "score": 0.532,
    "normalized_score": 0.532,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2412.10302",
    "verified_by_llmstats": false,
    "analysis_method": "Standard Evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.681683+00:00",
    "updated_at": "2025-07-19T19:56:14.681683+00:00",
    "benchmark_name": "MMT-Bench"
  },
  {
    "model_benchmark_id": 1543,
    "benchmark_id": "ocrbench",
    "model_id": "deepseek-vl2-tiny",
    "score": 0.809,
    "normalized_score": 0.809,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2412.10302",
    "verified_by_llmstats": false,
    "analysis_method": "Standard Evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.321888+00:00",
    "updated_at": "2025-07-19T19:56:14.321888+00:00",
    "benchmark_name": "OCRBench"
  },
  {
    "model_benchmark_id": 1636,
    "benchmark_id": "realworldqa",
    "model_id": "deepseek-vl2-tiny",
    "score": 0.642,
    "normalized_score": 0.642,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2412.10302",
    "verified_by_llmstats": false,
    "analysis_method": "Standard Evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.602948+00:00",
    "updated_at": "2025-07-19T19:56:14.602948+00:00",
    "benchmark_name": "RealWorldQA"
  },
  {
    "model_benchmark_id": 913,
    "benchmark_id": "textvqa",
    "model_id": "deepseek-vl2-tiny",
    "score": 0.807,
    "normalized_score": 0.807,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2412.10302",
    "verified_by_llmstats": false,
    "analysis_method": "val",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.904238+00:00",
    "updated_at": "2025-07-19T19:56:12.904238+00:00",
    "benchmark_name": "TextVQA"
  }
]

================================================
FILE: data/organizations/deepseek/models/deepseek-vl2-tiny/model.json
================================================
{
  "model_id": "deepseek-vl2-tiny",
  "name": "DeepSeek VL2 Tiny",
  "organization_id": "deepseek",
  "fine_tuned_from_model_id": null,
  "description": "An advanced series of large Mixture-of-Experts (MoE) Vision-Language Models that significantly improves upon its predecessor, DeepSeek-VL. DeepSeek-VL2 demonstrates superior capabilities across various tasks, including but not limited to visual question answering, optical character recognition, document/table/chart understanding, and visual grounding.",
  "release_date": "2024-12-13",
  "announcement_date": "2024-12-13",
  "license_id": "deepseek",
  "multimodal": true,
  "knowledge_cutoff": null,
  "param_count": 3000000000,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://www.deepseek.com/",
  "source_playground": "https://huggingface.co/deepseek-ai/deepseek-vl2-tiny",
  "source_paper": "https://arxiv.org/pdf/2412.10302",
  "source_scorecard_blog_link": null,
  "source_repo_link": "https://github.com/deepseek-ai/DeepSeek-VL2",
  "source_weights_link": "https://huggingface.co/deepseek-ai/deepseek-vl2-tiny",
  "created_at": "2025-07-19T19:49:05.662552+00:00",
  "updated_at": "2025-07-19T19:49:05.662552+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/deepseek/organization.json
================================================
{
  "organization_id": "deepseek",
  "name": "DeepSeek",
  "website": "https://deepseek.com",
  "description": "Chinese AI company developing state-of-the-art large language models including the DeepSeek-V3 series with mixture-of-experts architecture and hybrid thinking/non-thinking capabilities",
  "country": "CN",
  "created_at": "2025-07-19T19:49:05.655332+00:00",
  "updated_at": "2025-09-15T00:00:00.000000+00:00"
}


================================================
FILE: data/organizations/google/models/gemini-1.0-pro/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 1390,
    "benchmark_id": "big-bench",
    "model_id": "gemini-1.0-pro",
    "score": 0.75,
    "normalized_score": 0.75,
    "is_self_reported": false,
    "self_reported_source_link": "https://example.com/benchmark-image",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.928761+00:00",
    "updated_at": "2025-07-19T19:56:13.928761+00:00",
    "benchmark_name": "BIG-Bench"
  },
  {
    "model_benchmark_id": 920,
    "benchmark_id": "egoschema",
    "model_id": "gemini-1.0-pro",
    "score": 0.557,
    "normalized_score": 0.557,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/technologies/gemini/pro/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.922622+00:00",
    "updated_at": "2025-07-19T19:56:12.922622+00:00",
    "benchmark_name": "EgoSchema"
  },
  {
    "model_benchmark_id": 1397,
    "benchmark_id": "fleurs",
    "model_id": "gemini-1.0-pro",
    "score": 0.064,
    "normalized_score": 0.064,
    "is_self_reported": false,
    "self_reported_source_link": "https://example.com/benchmark-image",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.946039+00:00",
    "updated_at": "2025-07-19T19:56:13.946039+00:00",
    "benchmark_name": "FLEURS"
  },
  {
    "model_benchmark_id": 264,
    "benchmark_id": "gpqa",
    "model_id": "gemini-1.0-pro",
    "score": 0.279,
    "normalized_score": 0.279,
    "is_self_reported": false,
    "self_reported_source_link": "https://example.com/benchmark-image",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.607534+00:00",
    "updated_at": "2025-07-19T19:56:11.607534+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 378,
    "benchmark_id": "math",
    "model_id": "gemini-1.0-pro",
    "score": 0.326,
    "normalized_score": 0.326,
    "is_self_reported": false,
    "self_reported_source_link": "https://example.com/benchmark-image",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.817378+00:00",
    "updated_at": "2025-07-19T19:56:11.817378+00:00",
    "benchmark_name": "MATH"
  },
  {
    "model_benchmark_id": 516,
    "benchmark_id": "mathvista",
    "model_id": "gemini-1.0-pro",
    "score": 0.466,
    "normalized_score": 0.466,
    "is_self_reported": false,
    "self_reported_source_link": "https://example.com/benchmark-image",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.073663+00:00",
    "updated_at": "2025-07-19T19:56:12.073663+00:00",
    "benchmark_name": "MathVista"
  },
  {
    "model_benchmark_id": 64,
    "benchmark_id": "mmlu",
    "model_id": "gemini-1.0-pro",
    "score": 0.718,
    "normalized_score": 0.718,
    "is_self_reported": true,
    "self_reported_source_link": "https://example.com/benchmark-image",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.221259+00:00",
    "updated_at": "2025-07-19T19:56:11.221259+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 553,
    "benchmark_id": "mmmu",
    "model_id": "gemini-1.0-pro",
    "score": 0.479,
    "normalized_score": 0.479,
    "is_self_reported": false,
    "self_reported_source_link": "https://example.com/benchmark-image",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.139083+00:00",
    "updated_at": "2025-07-19T19:56:12.139083+00:00",
    "benchmark_name": "MMMU"
  },
  {
    "model_benchmark_id": 1393,
    "benchmark_id": "wmt23",
    "model_id": "gemini-1.0-pro",
    "score": 0.717,
    "normalized_score": 0.717,
    "is_self_reported": false,
    "self_reported_source_link": "https://example.com/benchmark-image",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.937549+00:00",
    "updated_at": "2025-07-19T19:56:13.937549+00:00",
    "benchmark_name": "WMT23"
  }
]

================================================
FILE: data/organizations/google/models/gemini-1.0-pro/model.json
================================================
{
  "model_id": "gemini-1.0-pro",
  "name": "Gemini 1.0 Pro",
  "organization_id": "google",
  "fine_tuned_from_model_id": null,
  "description": "Gemini 1.0 Pro is a Natural Language Processing (NLP) model designed for tasks such as multi-turn text and code chat, and code generation. It supports text input and output, making it ideal for natural language tasks. The model is optimized for handling complex conversations and generating code snippets. It offers adjustable safety settings and supports function calling, but does not support JSON mode, JSON schema, or system instructions. The latest stable version is gemini-1.0-pro-001, and it was last updated in February 2024.",
  "release_date": "2024-02-15",
  "announcement_date": "2024-02-15",
  "license_id": "proprietary",
  "multimodal": false,
  "knowledge_cutoff": "2024-02-01",
  "param_count": null,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://ai.google.dev/gemini-api/docs/models/gemini#gemini-1.0-pro",
  "source_playground": "https://gemini.google/advanced/",
  "source_paper": "https://arxiv.org/pdf/2312.11805",
  "source_scorecard_blog_link": "https://blog.google/technology/ai/google-gemini-ai/#scalable-efficient",
  "source_repo_link": null,
  "source_weights_link": null,
  "created_at": "2025-07-19T19:49:05.461784+00:00",
  "updated_at": "2025-07-19T19:49:05.461784+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/google/models/gemini-1.5-flash/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 1417,
    "benchmark_id": "amc-2022-23",
    "model_id": "gemini-1.5-flash",
    "score": 0.348,
    "normalized_score": 0.348,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.maa.org/math-competitions/amc-1012",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy (4-shot)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.997413+00:00",
    "updated_at": "2025-07-19T19:56:13.997413+00:00",
    "benchmark_name": "AMC_2022_23"
  },
  {
    "model_benchmark_id": 1072,
    "benchmark_id": "big-bench-hard",
    "model_id": "gemini-1.5-flash",
    "score": 0.855,
    "normalized_score": 0.855,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/abs/2206.04615",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy (3-shot)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.235605+00:00",
    "updated_at": "2025-07-19T19:56:13.235605+00:00",
    "benchmark_name": "BIG-Bench Hard"
  },
  {
    "model_benchmark_id": 1399,
    "benchmark_id": "fleurs",
    "model_id": "gemini-1.5-flash",
    "score": 0.096,
    "normalized_score": 0.096,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/technologies/gemini/flash/",
    "verified_by_llmstats": false,
    "analysis_method": "Word Error Rate",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.949679+00:00",
    "updated_at": "2025-07-19T19:56:13.949679+00:00",
    "benchmark_name": "FLEURS"
  },
  {
    "model_benchmark_id": 1415,
    "benchmark_id": "functionalmath",
    "model_id": "gemini-1.5-flash",
    "score": 0.536,
    "normalized_score": 0.536,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/abs/2201.04723",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy (0-shot)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.991969+00:00",
    "updated_at": "2025-07-19T19:56:13.991969+00:00",
    "benchmark_name": "FunctionalMATH"
  },
  {
    "model_benchmark_id": 272,
    "benchmark_id": "gpqa",
    "model_id": "gemini-1.5-flash",
    "score": 0.51,
    "normalized_score": 0.51,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/technologies/gemini/flash/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.622361+00:00",
    "updated_at": "2025-07-19T19:56:11.622361+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 981,
    "benchmark_id": "gsm8k",
    "model_id": "gemini-1.5-flash",
    "score": 0.862,
    "normalized_score": 0.862,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/abs/2110.14168",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy (11-shot)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.060014+00:00",
    "updated_at": "2025-07-19T19:56:13.060014+00:00",
    "benchmark_name": "GSM8k"
  },
  {
    "model_benchmark_id": 40,
    "benchmark_id": "hellaswag",
    "model_id": "gemini-1.5-flash",
    "score": 0.865,
    "normalized_score": 0.865,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/abs/1905.07830",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy (10-shot)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.168455+00:00",
    "updated_at": "2025-07-19T19:56:11.168455+00:00",
    "benchmark_name": "HellaSwag"
  },
  {
    "model_benchmark_id": 1158,
    "benchmark_id": "hiddenmath",
    "model_id": "gemini-1.5-flash",
    "score": 0.472,
    "normalized_score": 0.472,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/technologies/gemini/flash/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.436585+00:00",
    "updated_at": "2025-07-19T19:56:13.436585+00:00",
    "benchmark_name": "HiddenMath"
  },
  {
    "model_benchmark_id": 768,
    "benchmark_id": "humaneval",
    "model_id": "gemini-1.5-flash",
    "score": 0.743,
    "normalized_score": 0.743,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/technologies/gemini/flash/",
    "verified_by_llmstats": false,
    "analysis_method": "Pass Rate",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.617215+00:00",
    "updated_at": "2025-07-19T19:56:12.617215+00:00",
    "benchmark_name": "HumanEval"
  },
  {
    "model_benchmark_id": 383,
    "benchmark_id": "math",
    "model_id": "gemini-1.5-flash",
    "score": 0.779,
    "normalized_score": 0.779,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/technologies/gemini/flash/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.826586+00:00",
    "updated_at": "2025-07-19T19:56:11.826586+00:00",
    "benchmark_name": "MATH"
  },
  {
    "model_benchmark_id": 518,
    "benchmark_id": "mathvista",
    "model_id": "gemini-1.5-flash",
    "score": 0.658,
    "normalized_score": 0.658,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/technologies/gemini/flash/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.077492+00:00",
    "updated_at": "2025-07-19T19:56:12.077492+00:00",
    "benchmark_name": "MathVista"
  },
  {
    "model_benchmark_id": 1276,
    "benchmark_id": "mgsm",
    "model_id": "gemini-1.5-flash",
    "score": 0.826,
    "normalized_score": 0.826,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/abs/2305.08916",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy (8-shot)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.676395+00:00",
    "updated_at": "2025-07-19T19:56:13.676395+00:00",
    "benchmark_name": "MGSM"
  },
  {
    "model_benchmark_id": 69,
    "benchmark_id": "mmlu",
    "model_id": "gemini-1.5-flash",
    "score": 0.789,
    "normalized_score": 0.789,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/abs/2403.05530",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.229674+00:00",
    "updated_at": "2025-07-19T19:56:11.229674+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 168,
    "benchmark_id": "mmlu-pro",
    "model_id": "gemini-1.5-flash",
    "score": 0.673,
    "normalized_score": 0.673,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/technologies/gemini/flash/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.426986+00:00",
    "updated_at": "2025-07-19T19:56:11.426986+00:00",
    "benchmark_name": "MMLU-Pro"
  },
  {
    "model_benchmark_id": 560,
    "benchmark_id": "mmmu",
    "model_id": "gemini-1.5-flash",
    "score": 0.623,
    "normalized_score": 0.623,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/technologies/gemini/flash/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.153019+00:00",
    "updated_at": "2025-07-19T19:56:12.153019+00:00",
    "benchmark_name": "MMMU"
  },
  {
    "model_benchmark_id": 1376,
    "benchmark_id": "mrcr",
    "model_id": "gemini-1.5-flash",
    "score": 0.719,
    "normalized_score": 0.719,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/technologies/gemini/flash/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.896456+00:00",
    "updated_at": "2025-07-19T19:56:13.896456+00:00",
    "benchmark_name": "MRCR"
  },
  {
    "model_benchmark_id": 1199,
    "benchmark_id": "natural2code",
    "model_id": "gemini-1.5-flash",
    "score": 0.798,
    "normalized_score": 0.798,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/technologies/gemini/flash/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.525034+00:00",
    "updated_at": "2025-07-19T19:56:13.525034+00:00",
    "benchmark_name": "Natural2Code"
  },
  {
    "model_benchmark_id": 1413,
    "benchmark_id": "physicsfinals",
    "model_id": "gemini-1.5-flash",
    "score": 0.574,
    "normalized_score": 0.574,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/abs/2303.16416",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy (0-shot)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.986673+00:00",
    "updated_at": "2025-07-19T19:56:13.986673+00:00",
    "benchmark_name": "PhysicsFinals"
  },
  {
    "model_benchmark_id": 1369,
    "benchmark_id": "vibe-eval",
    "model_id": "gemini-1.5-flash",
    "score": 0.489,
    "normalized_score": 0.489,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/technologies/gemini/flash/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.882991+00:00",
    "updated_at": "2025-07-19T19:56:13.882991+00:00",
    "benchmark_name": "Vibe-Eval"
  },
  {
    "model_benchmark_id": 1381,
    "benchmark_id": "video-mme",
    "model_id": "gemini-1.5-flash",
    "score": 0.761,
    "normalized_score": 0.761,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/technologies/gemini/flash/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.908485+00:00",
    "updated_at": "2025-07-19T19:56:13.908485+00:00",
    "benchmark_name": "Video-MME"
  },
  {
    "model_benchmark_id": 1395,
    "benchmark_id": "wmt23",
    "model_id": "gemini-1.5-flash",
    "score": 0.741,
    "normalized_score": 0.741,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/technologies/gemini/flash/",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.940965+00:00",
    "updated_at": "2025-07-19T19:56:13.940965+00:00",
    "benchmark_name": "WMT23"
  },
  {
    "model_benchmark_id": 1419,
    "benchmark_id": "xstest",
    "model_id": "gemini-1.5-flash",
    "score": 0.97,
    "normalized_score": 0.97,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/technologies/gemini/flash/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.004109+00:00",
    "updated_at": "2025-07-19T19:56:14.004109+00:00",
    "benchmark_name": "XSTest"
  }
]

================================================
FILE: data/organizations/google/models/gemini-1.5-flash/model.json
================================================
{
  "model_id": "gemini-1.5-flash",
  "name": "Gemini 1.5 Flash",
  "organization_id": "google",
  "fine_tuned_from_model_id": null,
  "description": "Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. It supports audio, images, video, and text input, and produces text output. The model is optimized for generating code, extracting data, editing text, and more, making it ideal for narrow, high-frequency tasks.",
  "release_date": "2024-05-01",
  "announcement_date": "2024-05-01",
  "license_id": "proprietary",
  "multimodal": true,
  "knowledge_cutoff": "2023-11-01",
  "param_count": null,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://ai.google.dev/gemini-api/docs/models/gemini#gemini-1.5-flash",
  "source_playground": "https://ai.google.dev/studio",
  "source_paper": "https://arxiv.org/pdf/2403.05530",
  "source_scorecard_blog_link": "https://deepmind.google/technologies/gemini/flash/",
  "source_repo_link": null,
  "source_weights_link": null,
  "created_at": "2025-07-19T19:49:05.514569+00:00",
  "updated_at": "2025-07-19T19:49:05.514569+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/google/models/gemini-1.5-flash-8b/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 1400,
    "benchmark_id": "fleurs",
    "model_id": "gemini-1.5-flash-8b",
    "score": 0.864,
    "normalized_score": 0.864,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/technologies/gemini/flash/",
    "verified_by_llmstats": false,
    "analysis_method": "Speech recognition accuracy (1 - WER)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.951665+00:00",
    "updated_at": "2025-07-19T19:56:13.951665+00:00",
    "benchmark_name": "FLEURS"
  },
  {
    "model_benchmark_id": 277,
    "benchmark_id": "gpqa",
    "model_id": "gemini-1.5-flash-8b",
    "score": 0.384,
    "normalized_score": 0.384,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/technologies/gemini/flash/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy on expert-written science questions",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.635441+00:00",
    "updated_at": "2025-07-19T19:56:11.635441+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 1163,
    "benchmark_id": "hiddenmath",
    "model_id": "gemini-1.5-flash-8b",
    "score": 0.328,
    "normalized_score": 0.328,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/technologies/gemini/flash/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy on competition-level math problems",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.447290+00:00",
    "updated_at": "2025-07-19T19:56:13.447290+00:00",
    "benchmark_name": "HiddenMath"
  },
  {
    "model_benchmark_id": 387,
    "benchmark_id": "math",
    "model_id": "gemini-1.5-flash-8b",
    "score": 0.587,
    "normalized_score": 0.587,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/technologies/gemini/flash/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy on mathematical problem-solving tasks",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.834192+00:00",
    "updated_at": "2025-07-19T19:56:11.834192+00:00",
    "benchmark_name": "MATH"
  },
  {
    "model_benchmark_id": 519,
    "benchmark_id": "mathvista",
    "model_id": "gemini-1.5-flash-8b",
    "score": 0.547,
    "normalized_score": 0.547,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/technologies/gemini/flash/",
    "verified_by_llmstats": false,
    "analysis_method": "Visual mathematical reasoning accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.078820+00:00",
    "updated_at": "2025-07-19T19:56:12.078820+00:00",
    "benchmark_name": "MathVista"
  },
  {
    "model_benchmark_id": 173,
    "benchmark_id": "mmlu-pro",
    "model_id": "gemini-1.5-flash-8b",
    "score": 0.587,
    "normalized_score": 0.587,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/technologies/gemini/flash/",
    "verified_by_llmstats": false,
    "analysis_method": "Multiple choice accuracy across enhanced MMLU dataset with higher difficulty tasks",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.436045+00:00",
    "updated_at": "2025-07-19T19:56:11.436045+00:00",
    "benchmark_name": "MMLU-Pro"
  },
  {
    "model_benchmark_id": 561,
    "benchmark_id": "mmmu",
    "model_id": "gemini-1.5-flash-8b",
    "score": 0.537,
    "normalized_score": 0.537,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/technologies/gemini/flash/",
    "verified_by_llmstats": false,
    "analysis_method": "Multimodal understanding accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.154594+00:00",
    "updated_at": "2025-07-19T19:56:12.154594+00:00",
    "benchmark_name": "MMMU"
  },
  {
    "model_benchmark_id": 1377,
    "benchmark_id": "mrcr",
    "model_id": "gemini-1.5-flash-8b",
    "score": 0.547,
    "normalized_score": 0.547,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/technologies/gemini/flash/",
    "verified_by_llmstats": false,
    "analysis_method": "Long-context comprehension accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.898262+00:00",
    "updated_at": "2025-07-19T19:56:13.898262+00:00",
    "benchmark_name": "MRCR"
  },
  {
    "model_benchmark_id": 1203,
    "benchmark_id": "natural2code",
    "model_id": "gemini-1.5-flash-8b",
    "score": 0.755,
    "normalized_score": 0.755,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/technologies/gemini/flash/",
    "verified_by_llmstats": false,
    "analysis_method": "Pass rate on code generation tasks across multiple programming languages",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.531432+00:00",
    "updated_at": "2025-07-19T19:56:13.531432+00:00",
    "benchmark_name": "Natural2Code"
  },
  {
    "model_benchmark_id": 1370,
    "benchmark_id": "vibe-eval",
    "model_id": "gemini-1.5-flash-8b",
    "score": 0.409,
    "normalized_score": 0.409,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/technologies/gemini/flash/",
    "verified_by_llmstats": false,
    "analysis_method": "Visual understanding evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.885058+00:00",
    "updated_at": "2025-07-19T19:56:13.885058+00:00",
    "benchmark_name": "Vibe-Eval"
  },
  {
    "model_benchmark_id": 1382,
    "benchmark_id": "video-mme",
    "model_id": "gemini-1.5-flash-8b",
    "score": 0.662,
    "normalized_score": 0.662,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/technologies/gemini/flash/",
    "verified_by_llmstats": false,
    "analysis_method": "Video analysis accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.910273+00:00",
    "updated_at": "2025-07-19T19:56:13.910273+00:00",
    "benchmark_name": "Video-MME"
  },
  {
    "model_benchmark_id": 1396,
    "benchmark_id": "wmt23",
    "model_id": "gemini-1.5-flash-8b",
    "score": 0.726,
    "normalized_score": 0.726,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/technologies/gemini/flash/",
    "verified_by_llmstats": false,
    "analysis_method": "Translation quality score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.942779+00:00",
    "updated_at": "2025-07-19T19:56:13.942779+00:00",
    "benchmark_name": "WMT23"
  },
  {
    "model_benchmark_id": 1420,
    "benchmark_id": "xstest",
    "model_id": "gemini-1.5-flash-8b",
    "score": 0.926,
    "normalized_score": 0.926,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/technologies/gemini/flash/",
    "verified_by_llmstats": false,
    "analysis_method": "Safe request fulfillment rate",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.005888+00:00",
    "updated_at": "2025-07-19T19:56:14.005888+00:00",
    "benchmark_name": "XSTest"
  }
]

================================================
FILE: data/organizations/google/models/gemini-1.5-flash-8b/model.json
================================================
{
  "model_id": "gemini-1.5-flash-8b",
  "name": "Gemini 1.5 Flash 8B",
  "organization_id": "google",
  "fine_tuned_from_model_id": null,
  "description": "A multimodal model capable of processing audio, images, video, and text with high efficiency. Features JSON mode, function calling, code execution, and system instructions support. Optimized for fast inference with 8B parameters.",
  "release_date": "2024-03-15",
  "announcement_date": "2024-03-15",
  "license_id": "proprietary",
  "multimodal": true,
  "knowledge_cutoff": "2024-10-01",
  "param_count": 8000000000,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://ai.google.dev/docs/gemini_1.5_flash",
  "source_playground": "https://ai.google.dev/studio",
  "source_paper": null,
  "source_scorecard_blog_link": null,
  "source_repo_link": "https://github.com/google/generative-ai",
  "source_weights_link": null,
  "created_at": "2025-07-19T19:49:05.530672+00:00",
  "updated_at": "2025-07-19T19:49:05.530672+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/google/models/gemini-1.5-pro/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 1416,
    "benchmark_id": "amc-2022-23",
    "model_id": "gemini-1.5-pro",
    "score": 0.464,
    "normalized_score": 0.464,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2403.05530",
    "verified_by_llmstats": false,
    "analysis_method": "4-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.995700+00:00",
    "updated_at": "2025-07-19T19:56:13.995700+00:00",
    "benchmark_name": "AMC_2022_23"
  },
  {
    "model_benchmark_id": 1070,
    "benchmark_id": "big-bench-hard",
    "model_id": "gemini-1.5-pro",
    "score": 0.892,
    "normalized_score": 0.892,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2403.05530",
    "verified_by_llmstats": false,
    "analysis_method": "3-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.231702+00:00",
    "updated_at": "2025-07-19T19:56:13.231702+00:00",
    "benchmark_name": "BIG-Bench Hard"
  },
  {
    "model_benchmark_id": 945,
    "benchmark_id": "drop",
    "model_id": "gemini-1.5-pro",
    "score": 0.749,
    "normalized_score": 0.749,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2403.05530",
    "verified_by_llmstats": false,
    "analysis_method": "Variable shots",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.994980+00:00",
    "updated_at": "2025-07-19T19:56:12.994980+00:00",
    "benchmark_name": "DROP"
  },
  {
    "model_benchmark_id": 1398,
    "benchmark_id": "fleurs",
    "model_id": "gemini-1.5-pro",
    "score": 0.067,
    "normalized_score": 0.067,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/technologies/gemini/pro/",
    "verified_by_llmstats": false,
    "analysis_method": "Word Error Rate",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.947638+00:00",
    "updated_at": "2025-07-19T19:56:13.947638+00:00",
    "benchmark_name": "FLEURS"
  },
  {
    "model_benchmark_id": 1414,
    "benchmark_id": "functionalmath",
    "model_id": "gemini-1.5-pro",
    "score": 0.646,
    "normalized_score": 0.646,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2403.05530",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.990248+00:00",
    "updated_at": "2025-07-19T19:56:13.990248+00:00",
    "benchmark_name": "FunctionalMATH"
  },
  {
    "model_benchmark_id": 268,
    "benchmark_id": "gpqa",
    "model_id": "gemini-1.5-pro",
    "score": 0.591,
    "normalized_score": 0.591,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/technologies/gemini/pro/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.614440+00:00",
    "updated_at": "2025-07-19T19:56:11.614440+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 979,
    "benchmark_id": "gsm8k",
    "model_id": "gemini-1.5-pro",
    "score": 0.908,
    "normalized_score": 0.908,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2403.05530",
    "verified_by_llmstats": false,
    "analysis_method": "11-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.055992+00:00",
    "updated_at": "2025-07-19T19:56:13.055992+00:00",
    "benchmark_name": "GSM8k"
  },
  {
    "model_benchmark_id": 37,
    "benchmark_id": "hellaswag",
    "model_id": "gemini-1.5-pro",
    "score": 0.933,
    "normalized_score": 0.933,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2403.05530",
    "verified_by_llmstats": false,
    "analysis_method": "10-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.158919+00:00",
    "updated_at": "2025-07-19T19:56:11.158919+00:00",
    "benchmark_name": "HellaSwag"
  },
  {
    "model_benchmark_id": 1157,
    "benchmark_id": "hiddenmath",
    "model_id": "gemini-1.5-pro",
    "score": 0.52,
    "normalized_score": 0.52,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/technologies/gemini/pro/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.434888+00:00",
    "updated_at": "2025-07-19T19:56:13.434888+00:00",
    "benchmark_name": "HiddenMath"
  },
  {
    "model_benchmark_id": 766,
    "benchmark_id": "humaneval",
    "model_id": "gemini-1.5-pro",
    "score": 0.841,
    "normalized_score": 0.841,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2403.05530",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.613548+00:00",
    "updated_at": "2025-07-19T19:56:12.613548+00:00",
    "benchmark_name": "HumanEval"
  },
  {
    "model_benchmark_id": 381,
    "benchmark_id": "math",
    "model_id": "gemini-1.5-pro",
    "score": 0.865,
    "normalized_score": 0.865,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/technologies/gemini/pro/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.822515+00:00",
    "updated_at": "2025-07-19T19:56:11.822515+00:00",
    "benchmark_name": "MATH"
  },
  {
    "model_benchmark_id": 517,
    "benchmark_id": "mathvista",
    "model_id": "gemini-1.5-pro",
    "score": 0.681,
    "normalized_score": 0.681,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/technologies/gemini/pro/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.075702+00:00",
    "updated_at": "2025-07-19T19:56:12.075702+00:00",
    "benchmark_name": "MathVista"
  },
  {
    "model_benchmark_id": 1275,
    "benchmark_id": "mgsm",
    "model_id": "gemini-1.5-pro",
    "score": 0.875,
    "normalized_score": 0.875,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2403.05530",
    "verified_by_llmstats": false,
    "analysis_method": "8-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.674684+00:00",
    "updated_at": "2025-07-19T19:56:13.674684+00:00",
    "benchmark_name": "MGSM"
  },
  {
    "model_benchmark_id": 67,
    "benchmark_id": "mmlu",
    "model_id": "gemini-1.5-pro",
    "score": 0.859,
    "normalized_score": 0.859,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2403.05530",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.226593+00:00",
    "updated_at": "2025-07-19T19:56:11.226593+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 167,
    "benchmark_id": "mmlu-pro",
    "model_id": "gemini-1.5-pro",
    "score": 0.758,
    "normalized_score": 0.758,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/technologies/gemini/pro/",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.425109+00:00",
    "updated_at": "2025-07-19T19:56:11.425109+00:00",
    "benchmark_name": "MMLU-Pro"
  },
  {
    "model_benchmark_id": 556,
    "benchmark_id": "mmmu",
    "model_id": "gemini-1.5-pro",
    "score": 0.659,
    "normalized_score": 0.659,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/technologies/gemini/pro/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.145100+00:00",
    "updated_at": "2025-07-19T19:56:12.145100+00:00",
    "benchmark_name": "MMMU"
  },
  {
    "model_benchmark_id": 1373,
    "benchmark_id": "mrcr",
    "model_id": "gemini-1.5-pro",
    "score": 0.826,
    "normalized_score": 0.826,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/technologies/gemini/pro/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.891629+00:00",
    "updated_at": "2025-07-19T19:56:13.891629+00:00",
    "benchmark_name": "MRCR"
  },
  {
    "model_benchmark_id": 1198,
    "benchmark_id": "natural2code",
    "model_id": "gemini-1.5-pro",
    "score": 0.854,
    "normalized_score": 0.854,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/technologies/gemini/pro/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.523328+00:00",
    "updated_at": "2025-07-19T19:56:13.523328+00:00",
    "benchmark_name": "Natural2Code"
  },
  {
    "model_benchmark_id": 1412,
    "benchmark_id": "physicsfinals",
    "model_id": "gemini-1.5-pro",
    "score": 0.639,
    "normalized_score": 0.639,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2403.05530",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.984883+00:00",
    "updated_at": "2025-07-19T19:56:13.984883+00:00",
    "benchmark_name": "PhysicsFinals"
  },
  {
    "model_benchmark_id": 1366,
    "benchmark_id": "vibe-eval",
    "model_id": "gemini-1.5-pro",
    "score": 0.539,
    "normalized_score": 0.539,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/technologies/gemini/pro/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.877591+00:00",
    "updated_at": "2025-07-19T19:56:13.877591+00:00",
    "benchmark_name": "Vibe-Eval"
  },
  {
    "model_benchmark_id": 1380,
    "benchmark_id": "video-mme",
    "model_id": "gemini-1.5-pro",
    "score": 0.786,
    "normalized_score": 0.786,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/technologies/gemini/pro/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.906552+00:00",
    "updated_at": "2025-07-19T19:56:13.906552+00:00",
    "benchmark_name": "Video-MME"
  },
  {
    "model_benchmark_id": 1394,
    "benchmark_id": "wmt23",
    "model_id": "gemini-1.5-pro",
    "score": 0.751,
    "normalized_score": 0.751,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/technologies/gemini/pro/",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.939104+00:00",
    "updated_at": "2025-07-19T19:56:13.939104+00:00",
    "benchmark_name": "WMT23"
  },
  {
    "model_benchmark_id": 1418,
    "benchmark_id": "xstest",
    "model_id": "gemini-1.5-pro",
    "score": 0.988,
    "normalized_score": 0.988,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/technologies/gemini/pro/",
    "verified_by_llmstats": false,
    "analysis_method": "Safety Compliance",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.002222+00:00",
    "updated_at": "2025-07-19T19:56:14.002222+00:00",
    "benchmark_name": "XSTest"
  }
]

================================================
FILE: data/organizations/google/models/gemini-1.5-pro/model.json
================================================
{
  "model_id": "gemini-1.5-pro",
  "name": "Gemini 1.5 Pro",
  "organization_id": "google",
  "fine_tuned_from_model_id": null,
  "description": "Gemini 1.5 Pro is a mid-size multimodal model optimized for a wide range of reasoning tasks. It can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text.",
  "release_date": "2024-05-01",
  "announcement_date": "2024-05-01",
  "license_id": "proprietary",
  "multimodal": true,
  "knowledge_cutoff": "2023-11-01",
  "param_count": null,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://ai.google.dev/gemini-api/docs/models/gemini#gemini-1.5-pro",
  "source_playground": "https://ai.google.dev/studio",
  "source_paper": "https://arxiv.org/pdf/2403.05530",
  "source_scorecard_blog_link": "https://deepmind.google/technologies/gemini/pro/",
  "source_repo_link": null,
  "source_weights_link": null,
  "created_at": "2025-07-19T19:49:05.481673+00:00",
  "updated_at": "2025-07-19T19:49:05.481673+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/google/models/gemini-2.0-flash/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 1152,
    "benchmark_id": "bird-sql-(dev)",
    "model_id": "gemini-2.0-flash",
    "score": 0.569,
    "normalized_score": 0.569,
    "is_self_reported": true,
    "self_reported_source_link": "https://blog.google/technology/google-deepmind/google-gemini-ai-update-december-2024/",
    "verified_by_llmstats": false,
    "analysis_method": "Natural language to SQL conversion evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.423568+00:00",
    "updated_at": "2025-07-19T19:56:13.423568+00:00",
    "benchmark_name": "Bird-SQL (dev)"
  },
  {
    "model_benchmark_id": 1404,
    "benchmark_id": "covost2",
    "model_id": "gemini-2.0-flash",
    "score": 0.392,
    "normalized_score": 0.392,
    "is_self_reported": true,
    "self_reported_source_link": "https://blog.google/technology/google-deepmind/google-gemini-ai-update-december-2024/",
    "verified_by_llmstats": false,
    "analysis_method": "Automatic speech translation (BLEU score) across 21 languages",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.962212+00:00",
    "updated_at": "2025-07-19T19:56:13.962212+00:00",
    "benchmark_name": "CoVoST2"
  },
  {
    "model_benchmark_id": 922,
    "benchmark_id": "egoschema",
    "model_id": "gemini-2.0-flash",
    "score": 0.715,
    "normalized_score": 0.715,
    "is_self_reported": true,
    "self_reported_source_link": "https://blog.google/technology/google-deepmind/google-gemini-ai-update-december-2024/",
    "verified_by_llmstats": false,
    "analysis_method": "Video analysis across multiple domains",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.926117+00:00",
    "updated_at": "2025-07-19T19:56:12.926117+00:00",
    "benchmark_name": "EgoSchema"
  },
  {
    "model_benchmark_id": 1095,
    "benchmark_id": "facts-grounding",
    "model_id": "gemini-2.0-flash",
    "score": 0.836,
    "normalized_score": 0.836,
    "is_self_reported": true,
    "self_reported_source_link": "https://blog.google/technology/google-deepmind/google-gemini-ai-update-december-2024/",
    "verified_by_llmstats": false,
    "analysis_method": "Ability to provide factuality correct responses given documents and diverse user requests",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.278460+00:00",
    "updated_at": "2025-07-19T19:56:13.278460+00:00",
    "benchmark_name": "FACTS Grounding"
  },
  {
    "model_benchmark_id": 279,
    "benchmark_id": "gpqa",
    "model_id": "gemini-2.0-flash",
    "score": 0.621,
    "normalized_score": 0.621,
    "is_self_reported": true,
    "self_reported_source_link": "https://blog.google/technology/google-deepmind/google-gemini-ai-update-december-2024/",
    "verified_by_llmstats": false,
    "analysis_method": "Challenging dataset of questions written by domain experts in biology, physics, and chemistry",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.639283+00:00",
    "updated_at": "2025-07-19T19:56:11.639283+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 1164,
    "benchmark_id": "hiddenmath",
    "model_id": "gemini-2.0-flash",
    "score": 0.63,
    "normalized_score": 0.63,
    "is_self_reported": true,
    "self_reported_source_link": "https://blog.google/technology/google-deepmind/google-gemini-ai-update-december-2024/",
    "verified_by_llmstats": false,
    "analysis_method": "Competition-level math problems, Held out dataset AIME/AMC-like",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.449979+00:00",
    "updated_at": "2025-07-19T19:56:13.449979+00:00",
    "benchmark_name": "HiddenMath"
  },
  {
    "model_benchmark_id": 1111,
    "benchmark_id": "livecodebench",
    "model_id": "gemini-2.0-flash",
    "score": 0.351,
    "normalized_score": 0.351,
    "is_self_reported": true,
    "self_reported_source_link": "https://blog.google/technology/google-deepmind/google-gemini-ai-update-december-2024/",
    "verified_by_llmstats": false,
    "analysis_method": "Code generation in Python. Code Generation subset covering more recent examples: 06/01/2024 - 10/05/2024",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.317443+00:00",
    "updated_at": "2025-07-19T19:56:13.317443+00:00",
    "benchmark_name": "LiveCodeBench"
  },
  {
    "model_benchmark_id": 388,
    "benchmark_id": "math",
    "model_id": "gemini-2.0-flash",
    "score": 0.897,
    "normalized_score": 0.897,
    "is_self_reported": true,
    "self_reported_source_link": "https://blog.google/technology/google-deepmind/google-gemini-ai-update-december-2024/",
    "verified_by_llmstats": false,
    "analysis_method": "Challenging math problems including algebra, geometry, pre-calculus, and others",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.835842+00:00",
    "updated_at": "2025-07-19T19:56:11.835842+00:00",
    "benchmark_name": "MATH"
  },
  {
    "model_benchmark_id": 174,
    "benchmark_id": "mmlu-pro",
    "model_id": "gemini-2.0-flash",
    "score": 0.764,
    "normalized_score": 0.764,
    "is_self_reported": true,
    "self_reported_source_link": "https://blog.google/technology/google-deepmind/google-gemini-ai-update-december-2024/",
    "verified_by_llmstats": false,
    "analysis_method": "Enhanced version of MMLU dataset evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.437540+00:00",
    "updated_at": "2025-07-19T19:56:11.437540+00:00",
    "benchmark_name": "MMLU-Pro"
  },
  {
    "model_benchmark_id": 562,
    "benchmark_id": "mmmu",
    "model_id": "gemini-2.0-flash",
    "score": 0.707,
    "normalized_score": 0.707,
    "is_self_reported": true,
    "self_reported_source_link": "https://blog.google/technology/google-deepmind/google-gemini-ai-update-december-2024/",
    "verified_by_llmstats": false,
    "analysis_method": "Multi-discipline college-level multimodal understanding and reasoning problems",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.156776+00:00",
    "updated_at": "2025-07-19T19:56:12.156776+00:00",
    "benchmark_name": "MMMU"
  },
  {
    "model_benchmark_id": 1378,
    "benchmark_id": "mrcr",
    "model_id": "gemini-2.0-flash",
    "score": 0.692,
    "normalized_score": 0.692,
    "is_self_reported": true,
    "self_reported_source_link": "https://blog.google/technology/google-deepmind/google-gemini-ai-update-december-2024/",
    "verified_by_llmstats": false,
    "analysis_method": "Novel, diagnostic long-context understanding evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.900780+00:00",
    "updated_at": "2025-07-19T19:56:13.900780+00:00",
    "benchmark_name": "MRCR"
  },
  {
    "model_benchmark_id": 1204,
    "benchmark_id": "natural2code",
    "model_id": "gemini-2.0-flash",
    "score": 0.929,
    "normalized_score": 0.929,
    "is_self_reported": true,
    "self_reported_source_link": "https://blog.google/technology/google-deepmind/google-gemini-ai-update-december-2024/",
    "verified_by_llmstats": false,
    "analysis_method": "Code generation evaluation across multiple languages",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.533525+00:00",
    "updated_at": "2025-07-19T19:56:13.533525+00:00",
    "benchmark_name": "Natural2Code"
  },
  {
    "model_benchmark_id": 1371,
    "benchmark_id": "vibe-eval",
    "model_id": "gemini-2.0-flash",
    "score": 0.563,
    "normalized_score": 0.563,
    "is_self_reported": true,
    "self_reported_source_link": "https://blog.google/technology/google-deepmind/google-gemini-ai-update-december-2024/",
    "verified_by_llmstats": false,
    "analysis_method": "Visual understanding in chat models with challenging everyday examples",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.886575+00:00",
    "updated_at": "2025-07-19T19:56:13.886575+00:00",
    "benchmark_name": "Vibe-Eval"
  }
]

================================================
FILE: data/organizations/google/models/gemini-2.0-flash/model.json
================================================
{
  "model_id": "gemini-2.0-flash",
  "name": "Gemini 2.0 Flash",
  "organization_id": "google",
  "fine_tuned_from_model_id": null,
  "description": "Next-generation model featuring superior speed, native tool use, multimodal generation, and a 1M token context window. Supports audio, images, video, and text input with capabilities for structured outputs, function calling, code execution, search, and multimodal operations.",
  "release_date": "2024-12-01",
  "announcement_date": "2024-12-01",
  "license_id": "proprietary",
  "multimodal": true,
  "knowledge_cutoff": "2024-08-01",
  "param_count": null,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://ai.google.dev/gemini-api/docs/models/gemini#gemini-2.0-flash",
  "source_playground": "https://ai.google.dev/studio",
  "source_paper": null,
  "source_scorecard_blog_link": "https://blog.google/technology/google-deepmind/google-gemini-ai-update-december-2024/",
  "source_repo_link": null,
  "source_weights_link": null,
  "created_at": "2025-07-19T19:49:05.538624+00:00",
  "updated_at": "2025-07-19T19:49:05.538624+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/google/models/gemini-2.0-flash-lite/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 1148,
    "benchmark_id": "bird-sql-(dev)",
    "model_id": "gemini-2.0-flash-lite",
    "score": 0.574,
    "normalized_score": 0.574,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemini-api/docs/models",
    "verified_by_llmstats": false,
    "analysis_method": "- evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.415349+00:00",
    "updated_at": "2025-07-19T19:56:13.415349+00:00",
    "benchmark_name": "Bird-SQL (dev)"
  },
  {
    "model_benchmark_id": 1403,
    "benchmark_id": "covost2",
    "model_id": "gemini-2.0-flash-lite",
    "score": 0.384,
    "normalized_score": 0.384,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemini-api/docs/models",
    "verified_by_llmstats": false,
    "analysis_method": "Automatic speech translation (BLEU score) across 21 languages",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.960537+00:00",
    "updated_at": "2025-07-19T19:56:13.960537+00:00",
    "benchmark_name": "CoVoST2"
  },
  {
    "model_benchmark_id": 921,
    "benchmark_id": "egoschema",
    "model_id": "gemini-2.0-flash-lite",
    "score": 0.672,
    "normalized_score": 0.672,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemini-api/docs/models",
    "verified_by_llmstats": false,
    "analysis_method": "Video analysis across multiple domains",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.924659+00:00",
    "updated_at": "2025-07-19T19:56:12.924659+00:00",
    "benchmark_name": "EgoSchema"
  },
  {
    "model_benchmark_id": 1088,
    "benchmark_id": "facts-grounding",
    "model_id": "gemini-2.0-flash-lite",
    "score": 0.836,
    "normalized_score": 0.836,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemini-api/docs/models",
    "verified_by_llmstats": false,
    "analysis_method": "- evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.264333+00:00",
    "updated_at": "2025-07-19T19:56:13.264333+00:00",
    "benchmark_name": "FACTS Grounding"
  },
  {
    "model_benchmark_id": 1209,
    "benchmark_id": "global-mmlu-lite",
    "model_id": "gemini-2.0-flash-lite",
    "score": 0.782,
    "normalized_score": 0.782,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemini-api/docs/models",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.543616+00:00",
    "updated_at": "2025-07-19T19:56:13.543616+00:00",
    "benchmark_name": "Global-MMLU-Lite"
  },
  {
    "model_benchmark_id": 266,
    "benchmark_id": "gpqa",
    "model_id": "gemini-2.0-flash-lite",
    "score": 0.515,
    "normalized_score": 0.515,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemini-api/docs/models",
    "verified_by_llmstats": false,
    "analysis_method": "Diamond",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.611234+00:00",
    "updated_at": "2025-07-19T19:56:11.611234+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 1156,
    "benchmark_id": "hiddenmath",
    "model_id": "gemini-2.0-flash-lite",
    "score": 0.553,
    "normalized_score": 0.553,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemini-api/docs/models",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.433332+00:00",
    "updated_at": "2025-07-19T19:56:13.433332+00:00",
    "benchmark_name": "HiddenMath"
  },
  {
    "model_benchmark_id": 1320,
    "benchmark_id": "livecodebench-v5",
    "model_id": "gemini-2.0-flash-lite",
    "score": 0.289,
    "normalized_score": 0.289,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemini-api/docs/models",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.771288+00:00",
    "updated_at": "2025-07-19T19:56:13.771288+00:00",
    "benchmark_name": "LiveCodeBench v5"
  },
  {
    "model_benchmark_id": 379,
    "benchmark_id": "math",
    "model_id": "gemini-2.0-flash-lite",
    "score": 0.868,
    "normalized_score": 0.868,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemini-api/docs/models",
    "verified_by_llmstats": false,
    "analysis_method": "standard",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.819524+00:00",
    "updated_at": "2025-07-19T19:56:11.819524+00:00",
    "benchmark_name": "MATH"
  },
  {
    "model_benchmark_id": 166,
    "benchmark_id": "mmlu-pro",
    "model_id": "gemini-2.0-flash-lite",
    "score": 0.716,
    "normalized_score": 0.716,
    "is_self_reported": true,
    "self_reported_source_link": "https://developers.googleblog.com/en/gemini-2-family-expands/",
    "verified_by_llmstats": false,
    "analysis_method": "Chain-of-Thought accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.423223+00:00",
    "updated_at": "2025-07-19T19:56:11.423223+00:00",
    "benchmark_name": "MMLU-Pro"
  },
  {
    "model_benchmark_id": 554,
    "benchmark_id": "mmmu",
    "model_id": "gemini-2.0-flash-lite",
    "score": 0.68,
    "normalized_score": 0.68,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemini-api/docs/models",
    "verified_by_llmstats": false,
    "analysis_method": "Multi-discipline college-level multimodal understanding and reasoning problems",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.141505+00:00",
    "updated_at": "2025-07-19T19:56:12.141505+00:00",
    "benchmark_name": "MMMU"
  },
  {
    "model_benchmark_id": 1402,
    "benchmark_id": "mrcr-1m",
    "model_id": "gemini-2.0-flash-lite",
    "score": 0.58,
    "normalized_score": 0.58,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemini-api/docs/models",
    "verified_by_llmstats": false,
    "analysis_method": "Long-context comprehension accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.956748+00:00",
    "updated_at": "2025-07-19T19:56:13.956748+00:00",
    "benchmark_name": "MRCR 1M"
  },
  {
    "model_benchmark_id": 226,
    "benchmark_id": "simpleqa",
    "model_id": "gemini-2.0-flash-lite",
    "score": 0.217,
    "normalized_score": 0.217,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemini-api/docs/models",
    "verified_by_llmstats": false,
    "analysis_method": "Factuality",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.535234+00:00",
    "updated_at": "2025-07-19T19:56:11.535234+00:00",
    "benchmark_name": "SimpleQA"
  }
]

================================================
FILE: data/organizations/google/models/gemini-2.0-flash-lite/model.json
================================================
{
  "model_id": "gemini-2.0-flash-lite",
  "name": "Gemini 2.0 Flash-Lite",
  "organization_id": "google",
  "fine_tuned_from_model_id": null,
  "description": "A Gemini 2.0 Flash model optimized for cost efficiency and low latency",
  "release_date": "2025-02-05",
  "announcement_date": "2025-02-05",
  "license_id": "proprietary",
  "multimodal": true,
  "knowledge_cutoff": "2024-06-01",
  "param_count": null,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://ai.google.dev/gemini-api/docs/models#gemini-2.0-flash-lite",
  "source_playground": "https://aistudio.google.com/prompts/new_chat?model=gemini-2.0-flash-lite",
  "source_paper": null,
  "source_scorecard_blog_link": "https://developers.googleblog.com/en/gemini-2-family-expands",
  "source_repo_link": null,
  "source_weights_link": null,
  "created_at": "2025-07-19T19:49:05.469548+00:00",
  "updated_at": "2025-07-19T19:49:05.469548+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/google/models/gemini-2.0-flash-thinking/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 448,
    "benchmark_id": "aime-2024",
    "model_id": "gemini-2.0-flash-thinking",
    "score": 0.733,
    "normalized_score": 0.733,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemini-api/docs/models/gemini#evaluation",
    "verified_by_llmstats": false,
    "analysis_method": "Enhanced reasoning on competition-level math prompts",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.952263+00:00",
    "updated_at": "2025-07-19T19:56:11.952263+00:00",
    "benchmark_name": "AIME 2024"
  },
  {
    "model_benchmark_id": 271,
    "benchmark_id": "gpqa",
    "model_id": "gemini-2.0-flash-thinking",
    "score": 0.742,
    "normalized_score": 0.742,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemini-api/docs/models/gemini#evaluation",
    "verified_by_llmstats": false,
    "analysis_method": "Challenging science questions requiring chain-of-thought reasoning",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.620752+00:00",
    "updated_at": "2025-07-19T19:56:11.620752+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 559,
    "benchmark_id": "mmmu",
    "model_id": "gemini-2.0-flash-thinking",
    "score": 0.754,
    "normalized_score": 0.754,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemini-api/docs/models/gemini#evaluation",
    "verified_by_llmstats": false,
    "analysis_method": "Image-text QA across various domains",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.151038+00:00",
    "updated_at": "2025-07-19T19:56:12.151038+00:00",
    "benchmark_name": "MMMU"
  }
]

================================================
FILE: data/organizations/google/models/gemini-2.0-flash-thinking/model.json
================================================
{
  "model_id": "gemini-2.0-flash-thinking",
  "name": "Gemini 2.0 Flash Thinking",
  "organization_id": "google",
  "fine_tuned_from_model_id": null,
  "description": "Gemini 2.0 Flash Thinking is a enhanced reasoning model, capable of showing its thoughts to improve performance and explainability. Combining speed and performance, Gemini 2.0 Flash Thinking also excels in science and math, showing its thinking to solve complex problems.",
  "release_date": "2025-01-21",
  "announcement_date": "2025-01-21",
  "license_id": "proprietary",
  "multimodal": true,
  "knowledge_cutoff": "2024-08-01",
  "param_count": null,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://ai.google.dev/gemini-api/docs/models/gemini#gemini-2.0-flash-thinking-experimental",
  "source_playground": "https://ai.google.dev/studio",
  "source_paper": null,
  "source_scorecard_blog_link": "https://blog.google/technology/google-deepmind/google-gemini-ai-update-december-2024/",
  "source_repo_link": null,
  "source_weights_link": null,
  "created_at": "2025-07-19T19:49:05.504495+00:00",
  "updated_at": "2025-07-19T19:49:05.504495+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/google/models/gemini-2.5-flash/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 661,
    "benchmark_id": "aider-polyglot",
    "model_id": "gemini-2.5-flash",
    "score": 0.619,
    "normalized_score": 0.619,
    "is_self_reported": true,
    "self_reported_source_link": "https://developers.googleblog.com/en/start-building-with-gemini-25-flash/",
    "verified_by_llmstats": false,
    "analysis_method": "whole",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.370513+00:00",
    "updated_at": "2025-07-19T19:56:12.370513+00:00",
    "benchmark_name": "Aider-Polyglot"
  },
  {
    "model_benchmark_id": 1329,
    "benchmark_id": "aider-polyglot-edit",
    "model_id": "gemini-2.5-flash",
    "score": 0.567,
    "normalized_score": 0.567,
    "is_self_reported": true,
    "self_reported_source_link": "https://blog.google/technology/google-deepmind/google-gemini-updates-io-2025",
    "verified_by_llmstats": false,
    "analysis_method": "Diff-Fenced",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.795058+00:00",
    "updated_at": "2025-07-19T19:56:13.795058+00:00",
    "benchmark_name": "Aider-Polyglot Edit"
  },
  {
    "model_benchmark_id": 447,
    "benchmark_id": "aime-2024",
    "model_id": "gemini-2.5-flash",
    "score": 0.88,
    "normalized_score": 0.88,
    "is_self_reported": true,
    "self_reported_source_link": "https://developers.googleblog.com/en/start-building-with-gemini-25-flash/",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.950448+00:00",
    "updated_at": "2025-07-19T19:56:11.950448+00:00",
    "benchmark_name": "AIME 2024"
  },
  {
    "model_benchmark_id": 683,
    "benchmark_id": "aime-2025",
    "model_id": "gemini-2.5-flash",
    "score": 0.72,
    "normalized_score": 0.72,
    "is_self_reported": true,
    "self_reported_source_link": "https://developers.googleblog.com/en/start-building-with-gemini-25-flash/",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.428509+00:00",
    "updated_at": "2025-07-19T19:56:12.428509+00:00",
    "benchmark_name": "AIME 2025"
  },
  {
    "model_benchmark_id": 1091,
    "benchmark_id": "facts-grounding",
    "model_id": "gemini-2.5-flash",
    "score": 0.853,
    "normalized_score": 0.853,
    "is_self_reported": true,
    "self_reported_source_link": "https://developers.googleblog.com/en/start-building-with-gemini-25-flash/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.271323+00:00",
    "updated_at": "2025-07-19T19:56:13.271323+00:00",
    "benchmark_name": "FACTS Grounding"
  },
  {
    "model_benchmark_id": 1212,
    "benchmark_id": "global-mmlu-lite",
    "model_id": "gemini-2.5-flash",
    "score": 0.884,
    "normalized_score": 0.884,
    "is_self_reported": true,
    "self_reported_source_link": "https://developers.googleblog.com/en/start-building-with-gemini-25-flash/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.550549+00:00",
    "updated_at": "2025-07-19T19:56:13.550549+00:00",
    "benchmark_name": "Global-MMLU-Lite"
  },
  {
    "model_benchmark_id": 270,
    "benchmark_id": "gpqa",
    "model_id": "gemini-2.5-flash",
    "score": 0.828,
    "normalized_score": 0.828,
    "is_self_reported": true,
    "self_reported_source_link": "https://developers.googleblog.com/en/start-building-with-gemini-25-flash/",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.619078+00:00",
    "updated_at": "2025-07-19T19:56:11.619078+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 720,
    "benchmark_id": "humanity's-last-exam",
    "model_id": "gemini-2.5-flash",
    "score": 0.11,
    "normalized_score": 0.11,
    "is_self_reported": true,
    "self_reported_source_link": "https://developers.googleblog.com/en/start-building-with-gemini-25-flash/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.518055+00:00",
    "updated_at": "2025-07-19T19:56:12.518055+00:00",
    "benchmark_name": "Humanity's Last Exam"
  },
  {
    "model_benchmark_id": 1321,
    "benchmark_id": "livecodebench-v5",
    "model_id": "gemini-2.5-flash",
    "score": 0.639,
    "normalized_score": 0.639,
    "is_self_reported": true,
    "self_reported_source_link": "https://developers.googleblog.com/en/start-building-with-gemini-25-flash/",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.773194+00:00",
    "updated_at": "2025-07-19T19:56:13.773194+00:00",
    "benchmark_name": "LiveCodeBench v5"
  },
  {
    "model_benchmark_id": 558,
    "benchmark_id": "mmmu",
    "model_id": "gemini-2.5-flash",
    "score": 0.797,
    "normalized_score": 0.797,
    "is_self_reported": true,
    "self_reported_source_link": "https://developers.googleblog.com/en/start-building-with-gemini-25-flash/",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.148985+00:00",
    "updated_at": "2025-07-19T19:56:12.148985+00:00",
    "benchmark_name": "MMMU"
  },
  {
    "model_benchmark_id": 1374,
    "benchmark_id": "mrcr",
    "model_id": "gemini-2.5-flash",
    "score": 0.32,
    "normalized_score": 0.32,
    "is_self_reported": true,
    "self_reported_source_link": "https://blog.google/technology/google-deepmind/google-gemini-updates-io-2025",
    "verified_by_llmstats": false,
    "analysis_method": "1M-pointwise",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.893404+00:00",
    "updated_at": "2025-07-19T19:56:13.895016+00:00",
    "benchmark_name": "MRCR"
  },
  {
    "model_benchmark_id": 229,
    "benchmark_id": "simpleqa",
    "model_id": "gemini-2.5-flash",
    "score": 0.269,
    "normalized_score": 0.269,
    "is_self_reported": true,
    "self_reported_source_link": "https://developers.googleblog.com/en/start-building-with-gemini-25-flash/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.540281+00:00",
    "updated_at": "2025-07-19T19:56:11.540281+00:00",
    "benchmark_name": "SimpleQA"
  },
  {
    "model_benchmark_id": 1341,
    "benchmark_id": "swe-bench-verified",
    "model_id": "gemini-2.5-flash",
    "score": 0.604,
    "normalized_score": 0.604,
    "is_self_reported": true,
    "self_reported_source_link": "https://developers.googleblog.com/en/start-building-with-gemini-25-flash/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.822771+00:00",
    "updated_at": "2025-07-19T19:56:13.822771+00:00",
    "benchmark_name": "SWE-Bench Verified"
  },
  {
    "model_benchmark_id": 1368,
    "benchmark_id": "vibe-eval",
    "model_id": "gemini-2.5-flash",
    "score": 0.654,
    "normalized_score": 0.654,
    "is_self_reported": true,
    "self_reported_source_link": "https://developers.googleblog.com/en/start-building-with-gemini-25-flash/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.880772+00:00",
    "updated_at": "2025-07-19T19:56:13.880772+00:00",
    "benchmark_name": "Vibe-Eval"
  }
]

================================================
FILE: data/organizations/google/models/gemini-2.5-flash/model.json
================================================
{
  "model_id": "gemini-2.5-flash",
  "name": "Gemini 2.5 Flash",
  "organization_id": "google",
  "fine_tuned_from_model_id": null,
  "description": "A thinking model designed for a balance between price and performance. It builds upon Gemini 2.0 Flash with upgraded reasoning, hybrid thinking control, multimodal capabilities (text, image, video, audio input), and a 1M token input context window.",
  "release_date": "2025-05-20",
  "announcement_date": "2025-05-20",
  "license_id": "proprietary",
  "multimodal": true,
  "knowledge_cutoff": "2025-01-31",
  "param_count": null,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://ai.google.dev/gemini-api/docs/models?hl=en#gemini-2.5-flash-preview-04-17",
  "source_playground": "https://aistudio.google.com/?model=gemini-2.5-flash-preview-04-17",
  "source_paper": null,
  "source_scorecard_blog_link": "https://developers.googleblog.com/en/start-building-with-gemini-25-flash/",
  "source_repo_link": null,
  "source_weights_link": null,
  "created_at": "2025-07-19T19:49:05.500918+00:00",
  "updated_at": "2025-07-19T19:49:05.500918+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/google/models/gemini-2.5-flash-lite/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 659,
    "benchmark_id": "aider-polyglot",
    "model_id": "gemini-2.5-flash-lite",
    "score": 0.267,
    "normalized_score": 0.267,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/models/gemini/flash-lite/",
    "verified_by_llmstats": false,
    "analysis_method": "Code editing",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.366506+00:00",
    "updated_at": "2025-07-19T19:56:12.366506+00:00",
    "benchmark_name": "Aider-Polyglot"
  },
  {
    "model_benchmark_id": 681,
    "benchmark_id": "aime-2025",
    "model_id": "gemini-2.5-flash-lite",
    "score": 0.498,
    "normalized_score": 0.498,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/models/gemini/flash-lite/",
    "verified_by_llmstats": false,
    "analysis_method": "Mathematics",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.422347+00:00",
    "updated_at": "2025-07-19T19:56:12.422347+00:00",
    "benchmark_name": "AIME 2025"
  },
  {
    "model_benchmark_id": 1406,
    "benchmark_id": "arc",
    "model_id": "gemini-2.5-flash-lite",
    "score": 0.025,
    "normalized_score": 0.025,
    "is_self_reported": true,
    "self_reported_source_link": "https://cloud.google.com/vertex-ai/generative-ai/docs/models/gemini/2-5-flash-lite",
    "verified_by_llmstats": false,
    "analysis_method": "Default",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.969921+00:00",
    "updated_at": "2025-07-19T19:56:13.969921+00:00",
    "benchmark_name": "Arc"
  },
  {
    "model_benchmark_id": 1089,
    "benchmark_id": "facts-grounding",
    "model_id": "gemini-2.5-flash-lite",
    "score": 0.841,
    "normalized_score": 0.841,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/models/gemini/flash-lite/",
    "verified_by_llmstats": false,
    "analysis_method": "Factuality",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.267251+00:00",
    "updated_at": "2025-07-19T19:56:13.267251+00:00",
    "benchmark_name": "FACTS Grounding"
  },
  {
    "model_benchmark_id": 1210,
    "benchmark_id": "global-mmlu-lite",
    "model_id": "gemini-2.5-flash-lite",
    "score": 0.811,
    "normalized_score": 0.811,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/models/gemini/flash-lite/",
    "verified_by_llmstats": false,
    "analysis_method": "Multilingual performance",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.546251+00:00",
    "updated_at": "2025-07-19T19:56:13.546251+00:00",
    "benchmark_name": "Global-MMLU-Lite"
  },
  {
    "model_benchmark_id": 267,
    "benchmark_id": "gpqa",
    "model_id": "gemini-2.5-flash-lite",
    "score": 0.646,
    "normalized_score": 0.646,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/models/gemini/flash-lite/",
    "verified_by_llmstats": false,
    "analysis_method": "Diamond",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.612808+00:00",
    "updated_at": "2025-07-19T19:56:11.612808+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 718,
    "benchmark_id": "humanity's-last-exam",
    "model_id": "gemini-2.5-flash-lite",
    "score": 0.051,
    "normalized_score": 0.051,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/models/gemini/flash-lite/",
    "verified_by_llmstats": false,
    "analysis_method": "No tools",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.514286+00:00",
    "updated_at": "2025-07-19T19:56:12.514286+00:00",
    "benchmark_name": "Humanity's Last Exam"
  },
  {
    "model_benchmark_id": 1104,
    "benchmark_id": "livecodebench",
    "model_id": "gemini-2.5-flash-lite",
    "score": 0.337,
    "normalized_score": 0.337,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/models/gemini/flash-lite/",
    "verified_by_llmstats": false,
    "analysis_method": "Code generation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.300809+00:00",
    "updated_at": "2025-07-19T19:56:13.300809+00:00",
    "benchmark_name": "LiveCodeBench"
  },
  {
    "model_benchmark_id": 555,
    "benchmark_id": "mmmu",
    "model_id": "gemini-2.5-flash-lite",
    "score": 0.729,
    "normalized_score": 0.729,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/models/gemini/flash-lite/",
    "verified_by_llmstats": false,
    "analysis_method": "Visual reasoning",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.143254+00:00",
    "updated_at": "2025-07-19T19:56:12.143254+00:00",
    "benchmark_name": "MMMU"
  },
  {
    "model_benchmark_id": 1405,
    "benchmark_id": "mrcr-v2",
    "model_id": "gemini-2.5-flash-lite",
    "score": 0.166,
    "normalized_score": 0.166,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/models/gemini/flash-lite/",
    "verified_by_llmstats": false,
    "analysis_method": "Long context 128k average. 8 needle.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.966057+00:00",
    "updated_at": "2025-07-19T19:56:13.966057+00:00",
    "benchmark_name": "MRCR v2"
  },
  {
    "model_benchmark_id": 227,
    "benchmark_id": "simpleqa",
    "model_id": "gemini-2.5-flash-lite",
    "score": 0.107,
    "normalized_score": 0.107,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/models/gemini/flash-lite/",
    "verified_by_llmstats": false,
    "analysis_method": "Factuality",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.536893+00:00",
    "updated_at": "2025-07-19T19:56:11.536893+00:00",
    "benchmark_name": "SimpleQA"
  },
  {
    "model_benchmark_id": 1339,
    "benchmark_id": "swe-bench-verified",
    "model_id": "gemini-2.5-flash-lite",
    "score": 0.316,
    "normalized_score": 0.316,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/models/gemini/flash-lite/",
    "verified_by_llmstats": false,
    "analysis_method": "Agentic coding single attempt",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.819222+00:00",
    "updated_at": "2025-07-19T19:56:13.819222+00:00",
    "benchmark_name": "SWE-Bench Verified"
  },
  {
    "model_benchmark_id": 1365,
    "benchmark_id": "vibe-eval",
    "model_id": "gemini-2.5-flash-lite",
    "score": 0.513,
    "normalized_score": 0.513,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/models/gemini/flash-lite/",
    "verified_by_llmstats": false,
    "analysis_method": "Reka",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.875989+00:00",
    "updated_at": "2025-07-19T19:56:13.875989+00:00",
    "benchmark_name": "Vibe-Eval"
  }
]

================================================
FILE: data/organizations/google/models/gemini-2.5-flash-lite/model.json
================================================
{
  "model_id": "gemini-2.5-flash-lite",
  "name": "Gemini 2.5 Flash-Lite",
  "organization_id": "google",
  "fine_tuned_from_model_id": null,
  "description": "Gemini 2.5 Flash-Lite is a model developed by Google DeepMind, designed to handle various tasks including reasoning, science, mathematics, code generation, and more. It features advanced capabilities in multilingual performance and long context understanding. It is optimized for low latency use cases, supporting multimodal input with a 1 million-token context length.",
  "release_date": "2025-06-17",
  "announcement_date": "2025-06-17",
  "license_id": "creative_commons_attribution_4_0_license",
  "multimodal": true,
  "knowledge_cutoff": "2025-01-01",
  "param_count": null,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://cloud.google.com/vertex-ai/generative-ai/docs/models/gemini/2-5-flash-lite",
  "source_playground": "https://ai.google.com/studio",
  "source_paper": "https://arxiv.org/abs/2503.16534",
  "source_scorecard_blog_link": "https://cloud.google.com/vertex-ai/generative-ai/docs/models/gemini/2-5-flash-lite",
  "source_repo_link": null,
  "source_weights_link": null,
  "created_at": "2025-07-19T19:49:05.473471+00:00",
  "updated_at": "2025-07-19T19:49:05.473471+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/google/models/gemini-2.5-pro/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 658,
    "benchmark_id": "aider-polyglot",
    "model_id": "gemini-2.5-pro",
    "score": 0.765,
    "normalized_score": 0.765,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/models/gemini/pro/",
    "verified_by_llmstats": false,
    "analysis_method": "-",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.364634+00:00",
    "updated_at": "2025-07-19T19:56:12.364634+00:00",
    "benchmark_name": "Aider-Polyglot"
  },
  {
    "model_benchmark_id": 1328,
    "benchmark_id": "aider-polyglot-edit",
    "model_id": "gemini-2.5-pro",
    "score": 0.727,
    "normalized_score": 0.727,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/models/gemini/pro/",
    "verified_by_llmstats": false,
    "analysis_method": "Diff",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.793176+00:00",
    "updated_at": "2025-07-19T19:56:13.793176+00:00",
    "benchmark_name": "Aider-Polyglot Edit"
  },
  {
    "model_benchmark_id": 446,
    "benchmark_id": "aime-2024",
    "model_id": "gemini-2.5-pro",
    "score": 0.92,
    "normalized_score": 0.92,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/models/gemini/pro/",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.948567+00:00",
    "updated_at": "2025-07-19T19:56:11.948567+00:00",
    "benchmark_name": "AIME 2024"
  },
  {
    "model_benchmark_id": 679,
    "benchmark_id": "aime-2025",
    "model_id": "gemini-2.5-pro",
    "score": 0.83,
    "normalized_score": 0.83,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/models/gemini/pro/",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.417055+00:00",
    "updated_at": "2025-07-19T19:56:12.417055+00:00",
    "benchmark_name": "AIME 2025"
  },
  {
    "model_benchmark_id": 1385,
    "benchmark_id": "arc-agi-v2",
    "model_id": "gemini-2.5-pro",
    "score": 0.049,
    "normalized_score": 0.049,
    "is_self_reported": false,
    "self_reported_source_link": "https://x.com/xai/status/1943158495588815072",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.918991+00:00",
    "updated_at": "2025-07-19T19:56:13.918991+00:00",
    "benchmark_name": "ARC-AGI v2"
  },
  {
    "model_benchmark_id": 1207,
    "benchmark_id": "global-mmlu-lite",
    "model_id": "gemini-2.5-pro",
    "score": 0.886,
    "normalized_score": 0.886,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/models/gemini/pro/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.540318+00:00",
    "updated_at": "2025-07-19T19:56:13.540318+00:00",
    "benchmark_name": "Global-MMLU-Lite"
  },
  {
    "model_benchmark_id": 263,
    "benchmark_id": "gpqa",
    "model_id": "gemini-2.5-pro",
    "score": 0.83,
    "normalized_score": 0.83,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/models/gemini/pro/",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.605360+00:00",
    "updated_at": "2025-07-19T19:56:11.605360+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 717,
    "benchmark_id": "humanity's-last-exam",
    "model_id": "gemini-2.5-pro",
    "score": 0.178,
    "normalized_score": 0.178,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/models/gemini/pro/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.511856+00:00",
    "updated_at": "2025-07-19T19:56:12.511856+00:00",
    "benchmark_name": "Humanity's Last Exam"
  },
  {
    "model_benchmark_id": 1318,
    "benchmark_id": "livecodebench-v5",
    "model_id": "gemini-2.5-pro",
    "score": 0.756,
    "normalized_score": 0.756,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/models/gemini/pro/",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.763325+00:00",
    "updated_at": "2025-07-19T19:56:13.763325+00:00",
    "benchmark_name": "LiveCodeBench v5"
  },
  {
    "model_benchmark_id": 552,
    "benchmark_id": "mmmu",
    "model_id": "gemini-2.5-pro",
    "score": 0.796,
    "normalized_score": 0.796,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/models/gemini/pro/",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.137517+00:00",
    "updated_at": "2025-07-19T19:56:12.137517+00:00",
    "benchmark_name": "MMMU"
  },
  {
    "model_benchmark_id": 1372,
    "benchmark_id": "mrcr",
    "model_id": "gemini-2.5-pro",
    "score": 0.93,
    "normalized_score": 0.93,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/models/gemini/pro/",
    "verified_by_llmstats": false,
    "analysis_method": "128k-average",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.889867+00:00",
    "updated_at": "2025-07-19T19:56:13.889867+00:00",
    "benchmark_name": "MRCR"
  },
  {
    "model_benchmark_id": 1384,
    "benchmark_id": "mrcr-1m-(pointwise)",
    "model_id": "gemini-2.5-pro",
    "score": 0.829,
    "normalized_score": 0.829,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/models/gemini/pro/",
    "verified_by_llmstats": false,
    "analysis_method": "Pointwise",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.915166+00:00",
    "updated_at": "2025-07-19T19:56:13.915166+00:00",
    "benchmark_name": "MRCR 1M (pointwise)"
  },
  {
    "model_benchmark_id": 225,
    "benchmark_id": "simpleqa",
    "model_id": "gemini-2.5-pro",
    "score": 0.508,
    "normalized_score": 0.508,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/models/gemini/pro/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.532774+00:00",
    "updated_at": "2025-07-19T19:56:11.532774+00:00",
    "benchmark_name": "SimpleQA"
  },
  {
    "model_benchmark_id": 1338,
    "benchmark_id": "swe-bench-verified",
    "model_id": "gemini-2.5-pro",
    "score": 0.632,
    "normalized_score": 0.632,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/models/gemini/pro/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.816932+00:00",
    "updated_at": "2025-07-19T19:56:13.816932+00:00",
    "benchmark_name": "SWE-Bench Verified"
  },
  {
    "model_benchmark_id": 1364,
    "benchmark_id": "vibe-eval",
    "model_id": "gemini-2.5-pro",
    "score": 0.656,
    "normalized_score": 0.656,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/models/gemini/pro/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.874453+00:00",
    "updated_at": "2025-07-19T19:56:13.874453+00:00",
    "benchmark_name": "Vibe-Eval"
  },
  {
    "model_benchmark_id": 1379,
    "benchmark_id": "video-mme",
    "model_id": "gemini-2.5-pro",
    "score": 0.848,
    "normalized_score": 0.848,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/models/gemini/pro/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.904547+00:00",
    "updated_at": "2025-07-19T19:56:13.904547+00:00",
    "benchmark_name": "Video-MME"
  }
]

================================================
FILE: data/organizations/google/models/gemini-2.5-pro/model.json
================================================
{
  "model_id": "gemini-2.5-pro",
  "name": "Gemini 2.5 Pro",
  "organization_id": "google",
  "fine_tuned_from_model_id": null,
  "description": "Our most intelligent AI model, built for the agentic era. Gemini 2.5 Pro leads on common benchmarks with enhanced reasoning, multimodal capabilities (text, image, video, audio input), and a 1M token context window.",
  "release_date": "2025-05-20",
  "announcement_date": "2025-05-20",
  "license_id": "proprietary",
  "multimodal": true,
  "knowledge_cutoff": "2025-01-31",
  "param_count": null,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://ai.google.dev/gemini-api/docs/models?hl=en#gemini-2.5-pro-preview-03-25",
  "source_playground": "https://aistudio.google.com/?model=gemini-2.5-pro-preview-03-25",
  "source_paper": "https://storage.googleapis.com/model-cards/documents/gemini-2.5-pro-preview.pdf",
  "source_scorecard_blog_link": "https://blog.google/technology/google-deepmind/gemini-model-thinking-updates-march-2025/",
  "source_repo_link": null,
  "source_weights_link": null,
  "created_at": "2025-07-19T19:49:05.458697+00:00",
  "updated_at": "2025-07-19T19:49:05.458697+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/google/models/gemini-2.5-pro-preview-06-05/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 660,
    "benchmark_id": "aider-polyglot",
    "model_id": "gemini-2.5-pro-preview-06-05",
    "score": 0.822,
    "normalized_score": 0.822,
    "is_self_reported": true,
    "self_reported_source_link": "https://blog.google/products/gemini/gemini-2-5-pro-latest-preview/",
    "verified_by_llmstats": false,
    "analysis_method": "Diff-fenced",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.368655+00:00",
    "updated_at": "2025-07-19T19:56:12.368655+00:00",
    "benchmark_name": "Aider-Polyglot"
  },
  {
    "model_benchmark_id": 682,
    "benchmark_id": "aime-2025",
    "model_id": "gemini-2.5-pro-preview-06-05",
    "score": 0.88,
    "normalized_score": 0.88,
    "is_self_reported": true,
    "self_reported_source_link": "https://blog.google/products/gemini/gemini-2-5-pro-latest-preview/",
    "verified_by_llmstats": false,
    "analysis_method": "Single attempt",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.425843+00:00",
    "updated_at": "2025-07-19T19:56:12.425843+00:00",
    "benchmark_name": "AIME 2025"
  },
  {
    "model_benchmark_id": 1090,
    "benchmark_id": "facts-grounding",
    "model_id": "gemini-2.5-pro-preview-06-05",
    "score": 0.878,
    "normalized_score": 0.878,
    "is_self_reported": true,
    "self_reported_source_link": "https://blog.google/products/gemini/gemini-2-5-pro-latest-preview/",
    "verified_by_llmstats": false,
    "analysis_method": "Factuality",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.269434+00:00",
    "updated_at": "2025-07-19T19:56:13.269434+00:00",
    "benchmark_name": "FACTS Grounding"
  },
  {
    "model_benchmark_id": 1211,
    "benchmark_id": "global-mmlu-lite",
    "model_id": "gemini-2.5-pro-preview-06-05",
    "score": 0.892,
    "normalized_score": 0.892,
    "is_self_reported": true,
    "self_reported_source_link": "https://blog.google/products/gemini/gemini-2-5-pro-latest-preview/",
    "verified_by_llmstats": false,
    "analysis_method": "Multilingual performance",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.548453+00:00",
    "updated_at": "2025-07-19T19:56:13.548453+00:00",
    "benchmark_name": "Global-MMLU-Lite"
  },
  {
    "model_benchmark_id": 269,
    "benchmark_id": "gpqa",
    "model_id": "gemini-2.5-pro-preview-06-05",
    "score": 0.864,
    "normalized_score": 0.864,
    "is_self_reported": true,
    "self_reported_source_link": "https://blog.google/products/gemini/gemini-2-5-pro-latest-preview/",
    "verified_by_llmstats": false,
    "analysis_method": "Single attempt Diamond",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.617404+00:00",
    "updated_at": "2025-07-19T19:56:11.617404+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 719,
    "benchmark_id": "humanity's-last-exam",
    "model_id": "gemini-2.5-pro-preview-06-05",
    "score": 0.216,
    "normalized_score": 0.216,
    "is_self_reported": true,
    "self_reported_source_link": "https://blog.google/products/gemini/gemini-2-5-pro-latest-preview/",
    "verified_by_llmstats": false,
    "analysis_method": "No tools",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.516239+00:00",
    "updated_at": "2025-07-19T19:56:12.516239+00:00",
    "benchmark_name": "Humanity's Last Exam"
  },
  {
    "model_benchmark_id": 1105,
    "benchmark_id": "livecodebench",
    "model_id": "gemini-2.5-pro-preview-06-05",
    "score": 0.69,
    "normalized_score": 0.69,
    "is_self_reported": true,
    "self_reported_source_link": "https://blog.google/products/gemini/gemini-2-5-pro-latest-preview/",
    "verified_by_llmstats": false,
    "analysis_method": "Single attempt (1/1/2025-5/1/2025)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.303010+00:00",
    "updated_at": "2025-07-19T19:56:13.303010+00:00",
    "benchmark_name": "LiveCodeBench"
  },
  {
    "model_benchmark_id": 557,
    "benchmark_id": "mmmu",
    "model_id": "gemini-2.5-pro-preview-06-05",
    "score": 0.82,
    "normalized_score": 0.82,
    "is_self_reported": true,
    "self_reported_source_link": "https://blog.google/products/gemini/gemini-2-5-pro-latest-preview/",
    "verified_by_llmstats": false,
    "analysis_method": "Single attempt",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.146880+00:00",
    "updated_at": "2025-07-19T19:56:12.146880+00:00",
    "benchmark_name": "MMMU"
  },
  {
    "model_benchmark_id": 1422,
    "benchmark_id": "mrcr-v2-(8-needle)",
    "model_id": "gemini-2.5-pro-preview-06-05",
    "score": 0.164,
    "normalized_score": 0.164,
    "is_self_reported": true,
    "self_reported_source_link": "https://blog.google/products/gemini/gemini-2-5-pro-latest-preview/",
    "verified_by_llmstats": false,
    "analysis_method": "1M pointwise",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.013534+00:00",
    "updated_at": "2025-07-19T19:56:14.016258+00:00",
    "benchmark_name": "MRCR v2 (8-needle)"
  },
  {
    "model_benchmark_id": 228,
    "benchmark_id": "simpleqa",
    "model_id": "gemini-2.5-pro-preview-06-05",
    "score": 0.54,
    "normalized_score": 0.54,
    "is_self_reported": true,
    "self_reported_source_link": "https://blog.google/products/gemini/gemini-2-5-pro-latest-preview/",
    "verified_by_llmstats": false,
    "analysis_method": "Factuality",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.538432+00:00",
    "updated_at": "2025-07-19T19:56:11.538432+00:00",
    "benchmark_name": "SimpleQA"
  },
  {
    "model_benchmark_id": 1340,
    "benchmark_id": "swe-bench-verified",
    "model_id": "gemini-2.5-pro-preview-06-05",
    "score": 0.672,
    "normalized_score": 0.672,
    "is_self_reported": true,
    "self_reported_source_link": "https://blog.google/products/gemini/gemini-2-5-pro-latest-preview/",
    "verified_by_llmstats": false,
    "analysis_method": "Multiple attempts",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.820885+00:00",
    "updated_at": "2025-07-19T19:56:13.820885+00:00",
    "benchmark_name": "SWE-Bench Verified"
  },
  {
    "model_benchmark_id": 1367,
    "benchmark_id": "vibe-eval",
    "model_id": "gemini-2.5-pro-preview-06-05",
    "score": 0.672,
    "normalized_score": 0.672,
    "is_self_reported": true,
    "self_reported_source_link": "https://blog.google/products/gemini/gemini-2-5-pro-latest-preview/",
    "verified_by_llmstats": false,
    "analysis_method": "Image understanding",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.879257+00:00",
    "updated_at": "2025-07-19T19:56:13.879257+00:00",
    "benchmark_name": "Vibe-Eval"
  },
  {
    "model_benchmark_id": 1421,
    "benchmark_id": "videommmu",
    "model_id": "gemini-2.5-pro-preview-06-05",
    "score": 0.836,
    "normalized_score": 0.836,
    "is_self_reported": true,
    "self_reported_source_link": "https://blog.google/products/gemini/gemini-2-5-pro-latest-preview/",
    "verified_by_llmstats": false,
    "analysis_method": "Video understanding",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.009959+00:00",
    "updated_at": "2025-07-19T19:56:14.009959+00:00",
    "benchmark_name": "VideoMMMU"
  }
]

================================================
FILE: data/organizations/google/models/gemini-2.5-pro-preview-06-05/model.json
================================================
{
  "model_id": "gemini-2.5-pro-preview-06-05",
  "name": "Gemini 2.5 Pro Preview 06-05",
  "organization_id": "google",
  "fine_tuned_from_model_id": null,
  "description": "The latest preview version of Google's most advanced reasoning Gemini model, capable of solving complex problems. Built for the agentic era with enhanced reasoning capabilities, multimodal understanding (text, image, video, audio), and a 1M token context window. Features thinking preview, code execution, grounding with Google Search, system instructions, function calling, and controlled generation. Supports up to 3,000 images per prompt, 45-60 minutes of video, and 8.4 hours of audio.",
  "release_date": "2025-06-05",
  "announcement_date": "2025-06-05",
  "license_id": "proprietary",
  "multimodal": true,
  "knowledge_cutoff": "2025-01-31",
  "param_count": null,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://cloud.google.com/vertex-ai/generative-ai/docs/models/gemini/2-5-pro",
  "source_playground": "https://aistudio.google.com",
  "source_paper": null,
  "source_scorecard_blog_link": "https://blog.google/products/gemini/gemini-2-5-pro-latest-preview/",
  "source_repo_link": null,
  "source_weights_link": null,
  "created_at": "2025-07-19T19:49:05.493595+00:00",
  "updated_at": "2025-07-19T19:49:05.493595+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/google/models/gemini-diffusion/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 685,
    "benchmark_id": "aime-2025",
    "model_id": "gemini-diffusion",
    "score": 0.233,
    "normalized_score": 0.233,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/models/gemini-diffusion/",
    "verified_by_llmstats": false,
    "analysis_method": "pass @1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.434861+00:00",
    "updated_at": "2025-07-19T19:56:12.434861+00:00",
    "benchmark_name": "AIME 2025"
  },
  {
    "model_benchmark_id": 1100,
    "benchmark_id": "big-bench-extra-hard",
    "model_id": "gemini-diffusion",
    "score": 0.15,
    "normalized_score": 0.15,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/models/gemini-diffusion/",
    "verified_by_llmstats": false,
    "analysis_method": "pass @1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.291288+00:00",
    "updated_at": "2025-07-19T19:56:13.291288+00:00",
    "benchmark_name": "BIG-Bench Extra Hard"
  },
  {
    "model_benchmark_id": 1433,
    "benchmark_id": "bigcodebench",
    "model_id": "gemini-diffusion",
    "score": 0.454,
    "normalized_score": 0.454,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/models/gemini-diffusion/",
    "verified_by_llmstats": false,
    "analysis_method": "pass @1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.050987+00:00",
    "updated_at": "2025-07-19T19:56:14.050987+00:00",
    "benchmark_name": "BigCodeBench"
  },
  {
    "model_benchmark_id": 1217,
    "benchmark_id": "global-mmlu-lite",
    "model_id": "gemini-diffusion",
    "score": 0.691,
    "normalized_score": 0.691,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/models/gemini-diffusion/",
    "verified_by_llmstats": false,
    "analysis_method": "pass @1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.559014+00:00",
    "updated_at": "2025-07-19T19:56:13.559014+00:00",
    "benchmark_name": "Global-MMLU-Lite"
  },
  {
    "model_benchmark_id": 278,
    "benchmark_id": "gpqa",
    "model_id": "gemini-diffusion",
    "score": 0.404,
    "normalized_score": 0.404,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/models/gemini-diffusion/",
    "verified_by_llmstats": false,
    "analysis_method": "pass @1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.637311+00:00",
    "updated_at": "2025-07-19T19:56:11.637311+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 773,
    "benchmark_id": "humaneval",
    "model_id": "gemini-diffusion",
    "score": 0.896,
    "normalized_score": 0.896,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/models/gemini-diffusion/",
    "verified_by_llmstats": false,
    "analysis_method": "pass @1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.625233+00:00",
    "updated_at": "2025-07-19T19:56:12.625233+00:00",
    "benchmark_name": "HumanEval"
  },
  {
    "model_benchmark_id": 1435,
    "benchmark_id": "lbpp-(v2)",
    "model_id": "gemini-diffusion",
    "score": 0.568,
    "normalized_score": 0.568,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/models/gemini-diffusion/",
    "verified_by_llmstats": false,
    "analysis_method": "pass @1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.056060+00:00",
    "updated_at": "2025-07-19T19:56:14.056060+00:00",
    "benchmark_name": "LBPP (v2)"
  },
  {
    "model_benchmark_id": 1110,
    "benchmark_id": "livecodebench",
    "model_id": "gemini-diffusion",
    "score": 0.309,
    "normalized_score": 0.309,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/models/gemini-diffusion/",
    "verified_by_llmstats": false,
    "analysis_method": "pass @1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.314684+00:00",
    "updated_at": "2025-07-19T19:56:13.314684+00:00",
    "benchmark_name": "LiveCodeBench"
  },
  {
    "model_benchmark_id": 1175,
    "benchmark_id": "mbpp",
    "model_id": "gemini-diffusion",
    "score": 0.76,
    "normalized_score": 0.76,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/models/gemini-diffusion/",
    "verified_by_llmstats": false,
    "analysis_method": "pass @1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.475906+00:00",
    "updated_at": "2025-07-19T19:56:13.475906+00:00",
    "benchmark_name": "MBPP"
  },
  {
    "model_benchmark_id": 1342,
    "benchmark_id": "swe-bench-verified",
    "model_id": "gemini-diffusion",
    "score": 0.229,
    "normalized_score": 0.229,
    "is_self_reported": true,
    "self_reported_source_link": "https://deepmind.google/models/gemini-diffusion/",
    "verified_by_llmstats": false,
    "analysis_method": "pass @1, Non-agentic evaluation (single turn edit only), max prompt length of 32K",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.824708+00:00",
    "updated_at": "2025-07-19T19:56:13.824708+00:00",
    "benchmark_name": "SWE-Bench Verified"
  }
]

================================================
FILE: data/organizations/google/models/gemini-diffusion/model.json
================================================
{
  "model_id": "gemini-diffusion",
  "name": "Gemini Diffusion",
  "organization_id": "google",
  "fine_tuned_from_model_id": null,
  "description": "Gemini Diffusion is a state-of-the-art, experimental text diffusion model from Google DeepMind. It explores a new kind of language model designed to provide users with greater control, creativity, and speed in text generation. Instead of predicting text token-by-token, it learns to generate outputs by refining noise step-by-step, allowing for rapid iteration and error correction during generation. Key capabilities include rapid response times (reportedly 1479 tokens/sec excluding overhead), generation of more coherent text by outputting entire blocks of tokens at once, and iterative refinement for consistent outputs. It excels at tasks like editing, including in math and code contexts.",
  "release_date": "2025-05-20",
  "announcement_date": "2025-05-20",
  "license_id": "proprietary",
  "multimodal": false,
  "knowledge_cutoff": null,
  "param_count": null,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": null,
  "source_playground": null,
  "source_paper": null,
  "source_scorecard_blog_link": "https://deepmind.google/models/gemini-diffusion/",
  "source_repo_link": "https://github.com/google",
  "source_weights_link": null,
  "created_at": "2025-07-19T19:49:05.534835+00:00",
  "updated_at": "2025-07-19T19:49:05.534835+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/google/models/gemma-2-27b-it/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 1408,
    "benchmark_id": "agieval",
    "model_id": "gemma-2-27b-it",
    "score": 0.551,
    "normalized_score": 0.551,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/blog/gemma2",
    "verified_by_llmstats": false,
    "analysis_method": "3-5-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.975397+00:00",
    "updated_at": "2025-07-19T19:56:13.975397+00:00",
    "benchmark_name": "AGIEval"
  },
  {
    "model_benchmark_id": 9,
    "benchmark_id": "arc-c",
    "model_id": "gemma-2-27b-it",
    "score": 0.714,
    "normalized_score": 0.714,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/blog/gemma2",
    "verified_by_llmstats": false,
    "analysis_method": "25-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.099650+00:00",
    "updated_at": "2025-07-19T19:56:11.099650+00:00",
    "benchmark_name": "ARC-C"
  },
  {
    "model_benchmark_id": 1055,
    "benchmark_id": "arc-e",
    "model_id": "gemma-2-27b-it",
    "score": 0.886,
    "normalized_score": 0.886,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/blog/gemma2",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.203403+00:00",
    "updated_at": "2025-07-19T19:56:13.203403+00:00",
    "benchmark_name": "ARC-E"
  },
  {
    "model_benchmark_id": 1392,
    "benchmark_id": "big-bench",
    "model_id": "gemma-2-27b-it",
    "score": 0.749,
    "normalized_score": 0.749,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/blog/gemma2",
    "verified_by_llmstats": false,
    "analysis_method": "3-shot, CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.932992+00:00",
    "updated_at": "2025-07-19T19:56:13.932992+00:00",
    "benchmark_name": "BIG-Bench"
  },
  {
    "model_benchmark_id": 1021,
    "benchmark_id": "boolq",
    "model_id": "gemma-2-27b-it",
    "score": 0.848,
    "normalized_score": 0.848,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/blog/gemma2",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.126514+00:00",
    "updated_at": "2025-07-19T19:56:13.126514+00:00",
    "benchmark_name": "BoolQ"
  },
  {
    "model_benchmark_id": 980,
    "benchmark_id": "gsm8k",
    "model_id": "gemma-2-27b-it",
    "score": 0.74,
    "normalized_score": 0.74,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/blog/gemma2",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot, maj@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.058102+00:00",
    "updated_at": "2025-07-19T19:56:13.058102+00:00",
    "benchmark_name": "GSM8k"
  },
  {
    "model_benchmark_id": 38,
    "benchmark_id": "hellaswag",
    "model_id": "gemma-2-27b-it",
    "score": 0.864,
    "normalized_score": 0.864,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/blog/gemma2",
    "verified_by_llmstats": false,
    "analysis_method": "10-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.164247+00:00",
    "updated_at": "2025-07-19T19:56:11.164247+00:00",
    "benchmark_name": "HellaSwag"
  },
  {
    "model_benchmark_id": 767,
    "benchmark_id": "humaneval",
    "model_id": "gemma-2-27b-it",
    "score": 0.518,
    "normalized_score": 0.518,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/blog/gemma2",
    "verified_by_llmstats": false,
    "analysis_method": "pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.615384+00:00",
    "updated_at": "2025-07-19T19:56:12.615384+00:00",
    "benchmark_name": "HumanEval"
  },
  {
    "model_benchmark_id": 382,
    "benchmark_id": "math",
    "model_id": "gemma-2-27b-it",
    "score": 0.423,
    "normalized_score": 0.423,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/blog/gemma2",
    "verified_by_llmstats": false,
    "analysis_method": "4-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.824501+00:00",
    "updated_at": "2025-07-19T19:56:11.824501+00:00",
    "benchmark_name": "MATH"
  },
  {
    "model_benchmark_id": 1170,
    "benchmark_id": "mbpp",
    "model_id": "gemma-2-27b-it",
    "score": 0.626,
    "normalized_score": 0.626,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/blog/gemma2",
    "verified_by_llmstats": false,
    "analysis_method": "3-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.464425+00:00",
    "updated_at": "2025-07-19T19:56:13.464425+00:00",
    "benchmark_name": "MBPP"
  },
  {
    "model_benchmark_id": 68,
    "benchmark_id": "mmlu",
    "model_id": "gemma-2-27b-it",
    "score": 0.752,
    "normalized_score": 0.752,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/blog/gemma2",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot, top-1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.228104+00:00",
    "updated_at": "2025-07-19T19:56:11.228104+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 1048,
    "benchmark_id": "natural-questions",
    "model_id": "gemma-2-27b-it",
    "score": 0.345,
    "normalized_score": 0.345,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/blog/gemma2",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.188220+00:00",
    "updated_at": "2025-07-19T19:56:13.188220+00:00",
    "benchmark_name": "Natural Questions"
  },
  {
    "model_benchmark_id": 1030,
    "benchmark_id": "piqa",
    "model_id": "gemma-2-27b-it",
    "score": 0.832,
    "normalized_score": 0.832,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/blog/gemma2",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.145819+00:00",
    "updated_at": "2025-07-19T19:56:13.145819+00:00",
    "benchmark_name": "PIQA"
  },
  {
    "model_benchmark_id": 1039,
    "benchmark_id": "social-iqa",
    "model_id": "gemma-2-27b-it",
    "score": 0.537,
    "normalized_score": 0.537,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/blog/gemma2",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.168648+00:00",
    "updated_at": "2025-07-19T19:56:13.168648+00:00",
    "benchmark_name": "Social IQa"
  },
  {
    "model_benchmark_id": 248,
    "benchmark_id": "triviaqa",
    "model_id": "gemma-2-27b-it",
    "score": 0.837,
    "normalized_score": 0.837,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/blog/gemma2",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.574247+00:00",
    "updated_at": "2025-07-19T19:56:11.574247+00:00",
    "benchmark_name": "TriviaQA"
  },
  {
    "model_benchmark_id": 1060,
    "benchmark_id": "winogrande",
    "model_id": "gemma-2-27b-it",
    "score": 0.837,
    "normalized_score": 0.837,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/blog/gemma2",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.212219+00:00",
    "updated_at": "2025-07-19T19:56:13.212219+00:00",
    "benchmark_name": "Winogrande"
  }
]

================================================
FILE: data/organizations/google/models/gemma-2-27b-it/model.json
================================================
{
  "model_id": "gemma-2-27b-it",
  "name": "Gemma 2 27B",
  "organization_id": "google",
  "fine_tuned_from_model_id": null,
  "description": "Gemma 2 27B IT is an instruction-tuned version of Google's state-of-the-art open language model. Built from the same research and technology as Gemini, it's optimized for dialogue applications through supervised fine-tuning, distillation from larger models, and RLHF. The model excels at text generation tasks including question answering, summarization, and reasoning.",
  "release_date": "2024-06-27",
  "announcement_date": "2024-06-27",
  "license_id": "gemma",
  "multimodal": false,
  "knowledge_cutoff": null,
  "param_count": 27200000000,
  "training_tokens": 13000000000000,
  "available_in_zeroeval": true,
  "source_api_ref": "https://huggingface.co/google/gemma-2-27b-it",
  "source_playground": "https://huggingface.co/chat/models/google/gemma-2-27b-it",
  "source_paper": "https://storage.googleapis.com/deepmind-media/gemma/gemma-2-report.pdf",
  "source_scorecard_blog_link": "https://huggingface.co/blog/gemma2",
  "source_repo_link": null,
  "source_weights_link": "https://huggingface.co/google/gemma-2-27b-it",
  "created_at": "2025-07-19T19:49:05.485572+00:00",
  "updated_at": "2025-07-19T19:49:05.485572+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/google/models/gemma-2-9b-it/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 1407,
    "benchmark_id": "agieval",
    "model_id": "gemma-2-9b-it",
    "score": 0.528,
    "normalized_score": 0.528,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/blog/gemma2",
    "verified_by_llmstats": false,
    "analysis_method": "3-5-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.973652+00:00",
    "updated_at": "2025-07-19T19:56:13.973652+00:00",
    "benchmark_name": "AGIEval"
  },
  {
    "model_benchmark_id": 8,
    "benchmark_id": "arc-c",
    "model_id": "gemma-2-9b-it",
    "score": 0.684,
    "normalized_score": 0.684,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/blog/gemma2",
    "verified_by_llmstats": false,
    "analysis_method": "25-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.097779+00:00",
    "updated_at": "2025-07-19T19:56:11.097779+00:00",
    "benchmark_name": "ARC-C"
  },
  {
    "model_benchmark_id": 1054,
    "benchmark_id": "arc-e",
    "model_id": "gemma-2-9b-it",
    "score": 0.88,
    "normalized_score": 0.88,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/blog/gemma2",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.201834+00:00",
    "updated_at": "2025-07-19T19:56:13.201834+00:00",
    "benchmark_name": "ARC-E"
  },
  {
    "model_benchmark_id": 1391,
    "benchmark_id": "big-bench",
    "model_id": "gemma-2-9b-it",
    "score": 0.682,
    "normalized_score": 0.682,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/blog/gemma2",
    "verified_by_llmstats": false,
    "analysis_method": "3-shot Chain-of-Thought",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.930966+00:00",
    "updated_at": "2025-07-19T19:56:13.930966+00:00",
    "benchmark_name": "BIG-Bench"
  },
  {
    "model_benchmark_id": 1020,
    "benchmark_id": "boolq",
    "model_id": "gemma-2-9b-it",
    "score": 0.842,
    "normalized_score": 0.842,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/blog/gemma2",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.124981+00:00",
    "updated_at": "2025-07-19T19:56:13.124981+00:00",
    "benchmark_name": "BoolQ"
  },
  {
    "model_benchmark_id": 978,
    "benchmark_id": "gsm8k",
    "model_id": "gemma-2-9b-it",
    "score": 0.686,
    "normalized_score": 0.686,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/blog/gemma2",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot majority@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.053844+00:00",
    "updated_at": "2025-07-19T19:56:13.053844+00:00",
    "benchmark_name": "GSM8k"
  },
  {
    "model_benchmark_id": 36,
    "benchmark_id": "hellaswag",
    "model_id": "gemma-2-9b-it",
    "score": 0.819,
    "normalized_score": 0.819,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/blog/gemma2",
    "verified_by_llmstats": false,
    "analysis_method": "10-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.157090+00:00",
    "updated_at": "2025-07-19T19:56:11.157090+00:00",
    "benchmark_name": "HellaSwag"
  },
  {
    "model_benchmark_id": 765,
    "benchmark_id": "humaneval",
    "model_id": "gemma-2-9b-it",
    "score": 0.402,
    "normalized_score": 0.402,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/blog/gemma2",
    "verified_by_llmstats": false,
    "analysis_method": "pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.611318+00:00",
    "updated_at": "2025-07-19T19:56:12.611318+00:00",
    "benchmark_name": "HumanEval"
  },
  {
    "model_benchmark_id": 380,
    "benchmark_id": "math",
    "model_id": "gemma-2-9b-it",
    "score": 0.366,
    "normalized_score": 0.366,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/blog/gemma2",
    "verified_by_llmstats": false,
    "analysis_method": "4-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.821125+00:00",
    "updated_at": "2025-07-19T19:56:11.821125+00:00",
    "benchmark_name": "MATH"
  },
  {
    "model_benchmark_id": 1169,
    "benchmark_id": "mbpp",
    "model_id": "gemma-2-9b-it",
    "score": 0.524,
    "normalized_score": 0.524,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/blog/gemma2",
    "verified_by_llmstats": false,
    "analysis_method": "3-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.462564+00:00",
    "updated_at": "2025-07-19T19:56:13.462564+00:00",
    "benchmark_name": "MBPP"
  },
  {
    "model_benchmark_id": 66,
    "benchmark_id": "mmlu",
    "model_id": "gemma-2-9b-it",
    "score": 0.713,
    "normalized_score": 0.713,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/blog/gemma2",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.224994+00:00",
    "updated_at": "2025-07-19T19:56:11.224994+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 1047,
    "benchmark_id": "natural-questions",
    "model_id": "gemma-2-9b-it",
    "score": 0.292,
    "normalized_score": 0.292,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/blog/gemma2",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.186631+00:00",
    "updated_at": "2025-07-19T19:56:13.186631+00:00",
    "benchmark_name": "Natural Questions"
  },
  {
    "model_benchmark_id": 1029,
    "benchmark_id": "piqa",
    "model_id": "gemma-2-9b-it",
    "score": 0.817,
    "normalized_score": 0.817,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/blog/gemma2",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.144012+00:00",
    "updated_at": "2025-07-19T19:56:13.144012+00:00",
    "benchmark_name": "PIQA"
  },
  {
    "model_benchmark_id": 1038,
    "benchmark_id": "social-iqa",
    "model_id": "gemma-2-9b-it",
    "score": 0.534,
    "normalized_score": 0.534,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/blog/gemma2",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.166311+00:00",
    "updated_at": "2025-07-19T19:56:13.166311+00:00",
    "benchmark_name": "Social IQa"
  },
  {
    "model_benchmark_id": 247,
    "benchmark_id": "triviaqa",
    "model_id": "gemma-2-9b-it",
    "score": 0.766,
    "normalized_score": 0.766,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/blog/gemma2",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.572657+00:00",
    "updated_at": "2025-07-19T19:56:11.572657+00:00",
    "benchmark_name": "TriviaQA"
  },
  {
    "model_benchmark_id": 148,
    "benchmark_id": "winogrande",
    "model_id": "gemma-2-9b-it",
    "score": 0.806,
    "normalized_score": 0.806,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/blog/gemma2",
    "verified_by_llmstats": false,
    "analysis_method": "partial score evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.380497+00:00",
    "updated_at": "2025-07-19T19:56:11.380497+00:00",
    "benchmark_name": "Winogrande"
  }
]

================================================
FILE: data/organizations/google/models/gemma-2-9b-it/model.json
================================================
{
  "model_id": "gemma-2-9b-it",
  "name": "Gemma 2 9B",
  "organization_id": "google",
  "fine_tuned_from_model_id": null,
  "description": "Gemma 2 9B IT is an instruction-tuned version of Google's Gemma 2 9B base model. It was trained on 8 trillion tokens of web data, code, and math content. The model features sliding window attention, logit soft-capping, and knowledge distillation techniques. It's optimized for dialogue applications through supervised fine-tuning, distillation, RLHF, and model merging using WARP.",
  "release_date": "2024-06-27",
  "announcement_date": "2024-06-27",
  "license_id": "gemma",
  "multimodal": false,
  "knowledge_cutoff": null,
  "param_count": 9240000000,
  "training_tokens": 8000000000000,
  "available_in_zeroeval": true,
  "source_api_ref": "https://huggingface.co/google/gemma-2-9b-it",
  "source_playground": "https://huggingface.co/chat/models/google/gemma-2-9b-it",
  "source_paper": "https://storage.googleapis.com/deepmind-media/gemma/gemma-2-report.pdf",
  "source_scorecard_blog_link": "https://huggingface.co/blog/gemma2",
  "source_repo_link": null,
  "source_weights_link": "https://huggingface.co/google/gemma-2-9b-it",
  "created_at": "2025-07-19T19:49:05.477806+00:00",
  "updated_at": "2025-07-19T19:49:05.477806+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/google/models/gemma-3-12b-it/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 1247,
    "benchmark_id": "ai2d",
    "model_id": "gemma-3-12b-it",
    "score": 0.842,
    "normalized_score": 0.842,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "multimodal evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.621225+00:00",
    "updated_at": "2025-07-19T19:56:13.621225+00:00",
    "benchmark_name": "AI2D"
  },
  {
    "model_benchmark_id": 1096,
    "benchmark_id": "big-bench-extra-hard",
    "model_id": "gemma-3-12b-it",
    "score": 0.163,
    "normalized_score": 0.163,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.282747+00:00",
    "updated_at": "2025-07-19T19:56:13.282747+00:00",
    "benchmark_name": "BIG-Bench Extra Hard"
  },
  {
    "model_benchmark_id": 1067,
    "benchmark_id": "big-bench-hard",
    "model_id": "gemma-3-12b-it",
    "score": 0.857,
    "normalized_score": 0.857,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.226924+00:00",
    "updated_at": "2025-07-19T19:56:13.226924+00:00",
    "benchmark_name": "BIG-Bench Hard"
  },
  {
    "model_benchmark_id": 1147,
    "benchmark_id": "bird-sql-(dev)",
    "model_id": "gemma-3-12b-it",
    "score": 0.479,
    "normalized_score": 0.479,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "- evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.413629+00:00",
    "updated_at": "2025-07-19T19:56:13.413629+00:00",
    "benchmark_name": "Bird-SQL (dev)"
  },
  {
    "model_benchmark_id": 855,
    "benchmark_id": "chartqa",
    "model_id": "gemma-3-12b-it",
    "score": 0.757,
    "normalized_score": 0.757,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "multimodal evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.789962+00:00",
    "updated_at": "2025-07-19T19:56:12.789962+00:00",
    "benchmark_name": "ChartQA"
  },
  {
    "model_benchmark_id": 878,
    "benchmark_id": "docvqa",
    "model_id": "gemma-3-12b-it",
    "score": 0.871,
    "normalized_score": 0.871,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "multimodal evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.830839+00:00",
    "updated_at": "2025-07-19T19:56:12.830839+00:00",
    "benchmark_name": "DocVQA"
  },
  {
    "model_benchmark_id": 1219,
    "benchmark_id": "eclektic",
    "model_id": "gemma-3-12b-it",
    "score": 0.103,
    "normalized_score": 0.103,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.563615+00:00",
    "updated_at": "2025-07-19T19:56:13.563615+00:00",
    "benchmark_name": "ECLeKTic"
  },
  {
    "model_benchmark_id": 1087,
    "benchmark_id": "facts-grounding",
    "model_id": "gemma-3-12b-it",
    "score": 0.758,
    "normalized_score": 0.758,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "- evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.262640+00:00",
    "updated_at": "2025-07-19T19:56:13.262640+00:00",
    "benchmark_name": "FACTS Grounding"
  },
  {
    "model_benchmark_id": 1205,
    "benchmark_id": "global-mmlu-lite",
    "model_id": "gemma-3-12b-it",
    "score": 0.695,
    "normalized_score": 0.695,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.537058+00:00",
    "updated_at": "2025-07-19T19:56:13.537058+00:00",
    "benchmark_name": "Global-MMLU-Lite"
  },
  {
    "model_benchmark_id": 261,
    "benchmark_id": "gpqa",
    "model_id": "gemma-3-12b-it",
    "score": 0.409,
    "normalized_score": 0.409,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation diamond",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.600334+00:00",
    "updated_at": "2025-07-19T19:56:11.600334+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 977,
    "benchmark_id": "gsm8k",
    "model_id": "gemma-3-12b-it",
    "score": 0.944,
    "normalized_score": 0.944,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.052379+00:00",
    "updated_at": "2025-07-19T19:56:13.052379+00:00",
    "benchmark_name": "GSM8k"
  },
  {
    "model_benchmark_id": 1153,
    "benchmark_id": "hiddenmath",
    "model_id": "gemma-3-12b-it",
    "score": 0.545,
    "normalized_score": 0.545,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.427708+00:00",
    "updated_at": "2025-07-19T19:56:13.427708+00:00",
    "benchmark_name": "HiddenMath"
  },
  {
    "model_benchmark_id": 762,
    "benchmark_id": "humaneval",
    "model_id": "gemma-3-12b-it",
    "score": 0.854,
    "normalized_score": 0.854,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.606840+00:00",
    "updated_at": "2025-07-19T19:56:12.606840+00:00",
    "benchmark_name": "HumanEval"
  },
  {
    "model_benchmark_id": 607,
    "benchmark_id": "ifeval",
    "model_id": "gemma-3-12b-it",
    "score": 0.889,
    "normalized_score": 0.889,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.254325+00:00",
    "updated_at": "2025-07-19T19:56:12.254325+00:00",
    "benchmark_name": "IFEval"
  },
  {
    "model_benchmark_id": 1238,
    "benchmark_id": "infovqa",
    "model_id": "gemma-3-12b-it",
    "score": 0.649,
    "normalized_score": 0.649,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "multimodal evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.604072+00:00",
    "updated_at": "2025-07-19T19:56:13.604072+00:00",
    "benchmark_name": "InfoVQA"
  },
  {
    "model_benchmark_id": 1101,
    "benchmark_id": "livecodebench",
    "model_id": "gemma-3-12b-it",
    "score": 0.246,
    "normalized_score": 0.246,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.294686+00:00",
    "updated_at": "2025-07-19T19:56:13.294686+00:00",
    "benchmark_name": "LiveCodeBench"
  },
  {
    "model_benchmark_id": 377,
    "benchmark_id": "math",
    "model_id": "gemma-3-12b-it",
    "score": 0.838,
    "normalized_score": 0.838,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.815597+00:00",
    "updated_at": "2025-07-19T19:56:11.815597+00:00",
    "benchmark_name": "MATH"
  },
  {
    "model_benchmark_id": 1266,
    "benchmark_id": "mathvista-mini",
    "model_id": "gemma-3-12b-it",
    "score": 0.629,
    "normalized_score": 0.629,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "multimodal evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.657019+00:00",
    "updated_at": "2025-07-19T19:56:13.657019+00:00",
    "benchmark_name": "MathVista-Mini"
  },
  {
    "model_benchmark_id": 1166,
    "benchmark_id": "mbpp",
    "model_id": "gemma-3-12b-it",
    "score": 0.73,
    "normalized_score": 0.73,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "3-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.456223+00:00",
    "updated_at": "2025-07-19T19:56:13.456223+00:00",
    "benchmark_name": "MBPP"
  },
  {
    "model_benchmark_id": 163,
    "benchmark_id": "mmlu-pro",
    "model_id": "gemma-3-12b-it",
    "score": 0.606,
    "normalized_score": 0.606,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.415028+00:00",
    "updated_at": "2025-07-19T19:56:11.415028+00:00",
    "benchmark_name": "MMLU-Pro"
  },
  {
    "model_benchmark_id": 1235,
    "benchmark_id": "mmmu-(val)",
    "model_id": "gemma-3-12b-it",
    "score": 0.596,
    "normalized_score": 0.596,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "multimodal evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.595790+00:00",
    "updated_at": "2025-07-19T19:56:13.595790+00:00",
    "benchmark_name": "MMMU (val)"
  },
  {
    "model_benchmark_id": 1197,
    "benchmark_id": "natural2code",
    "model_id": "gemma-3-12b-it",
    "score": 0.807,
    "normalized_score": 0.807,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.521277+00:00",
    "updated_at": "2025-07-19T19:56:13.521277+00:00",
    "benchmark_name": "Natural2Code"
  },
  {
    "model_benchmark_id": 224,
    "benchmark_id": "simpleqa",
    "model_id": "gemma-3-12b-it",
    "score": 0.063,
    "normalized_score": 0.063,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.528858+00:00",
    "updated_at": "2025-07-19T19:56:11.528858+00:00",
    "benchmark_name": "SimpleQA"
  },
  {
    "model_benchmark_id": 903,
    "benchmark_id": "textvqa",
    "model_id": "gemma-3-12b-it",
    "score": 0.677,
    "normalized_score": 0.677,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "multimodal evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.882990+00:00",
    "updated_at": "2025-07-19T19:56:12.882990+00:00",
    "benchmark_name": "TextVQA"
  },
  {
    "model_benchmark_id": 1263,
    "benchmark_id": "vqav2-(val)",
    "model_id": "gemma-3-12b-it",
    "score": 0.716,
    "normalized_score": 0.716,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "multimodal evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.650557+00:00",
    "updated_at": "2025-07-19T19:56:13.650557+00:00",
    "benchmark_name": "VQAv2 (val)"
  },
  {
    "model_benchmark_id": 1227,
    "benchmark_id": "wmt24++",
    "model_id": "gemma-3-12b-it",
    "score": 0.516,
    "normalized_score": 0.516,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.578915+00:00",
    "updated_at": "2025-07-19T19:56:13.578915+00:00",
    "benchmark_name": "WMT24++"
  }
]

================================================
FILE: data/organizations/google/models/gemma-3-12b-it/model.json
================================================
{
  "model_id": "gemma-3-12b-it",
  "name": "Gemma 3 12B",
  "organization_id": "google",
  "fine_tuned_from_model_id": null,
  "description": "Gemma 3 12B is a 12-billion-parameter vision-language model from Google, handling text and image input and generating text output. It features a 128K context window, multilingual support, and open weights. Suitable for question answering, summarization, reasoning, and image understanding tasks.",
  "release_date": "2025-03-12",
  "announcement_date": "2025-03-12",
  "license_id": "gemma",
  "multimodal": true,
  "knowledge_cutoff": null,
  "param_count": 12000000000,
  "training_tokens": 12000000000000,
  "available_in_zeroeval": true,
  "source_api_ref": null,
  "source_playground": null,
  "source_paper": "https://storage.googleapis.com/deepmind-media/gemma/Gemma3Report.pdf",
  "source_scorecard_blog_link": "https://huggingface.co/blog/gemma3",
  "source_repo_link": null,
  "source_weights_link": "https://huggingface.co/google/gemma-3-12b-it",
  "created_at": "2025-07-19T19:49:05.444134+00:00",
  "updated_at": "2025-07-19T19:49:05.444134+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/google/models/gemma-3-1b-it/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 1099,
    "benchmark_id": "big-bench-extra-hard",
    "model_id": "gemma-3-1b-it",
    "score": 0.072,
    "normalized_score": 0.072,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.289054+00:00",
    "updated_at": "2025-07-19T19:56:13.289054+00:00",
    "benchmark_name": "BIG-Bench Extra Hard"
  },
  {
    "model_benchmark_id": 1075,
    "benchmark_id": "big-bench-hard",
    "model_id": "gemma-3-1b-it",
    "score": 0.391,
    "normalized_score": 0.391,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.240587+00:00",
    "updated_at": "2025-07-19T19:56:13.240587+00:00",
    "benchmark_name": "BIG-Bench Hard"
  },
  {
    "model_benchmark_id": 1151,
    "benchmark_id": "bird-sql-(dev)",
    "model_id": "gemma-3-1b-it",
    "score": 0.064,
    "normalized_score": 0.064,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "- evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.421336+00:00",
    "updated_at": "2025-07-19T19:56:13.421336+00:00",
    "benchmark_name": "Bird-SQL (dev)"
  },
  {
    "model_benchmark_id": 1225,
    "benchmark_id": "eclektic",
    "model_id": "gemma-3-1b-it",
    "score": 0.014,
    "normalized_score": 0.014,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.574307+00:00",
    "updated_at": "2025-07-19T19:56:13.574307+00:00",
    "benchmark_name": "ECLeKTic"
  },
  {
    "model_benchmark_id": 1094,
    "benchmark_id": "facts-grounding",
    "model_id": "gemma-3-1b-it",
    "score": 0.364,
    "normalized_score": 0.364,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "- evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.276605+00:00",
    "updated_at": "2025-07-19T19:56:13.276605+00:00",
    "benchmark_name": "FACTS Grounding"
  },
  {
    "model_benchmark_id": 1216,
    "benchmark_id": "global-mmlu-lite",
    "model_id": "gemma-3-1b-it",
    "score": 0.342,
    "normalized_score": 0.342,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.557306+00:00",
    "updated_at": "2025-07-19T19:56:13.557306+00:00",
    "benchmark_name": "Global-MMLU-Lite"
  },
  {
    "model_benchmark_id": 276,
    "benchmark_id": "gpqa",
    "model_id": "gemma-3-1b-it",
    "score": 0.192,
    "normalized_score": 0.192,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation diamond",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.633668+00:00",
    "updated_at": "2025-07-19T19:56:11.633668+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 984,
    "benchmark_id": "gsm8k",
    "model_id": "gemma-3-1b-it",
    "score": 0.628,
    "normalized_score": 0.628,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.064705+00:00",
    "updated_at": "2025-07-19T19:56:13.064705+00:00",
    "benchmark_name": "GSM8k"
  },
  {
    "model_benchmark_id": 1162,
    "benchmark_id": "hiddenmath",
    "model_id": "gemma-3-1b-it",
    "score": 0.158,
    "normalized_score": 0.158,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.445125+00:00",
    "updated_at": "2025-07-19T19:56:13.445125+00:00",
    "benchmark_name": "HiddenMath"
  },
  {
    "model_benchmark_id": 772,
    "benchmark_id": "humaneval",
    "model_id": "gemma-3-1b-it",
    "score": 0.415,
    "normalized_score": 0.415,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.623656+00:00",
    "updated_at": "2025-07-19T19:56:12.623656+00:00",
    "benchmark_name": "HumanEval"
  },
  {
    "model_benchmark_id": 610,
    "benchmark_id": "ifeval",
    "model_id": "gemma-3-1b-it",
    "score": 0.802,
    "normalized_score": 0.802,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.260062+00:00",
    "updated_at": "2025-07-19T19:56:12.260062+00:00",
    "benchmark_name": "IFEval"
  },
  {
    "model_benchmark_id": 1109,
    "benchmark_id": "livecodebench",
    "model_id": "gemma-3-1b-it",
    "score": 0.019,
    "normalized_score": 0.019,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.311408+00:00",
    "updated_at": "2025-07-19T19:56:13.311408+00:00",
    "benchmark_name": "LiveCodeBench"
  },
  {
    "model_benchmark_id": 386,
    "benchmark_id": "math",
    "model_id": "gemma-3-1b-it",
    "score": 0.48,
    "normalized_score": 0.48,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.832121+00:00",
    "updated_at": "2025-07-19T19:56:11.832121+00:00",
    "benchmark_name": "MATH"
  },
  {
    "model_benchmark_id": 1174,
    "benchmark_id": "mbpp",
    "model_id": "gemma-3-1b-it",
    "score": 0.352,
    "normalized_score": 0.352,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "3-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.474036+00:00",
    "updated_at": "2025-07-19T19:56:13.474036+00:00",
    "benchmark_name": "MBPP"
  },
  {
    "model_benchmark_id": 172,
    "benchmark_id": "mmlu-pro",
    "model_id": "gemma-3-1b-it",
    "score": 0.147,
    "normalized_score": 0.147,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.434242+00:00",
    "updated_at": "2025-07-19T19:56:11.434242+00:00",
    "benchmark_name": "MMLU-Pro"
  },
  {
    "model_benchmark_id": 1202,
    "benchmark_id": "natural2code",
    "model_id": "gemma-3-1b-it",
    "score": 0.56,
    "normalized_score": 0.56,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.529701+00:00",
    "updated_at": "2025-07-19T19:56:13.529701+00:00",
    "benchmark_name": "Natural2Code"
  },
  {
    "model_benchmark_id": 232,
    "benchmark_id": "simpleqa",
    "model_id": "gemma-3-1b-it",
    "score": 0.022,
    "normalized_score": 0.022,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.544931+00:00",
    "updated_at": "2025-07-19T19:56:11.544931+00:00",
    "benchmark_name": "SimpleQA"
  },
  {
    "model_benchmark_id": 1233,
    "benchmark_id": "wmt24++",
    "model_id": "gemma-3-1b-it",
    "score": 0.359,
    "normalized_score": 0.359,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.590063+00:00",
    "updated_at": "2025-07-19T19:56:13.590063+00:00",
    "benchmark_name": "WMT24++"
  }
]

================================================
FILE: data/organizations/google/models/gemma-3-1b-it/model.json
================================================
{
  "model_id": "gemma-3-1b-it",
  "name": "Gemma 3 1B",
  "organization_id": "google",
  "fine_tuned_from_model_id": null,
  "description": "The Gemma 3 1B model is a lightweight, 1-billion-parameter language model by Google, optimized for efficiency on resource-limited devices. At 529MB, it processes text at 2,585 tokens/second with a context window of 128,000 tokens. It supports 35+ languages but handles text-only input, unlike larger multimodal Gemma models. This balance of speed and efficiency makes it ideal for fast text processing on mobile and low-power devices.",
  "release_date": "2025-03-12",
  "announcement_date": "2025-03-12",
  "license_id": "gemma",
  "multimodal": false,
  "knowledge_cutoff": null,
  "param_count": 1000000000,
  "training_tokens": 2000000000000,
  "available_in_zeroeval": true,
  "source_api_ref": "https://huggingface.co/google/gemma-3-1b-it",
  "source_playground": "https://huggingface.co/chat/models/google/gemma-3-1b-it",
  "source_paper": "https://storage.googleapis.com/deepmind-media/gemma/Gemma3Report.pdf",
  "source_scorecard_blog_link": "https://huggingface.co/blog/gemma3",
  "source_repo_link": null,
  "source_weights_link": "https://huggingface.co/google/gemma-3-1b-it",
  "created_at": "2025-07-19T19:49:05.527185+00:00",
  "updated_at": "2025-07-19T19:49:05.527185+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/google/models/gemma-3-27b-it/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 1249,
    "benchmark_id": "ai2d",
    "model_id": "gemma-3-27b-it",
    "score": 0.845,
    "normalized_score": 0.845,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "multimodal evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.624921+00:00",
    "updated_at": "2025-07-19T19:56:13.624921+00:00",
    "benchmark_name": "AI2D"
  },
  {
    "model_benchmark_id": 1098,
    "benchmark_id": "big-bench-extra-hard",
    "model_id": "gemma-3-27b-it",
    "score": 0.193,
    "normalized_score": 0.193,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.286991+00:00",
    "updated_at": "2025-07-19T19:56:13.286991+00:00",
    "benchmark_name": "BIG-Bench Extra Hard"
  },
  {
    "model_benchmark_id": 1074,
    "benchmark_id": "big-bench-hard",
    "model_id": "gemma-3-27b-it",
    "score": 0.876,
    "normalized_score": 0.876,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.238868+00:00",
    "updated_at": "2025-07-19T19:56:13.238868+00:00",
    "benchmark_name": "BIG-Bench Hard"
  },
  {
    "model_benchmark_id": 1150,
    "benchmark_id": "bird-sql-(dev)",
    "model_id": "gemma-3-27b-it",
    "score": 0.544,
    "normalized_score": 0.544,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "- evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.418526+00:00",
    "updated_at": "2025-07-19T19:56:13.418526+00:00",
    "benchmark_name": "Bird-SQL (dev)"
  },
  {
    "model_benchmark_id": 857,
    "benchmark_id": "chartqa",
    "model_id": "gemma-3-27b-it",
    "score": 0.78,
    "normalized_score": 0.78,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "multimodal evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.793657+00:00",
    "updated_at": "2025-07-19T19:56:12.793657+00:00",
    "benchmark_name": "ChartQA"
  },
  {
    "model_benchmark_id": 880,
    "benchmark_id": "docvqa",
    "model_id": "gemma-3-27b-it",
    "score": 0.866,
    "normalized_score": 0.866,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "multimodal evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.834284+00:00",
    "updated_at": "2025-07-19T19:56:12.834284+00:00",
    "benchmark_name": "DocVQA"
  },
  {
    "model_benchmark_id": 1224,
    "benchmark_id": "eclektic",
    "model_id": "gemma-3-27b-it",
    "score": 0.167,
    "normalized_score": 0.167,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.572334+00:00",
    "updated_at": "2025-07-19T19:56:13.572334+00:00",
    "benchmark_name": "ECLeKTic"
  },
  {
    "model_benchmark_id": 1093,
    "benchmark_id": "facts-grounding",
    "model_id": "gemma-3-27b-it",
    "score": 0.749,
    "normalized_score": 0.749,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "- evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.275050+00:00",
    "updated_at": "2025-07-19T19:56:13.275050+00:00",
    "benchmark_name": "FACTS Grounding"
  },
  {
    "model_benchmark_id": 1215,
    "benchmark_id": "global-mmlu-lite",
    "model_id": "gemma-3-27b-it",
    "score": 0.751,
    "normalized_score": 0.751,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.555532+00:00",
    "updated_at": "2025-07-19T19:56:13.555532+00:00",
    "benchmark_name": "Global-MMLU-Lite"
  },
  {
    "model_benchmark_id": 275,
    "benchmark_id": "gpqa",
    "model_id": "gemma-3-27b-it",
    "score": 0.424,
    "normalized_score": 0.424,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation diamond",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.628803+00:00",
    "updated_at": "2025-07-19T19:56:11.628803+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 983,
    "benchmark_id": "gsm8k",
    "model_id": "gemma-3-27b-it",
    "score": 0.959,
    "normalized_score": 0.959,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.063038+00:00",
    "updated_at": "2025-07-19T19:56:13.063038+00:00",
    "benchmark_name": "GSM8k"
  },
  {
    "model_benchmark_id": 1161,
    "benchmark_id": "hiddenmath",
    "model_id": "gemma-3-27b-it",
    "score": 0.603,
    "normalized_score": 0.603,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.443231+00:00",
    "updated_at": "2025-07-19T19:56:13.443231+00:00",
    "benchmark_name": "HiddenMath"
  },
  {
    "model_benchmark_id": 771,
    "benchmark_id": "humaneval",
    "model_id": "gemma-3-27b-it",
    "score": 0.878,
    "normalized_score": 0.878,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.621954+00:00",
    "updated_at": "2025-07-19T19:56:12.621954+00:00",
    "benchmark_name": "HumanEval"
  },
  {
    "model_benchmark_id": 609,
    "benchmark_id": "ifeval",
    "model_id": "gemma-3-27b-it",
    "score": 0.904,
    "normalized_score": 0.904,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.258406+00:00",
    "updated_at": "2025-07-19T19:56:12.258406+00:00",
    "benchmark_name": "IFEval"
  },
  {
    "model_benchmark_id": 1240,
    "benchmark_id": "infovqa",
    "model_id": "gemma-3-27b-it",
    "score": 0.706,
    "normalized_score": 0.706,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "multimodal evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.607541+00:00",
    "updated_at": "2025-07-19T19:56:13.607541+00:00",
    "benchmark_name": "InfoVQA"
  },
  {
    "model_benchmark_id": 1108,
    "benchmark_id": "livecodebench",
    "model_id": "gemma-3-27b-it",
    "score": 0.297,
    "normalized_score": 0.297,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.308517+00:00",
    "updated_at": "2025-07-19T19:56:13.308517+00:00",
    "benchmark_name": "LiveCodeBench"
  },
  {
    "model_benchmark_id": 385,
    "benchmark_id": "math",
    "model_id": "gemma-3-27b-it",
    "score": 0.89,
    "normalized_score": 0.89,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.830123+00:00",
    "updated_at": "2025-07-19T19:56:11.830123+00:00",
    "benchmark_name": "MATH"
  },
  {
    "model_benchmark_id": 1268,
    "benchmark_id": "mathvista-mini",
    "model_id": "gemma-3-27b-it",
    "score": 0.676,
    "normalized_score": 0.676,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "multimodal evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.660624+00:00",
    "updated_at": "2025-07-19T19:56:13.660624+00:00",
    "benchmark_name": "MathVista-Mini"
  },
  {
    "model_benchmark_id": 1173,
    "benchmark_id": "mbpp",
    "model_id": "gemma-3-27b-it",
    "score": 0.744,
    "normalized_score": 0.744,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "3-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.472259+00:00",
    "updated_at": "2025-07-19T19:56:13.472259+00:00",
    "benchmark_name": "MBPP"
  },
  {
    "model_benchmark_id": 171,
    "benchmark_id": "mmlu-pro",
    "model_id": "gemma-3-27b-it",
    "score": 0.675,
    "normalized_score": 0.675,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.432013+00:00",
    "updated_at": "2025-07-19T19:56:11.432013+00:00",
    "benchmark_name": "MMLU-Pro"
  },
  {
    "model_benchmark_id": 1237,
    "benchmark_id": "mmmu-(val)",
    "model_id": "gemma-3-27b-it",
    "score": 0.649,
    "normalized_score": 0.649,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "multimodal evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.599826+00:00",
    "updated_at": "2025-07-19T19:56:13.599826+00:00",
    "benchmark_name": "MMMU (val)"
  },
  {
    "model_benchmark_id": 1201,
    "benchmark_id": "natural2code",
    "model_id": "gemma-3-27b-it",
    "score": 0.845,
    "normalized_score": 0.845,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.528235+00:00",
    "updated_at": "2025-07-19T19:56:13.528235+00:00",
    "benchmark_name": "Natural2Code"
  },
  {
    "model_benchmark_id": 231,
    "benchmark_id": "simpleqa",
    "model_id": "gemma-3-27b-it",
    "score": 0.1,
    "normalized_score": 0.1,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.543428+00:00",
    "updated_at": "2025-07-19T19:56:11.543428+00:00",
    "benchmark_name": "SimpleQA"
  },
  {
    "model_benchmark_id": 905,
    "benchmark_id": "textvqa",
    "model_id": "gemma-3-27b-it",
    "score": 0.651,
    "normalized_score": 0.651,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "multimodal evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.886992+00:00",
    "updated_at": "2025-07-19T19:56:12.886992+00:00",
    "benchmark_name": "TextVQA"
  },
  {
    "model_benchmark_id": 1265,
    "benchmark_id": "vqav2-(val)",
    "model_id": "gemma-3-27b-it",
    "score": 0.71,
    "normalized_score": 0.71,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "multimodal evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.653584+00:00",
    "updated_at": "2025-07-19T19:56:13.653584+00:00",
    "benchmark_name": "VQAv2 (val)"
  },
  {
    "model_benchmark_id": 1232,
    "benchmark_id": "wmt24++",
    "model_id": "gemma-3-27b-it",
    "score": 0.534,
    "normalized_score": 0.534,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.587542+00:00",
    "updated_at": "2025-07-19T19:56:13.587542+00:00",
    "benchmark_name": "WMT24++"
  }
]

================================================
FILE: data/organizations/google/models/gemma-3-27b-it/model.json
================================================
{
  "model_id": "gemma-3-27b-it",
  "name": "Gemma 3 27B",
  "organization_id": "google",
  "fine_tuned_from_model_id": null,
  "description": "Gemma 3 27B is a 27-billion-parameter vision-language model from Google, handling text and image input and generating text output. It features a 128K context window, multilingual support, and open weights. Suitable for complex question answering, summarization, reasoning, and image understanding tasks.",
  "release_date": "2025-03-12",
  "announcement_date": "2025-03-12",
  "license_id": "gemma",
  "multimodal": true,
  "knowledge_cutoff": null,
  "param_count": 27000000000,
  "training_tokens": 14000000000000,
  "available_in_zeroeval": true,
  "source_api_ref": null,
  "source_playground": null,
  "source_paper": "https://storage.googleapis.com/deepmind-media/gemma/Gemma3Report.pdf",
  "source_scorecard_blog_link": "https://huggingface.co/blog/gemma3",
  "source_repo_link": null,
  "source_weights_link": "https://huggingface.co/google/gemma-3-27b-it",
  "created_at": "2025-07-19T19:49:05.523800+00:00",
  "updated_at": "2025-07-19T19:49:05.523800+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/google/models/gemma-3-4b-it/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 1248,
    "benchmark_id": "ai2d",
    "model_id": "gemma-3-4b-it",
    "score": 0.748,
    "normalized_score": 0.748,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "multimodal evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.622871+00:00",
    "updated_at": "2025-07-19T19:56:13.622871+00:00",
    "benchmark_name": "AI2D"
  },
  {
    "model_benchmark_id": 1097,
    "benchmark_id": "big-bench-extra-hard",
    "model_id": "gemma-3-4b-it",
    "score": 0.11,
    "normalized_score": 0.11,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.285056+00:00",
    "updated_at": "2025-07-19T19:56:13.285056+00:00",
    "benchmark_name": "BIG-Bench Extra Hard"
  },
  {
    "model_benchmark_id": 1073,
    "benchmark_id": "big-bench-hard",
    "model_id": "gemma-3-4b-it",
    "score": 0.722,
    "normalized_score": 0.722,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.237255+00:00",
    "updated_at": "2025-07-19T19:56:13.237255+00:00",
    "benchmark_name": "BIG-Bench Hard"
  },
  {
    "model_benchmark_id": 1149,
    "benchmark_id": "bird-sql-(dev)",
    "model_id": "gemma-3-4b-it",
    "score": 0.363,
    "normalized_score": 0.363,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "- evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.417046+00:00",
    "updated_at": "2025-07-19T19:56:13.417046+00:00",
    "benchmark_name": "Bird-SQL (dev)"
  },
  {
    "model_benchmark_id": 856,
    "benchmark_id": "chartqa",
    "model_id": "gemma-3-4b-it",
    "score": 0.688,
    "normalized_score": 0.688,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "multimodal evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.791952+00:00",
    "updated_at": "2025-07-19T19:56:12.791952+00:00",
    "benchmark_name": "ChartQA"
  },
  {
    "model_benchmark_id": 879,
    "benchmark_id": "docvqa",
    "model_id": "gemma-3-4b-it",
    "score": 0.758,
    "normalized_score": 0.758,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "multimodal evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.832468+00:00",
    "updated_at": "2025-07-19T19:56:12.832468+00:00",
    "benchmark_name": "DocVQA"
  },
  {
    "model_benchmark_id": 1223,
    "benchmark_id": "eclektic",
    "model_id": "gemma-3-4b-it",
    "score": 0.046,
    "normalized_score": 0.046,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.570776+00:00",
    "updated_at": "2025-07-19T19:56:13.570776+00:00",
    "benchmark_name": "ECLeKTic"
  },
  {
    "model_benchmark_id": 1092,
    "benchmark_id": "facts-grounding",
    "model_id": "gemma-3-4b-it",
    "score": 0.701,
    "normalized_score": 0.701,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "- evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.273464+00:00",
    "updated_at": "2025-07-19T19:56:13.273464+00:00",
    "benchmark_name": "FACTS Grounding"
  },
  {
    "model_benchmark_id": 1214,
    "benchmark_id": "global-mmlu-lite",
    "model_id": "gemma-3-4b-it",
    "score": 0.545,
    "normalized_score": 0.545,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.553690+00:00",
    "updated_at": "2025-07-19T19:56:13.553690+00:00",
    "benchmark_name": "Global-MMLU-Lite"
  },
  {
    "model_benchmark_id": 274,
    "benchmark_id": "gpqa",
    "model_id": "gemma-3-4b-it",
    "score": 0.308,
    "normalized_score": 0.308,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation diamond",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.625675+00:00",
    "updated_at": "2025-07-19T19:56:11.625675+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 982,
    "benchmark_id": "gsm8k",
    "model_id": "gemma-3-4b-it",
    "score": 0.892,
    "normalized_score": 0.892,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.061601+00:00",
    "updated_at": "2025-07-19T19:56:13.061601+00:00",
    "benchmark_name": "GSM8k"
  },
  {
    "model_benchmark_id": 1160,
    "benchmark_id": "hiddenmath",
    "model_id": "gemma-3-4b-it",
    "score": 0.43,
    "normalized_score": 0.43,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.440350+00:00",
    "updated_at": "2025-07-19T19:56:13.440350+00:00",
    "benchmark_name": "HiddenMath"
  },
  {
    "model_benchmark_id": 770,
    "benchmark_id": "humaneval",
    "model_id": "gemma-3-4b-it",
    "score": 0.713,
    "normalized_score": 0.713,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.620468+00:00",
    "updated_at": "2025-07-19T19:56:12.620468+00:00",
    "benchmark_name": "HumanEval"
  },
  {
    "model_benchmark_id": 608,
    "benchmark_id": "ifeval",
    "model_id": "gemma-3-4b-it",
    "score": 0.902,
    "normalized_score": 0.902,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.256346+00:00",
    "updated_at": "2025-07-19T19:56:12.256346+00:00",
    "benchmark_name": "IFEval"
  },
  {
    "model_benchmark_id": 1239,
    "benchmark_id": "infovqa",
    "model_id": "gemma-3-4b-it",
    "score": 0.5,
    "normalized_score": 0.5,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "multimodal evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.605648+00:00",
    "updated_at": "2025-07-19T19:56:13.605648+00:00",
    "benchmark_name": "InfoVQA"
  },
  {
    "model_benchmark_id": 1107,
    "benchmark_id": "livecodebench",
    "model_id": "gemma-3-4b-it",
    "score": 0.126,
    "normalized_score": 0.126,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.306674+00:00",
    "updated_at": "2025-07-19T19:56:13.306674+00:00",
    "benchmark_name": "LiveCodeBench"
  },
  {
    "model_benchmark_id": 384,
    "benchmark_id": "math",
    "model_id": "gemma-3-4b-it",
    "score": 0.756,
    "normalized_score": 0.756,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.828322+00:00",
    "updated_at": "2025-07-19T19:56:11.828322+00:00",
    "benchmark_name": "MATH"
  },
  {
    "model_benchmark_id": 1267,
    "benchmark_id": "mathvista-mini",
    "model_id": "gemma-3-4b-it",
    "score": 0.5,
    "normalized_score": 0.5,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "multimodal evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.659077+00:00",
    "updated_at": "2025-07-19T19:56:13.659077+00:00",
    "benchmark_name": "MathVista-Mini"
  },
  {
    "model_benchmark_id": 1172,
    "benchmark_id": "mbpp",
    "model_id": "gemma-3-4b-it",
    "score": 0.632,
    "normalized_score": 0.632,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "3-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.469983+00:00",
    "updated_at": "2025-07-19T19:56:13.469983+00:00",
    "benchmark_name": "MBPP"
  },
  {
    "model_benchmark_id": 170,
    "benchmark_id": "mmlu-pro",
    "model_id": "gemma-3-4b-it",
    "score": 0.436,
    "normalized_score": 0.436,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.430343+00:00",
    "updated_at": "2025-07-19T19:56:11.430343+00:00",
    "benchmark_name": "MMLU-Pro"
  },
  {
    "model_benchmark_id": 1236,
    "benchmark_id": "mmmu-(val)",
    "model_id": "gemma-3-4b-it",
    "score": 0.488,
    "normalized_score": 0.488,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "multimodal evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.597769+00:00",
    "updated_at": "2025-07-19T19:56:13.597769+00:00",
    "benchmark_name": "MMMU (val)"
  },
  {
    "model_benchmark_id": 1200,
    "benchmark_id": "natural2code",
    "model_id": "gemma-3-4b-it",
    "score": 0.703,
    "normalized_score": 0.703,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.526663+00:00",
    "updated_at": "2025-07-19T19:56:13.526663+00:00",
    "benchmark_name": "Natural2Code"
  },
  {
    "model_benchmark_id": 230,
    "benchmark_id": "simpleqa",
    "model_id": "gemma-3-4b-it",
    "score": 0.04,
    "normalized_score": 0.04,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.542000+00:00",
    "updated_at": "2025-07-19T19:56:11.542000+00:00",
    "benchmark_name": "SimpleQA"
  },
  {
    "model_benchmark_id": 904,
    "benchmark_id": "textvqa",
    "model_id": "gemma-3-4b-it",
    "score": 0.578,
    "normalized_score": 0.578,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "multimodal evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.885190+00:00",
    "updated_at": "2025-07-19T19:56:12.885190+00:00",
    "benchmark_name": "TextVQA"
  },
  {
    "model_benchmark_id": 1264,
    "benchmark_id": "vqav2-(val)",
    "model_id": "gemma-3-4b-it",
    "score": 0.624,
    "normalized_score": 0.624,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "multimodal evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.652122+00:00",
    "updated_at": "2025-07-19T19:56:13.652122+00:00",
    "benchmark_name": "VQAv2 (val)"
  },
  {
    "model_benchmark_id": 1231,
    "benchmark_id": "wmt24++",
    "model_id": "gemma-3-4b-it",
    "score": 0.468,
    "normalized_score": 0.468,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.google.dev/gemma/docs/core/model_card_3",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.586157+00:00",
    "updated_at": "2025-07-19T19:56:13.586157+00:00",
    "benchmark_name": "WMT24++"
  }
]

================================================
FILE: data/organizations/google/models/gemma-3-4b-it/model.json
================================================
{
  "model_id": "gemma-3-4b-it",
  "name": "Gemma 3 4B",
  "organization_id": "google",
  "fine_tuned_from_model_id": null,
  "description": "Gemma 3 4B is a 4-billion-parameter vision-language model from Google, handling text and image input and generating text output. It features a 128K context window, multilingual support, and open weights. Suitable for question answering, summarization, reasoning, and image understanding tasks.",
  "release_date": "2025-03-12",
  "announcement_date": "2025-03-12",
  "license_id": "gemma",
  "multimodal": true,
  "knowledge_cutoff": "2024-08-01",
  "param_count": 4000000000,
  "training_tokens": 4000000000000,
  "available_in_zeroeval": true,
  "source_api_ref": null,
  "source_playground": null,
  "source_paper": "https://storage.googleapis.com/deepmind-media/gemma/Gemma3Report.pdf",
  "source_scorecard_blog_link": "https://huggingface.co/blog/gemma3",
  "source_repo_link": null,
  "source_weights_link": "https://huggingface.co/google/gemma-3-4b-it",
  "created_at": "2025-07-19T19:49:05.520515+00:00",
  "updated_at": "2025-07-19T19:49:05.520515+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/google/models/gemma-3n-e2b/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 10,
    "benchmark_id": "arc-c",
    "model_id": "gemma-3n-e2b",
    "score": 0.517,
    "normalized_score": 0.517,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B",
    "verified_by_llmstats": false,
    "analysis_method": "25-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.102376+00:00",
    "updated_at": "2025-07-19T19:56:11.102376+00:00",
    "benchmark_name": "ARC-C"
  },
  {
    "model_benchmark_id": 1056,
    "benchmark_id": "arc-e",
    "model_id": "gemma-3n-e2b",
    "score": 0.758,
    "normalized_score": 0.758,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.204955+00:00",
    "updated_at": "2025-07-19T19:56:13.204955+00:00",
    "benchmark_name": "ARC-E"
  },
  {
    "model_benchmark_id": 1071,
    "benchmark_id": "big-bench-hard",
    "model_id": "gemma-3n-e2b",
    "score": 0.443,
    "normalized_score": 0.443,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B",
    "verified_by_llmstats": false,
    "analysis_method": "few-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.233872+00:00",
    "updated_at": "2025-07-19T19:56:13.233872+00:00",
    "benchmark_name": "BIG-Bench Hard"
  },
  {
    "model_benchmark_id": 1022,
    "benchmark_id": "boolq",
    "model_id": "gemma-3n-e2b",
    "score": 0.764,
    "normalized_score": 0.764,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.127882+00:00",
    "updated_at": "2025-07-19T19:56:13.127882+00:00",
    "benchmark_name": "BoolQ"
  },
  {
    "model_benchmark_id": 946,
    "benchmark_id": "drop",
    "model_id": "gemma-3n-e2b",
    "score": 0.539,
    "normalized_score": 0.539,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B",
    "verified_by_llmstats": false,
    "analysis_method": "Token F1 score. 1-shot.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.996776+00:00",
    "updated_at": "2025-07-19T19:56:12.996776+00:00",
    "benchmark_name": "DROP"
  },
  {
    "model_benchmark_id": 39,
    "benchmark_id": "hellaswag",
    "model_id": "gemma-3n-e2b",
    "score": 0.722,
    "normalized_score": 0.722,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B",
    "verified_by_llmstats": false,
    "analysis_method": "10-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.166470+00:00",
    "updated_at": "2025-07-19T19:56:11.166470+00:00",
    "benchmark_name": "HellaSwag"
  },
  {
    "model_benchmark_id": 1049,
    "benchmark_id": "natural-questions",
    "model_id": "gemma-3n-e2b",
    "score": 0.155,
    "normalized_score": 0.155,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.190039+00:00",
    "updated_at": "2025-07-19T19:56:13.190039+00:00",
    "benchmark_name": "Natural Questions"
  },
  {
    "model_benchmark_id": 1031,
    "benchmark_id": "piqa",
    "model_id": "gemma-3n-e2b",
    "score": 0.789,
    "normalized_score": 0.789,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.147878+00:00",
    "updated_at": "2025-07-19T19:56:13.147878+00:00",
    "benchmark_name": "PIQA"
  },
  {
    "model_benchmark_id": 1040,
    "benchmark_id": "social-iqa",
    "model_id": "gemma-3n-e2b",
    "score": 0.488,
    "normalized_score": 0.488,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.170669+00:00",
    "updated_at": "2025-07-19T19:56:13.170669+00:00",
    "benchmark_name": "Social IQa"
  },
  {
    "model_benchmark_id": 249,
    "benchmark_id": "triviaqa",
    "model_id": "gemma-3n-e2b",
    "score": 0.608,
    "normalized_score": 0.608,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.576196+00:00",
    "updated_at": "2025-07-19T19:56:11.576196+00:00",
    "benchmark_name": "TriviaQA"
  },
  {
    "model_benchmark_id": 1061,
    "benchmark_id": "winogrande",
    "model_id": "gemma-3n-e2b",
    "score": 0.668,
    "normalized_score": 0.668,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.213740+00:00",
    "updated_at": "2025-07-19T19:56:13.213740+00:00",
    "benchmark_name": "Winogrande"
  }
]

================================================
FILE: data/organizations/google/models/gemma-3n-e2b/model.json
================================================
{
  "model_id": "gemma-3n-e2b",
  "name": "Gemma 3n E2B",
  "organization_id": "google",
  "fine_tuned_from_model_id": null,
  "description": "Gemma 3n is a multimodal model designed to run locally on hardware, supporting image, text, audio, and video inputs. It features a language decoder, audio encoder, and vision encoder, and is available in two sizes: E2B and E4B. The model is optimized for memory efficiency, allowing it to run on devices with limited GPU RAM. Gemma is a family of lightweight, state-of-the-art open models from Google, built from the same research and technology used to create the Gemini models. Gemma models are well-suited for a variety of content understanding tasks, including question answering, summarization, and reasoning. Their relatively small size makes it possible to deploy them in environments with limited resources such as laptops, desktops or your own cloud infrastructure, democratizing access to state of the art AI models and helping foster innovation for everyone. Gemma 3n models are designed for efficient execution on low-resource devices. They are capable of multimodal input, handling text, image, video, and audio input, and generating text outputs, with open weights for instruction-tuned variants. These models were trained with data in over 140 spoken languages.",
  "release_date": "2025-06-26",
  "announcement_date": "2025-06-26",
  "license_id": "proprietary",
  "multimodal": true,
  "knowledge_cutoff": "2024-06-01",
  "param_count": 8000000000,
  "training_tokens": 11000000000000,
  "available_in_zeroeval": true,
  "source_api_ref": "https://huggingface.co/blog/gemma3n",
  "source_playground": "https://aistudio.google.com/",
  "source_paper": null,
  "source_scorecard_blog_link": "https://ai.google.dev/gemma/docs/gemma-3n",
  "source_repo_link": null,
  "source_weights_link": "https://huggingface.co/google/gemma-3n-E2B",
  "created_at": "2025-07-19T19:49:05.508070+00:00",
  "updated_at": "2025-07-19T19:49:05.508070+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/google/models/gemma-3n-e2b-it/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 686,
    "benchmark_id": "aime-2025",
    "model_id": "gemma-3n-e2b-it",
    "score": 0.067,
    "normalized_score": 0.067,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy. 0-shot.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.437675+00:00",
    "updated_at": "2025-07-19T19:56:12.437675+00:00",
    "benchmark_name": "AIME 2025"
  },
  {
    "model_benchmark_id": 1327,
    "benchmark_id": "codegolf-v2.2",
    "model_id": "gemma-3n-e2b-it",
    "score": 0.11,
    "normalized_score": 0.11,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it",
    "verified_by_llmstats": false,
    "analysis_method": "pass@1. 0-shot.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.787794+00:00",
    "updated_at": "2025-07-19T19:56:13.787794+00:00",
    "benchmark_name": "Codegolf v2.2"
  },
  {
    "model_benchmark_id": 1226,
    "benchmark_id": "eclektic",
    "model_id": "gemma-3n-e2b-it",
    "score": 0.025,
    "normalized_score": 0.025,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.575847+00:00",
    "updated_at": "2025-07-19T19:56:13.575847+00:00",
    "benchmark_name": "ECLeKTic"
  },
  {
    "model_benchmark_id": 1316,
    "benchmark_id": "global-mmlu",
    "model_id": "gemma-3n-e2b-it",
    "score": 0.551,
    "normalized_score": 0.551,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.758455+00:00",
    "updated_at": "2025-07-19T19:56:13.758455+00:00",
    "benchmark_name": "Global-MMLU"
  },
  {
    "model_benchmark_id": 1218,
    "benchmark_id": "global-mmlu-lite",
    "model_id": "gemma-3n-e2b-it",
    "score": 0.59,
    "normalized_score": 0.59,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy. 0-shot.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.560513+00:00",
    "updated_at": "2025-07-19T19:56:13.560513+00:00",
    "benchmark_name": "Global-MMLU-Lite"
  },
  {
    "model_benchmark_id": 280,
    "benchmark_id": "gpqa",
    "model_id": "gemma-3n-e2b-it",
    "score": 0.248,
    "normalized_score": 0.248,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it",
    "verified_by_llmstats": false,
    "analysis_method": "Diamond. 0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.641018+00:00",
    "updated_at": "2025-07-19T19:56:11.641018+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 1165,
    "benchmark_id": "hiddenmath",
    "model_id": "gemma-3n-e2b-it",
    "score": 0.277,
    "normalized_score": 0.277,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy. 0-shot.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.451948+00:00",
    "updated_at": "2025-07-19T19:56:13.451948+00:00",
    "benchmark_name": "HiddenMath"
  },
  {
    "model_benchmark_id": 774,
    "benchmark_id": "humaneval",
    "model_id": "gemma-3n-e2b-it",
    "score": 0.665,
    "normalized_score": 0.665,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it",
    "verified_by_llmstats": false,
    "analysis_method": "pass@1. 0-shot.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.626596+00:00",
    "updated_at": "2025-07-19T19:56:12.626596+00:00",
    "benchmark_name": "HumanEval"
  },
  {
    "model_benchmark_id": 1307,
    "benchmark_id": "include",
    "model_id": "gemma-3n-e2b-it",
    "score": 0.386,
    "normalized_score": 0.386,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.735634+00:00",
    "updated_at": "2025-07-19T19:56:13.735634+00:00",
    "benchmark_name": "Include"
  },
  {
    "model_benchmark_id": 1112,
    "benchmark_id": "livecodebench",
    "model_id": "gemma-3n-e2b-it",
    "score": 0.132,
    "normalized_score": 0.132,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it",
    "verified_by_llmstats": false,
    "analysis_method": "pass@1. 0-shot.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.320311+00:00",
    "updated_at": "2025-07-19T19:56:13.320311+00:00",
    "benchmark_name": "LiveCodeBench"
  },
  {
    "model_benchmark_id": 1323,
    "benchmark_id": "livecodebench-v5",
    "model_id": "gemma-3n-e2b-it",
    "score": 0.186,
    "normalized_score": 0.186,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it",
    "verified_by_llmstats": false,
    "analysis_method": "pass@1. 0-shot.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.777049+00:00",
    "updated_at": "2025-07-19T19:56:13.777049+00:00",
    "benchmark_name": "LiveCodeBench v5"
  },
  {
    "model_benchmark_id": 1176,
    "benchmark_id": "mbpp",
    "model_id": "gemma-3n-e2b-it",
    "score": 0.566,
    "normalized_score": 0.566,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it",
    "verified_by_llmstats": false,
    "analysis_method": "pass@1. 3-shot.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.477545+00:00",
    "updated_at": "2025-07-19T19:56:13.477545+00:00",
    "benchmark_name": "MBPP"
  },
  {
    "model_benchmark_id": 1278,
    "benchmark_id": "mgsm",
    "model_id": "gemma-3n-e2b-it",
    "score": 0.531,
    "normalized_score": 0.531,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.679623+00:00",
    "updated_at": "2025-07-19T19:56:13.679623+00:00",
    "benchmark_name": "MGSM"
  },
  {
    "model_benchmark_id": 71,
    "benchmark_id": "mmlu",
    "model_id": "gemma-3n-e2b-it",
    "score": 0.601,
    "normalized_score": 0.601,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy. 0-shot.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.234595+00:00",
    "updated_at": "2025-07-19T19:56:11.234595+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 175,
    "benchmark_id": "mmlu-pro",
    "model_id": "gemma-3n-e2b-it",
    "score": 0.405,
    "normalized_score": 0.405,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy. 0-shot.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.439365+00:00",
    "updated_at": "2025-07-19T19:56:11.439365+00:00",
    "benchmark_name": "MMLU-Pro"
  },
  {
    "model_benchmark_id": 1312,
    "benchmark_id": "mmlu-prox",
    "model_id": "gemma-3n-e2b-it",
    "score": 0.081,
    "normalized_score": 0.081,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.746554+00:00",
    "updated_at": "2025-07-19T19:56:13.746554+00:00",
    "benchmark_name": "MMLU-ProX"
  },
  {
    "model_benchmark_id": 1432,
    "benchmark_id": "openai-mmlu",
    "model_id": "gemma-3n-e2b-it",
    "score": 0.223,
    "normalized_score": 0.223,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.047435+00:00",
    "updated_at": "2025-07-19T19:56:14.047435+00:00",
    "benchmark_name": "OpenAI MMLU"
  },
  {
    "model_benchmark_id": 1234,
    "benchmark_id": "wmt24++",
    "model_id": "gemma-3n-e2b-it",
    "score": 0.427,
    "normalized_score": 0.427,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it",
    "verified_by_llmstats": false,
    "analysis_method": "Character-level F-score. 0-shot.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.592107+00:00",
    "updated_at": "2025-07-19T19:56:13.592107+00:00",
    "benchmark_name": "WMT24++"
  }
]

================================================
FILE: data/organizations/google/models/gemma-3n-e2b-it/model.json
================================================
{
  "model_id": "gemma-3n-e2b-it",
  "name": "Gemma 3n E2B Instructed",
  "organization_id": "google",
  "fine_tuned_from_model_id": null,
  "description": "Gemma 3n is a multimodal model designed to run locally on hardware, supporting image, text, audio, and video inputs. It features a language decoder, audio encoder, and vision encoder, and is available in two sizes: E2B and E4B. The model is optimized for memory efficiency, allowing it to run on devices with limited GPU RAM. Gemma is a family of lightweight, state-of-the-art open models from Google, built from the same research and technology used to create the Gemini models. Gemma models are well-suited for a variety of content understanding tasks, including question answering, summarization, and reasoning. Their relatively small size makes it possible to deploy them in environments with limited resources such as laptops, desktops or your own cloud infrastructure, democratizing access to state of the art AI models and helping foster innovation for everyone. Gemma 3n models are designed for efficient execution on low-resource devices. They are capable of multimodal input, handling text, image, video, and audio input, and generating text outputs, with open weights for instruction-tuned variants. These models were trained with data in over 140 spoken languages.",
  "release_date": "2025-06-26",
  "announcement_date": "2025-06-26",
  "license_id": "proprietary",
  "multimodal": true,
  "knowledge_cutoff": "2024-06-01",
  "param_count": 8000000000,
  "training_tokens": 11000000000000,
  "available_in_zeroeval": true,
  "source_api_ref": "https://huggingface.co/blog/gemma3n",
  "source_playground": "https://aistudio.google.com/",
  "source_paper": null,
  "source_scorecard_blog_link": "https://ai.google.dev/gemma/docs/gemma-3n",
  "source_repo_link": null,
  "source_weights_link": "https://huggingface.co/google/gemma-3n-E2B-it",
  "created_at": "2025-07-19T19:49:05.541972+00:00",
  "updated_at": "2025-07-19T19:49:05.541972+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/google/models/gemma-3n-e2b-it-litert-preview/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 680,
    "benchmark_id": "aime-2025",
    "model_id": "gemma-3n-e2b-it-litert-preview",
    "score": 0.067,
    "normalized_score": 0.067,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it-litert-preview",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.419451+00:00",
    "updated_at": "2025-07-19T19:56:12.419451+00:00",
    "benchmark_name": "AIME 2025"
  },
  {
    "model_benchmark_id": 7,
    "benchmark_id": "arc-c",
    "model_id": "gemma-3n-e2b-it-litert-preview",
    "score": 0.517,
    "normalized_score": 0.517,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it-litert-preview",
    "verified_by_llmstats": false,
    "analysis_method": "25-shot Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.095909+00:00",
    "updated_at": "2025-07-19T19:56:11.095909+00:00",
    "benchmark_name": "ARC-C"
  },
  {
    "model_benchmark_id": 1053,
    "benchmark_id": "arc-e",
    "model_id": "gemma-3n-e2b-it-litert-preview",
    "score": 0.758,
    "normalized_score": 0.758,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it-litert-preview",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.199540+00:00",
    "updated_at": "2025-07-19T19:56:13.199540+00:00",
    "benchmark_name": "ARC-E"
  },
  {
    "model_benchmark_id": 1069,
    "benchmark_id": "big-bench-hard",
    "model_id": "gemma-3n-e2b-it-litert-preview",
    "score": 0.443,
    "normalized_score": 0.443,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it-litert-preview",
    "verified_by_llmstats": false,
    "analysis_method": "few-shot Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.229977+00:00",
    "updated_at": "2025-07-19T19:56:13.229977+00:00",
    "benchmark_name": "BIG-Bench Hard"
  },
  {
    "model_benchmark_id": 1019,
    "benchmark_id": "boolq",
    "model_id": "gemma-3n-e2b-it-litert-preview",
    "score": 0.764,
    "normalized_score": 0.764,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it-litert-preview",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.123278+00:00",
    "updated_at": "2025-07-19T19:56:13.123278+00:00",
    "benchmark_name": "BoolQ"
  },
  {
    "model_benchmark_id": 1325,
    "benchmark_id": "codegolf-v2.2",
    "model_id": "gemma-3n-e2b-it-litert-preview",
    "score": 0.11,
    "normalized_score": 0.11,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it-litert-preview",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.783685+00:00",
    "updated_at": "2025-07-19T19:56:13.783685+00:00",
    "benchmark_name": "Codegolf v2.2"
  },
  {
    "model_benchmark_id": 944,
    "benchmark_id": "drop",
    "model_id": "gemma-3n-e2b-it-litert-preview",
    "score": 0.539,
    "normalized_score": 0.539,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it-litert-preview",
    "verified_by_llmstats": false,
    "analysis_method": "1-shot Token F1 score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.993202+00:00",
    "updated_at": "2025-07-19T19:56:12.993202+00:00",
    "benchmark_name": "DROP"
  },
  {
    "model_benchmark_id": 1221,
    "benchmark_id": "eclektic",
    "model_id": "gemma-3n-e2b-it-litert-preview",
    "score": 0.025,
    "normalized_score": 0.025,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it-litert-preview",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot ECLeKTic score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.567241+00:00",
    "updated_at": "2025-07-19T19:56:13.567241+00:00",
    "benchmark_name": "ECLeKTic"
  },
  {
    "model_benchmark_id": 1314,
    "benchmark_id": "global-mmlu",
    "model_id": "gemma-3n-e2b-it-litert-preview",
    "score": 0.551,
    "normalized_score": 0.551,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it-litert-preview",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.754602+00:00",
    "updated_at": "2025-07-19T19:56:13.754602+00:00",
    "benchmark_name": "Global-MMLU"
  },
  {
    "model_benchmark_id": 1208,
    "benchmark_id": "global-mmlu-lite",
    "model_id": "gemma-3n-e2b-it-litert-preview",
    "score": 0.59,
    "normalized_score": 0.59,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it-litert-preview",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.542151+00:00",
    "updated_at": "2025-07-19T19:56:13.542151+00:00",
    "benchmark_name": "Global-MMLU-Lite"
  },
  {
    "model_benchmark_id": 265,
    "benchmark_id": "gpqa",
    "model_id": "gemma-3n-e2b-it-litert-preview",
    "score": 0.248,
    "normalized_score": 0.248,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it-litert-preview",
    "verified_by_llmstats": false,
    "analysis_method": "Diamond, 0-shot RelaxedAccuracy/accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.609514+00:00",
    "updated_at": "2025-07-19T19:56:11.609514+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 35,
    "benchmark_id": "hellaswag",
    "model_id": "gemma-3n-e2b-it-litert-preview",
    "score": 0.722,
    "normalized_score": 0.722,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it-litert-preview",
    "verified_by_llmstats": false,
    "analysis_method": "10-shot Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.154889+00:00",
    "updated_at": "2025-07-19T19:56:11.154889+00:00",
    "benchmark_name": "HellaSwag"
  },
  {
    "model_benchmark_id": 1155,
    "benchmark_id": "hiddenmath",
    "model_id": "gemma-3n-e2b-it-litert-preview",
    "score": 0.277,
    "normalized_score": 0.277,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it-litert-preview",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.431354+00:00",
    "updated_at": "2025-07-19T19:56:13.431354+00:00",
    "benchmark_name": "HiddenMath"
  },
  {
    "model_benchmark_id": 764,
    "benchmark_id": "humaneval",
    "model_id": "gemma-3n-e2b-it-litert-preview",
    "score": 0.665,
    "normalized_score": 0.665,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it-litert-preview",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.609959+00:00",
    "updated_at": "2025-07-19T19:56:12.609959+00:00",
    "benchmark_name": "HumanEval"
  },
  {
    "model_benchmark_id": 1305,
    "benchmark_id": "include",
    "model_id": "gemma-3n-e2b-it-litert-preview",
    "score": 0.386,
    "normalized_score": 0.386,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it-litert-preview",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.731041+00:00",
    "updated_at": "2025-07-19T19:56:13.731041+00:00",
    "benchmark_name": "Include"
  },
  {
    "model_benchmark_id": 1103,
    "benchmark_id": "livecodebench",
    "model_id": "gemma-3n-e2b-it-litert-preview",
    "score": 0.132,
    "normalized_score": 0.132,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it-litert-preview",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.298197+00:00",
    "updated_at": "2025-07-19T19:56:13.298197+00:00",
    "benchmark_name": "LiveCodeBench"
  },
  {
    "model_benchmark_id": 1319,
    "benchmark_id": "livecodebench-v5",
    "model_id": "gemma-3n-e2b-it-litert-preview",
    "score": 0.186,
    "normalized_score": 0.186,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it-litert-preview",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.768006+00:00",
    "updated_at": "2025-07-19T19:56:13.768006+00:00",
    "benchmark_name": "LiveCodeBench v5"
  },
  {
    "model_benchmark_id": 1168,
    "benchmark_id": "mbpp",
    "model_id": "gemma-3n-e2b-it-litert-preview",
    "score": 0.566,
    "normalized_score": 0.566,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it-litert-preview",
    "verified_by_llmstats": false,
    "analysis_method": "3-shot pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.460487+00:00",
    "updated_at": "2025-07-19T19:56:13.460487+00:00",
    "benchmark_name": "MBPP"
  },
  {
    "model_benchmark_id": 1274,
    "benchmark_id": "mgsm",
    "model_id": "gemma-3n-e2b-it-litert-preview",
    "score": 0.531,
    "normalized_score": 0.531,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it-litert-preview",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.672774+00:00",
    "updated_at": "2025-07-19T19:56:13.672774+00:00",
    "benchmark_name": "MGSM"
  },
  {
    "model_benchmark_id": 65,
    "benchmark_id": "mmlu",
    "model_id": "gemma-3n-e2b-it-litert-preview",
    "score": 0.601,
    "normalized_score": 0.601,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it-litert-preview",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.222830+00:00",
    "updated_at": "2025-07-19T19:56:11.222830+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 165,
    "benchmark_id": "mmlu-pro",
    "model_id": "gemma-3n-e2b-it-litert-preview",
    "score": 0.405,
    "normalized_score": 0.405,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it-litert-preview",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.421645+00:00",
    "updated_at": "2025-07-19T19:56:11.421645+00:00",
    "benchmark_name": "MMLU-Pro"
  },
  {
    "model_benchmark_id": 1310,
    "benchmark_id": "mmlu-prox",
    "model_id": "gemma-3n-e2b-it-litert-preview",
    "score": 0.081,
    "normalized_score": 0.081,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it-litert-preview",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.743201+00:00",
    "updated_at": "2025-07-19T19:56:13.743201+00:00",
    "benchmark_name": "MMLU-ProX"
  },
  {
    "model_benchmark_id": 1046,
    "benchmark_id": "natural-questions",
    "model_id": "gemma-3n-e2b-it-litert-preview",
    "score": 0.155,
    "normalized_score": 0.155,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it-litert-preview",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.184897+00:00",
    "updated_at": "2025-07-19T19:56:13.184897+00:00",
    "benchmark_name": "Natural Questions"
  },
  {
    "model_benchmark_id": 1028,
    "benchmark_id": "piqa",
    "model_id": "gemma-3n-e2b-it-litert-preview",
    "score": 0.789,
    "normalized_score": 0.789,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it-litert-preview",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.142086+00:00",
    "updated_at": "2025-07-19T19:56:13.142086+00:00",
    "benchmark_name": "PIQA"
  },
  {
    "model_benchmark_id": 1037,
    "benchmark_id": "social-iqa",
    "model_id": "gemma-3n-e2b-it-litert-preview",
    "score": 0.488,
    "normalized_score": 0.488,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it-litert-preview",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.164056+00:00",
    "updated_at": "2025-07-19T19:56:13.164056+00:00",
    "benchmark_name": "Social IQa"
  },
  {
    "model_benchmark_id": 246,
    "benchmark_id": "triviaqa",
    "model_id": "gemma-3n-e2b-it-litert-preview",
    "score": 0.608,
    "normalized_score": 0.608,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it-litert-preview",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.571204+00:00",
    "updated_at": "2025-07-19T19:56:11.571204+00:00",
    "benchmark_name": "TriviaQA"
  },
  {
    "model_benchmark_id": 1059,
    "benchmark_id": "winogrande",
    "model_id": "gemma-3n-e2b-it-litert-preview",
    "score": 0.668,
    "normalized_score": 0.668,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it-litert-preview",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.210650+00:00",
    "updated_at": "2025-07-19T19:56:13.210650+00:00",
    "benchmark_name": "Winogrande"
  },
  {
    "model_benchmark_id": 1229,
    "benchmark_id": "wmt24++",
    "model_id": "gemma-3n-e2b-it-litert-preview",
    "score": 0.427,
    "normalized_score": 0.427,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E2B-it-litert-preview",
    "verified_by_llmstats": false,
    "analysis_method": "ChrF, 0-shot Character-level F-score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.582347+00:00",
    "updated_at": "2025-07-19T19:56:13.582347+00:00",
    "benchmark_name": "WMT24++"
  }
]

================================================
FILE: data/organizations/google/models/gemma-3n-e2b-it-litert-preview/model.json
================================================
{
  "model_id": "gemma-3n-e2b-it-litert-preview",
  "name": "Gemma 3n E2B Instructed LiteRT (Preview)",
  "organization_id": "google",
  "fine_tuned_from_model_id": null,
  "description": "Gemma 3n is a generative AI model optimized for use in everyday devices, such as phones, laptops, and tablets. It features innovations like Per-Layer Embedding (PLE) parameter caching and a MatFormer model architecture for reduced compute and memory. These models handle audio, text, and visual data, though this E4B preview currently supports text and vision input. Gemma is a family of lightweight, state-of-the-art open models from Google, built from the same research and technology used to create the Gemini models, and is licensed for responsible commercial use.",
  "release_date": "2025-05-20",
  "announcement_date": "2025-05-20",
  "license_id": "gemma",
  "multimodal": true,
  "knowledge_cutoff": "2024-06-01",
  "param_count": 1910000000,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": null,
  "source_playground": "https://aistudio.google.com/",
  "source_paper": null,
  "source_scorecard_blog_link": "https://ai.google.dev/gemma/docs/gemma-3n",
  "source_repo_link": "https://huggingface.co/google/gemma-3n-E2B-it-litert-preview",
  "source_weights_link": "https://huggingface.co/google/gemma-3n-E2B-it-litert-preview",
  "created_at": "2025-07-19T19:49:05.466473+00:00",
  "updated_at": "2025-07-19T19:49:05.466473+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/google/models/gemma-3n-e4b/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 5,
    "benchmark_id": "arc-c",
    "model_id": "gemma-3n-e4b",
    "score": 0.616,
    "normalized_score": 0.616,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B",
    "verified_by_llmstats": false,
    "analysis_method": "25-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.091862+00:00",
    "updated_at": "2025-07-19T19:56:11.091862+00:00",
    "benchmark_name": "ARC-C"
  },
  {
    "model_benchmark_id": 1051,
    "benchmark_id": "arc-e",
    "model_id": "gemma-3n-e4b",
    "score": 0.816,
    "normalized_score": 0.816,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.195091+00:00",
    "updated_at": "2025-07-19T19:56:13.195091+00:00",
    "benchmark_name": "ARC-E"
  },
  {
    "model_benchmark_id": 1066,
    "benchmark_id": "big-bench-hard",
    "model_id": "gemma-3n-e4b",
    "score": 0.529,
    "normalized_score": 0.529,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B",
    "verified_by_llmstats": false,
    "analysis_method": "few-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.225269+00:00",
    "updated_at": "2025-07-19T19:56:13.225269+00:00",
    "benchmark_name": "BIG-Bench Hard"
  },
  {
    "model_benchmark_id": 1017,
    "benchmark_id": "boolq",
    "model_id": "gemma-3n-e4b",
    "score": 0.816,
    "normalized_score": 0.816,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.120054+00:00",
    "updated_at": "2025-07-19T19:56:13.120054+00:00",
    "benchmark_name": "BoolQ"
  },
  {
    "model_benchmark_id": 942,
    "benchmark_id": "drop",
    "model_id": "gemma-3n-e4b",
    "score": 0.608,
    "normalized_score": 0.608,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B",
    "verified_by_llmstats": false,
    "analysis_method": "Token F1 score. 1-shot.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.989555+00:00",
    "updated_at": "2025-07-19T19:56:12.989555+00:00",
    "benchmark_name": "DROP"
  },
  {
    "model_benchmark_id": 33,
    "benchmark_id": "hellaswag",
    "model_id": "gemma-3n-e4b",
    "score": 0.786,
    "normalized_score": 0.786,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B",
    "verified_by_llmstats": false,
    "analysis_method": "10-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.150880+00:00",
    "updated_at": "2025-07-19T19:56:11.150880+00:00",
    "benchmark_name": "HellaSwag"
  },
  {
    "model_benchmark_id": 1044,
    "benchmark_id": "natural-questions",
    "model_id": "gemma-3n-e4b",
    "score": 0.209,
    "normalized_score": 0.209,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.181324+00:00",
    "updated_at": "2025-07-19T19:56:13.181324+00:00",
    "benchmark_name": "Natural Questions"
  },
  {
    "model_benchmark_id": 1026,
    "benchmark_id": "piqa",
    "model_id": "gemma-3n-e4b",
    "score": 0.81,
    "normalized_score": 0.81,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.136080+00:00",
    "updated_at": "2025-07-19T19:56:13.136080+00:00",
    "benchmark_name": "PIQA"
  },
  {
    "model_benchmark_id": 1035,
    "benchmark_id": "social-iqa",
    "model_id": "gemma-3n-e4b",
    "score": 0.5,
    "normalized_score": 0.5,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.159816+00:00",
    "updated_at": "2025-07-19T19:56:13.159816+00:00",
    "benchmark_name": "Social IQa"
  },
  {
    "model_benchmark_id": 244,
    "benchmark_id": "triviaqa",
    "model_id": "gemma-3n-e4b",
    "score": 0.702,
    "normalized_score": 0.702,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.567693+00:00",
    "updated_at": "2025-07-19T19:56:11.567693+00:00",
    "benchmark_name": "TriviaQA"
  },
  {
    "model_benchmark_id": 1057,
    "benchmark_id": "winogrande",
    "model_id": "gemma-3n-e4b",
    "score": 0.717,
    "normalized_score": 0.717,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.207598+00:00",
    "updated_at": "2025-07-19T19:56:13.207598+00:00",
    "benchmark_name": "Winogrande"
  }
]

================================================
FILE: data/organizations/google/models/gemma-3n-e4b/model.json
================================================
{
  "model_id": "gemma-3n-e4b",
  "name": "Gemma 3n E4B",
  "organization_id": "google",
  "fine_tuned_from_model_id": null,
  "description": "Gemma 3n is a multimodal model designed to run locally on hardware, supporting image, text, audio, and video inputs. It features a language decoder, audio encoder, and vision encoder, and is available in two sizes: E2B and E4B. The model is optimized for memory efficiency, allowing it to run on devices with limited GPU RAM. Gemma is a family of lightweight, state-of-the-art open models from Google, built from the same research and technology used to create the Gemini models. Gemma models are well-suited for a variety of content understanding tasks, including question answering, summarization, and reasoning. Their relatively small size makes it possible to deploy them in environments with limited resources such as laptops, desktops or your own cloud infrastructure, democratizing access to state of the art AI models and helping foster innovation for everyone. Gemma 3n models are designed for efficient execution on low-resource devices. They are capable of multimodal input, handling text, image, video, and audio input, and generating text outputs, with open weights for instruction-tuned variants. These models were trained with data in over 140 spoken languages.",
  "release_date": "2025-06-26",
  "announcement_date": "2025-06-26",
  "license_id": "proprietary",
  "multimodal": true,
  "knowledge_cutoff": "2024-06-01",
  "param_count": 8000000000,
  "training_tokens": 11000000000000,
  "available_in_zeroeval": true,
  "source_api_ref": "https://huggingface.co/blog/gemma3n",
  "source_playground": "https://aistudio.google.com/",
  "source_paper": null,
  "source_scorecard_blog_link": "https://ai.google.dev/gemma/docs/gemma-3n",
  "source_repo_link": null,
  "source_weights_link": "https://huggingface.co/google/gemma-3n-E4B",
  "created_at": "2025-07-19T19:49:05.440084+00:00",
  "updated_at": "2025-07-19T19:49:05.440084+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/google/models/gemma-3n-e4b-it/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 684,
    "benchmark_id": "aime-2025",
    "model_id": "gemma-3n-e4b-it",
    "score": 0.116,
    "normalized_score": 0.116,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy. 0-shot.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.431148+00:00",
    "updated_at": "2025-07-19T19:56:12.431148+00:00",
    "benchmark_name": "AIME 2025"
  },
  {
    "model_benchmark_id": 1326,
    "benchmark_id": "codegolf-v2.2",
    "model_id": "gemma-3n-e4b-it",
    "score": 0.168,
    "normalized_score": 0.168,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it",
    "verified_by_llmstats": false,
    "analysis_method": "pass@1. 0-shot.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.785856+00:00",
    "updated_at": "2025-07-19T19:56:13.785856+00:00",
    "benchmark_name": "Codegolf v2.2"
  },
  {
    "model_benchmark_id": 1222,
    "benchmark_id": "eclektic",
    "model_id": "gemma-3n-e4b-it",
    "score": 0.19,
    "normalized_score": 0.19,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.569227+00:00",
    "updated_at": "2025-07-19T19:56:13.569227+00:00",
    "benchmark_name": "ECLeKTic"
  },
  {
    "model_benchmark_id": 1315,
    "benchmark_id": "global-mmlu",
    "model_id": "gemma-3n-e4b-it",
    "score": 0.603,
    "normalized_score": 0.603,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.756363+00:00",
    "updated_at": "2025-07-19T19:56:13.756363+00:00",
    "benchmark_name": "Global-MMLU"
  },
  {
    "model_benchmark_id": 1213,
    "benchmark_id": "global-mmlu-lite",
    "model_id": "gemma-3n-e4b-it",
    "score": 0.645,
    "normalized_score": 0.645,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy. 0-shot.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.552233+00:00",
    "updated_at": "2025-07-19T19:56:13.552233+00:00",
    "benchmark_name": "Global-MMLU-Lite"
  },
  {
    "model_benchmark_id": 273,
    "benchmark_id": "gpqa",
    "model_id": "gemma-3n-e4b-it",
    "score": 0.237,
    "normalized_score": 0.237,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it",
    "verified_by_llmstats": false,
    "analysis_method": "Diamond. 0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.624084+00:00",
    "updated_at": "2025-07-19T19:56:11.624084+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 1159,
    "benchmark_id": "hiddenmath",
    "model_id": "gemma-3n-e4b-it",
    "score": 0.377,
    "normalized_score": 0.377,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy. 0-shot.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.438271+00:00",
    "updated_at": "2025-07-19T19:56:13.438271+00:00",
    "benchmark_name": "HiddenMath"
  },
  {
    "model_benchmark_id": 769,
    "benchmark_id": "humaneval",
    "model_id": "gemma-3n-e4b-it",
    "score": 0.75,
    "normalized_score": 0.75,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it",
    "verified_by_llmstats": false,
    "analysis_method": "pass@1. 0-shot.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.618954+00:00",
    "updated_at": "2025-07-19T19:56:12.618954+00:00",
    "benchmark_name": "HumanEval"
  },
  {
    "model_benchmark_id": 1306,
    "benchmark_id": "include",
    "model_id": "gemma-3n-e4b-it",
    "score": 0.572,
    "normalized_score": 0.572,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.733461+00:00",
    "updated_at": "2025-07-19T19:56:13.733461+00:00",
    "benchmark_name": "Include"
  },
  {
    "model_benchmark_id": 1106,
    "benchmark_id": "livecodebench",
    "model_id": "gemma-3n-e4b-it",
    "score": 0.132,
    "normalized_score": 0.132,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it",
    "verified_by_llmstats": false,
    "analysis_method": "pass@1. 0-shot.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.304919+00:00",
    "updated_at": "2025-07-19T19:56:13.304919+00:00",
    "benchmark_name": "LiveCodeBench"
  },
  {
    "model_benchmark_id": 1322,
    "benchmark_id": "livecodebench-v5",
    "model_id": "gemma-3n-e4b-it",
    "score": 0.257,
    "normalized_score": 0.257,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it",
    "verified_by_llmstats": false,
    "analysis_method": "pass@1. 0-shot.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.775429+00:00",
    "updated_at": "2025-07-19T19:56:13.775429+00:00",
    "benchmark_name": "LiveCodeBench v5"
  },
  {
    "model_benchmark_id": 1171,
    "benchmark_id": "mbpp",
    "model_id": "gemma-3n-e4b-it",
    "score": 0.636,
    "normalized_score": 0.636,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it",
    "verified_by_llmstats": false,
    "analysis_method": "pass@1. 3-shot.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.466832+00:00",
    "updated_at": "2025-07-19T19:56:13.466832+00:00",
    "benchmark_name": "MBPP"
  },
  {
    "model_benchmark_id": 1277,
    "benchmark_id": "mgsm",
    "model_id": "gemma-3n-e4b-it",
    "score": 0.67,
    "normalized_score": 0.67,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.678210+00:00",
    "updated_at": "2025-07-19T19:56:13.678210+00:00",
    "benchmark_name": "MGSM"
  },
  {
    "model_benchmark_id": 70,
    "benchmark_id": "mmlu",
    "model_id": "gemma-3n-e4b-it",
    "score": 0.649,
    "normalized_score": 0.649,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy. 0-shot.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.232243+00:00",
    "updated_at": "2025-07-19T19:56:11.232243+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 169,
    "benchmark_id": "mmlu-pro",
    "model_id": "gemma-3n-e4b-it",
    "score": 0.506,
    "normalized_score": 0.506,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy. 0-shot.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.428457+00:00",
    "updated_at": "2025-07-19T19:56:11.428457+00:00",
    "benchmark_name": "MMLU-Pro"
  },
  {
    "model_benchmark_id": 1311,
    "benchmark_id": "mmlu-prox",
    "model_id": "gemma-3n-e4b-it",
    "score": 0.199,
    "normalized_score": 0.199,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.744918+00:00",
    "updated_at": "2025-07-19T19:56:13.744918+00:00",
    "benchmark_name": "MMLU-ProX"
  },
  {
    "model_benchmark_id": 1431,
    "benchmark_id": "openai-mmlu",
    "model_id": "gemma-3n-e4b-it",
    "score": 0.356,
    "normalized_score": 0.356,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.045887+00:00",
    "updated_at": "2025-07-19T19:56:14.045887+00:00",
    "benchmark_name": "OpenAI MMLU"
  },
  {
    "model_benchmark_id": 1230,
    "benchmark_id": "wmt24++",
    "model_id": "gemma-3n-e4b-it",
    "score": 0.501,
    "normalized_score": 0.501,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it",
    "verified_by_llmstats": false,
    "analysis_method": "Character-level F-score. 0-shot.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.584588+00:00",
    "updated_at": "2025-07-19T19:56:13.584588+00:00",
    "benchmark_name": "WMT24++"
  }
]

================================================
FILE: data/organizations/google/models/gemma-3n-e4b-it/model.json
================================================
{
  "model_id": "gemma-3n-e4b-it",
  "name": "Gemma 3n E4B Instructed",
  "organization_id": "google",
  "fine_tuned_from_model_id": null,
  "description": "Gemma 3n is a multimodal model designed to run locally on hardware, supporting image, text, audio, and video inputs. It features a language decoder, audio encoder, and vision encoder, and is available in two sizes: E2B and E4B. The model is optimized for memory efficiency, allowing it to run on devices with limited GPU RAM. Gemma is a family of lightweight, state-of-the-art open models from Google, built from the same research and technology used to create the Gemini models. Gemma models are well-suited for a variety of content understanding tasks, including question answering, summarization, and reasoning. Their relatively small size makes it possible to deploy them in environments with limited resources such as laptops, desktops or your own cloud infrastructure, democratizing access to state of the art AI models and helping foster innovation for everyone. Gemma 3n models are designed for efficient execution on low-resource devices. They are capable of multimodal input, handling text, image, video, and audio input, and generating text outputs, with open weights for instruction-tuned variants. These models were trained with data in over 140 spoken languages.",
  "release_date": "2025-06-26",
  "announcement_date": "2025-06-26",
  "license_id": "proprietary",
  "multimodal": true,
  "knowledge_cutoff": "2024-06-01",
  "param_count": 8000000000,
  "training_tokens": 11000000000000,
  "available_in_zeroeval": true,
  "source_api_ref": "https://huggingface.co/blog/gemma3n",
  "source_playground": "https://aistudio.google.com/",
  "source_paper": null,
  "source_scorecard_blog_link": "https://ai.google.dev/gemma/docs/gemma-3n",
  "source_repo_link": null,
  "source_weights_link": "https://huggingface.co/google/gemma-3n-E4B-it",
  "created_at": "2025-07-19T19:49:05.517334+00:00",
  "updated_at": "2025-07-19T19:49:05.517334+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/google/models/gemma-3n-e4b-it-litert-preview/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 678,
    "benchmark_id": "aime-2025",
    "model_id": "gemma-3n-e4b-it-litert-preview",
    "score": 0.116,
    "normalized_score": 0.116,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it-litert-preview",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.414248+00:00",
    "updated_at": "2025-07-19T19:56:12.414248+00:00",
    "benchmark_name": "AIME 2025"
  },
  {
    "model_benchmark_id": 6,
    "benchmark_id": "arc-c",
    "model_id": "gemma-3n-e4b-it-litert-preview",
    "score": 0.616,
    "normalized_score": 0.616,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it-litert-preview",
    "verified_by_llmstats": false,
    "analysis_method": "25-shot Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.093723+00:00",
    "updated_at": "2025-07-19T19:56:11.093723+00:00",
    "benchmark_name": "ARC-C"
  },
  {
    "model_benchmark_id": 1052,
    "benchmark_id": "arc-e",
    "model_id": "gemma-3n-e4b-it-litert-preview",
    "score": 0.816,
    "normalized_score": 0.816,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it-litert-preview",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.196728+00:00",
    "updated_at": "2025-07-19T19:56:13.196728+00:00",
    "benchmark_name": "ARC-E"
  },
  {
    "model_benchmark_id": 1068,
    "benchmark_id": "big-bench-hard",
    "model_id": "gemma-3n-e4b-it-litert-preview",
    "score": 0.529,
    "normalized_score": 0.529,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it-litert-preview",
    "verified_by_llmstats": false,
    "analysis_method": "few-shot Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.228349+00:00",
    "updated_at": "2025-07-19T19:56:13.228349+00:00",
    "benchmark_name": "BIG-Bench Hard"
  },
  {
    "model_benchmark_id": 1018,
    "benchmark_id": "boolq",
    "model_id": "gemma-3n-e4b-it-litert-preview",
    "score": 0.816,
    "normalized_score": 0.816,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it-litert-preview",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.121696+00:00",
    "updated_at": "2025-07-19T19:56:13.121696+00:00",
    "benchmark_name": "BoolQ"
  },
  {
    "model_benchmark_id": 1324,
    "benchmark_id": "codegolf-v2.2",
    "model_id": "gemma-3n-e4b-it-litert-preview",
    "score": 0.168,
    "normalized_score": 0.168,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it-litert-preview",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.781222+00:00",
    "updated_at": "2025-07-19T19:56:13.781222+00:00",
    "benchmark_name": "Codegolf v2.2"
  },
  {
    "model_benchmark_id": 943,
    "benchmark_id": "drop",
    "model_id": "gemma-3n-e4b-it-litert-preview",
    "score": 0.608,
    "normalized_score": 0.608,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it-litert-preview",
    "verified_by_llmstats": false,
    "analysis_method": "1-shot Token F1 score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.991359+00:00",
    "updated_at": "2025-07-19T19:56:12.991359+00:00",
    "benchmark_name": "DROP"
  },
  {
    "model_benchmark_id": 1220,
    "benchmark_id": "eclektic",
    "model_id": "gemma-3n-e4b-it-litert-preview",
    "score": 0.019,
    "normalized_score": 0.019,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it-litert-preview",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot ECLeKTic score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.565422+00:00",
    "updated_at": "2025-07-19T19:56:13.565422+00:00",
    "benchmark_name": "ECLeKTic"
  },
  {
    "model_benchmark_id": 1313,
    "benchmark_id": "global-mmlu",
    "model_id": "gemma-3n-e4b-it-litert-preview",
    "score": 0.603,
    "normalized_score": 0.603,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it-litert-preview",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.752749+00:00",
    "updated_at": "2025-07-19T19:56:13.752749+00:00",
    "benchmark_name": "Global-MMLU"
  },
  {
    "model_benchmark_id": 1206,
    "benchmark_id": "global-mmlu-lite",
    "model_id": "gemma-3n-e4b-it-litert-preview",
    "score": 0.645,
    "normalized_score": 0.645,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it-litert-preview",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.538643+00:00",
    "updated_at": "2025-07-19T19:56:13.538643+00:00",
    "benchmark_name": "Global-MMLU-Lite"
  },
  {
    "model_benchmark_id": 262,
    "benchmark_id": "gpqa",
    "model_id": "gemma-3n-e4b-it-litert-preview",
    "score": 0.237,
    "normalized_score": 0.237,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it-litert-preview",
    "verified_by_llmstats": false,
    "analysis_method": "Diamond, 0-shot RelaxedAccuracy/accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.602493+00:00",
    "updated_at": "2025-07-19T19:56:11.602493+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 34,
    "benchmark_id": "hellaswag",
    "model_id": "gemma-3n-e4b-it-litert-preview",
    "score": 0.786,
    "normalized_score": 0.786,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it-litert-preview",
    "verified_by_llmstats": false,
    "analysis_method": "10-shot Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.152761+00:00",
    "updated_at": "2025-07-19T19:56:11.152761+00:00",
    "benchmark_name": "HellaSwag"
  },
  {
    "model_benchmark_id": 1154,
    "benchmark_id": "hiddenmath",
    "model_id": "gemma-3n-e4b-it-litert-preview",
    "score": 0.377,
    "normalized_score": 0.377,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it-litert-preview",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.429415+00:00",
    "updated_at": "2025-07-19T19:56:13.429415+00:00",
    "benchmark_name": "HiddenMath"
  },
  {
    "model_benchmark_id": 763,
    "benchmark_id": "humaneval",
    "model_id": "gemma-3n-e4b-it-litert-preview",
    "score": 0.75,
    "normalized_score": 0.75,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it-litert-preview",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.608423+00:00",
    "updated_at": "2025-07-19T19:56:12.608423+00:00",
    "benchmark_name": "HumanEval"
  },
  {
    "model_benchmark_id": 1304,
    "benchmark_id": "include",
    "model_id": "gemma-3n-e4b-it-litert-preview",
    "score": 0.572,
    "normalized_score": 0.572,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it-litert-preview",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.729199+00:00",
    "updated_at": "2025-07-19T19:56:13.729199+00:00",
    "benchmark_name": "Include"
  },
  {
    "model_benchmark_id": 1102,
    "benchmark_id": "livecodebench",
    "model_id": "gemma-3n-e4b-it-litert-preview",
    "score": 0.132,
    "normalized_score": 0.132,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it-litert-preview",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.296281+00:00",
    "updated_at": "2025-07-19T19:56:13.296281+00:00",
    "benchmark_name": "LiveCodeBench"
  },
  {
    "model_benchmark_id": 1317,
    "benchmark_id": "livecodebench-v5",
    "model_id": "gemma-3n-e4b-it-litert-preview",
    "score": 0.257,
    "normalized_score": 0.257,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it-litert-preview",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.761673+00:00",
    "updated_at": "2025-07-19T19:56:13.761673+00:00",
    "benchmark_name": "LiveCodeBench v5"
  },
  {
    "model_benchmark_id": 1167,
    "benchmark_id": "mbpp",
    "model_id": "gemma-3n-e4b-it-litert-preview",
    "score": 0.636,
    "normalized_score": 0.636,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it-litert-preview",
    "verified_by_llmstats": false,
    "analysis_method": "3-shot pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.458570+00:00",
    "updated_at": "2025-07-19T19:56:13.458570+00:00",
    "benchmark_name": "MBPP"
  },
  {
    "model_benchmark_id": 1273,
    "benchmark_id": "mgsm",
    "model_id": "gemma-3n-e4b-it-litert-preview",
    "score": 0.607,
    "normalized_score": 0.607,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it-litert-preview",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.671283+00:00",
    "updated_at": "2025-07-19T19:56:13.671283+00:00",
    "benchmark_name": "MGSM"
  },
  {
    "model_benchmark_id": 63,
    "benchmark_id": "mmlu",
    "model_id": "gemma-3n-e4b-it-litert-preview",
    "score": 0.649,
    "normalized_score": 0.649,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it-litert-preview",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.219372+00:00",
    "updated_at": "2025-07-19T19:56:11.219372+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 164,
    "benchmark_id": "mmlu-pro",
    "model_id": "gemma-3n-e4b-it-litert-preview",
    "score": 0.506,
    "normalized_score": 0.506,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it-litert-preview",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.420000+00:00",
    "updated_at": "2025-07-19T19:56:11.420000+00:00",
    "benchmark_name": "MMLU-Pro"
  },
  {
    "model_benchmark_id": 1309,
    "benchmark_id": "mmlu-prox",
    "model_id": "gemma-3n-e4b-it-litert-preview",
    "score": 0.199,
    "normalized_score": 0.199,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it-litert-preview",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.741460+00:00",
    "updated_at": "2025-07-19T19:56:13.741460+00:00",
    "benchmark_name": "MMLU-ProX"
  },
  {
    "model_benchmark_id": 1045,
    "benchmark_id": "natural-questions",
    "model_id": "gemma-3n-e4b-it-litert-preview",
    "score": 0.209,
    "normalized_score": 0.209,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it-litert-preview",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.183031+00:00",
    "updated_at": "2025-07-19T19:56:13.183031+00:00",
    "benchmark_name": "Natural Questions"
  },
  {
    "model_benchmark_id": 1027,
    "benchmark_id": "piqa",
    "model_id": "gemma-3n-e4b-it-litert-preview",
    "score": 0.81,
    "normalized_score": 0.81,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it-litert-preview",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.137952+00:00",
    "updated_at": "2025-07-19T19:56:13.137952+00:00",
    "benchmark_name": "PIQA"
  },
  {
    "model_benchmark_id": 1036,
    "benchmark_id": "social-iqa",
    "model_id": "gemma-3n-e4b-it-litert-preview",
    "score": 0.5,
    "normalized_score": 0.5,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it-litert-preview",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.161822+00:00",
    "updated_at": "2025-07-19T19:56:13.161822+00:00",
    "benchmark_name": "Social IQa"
  },
  {
    "model_benchmark_id": 245,
    "benchmark_id": "triviaqa",
    "model_id": "gemma-3n-e4b-it-litert-preview",
    "score": 0.702,
    "normalized_score": 0.702,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it-litert-preview",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.569334+00:00",
    "updated_at": "2025-07-19T19:56:11.569334+00:00",
    "benchmark_name": "TriviaQA"
  },
  {
    "model_benchmark_id": 1058,
    "benchmark_id": "winogrande",
    "model_id": "gemma-3n-e4b-it-litert-preview",
    "score": 0.717,
    "normalized_score": 0.717,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it-litert-preview",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.209229+00:00",
    "updated_at": "2025-07-19T19:56:13.209229+00:00",
    "benchmark_name": "Winogrande"
  },
  {
    "model_benchmark_id": 1228,
    "benchmark_id": "wmt24++",
    "model_id": "gemma-3n-e4b-it-litert-preview",
    "score": 0.501,
    "normalized_score": 0.501,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/gemma-3n-E4B-it-litert-preview",
    "verified_by_llmstats": false,
    "analysis_method": "ChrF, 0-shot Character-level F-score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.580409+00:00",
    "updated_at": "2025-07-19T19:56:13.580409+00:00",
    "benchmark_name": "WMT24++"
  }
]

================================================
FILE: data/organizations/google/models/gemma-3n-e4b-it-litert-preview/model.json
================================================
{
  "model_id": "gemma-3n-e4b-it-litert-preview",
  "name": "Gemma 3n E4B Instructed LiteRT Preview",
  "organization_id": "google",
  "fine_tuned_from_model_id": null,
  "description": "Gemma 3n is a generative AI model optimized for use in everyday devices, such as phones, laptops, and tablets. It features innovations like Per-Layer Embedding (PLE) parameter caching and a MatFormer model architecture for reduced compute and memory. These models handle audio, text, and visual data, though this E4B preview currently supports text and vision input. Gemma is a family of lightweight, state-of-the-art open models from Google, built from the same research and technology used to create the Gemini models, and is licensed for responsible commercial use.",
  "release_date": "2025-05-20",
  "announcement_date": "2025-05-20",
  "license_id": "gemma",
  "multimodal": true,
  "knowledge_cutoff": "2024-06-01",
  "param_count": 1910000000,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": null,
  "source_playground": "https://aistudio.google.com/",
  "source_paper": null,
  "source_scorecard_blog_link": "https://ai.google.dev/gemma/docs/gemma-3n",
  "source_repo_link": "https://huggingface.co/google/gemma-3n-E4B-it-litert-preview",
  "source_weights_link": "https://huggingface.co/google/gemma-3n-E4B-it-litert-preview",
  "created_at": "2025-07-19T19:49:05.451978+00:00",
  "updated_at": "2025-07-19T19:49:05.451978+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/google/models/medgemma-4b-it/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 1425,
    "benchmark_id": "chexpert-cxr",
    "model_id": "medgemma-4b-it",
    "score": 0.481,
    "normalized_score": 0.481,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/medgemma-4b-it",
    "verified_by_llmstats": false,
    "analysis_method": "Average F1 for top 5 conditions",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.023334+00:00",
    "updated_at": "2025-07-19T19:56:14.023334+00:00",
    "benchmark_name": "CheXpert CXR"
  },
  {
    "model_benchmark_id": 1426,
    "benchmark_id": "dermmcqa",
    "model_id": "medgemma-4b-it",
    "score": 0.718,
    "normalized_score": 0.718,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/medgemma-4b-it",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.026812+00:00",
    "updated_at": "2025-07-19T19:56:14.026812+00:00",
    "benchmark_name": "DermMCQA"
  },
  {
    "model_benchmark_id": 1430,
    "benchmark_id": "medxpertqa",
    "model_id": "medgemma-4b-it",
    "score": 0.188,
    "normalized_score": 0.188,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/medgemma-4b-it",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.042823+00:00",
    "updated_at": "2025-07-19T19:56:14.042823+00:00",
    "benchmark_name": "MedXpertQA"
  },
  {
    "model_benchmark_id": 1424,
    "benchmark_id": "mimic-cxr",
    "model_id": "medgemma-4b-it",
    "score": 0.889,
    "normalized_score": 0.889,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/medgemma-4b-it",
    "verified_by_llmstats": false,
    "analysis_method": "Average F1 for top 5 conditions",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.019964+00:00",
    "updated_at": "2025-07-19T19:56:14.019964+00:00",
    "benchmark_name": "MIMIC CXR"
  },
  {
    "model_benchmark_id": 1429,
    "benchmark_id": "pathmcqa",
    "model_id": "medgemma-4b-it",
    "score": 0.698,
    "normalized_score": 0.698,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/medgemma-4b-it",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.039089+00:00",
    "updated_at": "2025-07-19T19:56:14.039089+00:00",
    "benchmark_name": "PathMCQA"
  },
  {
    "model_benchmark_id": 1427,
    "benchmark_id": "slakevqa",
    "model_id": "medgemma-4b-it",
    "score": 0.623,
    "normalized_score": 0.623,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/medgemma-4b-it",
    "verified_by_llmstats": false,
    "analysis_method": "Tokenized F1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.029835+00:00",
    "updated_at": "2025-07-19T19:56:14.029835+00:00",
    "benchmark_name": "SlakeVQA"
  },
  {
    "model_benchmark_id": 1428,
    "benchmark_id": "vqa-rad",
    "model_id": "medgemma-4b-it",
    "score": 0.499,
    "normalized_score": 0.499,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/google/medgemma-4b-it",
    "verified_by_llmstats": false,
    "analysis_method": "Tokenized F1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.035504+00:00",
    "updated_at": "2025-07-19T19:56:14.035504+00:00",
    "benchmark_name": "VQA-Rad"
  }
]

================================================
FILE: data/organizations/google/models/medgemma-4b-it/model.json
================================================
{
  "model_id": "medgemma-4b-it",
  "name": "MedGemma 4B IT",
  "organization_id": "google",
  "fine_tuned_from_model_id": null,
  "description": "MedGemma is a collection of Gemma 3 variants that are trained for performance on medical text and image comprehension. MedGemma 4B utilizes a SigLIP image encoder that has been specifically pre-trained on a variety of de-identified medical data, including chest X-rays, dermatology images, ophthalmology images, and histopathology slides. Its LLM component is trained on a diverse set of medical data, including radiology images, histopathology patches, ophthalmology images, and dermatology images. MedGemma is a multimodal model primarily evaluated on single-image tasks. It has not been evaluated for multi-turn applications and may be more sensitive to specific prompts than its predecessor, Gemma 3. Developers should consider bias in validation data and data contamination concerns when using MedGemma.",
  "release_date": "2025-05-20",
  "announcement_date": "2025-05-20",
  "license_id": "health_ai_developer_foundations_terms_of_use",
  "multimodal": true,
  "knowledge_cutoff": null,
  "param_count": 4300000000,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://developers.google.com/health-ai-developer-foundations/medgemma/get-started",
  "source_playground": null,
  "source_paper": null,
  "source_scorecard_blog_link": "https://developers.google.com/health-ai-developer-foundations/medgemma/model-card",
  "source_repo_link": null,
  "source_weights_link": "https://huggingface.co/google/medgemma-4b-it",
  "created_at": "2025-07-19T19:49:05.511963+00:00",
  "updated_at": "2025-07-19T19:49:05.511963+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/google/organization.json
================================================
{
  "organization_id": "google",
  "name": "Google",
  "website": "https://google.com",
  "description": "Technology giant with AI research",
  "country": "US",
  "created_at": "2025-07-19T19:49:05.437977+00:00",
  "updated_at": "2025-07-19T19:49:05.437977+00:00"
}


================================================
FILE: data/organizations/ibm/models/granite-3.3-8b-base/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 1409,
    "benchmark_id": "agieval",
    "model_id": "granite-3.3-8b-base",
    "score": 0.493,
    "normalized_score": 0.493,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-base",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.976963+00:00",
    "updated_at": "2025-07-19T19:56:13.976963+00:00",
    "benchmark_name": "AGIEval"
  },
  {
    "model_benchmark_id": 477,
    "benchmark_id": "aime-2024",
    "model_id": "granite-3.3-8b-base",
    "score": 0.812,
    "normalized_score": 0.812,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-base",
    "verified_by_llmstats": false,
    "analysis_method": "Not specified",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.006332+00:00",
    "updated_at": "2025-07-19T19:56:12.006332+00:00",
    "benchmark_name": "AIME 2024"
  },
  {
    "model_benchmark_id": 1794,
    "benchmark_id": "alpacaeval-2.0",
    "model_id": "granite-3.3-8b-base",
    "score": 0.6268,
    "normalized_score": 0.6268,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-base",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.048676+00:00",
    "updated_at": "2025-07-19T19:56:15.048676+00:00",
    "benchmark_name": "AlpacaEval 2.0"
  },
  {
    "model_benchmark_id": 23,
    "benchmark_id": "arc-c",
    "model_id": "granite-3.3-8b-base",
    "score": 0.5084,
    "normalized_score": 0.5084,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-base",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.131347+00:00",
    "updated_at": "2025-07-19T19:56:11.131347+00:00",
    "benchmark_name": "ARC-C"
  },
  {
    "model_benchmark_id": 1460,
    "benchmark_id": "arena-hard",
    "model_id": "granite-3.3-8b-base",
    "score": 0.5756,
    "normalized_score": 0.5756,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-base",
    "verified_by_llmstats": false,
    "analysis_method": "Arena Hard",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.111734+00:00",
    "updated_at": "2025-07-19T19:56:14.111734+00:00",
    "benchmark_name": "Arena Hard"
  },
  {
    "model_benchmark_id": 1807,
    "benchmark_id": "attaq",
    "model_id": "granite-3.3-8b-base",
    "score": 0.885,
    "normalized_score": 0.885,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-base",
    "verified_by_llmstats": false,
    "analysis_method": "Not specified (OLMES)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.087212+00:00",
    "updated_at": "2025-07-19T19:56:15.087212+00:00",
    "benchmark_name": "AttaQ"
  },
  {
    "model_benchmark_id": 1081,
    "benchmark_id": "big-bench-hard",
    "model_id": "granite-3.3-8b-base",
    "score": 0.6913,
    "normalized_score": 0.6913,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-base",
    "verified_by_llmstats": false,
    "analysis_method": "OLMES (Added regex for more efficient answer extraction)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.251020+00:00",
    "updated_at": "2025-07-19T19:56:13.251020+00:00",
    "benchmark_name": "BIG-Bench Hard"
  },
  {
    "model_benchmark_id": 955,
    "benchmark_id": "drop",
    "model_id": "granite-3.3-8b-base",
    "score": 0.3614,
    "normalized_score": 0.3614,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-base",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.012196+00:00",
    "updated_at": "2025-07-19T19:56:13.012196+00:00",
    "benchmark_name": "DROP"
  },
  {
    "model_benchmark_id": 1004,
    "benchmark_id": "gsm8k",
    "model_id": "granite-3.3-8b-base",
    "score": 0.59,
    "normalized_score": 0.59,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-base",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.098078+00:00",
    "updated_at": "2025-07-19T19:56:13.098078+00:00",
    "benchmark_name": "GSM8k"
  },
  {
    "model_benchmark_id": 49,
    "benchmark_id": "hellaswag",
    "model_id": "granite-3.3-8b-base",
    "score": 0.801,
    "normalized_score": 0.801,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-base",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.186799+00:00",
    "updated_at": "2025-07-19T19:56:11.186799+00:00",
    "benchmark_name": "HellaSwag"
  },
  {
    "model_benchmark_id": 798,
    "benchmark_id": "humaneval",
    "model_id": "granite-3.3-8b-base",
    "score": 0.8973,
    "normalized_score": 0.8973,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-base",
    "verified_by_llmstats": false,
    "analysis_method": "OLMES",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.666882+00:00",
    "updated_at": "2025-07-19T19:56:12.666882+00:00",
    "benchmark_name": "HumanEval"
  },
  {
    "model_benchmark_id": 1444,
    "benchmark_id": "humaneval+",
    "model_id": "granite-3.3-8b-base",
    "score": 0.8609,
    "normalized_score": 0.8609,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-base",
    "verified_by_llmstats": false,
    "analysis_method": "OLMES",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.078662+00:00",
    "updated_at": "2025-07-19T19:56:14.078662+00:00",
    "benchmark_name": "HumanEval+"
  },
  {
    "model_benchmark_id": 626,
    "benchmark_id": "ifeval",
    "model_id": "granite-3.3-8b-base",
    "score": 0.7482,
    "normalized_score": 0.7482,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-base",
    "verified_by_llmstats": false,
    "analysis_method": "OLMES",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.288064+00:00",
    "updated_at": "2025-07-19T19:56:12.288064+00:00",
    "benchmark_name": "IFEval"
  },
  {
    "model_benchmark_id": 508,
    "benchmark_id": "math-500",
    "model_id": "granite-3.3-8b-base",
    "score": 0.6902,
    "normalized_score": 0.6902,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-base",
    "verified_by_llmstats": false,
    "analysis_method": "Not specified",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.056690+00:00",
    "updated_at": "2025-07-19T19:56:12.056690+00:00",
    "benchmark_name": "MATH-500"
  },
  {
    "model_benchmark_id": 101,
    "benchmark_id": "mmlu",
    "model_id": "granite-3.3-8b-base",
    "score": 0.6389,
    "normalized_score": 0.6389,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-base",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.290899+00:00",
    "updated_at": "2025-07-19T19:56:11.290899+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 1808,
    "benchmark_id": "nq",
    "model_id": "granite-3.3-8b-base",
    "score": 0.365,
    "normalized_score": 0.365,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-base",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.090844+00:00",
    "updated_at": "2025-07-19T19:56:15.090844+00:00",
    "benchmark_name": "NQ"
  },
  {
    "model_benchmark_id": 1804,
    "benchmark_id": "popqa",
    "model_id": "granite-3.3-8b-base",
    "score": 0.2617,
    "normalized_score": 0.2617,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-base",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.078883+00:00",
    "updated_at": "2025-07-19T19:56:15.078883+00:00",
    "benchmark_name": "PopQA"
  },
  {
    "model_benchmark_id": 250,
    "benchmark_id": "triviaqa",
    "model_id": "granite-3.3-8b-base",
    "score": 0.7818,
    "normalized_score": 0.7818,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-base",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.577753+00:00",
    "updated_at": "2025-07-19T19:56:11.577753+00:00",
    "benchmark_name": "TriviaQA"
  },
  {
    "model_benchmark_id": 142,
    "benchmark_id": "truthfulqa",
    "model_id": "granite-3.3-8b-base",
    "score": 0.5215,
    "normalized_score": 0.5215,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-base",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.362380+00:00",
    "updated_at": "2025-07-19T19:56:11.362380+00:00",
    "benchmark_name": "TruthfulQA"
  },
  {
    "model_benchmark_id": 152,
    "benchmark_id": "winogrande",
    "model_id": "granite-3.3-8b-base",
    "score": 0.744,
    "normalized_score": 0.744,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-base",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.387990+00:00",
    "updated_at": "2025-07-19T19:56:11.387990+00:00",
    "benchmark_name": "Winogrande"
  }
]

================================================
FILE: data/organizations/ibm/models/granite-3.3-8b-base/model.json
================================================
{
  "model_id": "granite-3.3-8b-base",
  "name": "Granite 3.3 8B Base",
  "organization_id": "ibm",
  "fine_tuned_from_model_id": null,
  "description": "Granite-3.3-8B-Base is a decoder-only language model with a 128K token context window. It improves upon Granite-3.1-8B-Base by adding support for Fill-in-the-Middle (FIM) using specialized tokens, enabling the model to generate content conditioned on both prefix and suffix. This makes it well-suited for code completion tasks",
  "release_date": "2025-04-16",
  "announcement_date": "2025-04-16",
  "license_id": "apache_2_0",
  "multimodal": true,
  "knowledge_cutoff": "2024-04-01",
  "param_count": 8170000000,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://www.ibm.com/granite/docs/",
  "source_playground": "https://www.ibm.com/granite/playground/",
  "source_paper": null,
  "source_scorecard_blog_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-base",
  "source_repo_link": "https://github.com/ibm-granite/granite-3.3-language-models",
  "source_weights_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-base",
  "created_at": "2025-07-19T19:49:05.727013+00:00",
  "updated_at": "2025-07-19T19:49:05.727013+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/ibm/models/granite-3.3-8b-instruct/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 476,
    "benchmark_id": "aime-2024",
    "model_id": "granite-3.3-8b-instruct",
    "score": 0.812,
    "normalized_score": 0.812,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Not specified",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.004852+00:00",
    "updated_at": "2025-07-19T19:56:12.004852+00:00",
    "benchmark_name": "AIME 2024"
  },
  {
    "model_benchmark_id": 1793,
    "benchmark_id": "alpacaeval-2.0",
    "model_id": "granite-3.3-8b-instruct",
    "score": 0.6268,
    "normalized_score": 0.6268,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.046908+00:00",
    "updated_at": "2025-07-19T19:56:15.046908+00:00",
    "benchmark_name": "AlpacaEval 2.0"
  },
  {
    "model_benchmark_id": 1459,
    "benchmark_id": "arena-hard",
    "model_id": "granite-3.3-8b-instruct",
    "score": 0.5756,
    "normalized_score": 0.5756,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Arena Hard benchmark evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.110277+00:00",
    "updated_at": "2025-07-19T19:56:14.110277+00:00",
    "benchmark_name": "Arena Hard"
  },
  {
    "model_benchmark_id": 1806,
    "benchmark_id": "attaq",
    "model_id": "granite-3.3-8b-instruct",
    "score": 0.885,
    "normalized_score": 0.885,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Not specified (OLMES)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.085492+00:00",
    "updated_at": "2025-07-19T19:56:15.085492+00:00",
    "benchmark_name": "AttaQ"
  },
  {
    "model_benchmark_id": 1080,
    "benchmark_id": "big-bench-hard",
    "model_id": "granite-3.3-8b-instruct",
    "score": 0.6913,
    "normalized_score": 0.6913,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "OLMES (Added regex for more efficient answer extraction)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.249459+00:00",
    "updated_at": "2025-07-19T19:56:13.249459+00:00",
    "benchmark_name": "BIG-Bench Hard"
  },
  {
    "model_benchmark_id": 954,
    "benchmark_id": "drop",
    "model_id": "granite-3.3-8b-instruct",
    "score": 0.5936,
    "normalized_score": 0.5936,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "OLMES (Modified implementation)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.010691+00:00",
    "updated_at": "2025-07-19T19:56:13.010691+00:00",
    "benchmark_name": "DROP"
  },
  {
    "model_benchmark_id": 1003,
    "benchmark_id": "gsm8k",
    "model_id": "granite-3.3-8b-instruct",
    "score": 0.8089,
    "normalized_score": 0.8089,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "OLMES",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.095998+00:00",
    "updated_at": "2025-07-19T19:56:13.095998+00:00",
    "benchmark_name": "GSM8k"
  },
  {
    "model_benchmark_id": 797,
    "benchmark_id": "humaneval",
    "model_id": "granite-3.3-8b-instruct",
    "score": 0.8973,
    "normalized_score": 0.8973,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "OLMES",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.665403+00:00",
    "updated_at": "2025-07-19T19:56:12.665403+00:00",
    "benchmark_name": "HumanEval"
  },
  {
    "model_benchmark_id": 1443,
    "benchmark_id": "humaneval+",
    "model_id": "granite-3.3-8b-instruct",
    "score": 0.8609,
    "normalized_score": 0.8609,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "OLMES",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.076877+00:00",
    "updated_at": "2025-07-19T19:56:14.076877+00:00",
    "benchmark_name": "HumanEval+"
  },
  {
    "model_benchmark_id": 625,
    "benchmark_id": "ifeval",
    "model_id": "granite-3.3-8b-instruct",
    "score": 0.7482,
    "normalized_score": 0.7482,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "OLMES",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.286600+00:00",
    "updated_at": "2025-07-19T19:56:12.286600+00:00",
    "benchmark_name": "IFEval"
  },
  {
    "model_benchmark_id": 507,
    "benchmark_id": "math-500",
    "model_id": "granite-3.3-8b-instruct",
    "score": 0.6902,
    "normalized_score": 0.6902,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Not specified",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.054762+00:00",
    "updated_at": "2025-07-19T19:56:12.054762+00:00",
    "benchmark_name": "MATH-500"
  },
  {
    "model_benchmark_id": 100,
    "benchmark_id": "mmlu",
    "model_id": "granite-3.3-8b-instruct",
    "score": 0.6554,
    "normalized_score": 0.6554,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.288937+00:00",
    "updated_at": "2025-07-19T19:56:11.288937+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 1803,
    "benchmark_id": "popqa",
    "model_id": "granite-3.3-8b-instruct",
    "score": 0.2617,
    "normalized_score": 0.2617,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.077308+00:00",
    "updated_at": "2025-07-19T19:56:15.077308+00:00",
    "benchmark_name": "PopQA"
  },
  {
    "model_benchmark_id": 141,
    "benchmark_id": "truthfulqa",
    "model_id": "granite-3.3-8b-instruct",
    "score": 0.6686,
    "normalized_score": 0.6686,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.360858+00:00",
    "updated_at": "2025-07-19T19:56:11.360858+00:00",
    "benchmark_name": "TruthfulQA"
  }
]

================================================
FILE: data/organizations/ibm/models/granite-3.3-8b-instruct/model.json
================================================
{
  "model_id": "granite-3.3-8b-instruct",
  "name": "Granite 3.3 8B Instruct",
  "organization_id": "ibm",
  "fine_tuned_from_model_id": null,
  "description": "Granite 3.3 models feature enhanced reasoning capabilities and support for Fill-in-the-Middle (FIM) code completion. They are built on a foundation of open-source instruction datasets with permissive licenses, alongside internally curated synthetic datasets tailored for long-context problem-solving. These models preserve the key strengths of previous Granite versions, including support for a 128K context length, strong performance in retrieval-augmented generation (RAG) and function calling, and controls for response length and originality. Granite 3.3 also delivers competitive results across general, enterprise, and safety benchmarks. Released as open source, the models are available under the Apache 2.0 license.",
  "release_date": "2025-04-16",
  "announcement_date": "2025-04-16",
  "license_id": "apache_2_0",
  "multimodal": true,
  "knowledge_cutoff": "2024-04-01",
  "param_count": 8000000000,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://www.ibm.com/granite/docs/",
  "source_playground": "https://www.ibm.com/granite/playground/",
  "source_paper": null,
  "source_scorecard_blog_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-instruct",
  "source_repo_link": "https://github.com/ibm-granite/granite-3.3-language-models",
  "source_weights_link": "https://huggingface.co/ibm-granite/granite-3.3-8b-instruct",
  "created_at": "2025-07-19T19:49:05.723958+00:00",
  "updated_at": "2025-07-19T19:49:05.723958+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/ibm/models/granite-4.0-tiny-preview/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 1792,
    "benchmark_id": "alpacaeval-2.0",
    "model_id": "granite-4.0-tiny-preview",
    "score": 0.3516,
    "normalized_score": 0.3516,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-4.0-tiny-preview",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.045290+00:00",
    "updated_at": "2025-07-19T19:56:15.045290+00:00",
    "benchmark_name": "AlpacaEval 2.0"
  },
  {
    "model_benchmark_id": 1458,
    "benchmark_id": "arena-hard",
    "model_id": "granite-4.0-tiny-preview",
    "score": 0.267,
    "normalized_score": 0.267,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/ibm-granite/granite-4.0-tiny-preview",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.108397+00:00",
    "updated_at": "2025-07-19T19:56:14.108397+00:00",
    "benchmark_name": "Arena Hard"
  },
  {
    "model_benchmark_id": 1805,
    "benchmark_id": "attaq",
    "model_id": "granite-4.0-tiny-preview",
    "score": 0.861,
    "normalized_score": 0.861,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.ibm.com/new/announcements/ibm-granite-4-0-tiny-preview-sneak-peek",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.083480+00:00",
    "updated_at": "2025-07-19T19:56:15.083480+00:00",
    "benchmark_name": "AttaQ"
  },
  {
    "model_benchmark_id": 1079,
    "benchmark_id": "big-bench-hard",
    "model_id": "granite-4.0-tiny-preview",
    "score": 0.557,
    "normalized_score": 0.557,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.ibm.com/new/announcements/ibm-granite-4-0-tiny-preview-sneak-peek",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.247228+00:00",
    "updated_at": "2025-07-19T19:56:13.247228+00:00",
    "benchmark_name": "BIG-Bench Hard"
  },
  {
    "model_benchmark_id": 953,
    "benchmark_id": "drop",
    "model_id": "granite-4.0-tiny-preview",
    "score": 0.462,
    "normalized_score": 0.462,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.ibm.com/new/announcements/ibm-granite-4-0-tiny-preview-sneak-peek",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.009229+00:00",
    "updated_at": "2025-07-19T19:56:13.009229+00:00",
    "benchmark_name": "DROP"
  },
  {
    "model_benchmark_id": 1002,
    "benchmark_id": "gsm8k",
    "model_id": "granite-4.0-tiny-preview",
    "score": 0.701,
    "normalized_score": 0.701,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.ibm.com/new/announcements/ibm-granite-4-0-tiny-preview-sneak-peek",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.094422+00:00",
    "updated_at": "2025-07-19T19:56:13.094422+00:00",
    "benchmark_name": "GSM8k"
  },
  {
    "model_benchmark_id": 796,
    "benchmark_id": "humaneval",
    "model_id": "granite-4.0-tiny-preview",
    "score": 0.824,
    "normalized_score": 0.824,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.ibm.com/new/announcements/ibm-granite-4-0-tiny-preview-sneak-peek",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.663900+00:00",
    "updated_at": "2025-07-19T19:56:12.663900+00:00",
    "benchmark_name": "HumanEval"
  },
  {
    "model_benchmark_id": 1442,
    "benchmark_id": "humaneval+",
    "model_id": "granite-4.0-tiny-preview",
    "score": 0.783,
    "normalized_score": 0.783,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.ibm.com/new/announcements/ibm-granite-4-0-tiny-preview-sneak-peek",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.074105+00:00",
    "updated_at": "2025-07-19T19:56:14.074105+00:00",
    "benchmark_name": "HumanEval+"
  },
  {
    "model_benchmark_id": 624,
    "benchmark_id": "ifeval",
    "model_id": "granite-4.0-tiny-preview",
    "score": 0.63,
    "normalized_score": 0.63,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.ibm.com/new/announcements/ibm-granite-4-0-tiny-preview-sneak-peek",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.285068+00:00",
    "updated_at": "2025-07-19T19:56:12.285068+00:00",
    "benchmark_name": "IFEval"
  },
  {
    "model_benchmark_id": 99,
    "benchmark_id": "mmlu",
    "model_id": "granite-4.0-tiny-preview",
    "score": 0.604,
    "normalized_score": 0.604,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.ibm.com/new/announcements/ibm-granite-4-0-tiny-preview-sneak-peek",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.287184+00:00",
    "updated_at": "2025-07-19T19:56:11.287184+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 1802,
    "benchmark_id": "popqa",
    "model_id": "granite-4.0-tiny-preview",
    "score": 0.229,
    "normalized_score": 0.229,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.ibm.com/new/announcements/ibm-granite-4-0-tiny-preview-sneak-peek",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.075622+00:00",
    "updated_at": "2025-07-19T19:56:15.075622+00:00",
    "benchmark_name": "PopQA"
  },
  {
    "model_benchmark_id": 140,
    "benchmark_id": "truthfulqa",
    "model_id": "granite-4.0-tiny-preview",
    "score": 0.581,
    "normalized_score": 0.581,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.ibm.com/new/announcements/ibm-granite-4-0-tiny-preview-sneak-peek",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.358910+00:00",
    "updated_at": "2025-07-19T19:56:11.358910+00:00",
    "benchmark_name": "TruthfulQA"
  }
]

================================================
FILE: data/organizations/ibm/models/granite-4.0-tiny-preview/model.json
================================================
{
  "model_id": "granite-4.0-tiny-preview",
  "name": "IBM Granite 4.0 Tiny Preview",
  "organization_id": "ibm",
  "fine_tuned_from_model_id": null,
  "description": "A preliminary version of the smallest model in the upcoming Granite 4.0 family, released May 2025. It utilizes a novel hybrid Mamba-2/Transformer, fine-grained mixture of experts (MoE) architecture (7B total parameters, 1B active at inference). This preview version is partially trained (2.5T tokens) but demonstrates significant memory efficiency and performance potential, validated for at least 128K context length without positional encoding.",
  "release_date": "2025-05-02",
  "announcement_date": "2025-05-02",
  "license_id": "apache_2_0",
  "multimodal": false,
  "knowledge_cutoff": null,
  "param_count": 7000000000,
  "training_tokens": 2500000000000,
  "available_in_zeroeval": true,
  "source_api_ref": "https://www.ibm.com/granite/docs/",
  "source_playground": "https://www.ibm.com/granite/playground/",
  "source_paper": null,
  "source_scorecard_blog_link": "https://www.ibm.com/new/announcements/ibm-granite-4-0-tiny-preview-sneak-peek",
  "source_repo_link": null,
  "source_weights_link": "https://huggingface.co/ibm-granite/granite-4.0-tiny-preview",
  "created_at": "2025-07-19T19:49:05.720766+00:00",
  "updated_at": "2025-07-19T19:49:05.720766+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/ibm/organization.json
================================================
{
  "organization_id": "ibm",
  "name": "IBM",
  "website": "https://ibm.com",
  "description": "Technology and consulting company",
  "country": null,
  "created_at": "2025-07-19T19:49:05.719047+00:00",
  "updated_at": "2025-07-19T19:49:05.719047+00:00"
}

================================================
FILE: data/organizations/meta/models/llama-3.1-405b-instruct/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 1562,
    "benchmark_id": "api-bank",
    "model_id": "llama-3.1-405b-instruct",
    "score": 0.92,
    "normalized_score": 0.92,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.382379+00:00",
    "updated_at": "2025-07-19T19:56:14.382379+00:00",
    "benchmark_name": "API-Bank"
  },
  {
    "model_benchmark_id": 16,
    "benchmark_id": "arc-c",
    "model_id": "llama-3.1-405b-instruct",
    "score": 0.969,
    "normalized_score": 0.969,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.118562+00:00",
    "updated_at": "2025-07-19T19:56:11.118562+00:00",
    "benchmark_name": "ARC-C"
  },
  {
    "model_benchmark_id": 848,
    "benchmark_id": "bfcl",
    "model_id": "llama-3.1-405b-instruct",
    "score": 0.885,
    "normalized_score": 0.885,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.775431+00:00",
    "updated_at": "2025-07-19T19:56:12.775431+00:00",
    "benchmark_name": "BFCL"
  },
  {
    "model_benchmark_id": 950,
    "benchmark_id": "drop",
    "model_id": "llama-3.1-405b-instruct",
    "score": 0.848,
    "normalized_score": 0.848,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2407.21783",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.004517+00:00",
    "updated_at": "2025-07-19T19:56:13.004517+00:00",
    "benchmark_name": "DROP"
  },
  {
    "model_benchmark_id": 1565,
    "benchmark_id": "gorilla-benchmark-api-bench",
    "model_id": "llama-3.1-405b-instruct",
    "score": 0.353,
    "normalized_score": 0.353,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.390263+00:00",
    "updated_at": "2025-07-19T19:56:14.390263+00:00",
    "benchmark_name": "Gorilla Benchmark API Bench"
  },
  {
    "model_benchmark_id": 291,
    "benchmark_id": "gpqa",
    "model_id": "llama-3.1-405b-instruct",
    "score": 0.507,
    "normalized_score": 0.507,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.662460+00:00",
    "updated_at": "2025-07-19T19:56:11.662460+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 988,
    "benchmark_id": "gsm8k",
    "model_id": "llama-3.1-405b-instruct",
    "score": 0.968,
    "normalized_score": 0.968,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "8-shot, CoT, em_maj1@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.071677+00:00",
    "updated_at": "2025-07-19T19:56:13.071677+00:00",
    "benchmark_name": "GSM8k"
  },
  {
    "model_benchmark_id": 780,
    "benchmark_id": "humaneval",
    "model_id": "llama-3.1-405b-instruct",
    "score": 0.89,
    "normalized_score": 0.89,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot, pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.636480+00:00",
    "updated_at": "2025-07-19T19:56:12.636480+00:00",
    "benchmark_name": "HumanEval"
  },
  {
    "model_benchmark_id": 616,
    "benchmark_id": "ifeval",
    "model_id": "llama-3.1-405b-instruct",
    "score": 0.886,
    "normalized_score": 0.886,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Standard",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.270752+00:00",
    "updated_at": "2025-07-19T19:56:12.270752+00:00",
    "benchmark_name": "IFEval"
  },
  {
    "model_benchmark_id": 394,
    "benchmark_id": "math",
    "model_id": "llama-3.1-405b-instruct",
    "score": 0.738,
    "normalized_score": 0.738,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot, CoT, final_em",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.846056+00:00",
    "updated_at": "2025-07-19T19:56:11.846056+00:00",
    "benchmark_name": "MATH"
  },
  {
    "model_benchmark_id": 1578,
    "benchmark_id": "mbpp-evalplus",
    "model_id": "llama-3.1-405b-instruct",
    "score": 0.886,
    "normalized_score": 0.886,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot, base, pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.428183+00:00",
    "updated_at": "2025-07-19T19:56:14.428183+00:00",
    "benchmark_name": "MBPP EvalPlus"
  },
  {
    "model_benchmark_id": 79,
    "benchmark_id": "mmlu",
    "model_id": "llama-3.1-405b-instruct",
    "score": 0.873,
    "normalized_score": 0.873,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot, macro_avg/acc",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.249582+00:00",
    "updated_at": "2025-07-19T19:56:11.249582+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 1548,
    "benchmark_id": "mmlu-(cot)",
    "model_id": "llama-3.1-405b-instruct",
    "score": 0.886,
    "normalized_score": 0.886,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot, macro_avg/acc",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.339647+00:00",
    "updated_at": "2025-07-19T19:56:14.339647+00:00",
    "benchmark_name": "MMLU (CoT)"
  },
  {
    "model_benchmark_id": 186,
    "benchmark_id": "mmlu-pro",
    "model_id": "llama-3.1-405b-instruct",
    "score": 0.733,
    "normalized_score": 0.733,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot, CoT, micro_avg/acc_char",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.458814+00:00",
    "updated_at": "2025-07-19T19:56:11.458814+00:00",
    "benchmark_name": "MMLU-Pro"
  },
  {
    "model_benchmark_id": 1572,
    "benchmark_id": "multilingual-mgsm-(cot)",
    "model_id": "llama-3.1-405b-instruct",
    "score": 0.916,
    "normalized_score": 0.916,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot, CoT, em",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.409472+00:00",
    "updated_at": "2025-07-19T19:56:14.409472+00:00",
    "benchmark_name": "Multilingual MGSM (CoT)"
  },
  {
    "model_benchmark_id": 1552,
    "benchmark_id": "multipl-e-humaneval",
    "model_id": "llama-3.1-405b-instruct",
    "score": 0.752,
    "normalized_score": 0.752,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot, pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.352505+00:00",
    "updated_at": "2025-07-19T19:56:14.352505+00:00",
    "benchmark_name": "Multipl-E HumanEval"
  },
  {
    "model_benchmark_id": 1555,
    "benchmark_id": "multipl-e-mbpp",
    "model_id": "llama-3.1-405b-instruct",
    "score": 0.657,
    "normalized_score": 0.657,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot, pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.359473+00:00",
    "updated_at": "2025-07-19T19:56:14.359473+00:00",
    "benchmark_name": "Multipl-E MBPP"
  },
  {
    "model_benchmark_id": 1568,
    "benchmark_id": "nexus",
    "model_id": "llama-3.1-405b-instruct",
    "score": 0.587,
    "normalized_score": 0.587,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot, macro_avg/acc",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.398966+00:00",
    "updated_at": "2025-07-19T19:56:14.398966+00:00",
    "benchmark_name": "Nexus"
  }
]

================================================
FILE: data/organizations/meta/models/llama-3.1-405b-instruct/model.json
================================================
{
  "model_id": "llama-3.1-405b-instruct",
  "name": "Llama 3.1 405B Instruct",
  "organization_id": "meta",
  "fine_tuned_from_model_id": null,
  "description": "Llama 3.1 405B Instruct is a large language model optimized for multilingual dialogue use cases. It outperforms many available open source and closed chat models on common industry benchmarks. The model supports 8 languages and has a 128K token context length.",
  "release_date": "2024-07-23",
  "announcement_date": "2024-07-23",
  "license_id": "llama_3_1_community_license",
  "multimodal": false,
  "knowledge_cutoff": null,
  "param_count": 405000000000,
  "training_tokens": 15000000000000,
  "available_in_zeroeval": true,
  "source_api_ref": "https://github.com/meta-llama/llama-models",
  "source_playground": "https://llama.meta.com/llama-downloads",
  "source_paper": null,
  "source_scorecard_blog_link": "https://ai.meta.com/blog/meta-llama-3-1/",
  "source_repo_link": "https://github.com/meta-llama/llama-models",
  "source_weights_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct",
  "created_at": "2025-07-19T19:49:05.585389+00:00",
  "updated_at": "2025-07-19T19:49:05.585389+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/meta/models/llama-3.1-70b-instruct/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 1560,
    "benchmark_id": "api-bank",
    "model_id": "llama-3.1-70b-instruct",
    "score": 0.9,
    "normalized_score": 0.9,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.378301+00:00",
    "updated_at": "2025-07-19T19:56:14.378301+00:00",
    "benchmark_name": "API-Bank"
  },
  {
    "model_benchmark_id": 14,
    "benchmark_id": "arc-c",
    "model_id": "llama-3.1-70b-instruct",
    "score": 0.948,
    "normalized_score": 0.948,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.113697+00:00",
    "updated_at": "2025-07-19T19:56:11.113697+00:00",
    "benchmark_name": "ARC-C"
  },
  {
    "model_benchmark_id": 846,
    "benchmark_id": "bfcl",
    "model_id": "llama-3.1-70b-instruct",
    "score": 0.848,
    "normalized_score": 0.848,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Standard evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.771784+00:00",
    "updated_at": "2025-07-19T19:56:12.771784+00:00",
    "benchmark_name": "BFCL"
  },
  {
    "model_benchmark_id": 948,
    "benchmark_id": "drop",
    "model_id": "llama-3.1-70b-instruct",
    "score": 0.796,
    "normalized_score": 0.796,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2407.21783",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.001514+00:00",
    "updated_at": "2025-07-19T19:56:13.001514+00:00",
    "benchmark_name": "DROP"
  },
  {
    "model_benchmark_id": 1563,
    "benchmark_id": "gorilla-benchmark-api-bench",
    "model_id": "llama-3.1-70b-instruct",
    "score": 0.297,
    "normalized_score": 0.297,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.386457+00:00",
    "updated_at": "2025-07-19T19:56:14.386457+00:00",
    "benchmark_name": "Gorilla Benchmark API Bench"
  },
  {
    "model_benchmark_id": 288,
    "benchmark_id": "gpqa",
    "model_id": "llama-3.1-70b-instruct",
    "score": 0.417,
    "normalized_score": 0.417,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.657221+00:00",
    "updated_at": "2025-07-19T19:56:11.657221+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 1556,
    "benchmark_id": "gsm-8k-(cot)",
    "model_id": "llama-3.1-70b-instruct",
    "score": 0.951,
    "normalized_score": 0.951,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "8-shot Chain-of-Thought",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.362878+00:00",
    "updated_at": "2025-07-19T19:56:14.362878+00:00",
    "benchmark_name": "GSM-8K (CoT)"
  },
  {
    "model_benchmark_id": 778,
    "benchmark_id": "humaneval",
    "model_id": "llama-3.1-70b-instruct",
    "score": 0.805,
    "normalized_score": 0.805,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.632931+00:00",
    "updated_at": "2025-07-19T19:56:12.632931+00:00",
    "benchmark_name": "HumanEval"
  },
  {
    "model_benchmark_id": 614,
    "benchmark_id": "ifeval",
    "model_id": "llama-3.1-70b-instruct",
    "score": 0.875,
    "normalized_score": 0.875,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Standard evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.266791+00:00",
    "updated_at": "2025-07-19T19:56:12.266791+00:00",
    "benchmark_name": "IFEval"
  },
  {
    "model_benchmark_id": 1558,
    "benchmark_id": "math-(cot)",
    "model_id": "llama-3.1-70b-instruct",
    "score": 0.68,
    "normalized_score": 0.68,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot Chain-of-Thought",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.371489+00:00",
    "updated_at": "2025-07-19T19:56:14.371489+00:00",
    "benchmark_name": "MATH (CoT)"
  },
  {
    "model_benchmark_id": 1549,
    "benchmark_id": "mbpp-++-base-version",
    "model_id": "llama-3.1-70b-instruct",
    "score": 0.86,
    "normalized_score": 0.86,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.344061+00:00",
    "updated_at": "2025-07-19T19:56:14.344061+00:00",
    "benchmark_name": "MBPP ++ base version"
  },
  {
    "model_benchmark_id": 76,
    "benchmark_id": "mmlu",
    "model_id": "llama-3.1-70b-instruct",
    "score": 0.836,
    "normalized_score": 0.836,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.243294+00:00",
    "updated_at": "2025-07-19T19:56:11.243294+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 1546,
    "benchmark_id": "mmlu-(cot)",
    "model_id": "llama-3.1-70b-instruct",
    "score": 0.86,
    "normalized_score": 0.86,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot Chain-of-Thought",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.334507+00:00",
    "updated_at": "2025-07-19T19:56:14.334507+00:00",
    "benchmark_name": "MMLU (CoT)"
  },
  {
    "model_benchmark_id": 184,
    "benchmark_id": "mmlu-pro",
    "model_id": "llama-3.1-70b-instruct",
    "score": 0.664,
    "normalized_score": 0.664,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot Chain-of-Thought",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.455089+00:00",
    "updated_at": "2025-07-19T19:56:11.455089+00:00",
    "benchmark_name": "MMLU-Pro"
  },
  {
    "model_benchmark_id": 1570,
    "benchmark_id": "multilingual-mgsm-(cot)",
    "model_id": "llama-3.1-70b-instruct",
    "score": 0.869,
    "normalized_score": 0.869,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot Chain-of-Thought",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.405488+00:00",
    "updated_at": "2025-07-19T19:56:14.405488+00:00",
    "benchmark_name": "Multilingual MGSM (CoT)"
  },
  {
    "model_benchmark_id": 1550,
    "benchmark_id": "multipl-e-humaneval",
    "model_id": "llama-3.1-70b-instruct",
    "score": 0.655,
    "normalized_score": 0.655,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.347431+00:00",
    "updated_at": "2025-07-19T19:56:14.347431+00:00",
    "benchmark_name": "Multipl-E HumanEval"
  },
  {
    "model_benchmark_id": 1553,
    "benchmark_id": "multipl-e-mbpp",
    "model_id": "llama-3.1-70b-instruct",
    "score": 0.62,
    "normalized_score": 0.62,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.356043+00:00",
    "updated_at": "2025-07-19T19:56:14.356043+00:00",
    "benchmark_name": "Multipl-E MBPP"
  },
  {
    "model_benchmark_id": 1566,
    "benchmark_id": "nexus",
    "model_id": "llama-3.1-70b-instruct",
    "score": 0.567,
    "normalized_score": 0.567,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.394299+00:00",
    "updated_at": "2025-07-19T19:56:14.394299+00:00",
    "benchmark_name": "Nexus"
  }
]

================================================
FILE: data/organizations/meta/models/llama-3.1-70b-instruct/model.json
================================================
{
  "model_id": "llama-3.1-70b-instruct",
  "name": "Llama 3.1 70B Instruct",
  "organization_id": "meta",
  "fine_tuned_from_model_id": null,
  "description": "Llama 3.1 70B Instruct is a large language model optimized for multilingual dialogue use cases. It outperforms many available open source and closed chat models on common industry benchmarks.",
  "release_date": "2024-07-23",
  "announcement_date": "2024-07-23",
  "license_id": "llama_3_1_community_license",
  "multimodal": false,
  "knowledge_cutoff": null,
  "param_count": 70000000000,
  "training_tokens": 15000000000000,
  "available_in_zeroeval": true,
  "source_api_ref": "https://ai.meta.com/llama/",
  "source_playground": null,
  "source_paper": "https://ai.meta.com/research/publications/llama-3-open-foundation-and-fine-tuned-chat-models/",
  "source_scorecard_blog_link": "https://ai.meta.com/blog/meta-llama-3-1/",
  "source_repo_link": "https://github.com/meta-llama/llama-models",
  "source_weights_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct",
  "created_at": "2025-07-19T19:49:05.575761+00:00",
  "updated_at": "2025-07-19T19:49:05.575761+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/meta/models/llama-3.1-8b-instruct/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 1561,
    "benchmark_id": "api-bank",
    "model_id": "llama-3.1-8b-instruct",
    "score": 0.826,
    "normalized_score": 0.826,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.380088+00:00",
    "updated_at": "2025-07-19T19:56:14.380088+00:00",
    "benchmark_name": "API-Bank"
  },
  {
    "model_benchmark_id": 15,
    "benchmark_id": "arc-c",
    "model_id": "llama-3.1-8b-instruct",
    "score": 0.834,
    "normalized_score": 0.834,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.115810+00:00",
    "updated_at": "2025-07-19T19:56:11.115810+00:00",
    "benchmark_name": "ARC-C"
  },
  {
    "model_benchmark_id": 847,
    "benchmark_id": "bfcl",
    "model_id": "llama-3.1-8b-instruct",
    "score": 0.761,
    "normalized_score": 0.761,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.773659+00:00",
    "updated_at": "2025-07-19T19:56:12.773659+00:00",
    "benchmark_name": "BFCL"
  },
  {
    "model_benchmark_id": 949,
    "benchmark_id": "drop",
    "model_id": "llama-3.1-8b-instruct",
    "score": 0.595,
    "normalized_score": 0.595,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2407.21783",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.003032+00:00",
    "updated_at": "2025-07-19T19:56:13.003032+00:00",
    "benchmark_name": "DROP"
  },
  {
    "model_benchmark_id": 1564,
    "benchmark_id": "gorilla-benchmark-api-bench",
    "model_id": "llama-3.1-8b-instruct",
    "score": 0.082,
    "normalized_score": 0.082,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.388429+00:00",
    "updated_at": "2025-07-19T19:56:14.388429+00:00",
    "benchmark_name": "Gorilla Benchmark API Bench"
  },
  {
    "model_benchmark_id": 290,
    "benchmark_id": "gpqa",
    "model_id": "llama-3.1-8b-instruct",
    "score": 0.304,
    "normalized_score": 0.304,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.660952+00:00",
    "updated_at": "2025-07-19T19:56:11.660952+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 1557,
    "benchmark_id": "gsm-8k-(cot)",
    "model_id": "llama-3.1-8b-instruct",
    "score": 0.845,
    "normalized_score": 0.845,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "8-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.364382+00:00",
    "updated_at": "2025-07-19T19:56:14.364382+00:00",
    "benchmark_name": "GSM-8K (CoT)"
  },
  {
    "model_benchmark_id": 779,
    "benchmark_id": "humaneval",
    "model_id": "llama-3.1-8b-instruct",
    "score": 0.726,
    "normalized_score": 0.726,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.634981+00:00",
    "updated_at": "2025-07-19T19:56:12.634981+00:00",
    "benchmark_name": "HumanEval"
  },
  {
    "model_benchmark_id": 615,
    "benchmark_id": "ifeval",
    "model_id": "llama-3.1-8b-instruct",
    "score": 0.804,
    "normalized_score": 0.804,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "unspecified",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.268709+00:00",
    "updated_at": "2025-07-19T19:56:12.268709+00:00",
    "benchmark_name": "IFEval"
  },
  {
    "model_benchmark_id": 1559,
    "benchmark_id": "math-(cot)",
    "model_id": "llama-3.1-8b-instruct",
    "score": 0.519,
    "normalized_score": 0.519,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.373274+00:00",
    "updated_at": "2025-07-19T19:56:14.373274+00:00",
    "benchmark_name": "MATH (CoT)"
  },
  {
    "model_benchmark_id": 1577,
    "benchmark_id": "mbpp-evalplus-(base)",
    "model_id": "llama-3.1-8b-instruct",
    "score": 0.728,
    "normalized_score": 0.728,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.424442+00:00",
    "updated_at": "2025-07-19T19:56:14.424442+00:00",
    "benchmark_name": "MBPP EvalPlus (base)"
  },
  {
    "model_benchmark_id": 78,
    "benchmark_id": "mmlu",
    "model_id": "llama-3.1-8b-instruct",
    "score": 0.694,
    "normalized_score": 0.694,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.247675+00:00",
    "updated_at": "2025-07-19T19:56:11.247675+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 1547,
    "benchmark_id": "mmlu-(cot)",
    "model_id": "llama-3.1-8b-instruct",
    "score": 0.73,
    "normalized_score": 0.73,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.337443+00:00",
    "updated_at": "2025-07-19T19:56:14.337443+00:00",
    "benchmark_name": "MMLU (CoT)"
  },
  {
    "model_benchmark_id": 185,
    "benchmark_id": "mmlu-pro",
    "model_id": "llama-3.1-8b-instruct",
    "score": 0.483,
    "normalized_score": 0.483,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.457212+00:00",
    "updated_at": "2025-07-19T19:56:11.457212+00:00",
    "benchmark_name": "MMLU-Pro"
  },
  {
    "model_benchmark_id": 1571,
    "benchmark_id": "multilingual-mgsm-(cot)",
    "model_id": "llama-3.1-8b-instruct",
    "score": 0.689,
    "normalized_score": 0.689,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.407707+00:00",
    "updated_at": "2025-07-19T19:56:14.407707+00:00",
    "benchmark_name": "Multilingual MGSM (CoT)"
  },
  {
    "model_benchmark_id": 1551,
    "benchmark_id": "multipl-e-humaneval",
    "model_id": "llama-3.1-8b-instruct",
    "score": 0.508,
    "normalized_score": 0.508,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.350301+00:00",
    "updated_at": "2025-07-19T19:56:14.350301+00:00",
    "benchmark_name": "Multipl-E HumanEval"
  },
  {
    "model_benchmark_id": 1554,
    "benchmark_id": "multipl-e-mbpp",
    "model_id": "llama-3.1-8b-instruct",
    "score": 0.524,
    "normalized_score": 0.524,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.357886+00:00",
    "updated_at": "2025-07-19T19:56:14.357886+00:00",
    "benchmark_name": "Multipl-E MBPP"
  },
  {
    "model_benchmark_id": 1567,
    "benchmark_id": "nexus",
    "model_id": "llama-3.1-8b-instruct",
    "score": 0.385,
    "normalized_score": 0.385,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.396611+00:00",
    "updated_at": "2025-07-19T19:56:14.396611+00:00",
    "benchmark_name": "Nexus"
  }
]

================================================
FILE: data/organizations/meta/models/llama-3.1-8b-instruct/model.json
================================================
{
  "model_id": "llama-3.1-8b-instruct",
  "name": "Llama 3.1 8B Instruct",
  "organization_id": "meta",
  "fine_tuned_from_model_id": null,
  "description": "Llama 3.1 8B Instruct is a multilingual large language model optimized for dialogue use cases. It features a 128K context length, state-of-the-art tool use, and strong reasoning capabilities.",
  "release_date": "2024-07-23",
  "announcement_date": "2024-07-23",
  "license_id": "llama_3_1_community_license",
  "multimodal": false,
  "knowledge_cutoff": "2023-12-31",
  "param_count": 8000000000,
  "training_tokens": 15000000000000,
  "available_in_zeroeval": true,
  "source_api_ref": "https://www.llama.com/",
  "source_playground": null,
  "source_paper": null,
  "source_scorecard_blog_link": "https://ai.meta.com/blog/meta-llama-3-1/",
  "source_repo_link": "https://github.com/meta-llama/llama-models",
  "source_weights_link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct",
  "created_at": "2025-07-19T19:49:05.582878+00:00",
  "updated_at": "2025-07-19T19:49:05.582878+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/meta/models/llama-3.2-11b-instruct/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 1253,
    "benchmark_id": "ai2d",
    "model_id": "llama-3.2-11b-instruct",
    "score": 0.911,
    "normalized_score": 0.911,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Test accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.631448+00:00",
    "updated_at": "2025-07-19T19:56:13.631448+00:00",
    "benchmark_name": "AI2D"
  },
  {
    "model_benchmark_id": 861,
    "benchmark_id": "chartqa",
    "model_id": "llama-3.2-11b-instruct",
    "score": 0.834,
    "normalized_score": 0.834,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Test, 0-shot CoT relaxed accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.801741+00:00",
    "updated_at": "2025-07-19T19:56:12.801741+00:00",
    "benchmark_name": "ChartQA"
  },
  {
    "model_benchmark_id": 883,
    "benchmark_id": "docvqa",
    "model_id": "llama-3.2-11b-instruct",
    "score": 0.884,
    "normalized_score": 0.884,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Test ANLS",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.839416+00:00",
    "updated_at": "2025-07-19T19:56:12.839416+00:00",
    "benchmark_name": "DocVQA"
  },
  {
    "model_benchmark_id": 292,
    "benchmark_id": "gpqa",
    "model_id": "llama-3.2-11b-instruct",
    "score": 0.328,
    "normalized_score": 0.328,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot, CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.663962+00:00",
    "updated_at": "2025-07-19T19:56:11.663962+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 395,
    "benchmark_id": "math",
    "model_id": "llama-3.2-11b-instruct",
    "score": 0.519,
    "normalized_score": 0.519,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot, CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.847598+00:00",
    "updated_at": "2025-07-19T19:56:11.847598+00:00",
    "benchmark_name": "MATH"
  },
  {
    "model_benchmark_id": 523,
    "benchmark_id": "mathvista",
    "model_id": "llama-3.2-11b-instruct",
    "score": 0.515,
    "normalized_score": 0.515,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Test accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.086640+00:00",
    "updated_at": "2025-07-19T19:56:12.086640+00:00",
    "benchmark_name": "MathVista"
  },
  {
    "model_benchmark_id": 1284,
    "benchmark_id": "mgsm",
    "model_id": "llama-3.2-11b-instruct",
    "score": 0.689,
    "normalized_score": 0.689,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot, CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.690958+00:00",
    "updated_at": "2025-07-19T19:56:13.690958+00:00",
    "benchmark_name": "MGSM"
  },
  {
    "model_benchmark_id": 80,
    "benchmark_id": "mmlu",
    "model_id": "llama-3.2-11b-instruct",
    "score": 0.73,
    "normalized_score": 0.73,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Macro average accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.251362+00:00",
    "updated_at": "2025-07-19T19:56:11.251362+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 566,
    "benchmark_id": "mmmu",
    "model_id": "llama-3.2-11b-instruct",
    "score": 0.507,
    "normalized_score": 0.507,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Val, 0-shot CoT, micro avg accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.164872+00:00",
    "updated_at": "2025-07-19T19:56:12.164872+00:00",
    "benchmark_name": "MMMU"
  },
  {
    "model_benchmark_id": 1530,
    "benchmark_id": "mmmu-pro",
    "model_id": "llama-3.2-11b-instruct",
    "score": 0.33,
    "normalized_score": 0.33,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Test accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.288730+00:00",
    "updated_at": "2025-07-19T19:56:14.288730+00:00",
    "benchmark_name": "MMMU-Pro"
  },
  {
    "model_benchmark_id": 1580,
    "benchmark_id": "vqav2-(test)",
    "model_id": "llama-3.2-11b-instruct",
    "score": 0.752,
    "normalized_score": 0.752,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.434081+00:00",
    "updated_at": "2025-07-19T19:56:14.434081+00:00",
    "benchmark_name": "VQAv2 (test)"
  }
]

================================================
FILE: data/organizations/meta/models/llama-3.2-11b-instruct/model.json
================================================
{
  "model_id": "llama-3.2-11b-instruct",
  "name": "Llama 3.2 11B Instruct",
  "organization_id": "meta",
  "fine_tuned_from_model_id": null,
  "description": "Llama 3.2 11B Vision Instruct is an instruction-tuned multimodal large language model optimized for visual recognition, image reasoning, captioning, and answering general questions about an image. It accepts text and images as input and generates text as output.",
  "release_date": "2024-09-25",
  "announcement_date": "2024-09-25",
  "license_id": "llama_3_2_community_license",
  "multimodal": true,
  "knowledge_cutoff": "2023-12-31",
  "param_count": 10600000000,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct",
  "source_playground": null,
  "source_paper": null,
  "source_scorecard_blog_link": "https://ai.meta.com/blog/llama-3-2-connect-2024-vision-edge-mobile-devices/",
  "source_repo_link": "https://github.com/facebookresearch/llama",
  "source_weights_link": "https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct",
  "created_at": "2025-07-19T19:49:05.588479+00:00",
  "updated_at": "2025-07-19T19:49:05.588479+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/meta/models/llama-3.2-3b-instruct/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 17,
    "benchmark_id": "arc-c",
    "model_id": "llama-3.2-3b-instruct",
    "score": 0.786,
    "normalized_score": 0.786,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot, acc",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.120164+00:00",
    "updated_at": "2025-07-19T19:56:11.120164+00:00",
    "benchmark_name": "ARC-C"
  },
  {
    "model_benchmark_id": 1583,
    "benchmark_id": "bfcl-v2",
    "model_id": "llama-3.2-3b-instruct",
    "score": 0.67,
    "normalized_score": 0.67,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot, acc",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.446368+00:00",
    "updated_at": "2025-07-19T19:56:14.446368+00:00",
    "benchmark_name": "BFCL v2"
  },
  {
    "model_benchmark_id": 293,
    "benchmark_id": "gpqa",
    "model_id": "llama-3.2-3b-instruct",
    "score": 0.328,
    "normalized_score": 0.328,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot, acc",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.665423+00:00",
    "updated_at": "2025-07-19T19:56:11.665423+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 989,
    "benchmark_id": "gsm8k",
    "model_id": "llama-3.2-3b-instruct",
    "score": 0.777,
    "normalized_score": 0.777,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "8-shot, em_maj1@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.073210+00:00",
    "updated_at": "2025-07-19T19:56:13.073210+00:00",
    "benchmark_name": "GSM8k"
  },
  {
    "model_benchmark_id": 44,
    "benchmark_id": "hellaswag",
    "model_id": "llama-3.2-3b-instruct",
    "score": 0.698,
    "normalized_score": 0.698,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot, acc",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.175473+00:00",
    "updated_at": "2025-07-19T19:56:11.175473+00:00",
    "benchmark_name": "HellaSwag"
  },
  {
    "model_benchmark_id": 617,
    "benchmark_id": "ifeval",
    "model_id": "llama-3.2-3b-instruct",
    "score": 0.774,
    "normalized_score": 0.774,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Avg(Prompt/Instruction acc Loose/Strict)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.272319+00:00",
    "updated_at": "2025-07-19T19:56:12.272319+00:00",
    "benchmark_name": "IFEval"
  },
  {
    "model_benchmark_id": 1589,
    "benchmark_id": "infinitebench-en.mc",
    "model_id": "llama-3.2-3b-instruct",
    "score": 0.633,
    "normalized_score": 0.633,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot, longbook_choice/acc",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.464298+00:00",
    "updated_at": "2025-07-19T19:56:14.464298+00:00",
    "benchmark_name": "InfiniteBench/En.MC"
  },
  {
    "model_benchmark_id": 1588,
    "benchmark_id": "infinitebench-en.qa",
    "model_id": "llama-3.2-3b-instruct",
    "score": 0.198,
    "normalized_score": 0.198,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot, longbook_qa/f1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.460560+00:00",
    "updated_at": "2025-07-19T19:56:14.460560+00:00",
    "benchmark_name": "InfiniteBench/En.QA"
  },
  {
    "model_benchmark_id": 396,
    "benchmark_id": "math",
    "model_id": "llama-3.2-3b-instruct",
    "score": 0.48,
    "normalized_score": 0.48,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot, final_em",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.849582+00:00",
    "updated_at": "2025-07-19T19:56:11.849582+00:00",
    "benchmark_name": "MATH"
  },
  {
    "model_benchmark_id": 1285,
    "benchmark_id": "mgsm",
    "model_id": "llama-3.2-3b-instruct",
    "score": 0.582,
    "normalized_score": 0.582,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "CoT, em",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.692573+00:00",
    "updated_at": "2025-07-19T19:56:13.692573+00:00",
    "benchmark_name": "MGSM"
  },
  {
    "model_benchmark_id": 81,
    "benchmark_id": "mmlu",
    "model_id": "llama-3.2-3b-instruct",
    "score": 0.634,
    "normalized_score": 0.634,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot, macro_avg/acc",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.252797+00:00",
    "updated_at": "2025-07-19T19:56:11.252797+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 1569,
    "benchmark_id": "nexus",
    "model_id": "llama-3.2-3b-instruct",
    "score": 0.343,
    "normalized_score": 0.343,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot, macro_avg/acc",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.401027+00:00",
    "updated_at": "2025-07-19T19:56:14.401027+00:00",
    "benchmark_name": "Nexus"
  },
  {
    "model_benchmark_id": 1590,
    "benchmark_id": "nih-multi-needle",
    "model_id": "llama-3.2-3b-instruct",
    "score": 0.847,
    "normalized_score": 0.847,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot, recall",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.469424+00:00",
    "updated_at": "2025-07-19T19:56:14.469424+00:00",
    "benchmark_name": "NIH/Multi-needle"
  },
  {
    "model_benchmark_id": 1581,
    "benchmark_id": "open-rewrite",
    "model_id": "llama-3.2-3b-instruct",
    "score": 0.401,
    "normalized_score": 0.401,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot, micro_avg/rougeL",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.438526+00:00",
    "updated_at": "2025-07-19T19:56:14.438526+00:00",
    "benchmark_name": "Open-rewrite"
  },
  {
    "model_benchmark_id": 1582,
    "benchmark_id": "tldr9+-(test)",
    "model_id": "llama-3.2-3b-instruct",
    "score": 0.19,
    "normalized_score": 0.19,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "1-shot, rougeL",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.443142+00:00",
    "updated_at": "2025-07-19T19:56:14.443142+00:00",
    "benchmark_name": "TLDR9+ (test)"
  }
]

================================================
FILE: data/organizations/meta/models/llama-3.2-3b-instruct/model.json
================================================
{
  "model_id": "llama-3.2-3b-instruct",
  "name": "Llama 3.2 3B Instruct",
  "organization_id": "meta",
  "fine_tuned_from_model_id": null,
  "description": "Llama 3.2 3B Instruct is a large language model that supports a context length of 128K tokens and are state-of-the-art in their class for on-device use cases like summarization, instruction following, and rewriting tasks running locally at the edge.",
  "release_date": "2024-09-25",
  "announcement_date": "2024-09-25",
  "license_id": "llama_3_2_community_license",
  "multimodal": false,
  "knowledge_cutoff": null,
  "param_count": 3210000000,
  "training_tokens": 9000000000000,
  "available_in_zeroeval": true,
  "source_api_ref": "https://github.com/meta-llama/llama-models",
  "source_playground": "https://llama.meta.com/llama-downloads",
  "source_paper": null,
  "source_scorecard_blog_link": "https://ai.meta.com/blog/llama-3-2-connect-2024-vision-edge-mobile-devices/",
  "source_repo_link": "https://github.com/meta-llama/llama-models",
  "source_weights_link": "https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct",
  "created_at": "2025-07-19T19:49:05.591372+00:00",
  "updated_at": "2025-07-19T19:49:05.591372+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/meta/models/llama-3.2-90b-instruct/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 1252,
    "benchmark_id": "ai2d",
    "model_id": "llama-3.2-90b-instruct",
    "score": 0.923,
    "normalized_score": 0.923,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.meta.com/blog/llama-3-2-connect-2024-vision-edge-mobile-devices/",
    "verified_by_llmstats": false,
    "analysis_method": "-",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.629735+00:00",
    "updated_at": "2025-07-19T19:56:13.629735+00:00",
    "benchmark_name": "AI2D"
  },
  {
    "model_benchmark_id": 860,
    "benchmark_id": "chartqa",
    "model_id": "llama-3.2-90b-instruct",
    "score": 0.855,
    "normalized_score": 0.855,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.meta.com/blog/llama-3-2-connect-2024-vision-edge-mobile-devices/",
    "verified_by_llmstats": false,
    "analysis_method": "-",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.799861+00:00",
    "updated_at": "2025-07-19T19:56:12.799861+00:00",
    "benchmark_name": "ChartQA"
  },
  {
    "model_benchmark_id": 882,
    "benchmark_id": "docvqa",
    "model_id": "llama-3.2-90b-instruct",
    "score": 0.901,
    "normalized_score": 0.901,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.meta.com/blog/llama-3-2-connect-2024-vision-edge-mobile-devices/",
    "verified_by_llmstats": false,
    "analysis_method": "-",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.837654+00:00",
    "updated_at": "2025-07-19T19:56:12.837654+00:00",
    "benchmark_name": "DocVQA"
  },
  {
    "model_benchmark_id": 289,
    "benchmark_id": "gpqa",
    "model_id": "llama-3.2-90b-instruct",
    "score": 0.467,
    "normalized_score": 0.467,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.meta.com/blog/llama-3-2-connect-2024-vision-edge-mobile-devices/",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.659193+00:00",
    "updated_at": "2025-07-19T19:56:11.659193+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 1576,
    "benchmark_id": "infographicsqa",
    "model_id": "llama-3.2-90b-instruct",
    "score": 0.568,
    "normalized_score": 0.568,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.meta.com/blog/llama-3-2-connect-2024-vision-edge-mobile-devices/",
    "verified_by_llmstats": false,
    "analysis_method": "-",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.420214+00:00",
    "updated_at": "2025-07-19T19:56:14.420214+00:00",
    "benchmark_name": "InfographicsQA"
  },
  {
    "model_benchmark_id": 393,
    "benchmark_id": "math",
    "model_id": "llama-3.2-90b-instruct",
    "score": 0.68,
    "normalized_score": 0.68,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.meta.com/blog/llama-3-2-connect-2024-vision-edge-mobile-devices/",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.844378+00:00",
    "updated_at": "2025-07-19T19:56:11.844378+00:00",
    "benchmark_name": "MATH"
  },
  {
    "model_benchmark_id": 522,
    "benchmark_id": "mathvista",
    "model_id": "llama-3.2-90b-instruct",
    "score": 0.573,
    "normalized_score": 0.573,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.meta.com/blog/llama-3-2-connect-2024-vision-edge-mobile-devices/",
    "verified_by_llmstats": false,
    "analysis_method": "-",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.084321+00:00",
    "updated_at": "2025-07-19T19:56:12.084321+00:00",
    "benchmark_name": "MathVista"
  },
  {
    "model_benchmark_id": 1283,
    "benchmark_id": "mgsm",
    "model_id": "llama-3.2-90b-instruct",
    "score": 0.869,
    "normalized_score": 0.869,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.meta.com/blog/llama-3-2-connect-2024-vision-edge-mobile-devices/",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.688987+00:00",
    "updated_at": "2025-07-19T19:56:13.688987+00:00",
    "benchmark_name": "MGSM"
  },
  {
    "model_benchmark_id": 77,
    "benchmark_id": "mmlu",
    "model_id": "llama-3.2-90b-instruct",
    "score": 0.86,
    "normalized_score": 0.86,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.meta.com/blog/llama-3-2-connect-2024-vision-edge-mobile-devices/",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.245688+00:00",
    "updated_at": "2025-07-19T19:56:11.245688+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 565,
    "benchmark_id": "mmmu",
    "model_id": "llama-3.2-90b-instruct",
    "score": 0.603,
    "normalized_score": 0.603,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.meta.com/blog/llama-3-2-connect-2024-vision-edge-mobile-devices/",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.162828+00:00",
    "updated_at": "2025-07-19T19:56:12.162828+00:00",
    "benchmark_name": "MMMU"
  },
  {
    "model_benchmark_id": 1529,
    "benchmark_id": "mmmu-pro",
    "model_id": "llama-3.2-90b-instruct",
    "score": 0.452,
    "normalized_score": 0.452,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.meta.com/blog/llama-3-2-connect-2024-vision-edge-mobile-devices/",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.287214+00:00",
    "updated_at": "2025-07-19T19:56:14.287214+00:00",
    "benchmark_name": "MMMU-Pro"
  },
  {
    "model_benchmark_id": 908,
    "benchmark_id": "textvqa",
    "model_id": "llama-3.2-90b-instruct",
    "score": 0.735,
    "normalized_score": 0.735,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.meta.com/blog/llama-3-2-connect-2024-vision-edge-mobile-devices/",
    "verified_by_llmstats": false,
    "analysis_method": "-",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.892927+00:00",
    "updated_at": "2025-07-19T19:56:12.892927+00:00",
    "benchmark_name": "TextVQA"
  },
  {
    "model_benchmark_id": 1573,
    "benchmark_id": "vqav2",
    "model_id": "llama-3.2-90b-instruct",
    "score": 0.781,
    "normalized_score": 0.781,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.meta.com/blog/llama-3-2-connect-2024-vision-edge-mobile-devices/",
    "verified_by_llmstats": false,
    "analysis_method": "-",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.412800+00:00",
    "updated_at": "2025-07-19T19:56:14.412800+00:00",
    "benchmark_name": "VQAv2"
  }
]


================================================
FILE: data/organizations/meta/models/llama-3.2-90b-instruct/model.json
================================================
{
  "model_id": "llama-3.2-90b-instruct",
  "name": "Llama 3.2 90B Instruct",
  "organization_id": "meta",
  "fine_tuned_from_model_id": null,
  "description": "Llama 3.2 90B is a large multimodal language model optimized for visual recognition, image reasoning, and captioning tasks. It supports a context length of 128,000 tokens and is designed for deployment on edge and mobile devices, offering state-of-the-art performance in image understanding and generative tasks.",
  "release_date": "2024-09-25",
  "announcement_date": "2024-09-25",
  "license_id": "llama3_2",
  "multimodal": true,
  "knowledge_cutoff": null,
  "param_count": 90000000000,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://huggingface.co/meta-llama/Llama-3.2-90B-Vision-Instruct",
  "source_playground": null,
  "source_paper": null,
  "source_scorecard_blog_link": "https://ai.meta.com/blog/llama-3-2-connect-2024-vision-edge-mobile-devices/",
  "source_repo_link": "https://huggingface.co/meta-llama/Llama-3.2-90B-Vision-Instruct",
  "source_weights_link": "https://huggingface.co/meta-llama/Llama-3.2-90B-Vision-Instruct",
  "created_at": "2025-07-19T19:49:05.579590+00:00",
  "updated_at": "2025-07-19T19:49:05.579590+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/meta/models/llama-3.3-70b-instruct/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 1584,
    "benchmark_id": "bfcl-v2",
    "model_id": "llama-3.3-70b-instruct",
    "score": 0.773,
    "normalized_score": 0.773,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/meta-llama/llama-models/blob/main/models/llama3_3/MODEL_CARD.md",
    "verified_by_llmstats": false,
    "analysis_method": "-",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.448863+00:00",
    "updated_at": "2025-07-19T19:56:14.448863+00:00",
    "benchmark_name": "BFCL v2"
  },
  {
    "model_benchmark_id": 296,
    "benchmark_id": "gpqa",
    "model_id": "llama-3.3-70b-instruct",
    "score": 0.505,
    "normalized_score": 0.505,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/meta-llama/llama-models/blob/main/models/llama3_3/MODEL_CARD.md",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.669923+00:00",
    "updated_at": "2025-07-19T19:56:11.669923+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 781,
    "benchmark_id": "humaneval",
    "model_id": "llama-3.3-70b-instruct",
    "score": 0.884,
    "normalized_score": 0.884,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/meta-llama/llama-models/blob/main/models/llama3_3/MODEL_CARD.md",
    "verified_by_llmstats": false,
    "analysis_method": "-",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.637990+00:00",
    "updated_at": "2025-07-19T19:56:12.637990+00:00",
    "benchmark_name": "HumanEval"
  },
  {
    "model_benchmark_id": 618,
    "benchmark_id": "ifeval",
    "model_id": "llama-3.3-70b-instruct",
    "score": 0.921,
    "normalized_score": 0.921,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/meta-llama/llama-models/blob/main/models/llama3_3/MODEL_CARD.md",
    "verified_by_llmstats": false,
    "analysis_method": "-",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.274109+00:00",
    "updated_at": "2025-07-19T19:56:12.274109+00:00",
    "benchmark_name": "IFEval"
  },
  {
    "model_benchmark_id": 399,
    "benchmark_id": "math",
    "model_id": "llama-3.3-70b-instruct",
    "score": 0.77,
    "normalized_score": 0.77,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/meta-llama/llama-models/blob/main/models/llama3_3/MODEL_CARD.md",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.854268+00:00",
    "updated_at": "2025-07-19T19:56:11.854268+00:00",
    "benchmark_name": "MATH"
  },
  {
    "model_benchmark_id": 1579,
    "benchmark_id": "mbpp-evalplus",
    "model_id": "llama-3.3-70b-instruct",
    "score": 0.876,
    "normalized_score": 0.876,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/meta-llama/llama-models/blob/main/models/llama3_3/MODEL_CARD.md",
    "verified_by_llmstats": false,
    "analysis_method": "-",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.429699+00:00",
    "updated_at": "2025-07-19T19:56:14.429699+00:00",
    "benchmark_name": "MBPP EvalPlus"
  },
  {
    "model_benchmark_id": 1288,
    "benchmark_id": "mgsm",
    "model_id": "llama-3.3-70b-instruct",
    "score": 0.911,
    "normalized_score": 0.911,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/meta-llama/llama-models/blob/main/models/llama3_3/MODEL_CARD.md",
    "verified_by_llmstats": false,
    "analysis_method": "-",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.697414+00:00",
    "updated_at": "2025-07-19T19:56:13.697414+00:00",
    "benchmark_name": "MGSM"
  },
  {
    "model_benchmark_id": 84,
    "benchmark_id": "mmlu",
    "model_id": "llama-3.3-70b-instruct",
    "score": 0.86,
    "normalized_score": 0.86,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/meta-llama/llama-models/blob/main/models/llama3_3/MODEL_CARD.md",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.259963+00:00",
    "updated_at": "2025-07-19T19:56:11.259963+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 189,
    "benchmark_id": "mmlu-pro",
    "model_id": "llama-3.3-70b-instruct",
    "score": 0.689,
    "normalized_score": 0.689,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/meta-llama/llama-models/blob/main/models/llama3_3/MODEL_CARD.md",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.463251+00:00",
    "updated_at": "2025-07-19T19:56:11.463251+00:00",
    "benchmark_name": "MMLU-Pro"
  }
]


================================================
FILE: data/organizations/meta/models/llama-3.3-70b-instruct/model.json
================================================
{
  "model_id": "llama-3.3-70b-instruct",
  "name": "Llama 3.3 70B Instruct",
  "organization_id": "meta",
  "fine_tuned_from_model_id": null,
  "description": "Llama 3.3 is a multilingual large language model optimized for dialogue use cases across multiple languages. It is a pretrained and instruction-tuned generative model with 70 billion parameters, outperforming many open-source and closed chat models on common industry benchmarks. Llama 3.3 supports a context length of 128,000 tokens and is designed for commercial and research use in multiple languages.",
  "release_date": "2024-12-06",
  "announcement_date": "2024-12-06",
  "license_id": "llama_3_3_community_license_agreement",
  "multimodal": false,
  "knowledge_cutoff": null,
  "param_count": 70000000000,
  "training_tokens": 15000000000000,
  "available_in_zeroeval": true,
  "source_api_ref": "https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct",
  "source_playground": "https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct",
  "source_paper": null,
  "source_scorecard_blog_link": null,
  "source_repo_link": "https://github.com/meta-llama/llama-models/blob/main/models/llama3_3/MODEL_CARD.md",
  "source_weights_link": "https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct",
  "created_at": "2025-07-19T19:49:05.603412+00:00",
  "updated_at": "2025-07-19T19:49:05.603412+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/meta/models/llama-4-maverick/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 862,
    "benchmark_id": "chartqa",
    "model_id": "llama-4-maverick",
    "score": 0.9,
    "normalized_score": 0.9,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.meta.com/blog/llama-4-multimodal-intelligence/",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.803334+00:00",
    "updated_at": "2025-07-19T19:56:12.803334+00:00",
    "benchmark_name": "ChartQA"
  },
  {
    "model_benchmark_id": 884,
    "benchmark_id": "docvqa",
    "model_id": "llama-4-maverick",
    "score": 0.944,
    "normalized_score": 0.944,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.meta.com/blog/llama-4-multimodal-intelligence/",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.841331+00:00",
    "updated_at": "2025-07-19T19:56:12.841331+00:00",
    "benchmark_name": "DocVQA"
  },
  {
    "model_benchmark_id": 294,
    "benchmark_id": "gpqa",
    "model_id": "llama-4-maverick",
    "score": 0.698,
    "normalized_score": 0.698,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.meta.com/blog/llama-4-multimodal-intelligence/",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.666983+00:00",
    "updated_at": "2025-07-19T19:56:11.666983+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 1115,
    "benchmark_id": "livecodebench",
    "model_id": "llama-4-maverick",
    "score": 0.434,
    "normalized_score": 0.434,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.meta.com/blog/llama-4-multimodal-intelligence/",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.326624+00:00",
    "updated_at": "2025-07-19T19:56:13.326624+00:00",
    "benchmark_name": "LiveCodeBench"
  },
  {
    "model_benchmark_id": 397,
    "benchmark_id": "math",
    "model_id": "llama-4-maverick",
    "score": 0.612,
    "normalized_score": 0.612,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "4-shot em_maj1@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.851038+00:00",
    "updated_at": "2025-07-19T19:56:11.851038+00:00",
    "benchmark_name": "MATH"
  },
  {
    "model_benchmark_id": 524,
    "benchmark_id": "mathvista",
    "model_id": "llama-4-maverick",
    "score": 0.737,
    "normalized_score": 0.737,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.meta.com/blog/llama-4-multimodal-intelligence/",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.088308+00:00",
    "updated_at": "2025-07-19T19:56:12.088308+00:00",
    "benchmark_name": "MathVista"
  },
  {
    "model_benchmark_id": 1179,
    "benchmark_id": "mbpp",
    "model_id": "llama-4-maverick",
    "score": 0.776,
    "normalized_score": 0.776,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "3-shot pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.485323+00:00",
    "updated_at": "2025-07-19T19:56:13.485323+00:00",
    "benchmark_name": "MBPP"
  },
  {
    "model_benchmark_id": 1286,
    "benchmark_id": "mgsm",
    "model_id": "llama-4-maverick",
    "score": 0.923,
    "normalized_score": 0.923,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.694238+00:00",
    "updated_at": "2025-07-19T19:56:13.694238+00:00",
    "benchmark_name": "MGSM"
  },
  {
    "model_benchmark_id": 82,
    "benchmark_id": "mmlu",
    "model_id": "llama-4-maverick",
    "score": 0.855,
    "normalized_score": 0.855,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot macro_avg/acc_char",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.254352+00:00",
    "updated_at": "2025-07-19T19:56:11.254352+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 187,
    "benchmark_id": "mmlu-pro",
    "model_id": "llama-4-maverick",
    "score": 0.805,
    "normalized_score": 0.805,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.meta.com/blog/llama-4-multimodal-intelligence/",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.460210+00:00",
    "updated_at": "2025-07-19T19:56:11.460210+00:00",
    "benchmark_name": "MMLU-Pro"
  },
  {
    "model_benchmark_id": 567,
    "benchmark_id": "mmmu",
    "model_id": "llama-4-maverick",
    "score": 0.734,
    "normalized_score": 0.734,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.meta.com/blog/llama-4-multimodal-intelligence/",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.167124+00:00",
    "updated_at": "2025-07-19T19:56:12.167124+00:00",
    "benchmark_name": "MMMU"
  },
  {
    "model_benchmark_id": 1531,
    "benchmark_id": "mmmu-pro",
    "model_id": "llama-4-maverick",
    "score": 0.596,
    "normalized_score": 0.596,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.290598+00:00",
    "updated_at": "2025-07-19T19:56:14.290598+00:00",
    "benchmark_name": "MMMU-Pro"
  },
  {
    "model_benchmark_id": 1591,
    "benchmark_id": "tydiqa",
    "model_id": "llama-4-maverick",
    "score": 0.317,
    "normalized_score": 0.317,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "1-shot average/f1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.475429+00:00",
    "updated_at": "2025-07-19T19:56:14.475429+00:00",
    "benchmark_name": "TydiQA"
  }
]

================================================
FILE: data/organizations/meta/models/llama-4-maverick/model.json
================================================
{
  "model_id": "llama-4-maverick",
  "name": "Llama 4 Maverick",
  "organization_id": "meta",
  "fine_tuned_from_model_id": null,
  "description": "Llama 4 Maverick is a natively multimodal model capable of processing both text and images. It features a 17 billion active parameter mixture-of-experts (MoE) architecture with 128 experts, supporting a wide range of multimodal tasks such as conversational interaction, image analysis, and code generation. The model includes a 1 million token context window.",
  "release_date": "2025-04-05",
  "announcement_date": "2025-04-05",
  "license_id": "llama_4_community_license_agreement",
  "multimodal": true,
  "knowledge_cutoff": null,
  "param_count": 400000000000,
  "training_tokens": 22000000000000,
  "available_in_zeroeval": true,
  "source_api_ref": "https://ai.meta.com/blog/llama-4-multimodal-intelligence/",
  "source_playground": "https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct",
  "source_paper": null,
  "source_scorecard_blog_link": null,
  "source_repo_link": "https://github.com/meta-llama/llama-models/tree/main/models/llama4",
  "source_weights_link": "https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct",
  "created_at": "2025-07-19T19:49:05.595636+00:00",
  "updated_at": "2025-07-19T19:49:05.595636+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/meta/models/llama-4-scout/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 863,
    "benchmark_id": "chartqa",
    "model_id": "llama-4-scout",
    "score": 0.888,
    "normalized_score": 0.888,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.meta.com/blog/llama-4-multimodal-intelligence/",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.804916+00:00",
    "updated_at": "2025-07-19T19:56:12.804916+00:00",
    "benchmark_name": "ChartQA"
  },
  {
    "model_benchmark_id": 885,
    "benchmark_id": "docvqa",
    "model_id": "llama-4-scout",
    "score": 0.944,
    "normalized_score": 0.944,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.meta.com/blog/llama-4-multimodal-intelligence/",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot (ANLS)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.842838+00:00",
    "updated_at": "2025-07-19T19:56:12.842838+00:00",
    "benchmark_name": "DocVQA"
  },
  {
    "model_benchmark_id": 295,
    "benchmark_id": "gpqa",
    "model_id": "llama-4-scout",
    "score": 0.572,
    "normalized_score": 0.572,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot (accuracy)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.668436+00:00",
    "updated_at": "2025-07-19T19:56:11.668436+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 1116,
    "benchmark_id": "livecodebench",
    "model_id": "llama-4-scout",
    "score": 0.328,
    "normalized_score": 0.328,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.meta.com/blog/llama-4-multimodal-intelligence/",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.328074+00:00",
    "updated_at": "2025-07-19T19:56:13.328074+00:00",
    "benchmark_name": "LiveCodeBench"
  },
  {
    "model_benchmark_id": 398,
    "benchmark_id": "math",
    "model_id": "llama-4-scout",
    "score": 0.503,
    "normalized_score": 0.503,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "4-shot em_maj1@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.852669+00:00",
    "updated_at": "2025-07-19T19:56:11.852669+00:00",
    "benchmark_name": "MATH"
  },
  {
    "model_benchmark_id": 525,
    "benchmark_id": "mathvista",
    "model_id": "llama-4-scout",
    "score": 0.707,
    "normalized_score": 0.707,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.meta.com/blog/llama-4-multimodal-intelligence/",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.089981+00:00",
    "updated_at": "2025-07-19T19:56:12.089981+00:00",
    "benchmark_name": "MathVista"
  },
  {
    "model_benchmark_id": 1180,
    "benchmark_id": "mbpp",
    "model_id": "llama-4-scout",
    "score": 0.678,
    "normalized_score": 0.678,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "3-shot pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.487376+00:00",
    "updated_at": "2025-07-19T19:56:13.487376+00:00",
    "benchmark_name": "MBPP"
  },
  {
    "model_benchmark_id": 1287,
    "benchmark_id": "mgsm",
    "model_id": "llama-4-scout",
    "score": 0.906,
    "normalized_score": 0.906,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot (average/em)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.695659+00:00",
    "updated_at": "2025-07-19T19:56:13.695659+00:00",
    "benchmark_name": "MGSM"
  },
  {
    "model_benchmark_id": 83,
    "benchmark_id": "mmlu",
    "model_id": "llama-4-scout",
    "score": 0.796,
    "normalized_score": 0.796,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot macro_avg/acc_char",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.258246+00:00",
    "updated_at": "2025-07-19T19:56:11.258246+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 188,
    "benchmark_id": "mmlu-pro",
    "model_id": "llama-4-scout",
    "score": 0.743,
    "normalized_score": 0.743,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.meta.com/blog/llama-4-multimodal-intelligence/",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot (macro_avg/acc)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.461726+00:00",
    "updated_at": "2025-07-19T19:56:11.461726+00:00",
    "benchmark_name": "MMLU-Pro"
  },
  {
    "model_benchmark_id": 568,
    "benchmark_id": "mmmu",
    "model_id": "llama-4-scout",
    "score": 0.694,
    "normalized_score": 0.694,
    "is_self_reported": true,
    "self_reported_source_link": "https://ai.meta.com/blog/llama-4-multimodal-intelligence/",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.169227+00:00",
    "updated_at": "2025-07-19T19:56:12.169227+00:00",
    "benchmark_name": "MMMU"
  },
  {
    "model_benchmark_id": 1592,
    "benchmark_id": "tydiqa",
    "model_id": "llama-4-scout",
    "score": 0.315,
    "normalized_score": 0.315,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "1-shot average/f1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.477364+00:00",
    "updated_at": "2025-07-19T19:56:14.477364+00:00",
    "benchmark_name": "TydiQA"
  }
]

================================================
FILE: data/organizations/meta/models/llama-4-scout/model.json
================================================
{
  "model_id": "llama-4-scout",
  "name": "Llama 4 Scout",
  "organization_id": "meta",
  "fine_tuned_from_model_id": null,
  "description": "Llama 4 Scout is a natively multimodal model capable of processing both text and images. It features a 17 billion activated parameter (109B total) mixture-of-experts (MoE) architecture with 16 experts, supporting a wide range of multimodal tasks such as conversational interaction, image analysis, and code generation. The model includes a 10 million token context window.",
  "release_date": "2025-04-05",
  "announcement_date": "2025-04-05",
  "license_id": "llama_4_community_license_agreement",
  "multimodal": true,
  "knowledge_cutoff": null,
  "param_count": 109000000000,
  "training_tokens": 40000000000000,
  "available_in_zeroeval": true,
  "source_api_ref": "https://ai.meta.com/blog/llama-4-multimodal-intelligence/",
  "source_playground": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct",
  "source_paper": null,
  "source_scorecard_blog_link": null,
  "source_repo_link": "https://github.com/meta-llama/llama-models/tree/main/models/llama4",
  "source_weights_link": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct",
  "created_at": "2025-07-19T19:49:05.599841+00:00",
  "updated_at": "2025-07-19T19:49:05.599841+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/meta/organization.json
================================================
{
  "organization_id": "meta",
  "name": "Meta",
  "website": "https://meta.com",
  "description": "Social media company with AI research",
  "country": "US",
  "created_at": "2025-07-19T19:49:05.572641+00:00",
  "updated_at": "2025-07-19T19:49:05.572641+00:00"
}


================================================
FILE: data/organizations/microsoft/models/phi-3.5-mini-instruct/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 13,
    "benchmark_id": "arc-c",
    "model_id": "phi-3.5-mini-instruct",
    "score": 0.846,
    "normalized_score": 0.846,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "10-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.111398+00:00",
    "updated_at": "2025-07-19T19:56:11.111398+00:00",
    "benchmark_name": "ARC-C"
  },
  {
    "model_benchmark_id": 1448,
    "benchmark_id": "arena-hard",
    "model_id": "phi-3.5-mini-instruct",
    "score": 0.37,
    "normalized_score": 0.37,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "standard evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.088299+00:00",
    "updated_at": "2025-07-19T19:56:14.088299+00:00",
    "benchmark_name": "Arena Hard"
  },
  {
    "model_benchmark_id": 1078,
    "benchmark_id": "big-bench-hard",
    "model_id": "phi-3.5-mini-instruct",
    "score": 0.69,
    "normalized_score": 0.69,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot chain-of-thought",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.245591+00:00",
    "updated_at": "2025-07-19T19:56:13.245591+00:00",
    "benchmark_name": "BIG-Bench Hard"
  },
  {
    "model_benchmark_id": 1025,
    "benchmark_id": "boolq",
    "model_id": "phi-3.5-mini-instruct",
    "score": 0.78,
    "normalized_score": 0.78,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "2-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.132882+00:00",
    "updated_at": "2025-07-19T19:56:13.132882+00:00",
    "benchmark_name": "BoolQ"
  },
  {
    "model_benchmark_id": 1504,
    "benchmark_id": "govreport",
    "model_id": "phi-3.5-mini-instruct",
    "score": 0.259,
    "normalized_score": 0.259,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "standard evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.222697+00:00",
    "updated_at": "2025-07-19T19:56:14.222697+00:00",
    "benchmark_name": "GovReport"
  },
  {
    "model_benchmark_id": 285,
    "benchmark_id": "gpqa",
    "model_id": "phi-3.5-mini-instruct",
    "score": 0.304,
    "normalized_score": 0.304,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot chain-of-thought",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.651230+00:00",
    "updated_at": "2025-07-19T19:56:11.651230+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 987,
    "benchmark_id": "gsm8k",
    "model_id": "phi-3.5-mini-instruct",
    "score": 0.862,
    "normalized_score": 0.862,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "8-shot chain-of-thought",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.070240+00:00",
    "updated_at": "2025-07-19T19:56:13.070240+00:00",
    "benchmark_name": "GSM8k"
  },
  {
    "model_benchmark_id": 43,
    "benchmark_id": "hellaswag",
    "model_id": "phi-3.5-mini-instruct",
    "score": 0.694,
    "normalized_score": 0.694,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.173447+00:00",
    "updated_at": "2025-07-19T19:56:11.173447+00:00",
    "benchmark_name": "HellaSwag"
  },
  {
    "model_benchmark_id": 777,
    "benchmark_id": "humaneval",
    "model_id": "phi-3.5-mini-instruct",
    "score": 0.628,
    "normalized_score": 0.628,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.631199+00:00",
    "updated_at": "2025-07-19T19:56:12.631199+00:00",
    "benchmark_name": "HumanEval"
  },
  {
    "model_benchmark_id": 392,
    "benchmark_id": "math",
    "model_id": "phi-3.5-mini-instruct",
    "score": 0.485,
    "normalized_score": 0.485,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot chain-of-thought",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.842901+00:00",
    "updated_at": "2025-07-19T19:56:11.842901+00:00",
    "benchmark_name": "MATH"
  },
  {
    "model_benchmark_id": 1178,
    "benchmark_id": "mbpp",
    "model_id": "phi-3.5-mini-instruct",
    "score": 0.696,
    "normalized_score": 0.696,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "3-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.481045+00:00",
    "updated_at": "2025-07-19T19:56:13.481045+00:00",
    "benchmark_name": "MBPP"
  },
  {
    "model_benchmark_id": 1494,
    "benchmark_id": "mega-mlqa",
    "model_id": "phi-3.5-mini-instruct",
    "score": 0.617,
    "normalized_score": 0.617,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "standard evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.191909+00:00",
    "updated_at": "2025-07-19T19:56:14.191909+00:00",
    "benchmark_name": "MEGA MLQA"
  },
  {
    "model_benchmark_id": 1496,
    "benchmark_id": "mega-tydi-qa",
    "model_id": "phi-3.5-mini-instruct",
    "score": 0.622,
    "normalized_score": 0.622,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "standard evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.197084+00:00",
    "updated_at": "2025-07-19T19:56:14.197084+00:00",
    "benchmark_name": "MEGA TyDi QA"
  },
  {
    "model_benchmark_id": 1498,
    "benchmark_id": "mega-udpos",
    "model_id": "phi-3.5-mini-instruct",
    "score": 0.465,
    "normalized_score": 0.465,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "standard evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.203616+00:00",
    "updated_at": "2025-07-19T19:56:14.203616+00:00",
    "benchmark_name": "MEGA UDPOS"
  },
  {
    "model_benchmark_id": 1500,
    "benchmark_id": "mega-xcopa",
    "model_id": "phi-3.5-mini-instruct",
    "score": 0.631,
    "normalized_score": 0.631,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "standard evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.210364+00:00",
    "updated_at": "2025-07-19T19:56:14.210364+00:00",
    "benchmark_name": "MEGA XCOPA"
  },
  {
    "model_benchmark_id": 1502,
    "benchmark_id": "mega-xstorycloze",
    "model_id": "phi-3.5-mini-instruct",
    "score": 0.735,
    "normalized_score": 0.735,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "standard evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.217597+00:00",
    "updated_at": "2025-07-19T19:56:14.217597+00:00",
    "benchmark_name": "MEGA XStoryCloze"
  },
  {
    "model_benchmark_id": 1282,
    "benchmark_id": "mgsm",
    "model_id": "phi-3.5-mini-instruct",
    "score": 0.479,
    "normalized_score": 0.479,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot chain-of-thought",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.687534+00:00",
    "updated_at": "2025-07-19T19:56:13.687534+00:00",
    "benchmark_name": "MGSM"
  },
  {
    "model_benchmark_id": 75,
    "benchmark_id": "mmlu",
    "model_id": "phi-3.5-mini-instruct",
    "score": 0.69,
    "normalized_score": 0.69,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.240966+00:00",
    "updated_at": "2025-07-19T19:56:11.240966+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 180,
    "benchmark_id": "mmlu-pro",
    "model_id": "phi-3.5-mini-instruct",
    "score": 0.474,
    "normalized_score": 0.474,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot chain-of-thought",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.447960+00:00",
    "updated_at": "2025-07-19T19:56:11.450171+00:00",
    "benchmark_name": "MMLU-Pro"
  },
  {
    "model_benchmark_id": 1476,
    "benchmark_id": "mmmlu",
    "model_id": "phi-3.5-mini-instruct",
    "score": 0.554,
    "normalized_score": 0.554,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.148935+00:00",
    "updated_at": "2025-07-19T19:56:14.148935+00:00",
    "benchmark_name": "MMMLU"
  },
  {
    "model_benchmark_id": 1471,
    "benchmark_id": "openbookqa",
    "model_id": "phi-3.5-mini-instruct",
    "score": 0.792,
    "normalized_score": 0.792,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "10-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.136354+00:00",
    "updated_at": "2025-07-19T19:56:14.136354+00:00",
    "benchmark_name": "OpenBookQA"
  },
  {
    "model_benchmark_id": 1034,
    "benchmark_id": "piqa",
    "model_id": "phi-3.5-mini-instruct",
    "score": 0.81,
    "normalized_score": 0.81,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.154444+00:00",
    "updated_at": "2025-07-19T19:56:13.154444+00:00",
    "benchmark_name": "PIQA"
  },
  {
    "model_benchmark_id": 1488,
    "benchmark_id": "qasper",
    "model_id": "phi-3.5-mini-instruct",
    "score": 0.419,
    "normalized_score": 0.419,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "standard evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.173290+00:00",
    "updated_at": "2025-07-19T19:56:14.173290+00:00",
    "benchmark_name": "Qasper"
  },
  {
    "model_benchmark_id": 1506,
    "benchmark_id": "qmsum",
    "model_id": "phi-3.5-mini-instruct",
    "score": 0.213,
    "normalized_score": 0.213,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "standard evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.228389+00:00",
    "updated_at": "2025-07-19T19:56:14.228389+00:00",
    "benchmark_name": "QMSum"
  },
  {
    "model_benchmark_id": 1492,
    "benchmark_id": "repoqa",
    "model_id": "phi-3.5-mini-instruct",
    "score": 0.77,
    "normalized_score": 0.77,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "average",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.186426+00:00",
    "updated_at": "2025-07-19T19:56:14.186426+00:00",
    "benchmark_name": "RepoQA"
  },
  {
    "model_benchmark_id": 1490,
    "benchmark_id": "ruler",
    "model_id": "phi-3.5-mini-instruct",
    "score": 0.841,
    "normalized_score": 0.841,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "128k",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.179307+00:00",
    "updated_at": "2025-07-19T19:56:14.179307+00:00",
    "benchmark_name": "RULER"
  },
  {
    "model_benchmark_id": 1043,
    "benchmark_id": "social-iqa",
    "model_id": "phi-3.5-mini-instruct",
    "score": 0.747,
    "normalized_score": 0.747,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.177860+00:00",
    "updated_at": "2025-07-19T19:56:13.177860+00:00",
    "benchmark_name": "Social IQa"
  },
  {
    "model_benchmark_id": 825,
    "benchmark_id": "squality",
    "model_id": "phi-3.5-mini-instruct",
    "score": 0.243,
    "normalized_score": 0.243,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "standard evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.722570+00:00",
    "updated_at": "2025-07-19T19:56:12.722570+00:00",
    "benchmark_name": "SQuALITY"
  },
  {
    "model_benchmark_id": 1508,
    "benchmark_id": "summscreenfd",
    "model_id": "phi-3.5-mini-instruct",
    "score": 0.16,
    "normalized_score": 0.16,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "standard evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.234498+00:00",
    "updated_at": "2025-07-19T19:56:14.234498+00:00",
    "benchmark_name": "SummScreenFD"
  },
  {
    "model_benchmark_id": 134,
    "benchmark_id": "truthfulqa",
    "model_id": "phi-3.5-mini-instruct",
    "score": 0.64,
    "normalized_score": 0.64,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "10-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.346508+00:00",
    "updated_at": "2025-07-19T19:56:11.346508+00:00",
    "benchmark_name": "TruthfulQA"
  },
  {
    "model_benchmark_id": 1063,
    "benchmark_id": "winogrande",
    "model_id": "phi-3.5-mini-instruct",
    "score": 0.685,
    "normalized_score": 0.685,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.217697+00:00",
    "updated_at": "2025-07-19T19:56:13.217697+00:00",
    "benchmark_name": "Winogrande"
  }
]

================================================
FILE: data/organizations/microsoft/models/phi-3.5-mini-instruct/model.json
================================================
{
  "model_id": "phi-3.5-mini-instruct",
  "name": "Phi-3.5-mini-instruct",
  "organization_id": "microsoft",
  "fine_tuned_from_model_id": null,
  "description": "Phi-3.5-mini-instruct is a 3.8B-parameter model that supports up to 128K context tokens, with improved multilingual capabilities across over 20 languages. It underwent additional training and safety post-training to enhance instruction-following, reasoning, math, and code generation. Ideal for environments with memory or latency constraints, it uses an MIT license.",
  "release_date": "2024-08-23",
  "announcement_date": "2024-08-23",
  "license_id": "mit",
  "multimodal": false,
  "knowledge_cutoff": null,
  "param_count": 3800000000,
  "training_tokens": 3400000000000,
  "available_in_zeroeval": true,
  "source_api_ref": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct",
  "source_playground": null,
  "source_paper": "https://arxiv.org/abs/2404.14219",
  "source_scorecard_blog_link": "https://techcommunity.microsoft.com/blog/azure-ai-services-blog/discover-the-new-multi-lingual-high-quality-phi-3-5-slms/4225280",
  "source_repo_link": null,
  "source_weights_link": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct",
  "created_at": "2025-07-19T19:49:05.559796+00:00",
  "updated_at": "2025-07-19T19:49:05.559796+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/microsoft/models/phi-3.5-moe-instruct/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 12,
    "benchmark_id": "arc-c",
    "model_id": "phi-3.5-moe-instruct",
    "score": 0.91,
    "normalized_score": 0.91,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "10-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.108027+00:00",
    "updated_at": "2025-07-19T19:56:11.108027+00:00",
    "benchmark_name": "ARC-C"
  },
  {
    "model_benchmark_id": 1447,
    "benchmark_id": "arena-hard",
    "model_id": "phi-3.5-moe-instruct",
    "score": 0.379,
    "normalized_score": 0.379,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "standard evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.086453+00:00",
    "updated_at": "2025-07-19T19:56:14.086453+00:00",
    "benchmark_name": "Arena Hard"
  },
  {
    "model_benchmark_id": 1077,
    "benchmark_id": "big-bench-hard",
    "model_id": "phi-3.5-moe-instruct",
    "score": 0.791,
    "normalized_score": 0.791,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot chain-of-thought",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.244054+00:00",
    "updated_at": "2025-07-19T19:56:13.244054+00:00",
    "benchmark_name": "BIG-Bench Hard"
  },
  {
    "model_benchmark_id": 1024,
    "benchmark_id": "boolq",
    "model_id": "phi-3.5-moe-instruct",
    "score": 0.846,
    "normalized_score": 0.846,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "2-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.130867+00:00",
    "updated_at": "2025-07-19T19:56:13.130867+00:00",
    "benchmark_name": "BoolQ"
  },
  {
    "model_benchmark_id": 1503,
    "benchmark_id": "govreport",
    "model_id": "phi-3.5-moe-instruct",
    "score": 0.264,
    "normalized_score": 0.264,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "standard evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.221191+00:00",
    "updated_at": "2025-07-19T19:56:14.221191+00:00",
    "benchmark_name": "GovReport"
  },
  {
    "model_benchmark_id": 284,
    "benchmark_id": "gpqa",
    "model_id": "phi-3.5-moe-instruct",
    "score": 0.368,
    "normalized_score": 0.368,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot chain-of-thought",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.649286+00:00",
    "updated_at": "2025-07-19T19:56:11.649286+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 986,
    "benchmark_id": "gsm8k",
    "model_id": "phi-3.5-moe-instruct",
    "score": 0.887,
    "normalized_score": 0.887,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "8-shot chain-of-thought",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.068601+00:00",
    "updated_at": "2025-07-19T19:56:13.068601+00:00",
    "benchmark_name": "GSM8k"
  },
  {
    "model_benchmark_id": 42,
    "benchmark_id": "hellaswag",
    "model_id": "phi-3.5-moe-instruct",
    "score": 0.838,
    "normalized_score": 0.838,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.171621+00:00",
    "updated_at": "2025-07-19T19:56:11.171621+00:00",
    "benchmark_name": "HellaSwag"
  },
  {
    "model_benchmark_id": 776,
    "benchmark_id": "humaneval",
    "model_id": "phi-3.5-moe-instruct",
    "score": 0.707,
    "normalized_score": 0.707,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.629465+00:00",
    "updated_at": "2025-07-19T19:56:12.629465+00:00",
    "benchmark_name": "HumanEval"
  },
  {
    "model_benchmark_id": 391,
    "benchmark_id": "math",
    "model_id": "phi-3.5-moe-instruct",
    "score": 0.595,
    "normalized_score": 0.595,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot chain-of-thought",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.841295+00:00",
    "updated_at": "2025-07-19T19:56:11.841295+00:00",
    "benchmark_name": "MATH"
  },
  {
    "model_benchmark_id": 1177,
    "benchmark_id": "mbpp",
    "model_id": "phi-3.5-moe-instruct",
    "score": 0.808,
    "normalized_score": 0.808,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "3-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.479387+00:00",
    "updated_at": "2025-07-19T19:56:13.479387+00:00",
    "benchmark_name": "MBPP"
  },
  {
    "model_benchmark_id": 1493,
    "benchmark_id": "mega-mlqa",
    "model_id": "phi-3.5-moe-instruct",
    "score": 0.653,
    "normalized_score": 0.653,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "standard evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.190086+00:00",
    "updated_at": "2025-07-19T19:56:14.190086+00:00",
    "benchmark_name": "MEGA MLQA"
  },
  {
    "model_benchmark_id": 1495,
    "benchmark_id": "mega-tydi-qa",
    "model_id": "phi-3.5-moe-instruct",
    "score": 0.671,
    "normalized_score": 0.671,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "standard evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.195123+00:00",
    "updated_at": "2025-07-19T19:56:14.195123+00:00",
    "benchmark_name": "MEGA TyDi QA"
  },
  {
    "model_benchmark_id": 1497,
    "benchmark_id": "mega-udpos",
    "model_id": "phi-3.5-moe-instruct",
    "score": 0.604,
    "normalized_score": 0.604,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "standard evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.201497+00:00",
    "updated_at": "2025-07-19T19:56:14.201497+00:00",
    "benchmark_name": "MEGA UDPOS"
  },
  {
    "model_benchmark_id": 1499,
    "benchmark_id": "mega-xcopa",
    "model_id": "phi-3.5-moe-instruct",
    "score": 0.766,
    "normalized_score": 0.766,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "standard evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.208476+00:00",
    "updated_at": "2025-07-19T19:56:14.208476+00:00",
    "benchmark_name": "MEGA XCOPA"
  },
  {
    "model_benchmark_id": 1501,
    "benchmark_id": "mega-xstorycloze",
    "model_id": "phi-3.5-moe-instruct",
    "score": 0.828,
    "normalized_score": 0.828,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "standard evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.214764+00:00",
    "updated_at": "2025-07-19T19:56:14.214764+00:00",
    "benchmark_name": "MEGA XStoryCloze"
  },
  {
    "model_benchmark_id": 1281,
    "benchmark_id": "mgsm",
    "model_id": "phi-3.5-moe-instruct",
    "score": 0.587,
    "normalized_score": 0.587,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot chain-of-thought",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.686017+00:00",
    "updated_at": "2025-07-19T19:56:13.686017+00:00",
    "benchmark_name": "MGSM"
  },
  {
    "model_benchmark_id": 74,
    "benchmark_id": "mmlu",
    "model_id": "phi-3.5-moe-instruct",
    "score": 0.789,
    "normalized_score": 0.789,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.239087+00:00",
    "updated_at": "2025-07-19T19:56:11.239087+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 178,
    "benchmark_id": "mmlu-pro",
    "model_id": "phi-3.5-moe-instruct",
    "score": 0.453,
    "normalized_score": 0.453,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "standard evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.444580+00:00",
    "updated_at": "2025-07-19T19:56:11.446076+00:00",
    "benchmark_name": "MMLU-Pro"
  },
  {
    "model_benchmark_id": 1475,
    "benchmark_id": "mmmlu",
    "model_id": "phi-3.5-moe-instruct",
    "score": 0.699,
    "normalized_score": 0.699,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.147234+00:00",
    "updated_at": "2025-07-19T19:56:14.147234+00:00",
    "benchmark_name": "MMMLU"
  },
  {
    "model_benchmark_id": 1470,
    "benchmark_id": "openbookqa",
    "model_id": "phi-3.5-moe-instruct",
    "score": 0.896,
    "normalized_score": 0.896,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "10-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.134275+00:00",
    "updated_at": "2025-07-19T19:56:14.134275+00:00",
    "benchmark_name": "OpenBookQA"
  },
  {
    "model_benchmark_id": 1033,
    "benchmark_id": "piqa",
    "model_id": "phi-3.5-moe-instruct",
    "score": 0.886,
    "normalized_score": 0.886,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.152199+00:00",
    "updated_at": "2025-07-19T19:56:13.152199+00:00",
    "benchmark_name": "PIQA"
  },
  {
    "model_benchmark_id": 1487,
    "benchmark_id": "qasper",
    "model_id": "phi-3.5-moe-instruct",
    "score": 0.4,
    "normalized_score": 0.4,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "standard evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.171579+00:00",
    "updated_at": "2025-07-19T19:56:14.171579+00:00",
    "benchmark_name": "Qasper"
  },
  {
    "model_benchmark_id": 1505,
    "benchmark_id": "qmsum",
    "model_id": "phi-3.5-moe-instruct",
    "score": 0.199,
    "normalized_score": 0.199,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "standard evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.226358+00:00",
    "updated_at": "2025-07-19T19:56:14.226358+00:00",
    "benchmark_name": "QMSum"
  },
  {
    "model_benchmark_id": 1491,
    "benchmark_id": "repoqa",
    "model_id": "phi-3.5-moe-instruct",
    "score": 0.85,
    "normalized_score": 0.85,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "average",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.184432+00:00",
    "updated_at": "2025-07-19T19:56:14.184432+00:00",
    "benchmark_name": "RepoQA"
  },
  {
    "model_benchmark_id": 1489,
    "benchmark_id": "ruler",
    "model_id": "phi-3.5-moe-instruct",
    "score": 0.871,
    "normalized_score": 0.871,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "long context (128K) evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.177557+00:00",
    "updated_at": "2025-07-19T19:56:14.177557+00:00",
    "benchmark_name": "RULER"
  },
  {
    "model_benchmark_id": 1042,
    "benchmark_id": "social-iqa",
    "model_id": "phi-3.5-moe-instruct",
    "score": 0.78,
    "normalized_score": 0.78,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.176106+00:00",
    "updated_at": "2025-07-19T19:56:13.176106+00:00",
    "benchmark_name": "Social IQa"
  },
  {
    "model_benchmark_id": 824,
    "benchmark_id": "squality",
    "model_id": "phi-3.5-moe-instruct",
    "score": 0.241,
    "normalized_score": 0.241,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "standard evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.720914+00:00",
    "updated_at": "2025-07-19T19:56:12.720914+00:00",
    "benchmark_name": "SQuALITY"
  },
  {
    "model_benchmark_id": 1507,
    "benchmark_id": "summscreenfd",
    "model_id": "phi-3.5-moe-instruct",
    "score": 0.169,
    "normalized_score": 0.169,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "standard evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.232655+00:00",
    "updated_at": "2025-07-19T19:56:14.232655+00:00",
    "benchmark_name": "SummScreenFD"
  },
  {
    "model_benchmark_id": 133,
    "benchmark_id": "truthfulqa",
    "model_id": "phi-3.5-moe-instruct",
    "score": 0.775,
    "normalized_score": 0.775,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "10-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.344788+00:00",
    "updated_at": "2025-07-19T19:56:11.344788+00:00",
    "benchmark_name": "TruthfulQA"
  },
  {
    "model_benchmark_id": 1062,
    "benchmark_id": "winogrande",
    "model_id": "phi-3.5-moe-instruct",
    "score": 0.813,
    "normalized_score": 0.813,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.215763+00:00",
    "updated_at": "2025-07-19T19:56:13.215763+00:00",
    "benchmark_name": "Winogrande"
  }
]

================================================
FILE: data/organizations/microsoft/models/phi-3.5-moe-instruct/model.json
================================================
{
  "model_id": "phi-3.5-moe-instruct",
  "name": "Phi-3.5-MoE-instruct",
  "organization_id": "microsoft",
  "fine_tuned_from_model_id": null,
  "description": "Phi-3.5-MoE-instruct is a mixture-of-experts model with ~42B total parameters (6.6B active) and a 128K context window. It excels at reasoning, math, coding, and multilingual tasks, outperforming larger dense models in many benchmarks. It underwent a thorough safety post-training process (SFT + DPO) and is licensed under MIT. This model is ideal for scenarios where efficiency and high performance are both required, particularly in multi-lingual or reasoning-intensive tasks.",
  "release_date": "2024-08-23",
  "announcement_date": "2024-08-23",
  "license_id": "mit",
  "multimodal": false,
  "knowledge_cutoff": null,
  "param_count": 60000000000,
  "training_tokens": 4900000000000,
  "available_in_zeroeval": true,
  "source_api_ref": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct",
  "source_playground": null,
  "source_paper": "https://arxiv.org/abs/2404.14219",
  "source_scorecard_blog_link": "https://techcommunity.microsoft.com/blog/azure-ai-services-blog/discover-the-new-multi-lingual-high-quality-phi-3-5-slms/4225280",
  "source_repo_link": null,
  "source_weights_link": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct",
  "created_at": "2025-07-19T19:49:05.555819+00:00",
  "updated_at": "2025-07-19T19:49:05.555819+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/microsoft/models/phi-3.5-vision-instruct/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 1250,
    "benchmark_id": "ai2d",
    "model_id": "phi-3.5-vision-instruct",
    "score": 0.781,
    "normalized_score": 0.781,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-vision-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "standard evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.626694+00:00",
    "updated_at": "2025-07-19T19:56:13.626694+00:00",
    "benchmark_name": "AI2D"
  },
  {
    "model_benchmark_id": 858,
    "benchmark_id": "chartqa",
    "model_id": "phi-3.5-vision-instruct",
    "score": 0.818,
    "normalized_score": 0.818,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-vision-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "standard evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.795942+00:00",
    "updated_at": "2025-07-19T19:56:12.795942+00:00",
    "benchmark_name": "ChartQA"
  },
  {
    "model_benchmark_id": 1520,
    "benchmark_id": "intergps",
    "model_id": "phi-3.5-vision-instruct",
    "score": 0.363,
    "normalized_score": 0.363,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-vision-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "standard evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.261813+00:00",
    "updated_at": "2025-07-19T19:56:14.261813+00:00",
    "benchmark_name": "InterGPS"
  },
  {
    "model_benchmark_id": 520,
    "benchmark_id": "mathvista",
    "model_id": "phi-3.5-vision-instruct",
    "score": 0.439,
    "normalized_score": 0.439,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-vision-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "standard evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.080462+00:00",
    "updated_at": "2025-07-19T19:56:12.080462+00:00",
    "benchmark_name": "MathVista"
  },
  {
    "model_benchmark_id": 1509,
    "benchmark_id": "mmbench",
    "model_id": "phi-3.5-vision-instruct",
    "score": 0.819,
    "normalized_score": 0.819,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-vision-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "standard evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.238017+00:00",
    "updated_at": "2025-07-19T19:56:14.238017+00:00",
    "benchmark_name": "MMBench"
  },
  {
    "model_benchmark_id": 563,
    "benchmark_id": "mmmu",
    "model_id": "phi-3.5-vision-instruct",
    "score": 0.43,
    "normalized_score": 0.43,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-vision-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "standard evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.158730+00:00",
    "updated_at": "2025-07-19T19:56:12.158730+00:00",
    "benchmark_name": "MMMU"
  },
  {
    "model_benchmark_id": 1522,
    "benchmark_id": "pope",
    "model_id": "phi-3.5-vision-instruct",
    "score": 0.861,
    "normalized_score": 0.861,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-vision-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "standard evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.266959+00:00",
    "updated_at": "2025-07-19T19:56:14.266959+00:00",
    "benchmark_name": "POPE"
  },
  {
    "model_benchmark_id": 1519,
    "benchmark_id": "scienceqa",
    "model_id": "phi-3.5-vision-instruct",
    "score": 0.913,
    "normalized_score": 0.913,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-vision-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "standard evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.258220+00:00",
    "updated_at": "2025-07-19T19:56:14.258220+00:00",
    "benchmark_name": "ScienceQA"
  },
  {
    "model_benchmark_id": 906,
    "benchmark_id": "textvqa",
    "model_id": "phi-3.5-vision-instruct",
    "score": 0.72,
    "normalized_score": 0.72,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-3.5-vision-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "standard evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.888892+00:00",
    "updated_at": "2025-07-19T19:56:12.888892+00:00",
    "benchmark_name": "TextVQA"
  }
]

================================================
FILE: data/organizations/microsoft/models/phi-3.5-vision-instruct/model.json
================================================
{
  "model_id": "phi-3.5-vision-instruct",
  "name": "Phi-3.5-vision-instruct",
  "organization_id": "microsoft",
  "fine_tuned_from_model_id": null,
  "description": "Phi-3.5-vision-instruct is a 4.2B-parameter open multimodal model with up to 128K context tokens. It emphasizes multi-frame image understanding and reasoning, boosting performance on single-image benchmarks while enabling multi-image comparison, summarization, and even video analysis. The model underwent safety post-training for improved instruction-following, alignment, and robust handling of visual and text inputs, and is released under the MIT license.",
  "release_date": "2024-08-23",
  "announcement_date": "2024-08-23",
  "license_id": "mit",
  "multimodal": true,
  "knowledge_cutoff": null,
  "param_count": 4200000000,
  "training_tokens": 500000000000,
  "available_in_zeroeval": true,
  "source_api_ref": "https://huggingface.co/microsoft/Phi-3.5-vision-instruct",
  "source_playground": null,
  "source_paper": "https://arxiv.org/abs/2404.14219",
  "source_scorecard_blog_link": "https://techcommunity.microsoft.com/blog/azure-ai-services-blog/discover-the-new-multi-lingual-high-quality-phi-3-5-slms/4225280",
  "source_repo_link": null,
  "source_weights_link": "https://huggingface.co/microsoft/Phi-3.5-vision-instruct",
  "created_at": "2025-07-19T19:49:05.563203+00:00",
  "updated_at": "2025-07-19T19:49:05.563203+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/microsoft/models/phi-4/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 1445,
    "benchmark_id": "arena-hard",
    "model_id": "phi-4",
    "score": 0.754,
    "normalized_score": 0.754,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2412.08905",
    "verified_by_llmstats": false,
    "analysis_method": "simple-evals",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.082804+00:00",
    "updated_at": "2025-07-19T19:56:14.082804+00:00",
    "benchmark_name": "Arena Hard"
  },
  {
    "model_benchmark_id": 947,
    "benchmark_id": "drop",
    "model_id": "phi-4",
    "score": 0.755,
    "normalized_score": 0.755,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2412.08905",
    "verified_by_llmstats": false,
    "analysis_method": "simple-evals",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.999411+00:00",
    "updated_at": "2025-07-19T19:56:12.999411+00:00",
    "benchmark_name": "DROP"
  },
  {
    "model_benchmark_id": 282,
    "benchmark_id": "gpqa",
    "model_id": "phi-4",
    "score": 0.561,
    "normalized_score": 0.561,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2412.08905",
    "verified_by_llmstats": false,
    "analysis_method": "simple-evals",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.644574+00:00",
    "updated_at": "2025-07-19T19:56:11.644574+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 775,
    "benchmark_id": "humaneval",
    "model_id": "phi-4",
    "score": 0.826,
    "normalized_score": 0.826,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2412.08905",
    "verified_by_llmstats": false,
    "analysis_method": "simple-evals",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.628035+00:00",
    "updated_at": "2025-07-19T19:56:12.628035+00:00",
    "benchmark_name": "HumanEval"
  },
  {
    "model_benchmark_id": 1437,
    "benchmark_id": "humaneval+",
    "model_id": "phi-4",
    "score": 0.828,
    "normalized_score": 0.828,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2412.08905",
    "verified_by_llmstats": false,
    "analysis_method": "simple-evals",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.064824+00:00",
    "updated_at": "2025-07-19T19:56:14.064824+00:00",
    "benchmark_name": "HumanEval+"
  },
  {
    "model_benchmark_id": 611,
    "benchmark_id": "ifeval",
    "model_id": "phi-4",
    "score": 0.63,
    "normalized_score": 0.63,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2412.08905",
    "verified_by_llmstats": false,
    "analysis_method": "simple-evals",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.261770+00:00",
    "updated_at": "2025-07-19T19:56:12.261770+00:00",
    "benchmark_name": "IFEval"
  },
  {
    "model_benchmark_id": 746,
    "benchmark_id": "livebench",
    "model_id": "phi-4",
    "score": 0.476,
    "normalized_score": 0.476,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2412.08905",
    "verified_by_llmstats": false,
    "analysis_method": "simple-evals",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.569213+00:00",
    "updated_at": "2025-07-19T19:56:12.569213+00:00",
    "benchmark_name": "LiveBench"
  },
  {
    "model_benchmark_id": 389,
    "benchmark_id": "math",
    "model_id": "phi-4",
    "score": 0.804,
    "normalized_score": 0.804,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2412.08905",
    "verified_by_llmstats": false,
    "analysis_method": "simple-evals",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.837602+00:00",
    "updated_at": "2025-07-19T19:56:11.837602+00:00",
    "benchmark_name": "MATH"
  },
  {
    "model_benchmark_id": 1279,
    "benchmark_id": "mgsm",
    "model_id": "phi-4",
    "score": 0.806,
    "normalized_score": 0.806,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2412.08905",
    "verified_by_llmstats": false,
    "analysis_method": "simple-evals",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.681417+00:00",
    "updated_at": "2025-07-19T19:56:13.681417+00:00",
    "benchmark_name": "MGSM"
  },
  {
    "model_benchmark_id": 72,
    "benchmark_id": "mmlu",
    "model_id": "phi-4",
    "score": 0.848,
    "normalized_score": 0.848,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2412.08905",
    "verified_by_llmstats": false,
    "analysis_method": "simple-evals",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.236043+00:00",
    "updated_at": "2025-07-19T19:56:11.236043+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 176,
    "benchmark_id": "mmlu-pro",
    "model_id": "phi-4",
    "score": 0.704,
    "normalized_score": 0.704,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2412.08905",
    "verified_by_llmstats": false,
    "analysis_method": "simple-evals",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.441164+00:00",
    "updated_at": "2025-07-19T19:56:11.441164+00:00",
    "benchmark_name": "MMLU-Pro"
  },
  {
    "model_benchmark_id": 1466,
    "benchmark_id": "phibench",
    "model_id": "phi-4",
    "score": 0.562,
    "normalized_score": 0.562,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2412.08905",
    "verified_by_llmstats": false,
    "analysis_method": "simple-evals",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.124860+00:00",
    "updated_at": "2025-07-19T19:56:14.124860+00:00",
    "benchmark_name": "PhiBench"
  },
  {
    "model_benchmark_id": 233,
    "benchmark_id": "simpleqa",
    "model_id": "phi-4",
    "score": 0.03,
    "normalized_score": 0.03,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2412.08905",
    "verified_by_llmstats": false,
    "analysis_method": "simple-evals",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.546523+00:00",
    "updated_at": "2025-07-19T19:56:11.546523+00:00",
    "benchmark_name": "SimpleQA"
  }
]

================================================
FILE: data/organizations/microsoft/models/phi-4/model.json
================================================
{
  "model_id": "phi-4",
  "name": "Phi 4",
  "organization_id": "microsoft",
  "fine_tuned_from_model_id": null,
  "description": "phi-4 is a state-of-the-art open model built to excel at advanced reasoning, coding, and knowledge tasks. It leverages a blend of synthetic data, filtered web data, academic texts, and supervised fine-tuning for precision, alignment, and safety.",
  "release_date": "2024-12-12",
  "announcement_date": "2024-12-12",
  "license_id": "mit",
  "multimodal": false,
  "knowledge_cutoff": "2024-06-01",
  "param_count": 14700000000,
  "training_tokens": 9800000000000,
  "available_in_zeroeval": true,
  "source_api_ref": "https://huggingface.co/microsoft/phi-4",
  "source_playground": null,
  "source_paper": "https://arxiv.org/pdf/2412.08905",
  "source_scorecard_blog_link": "https://techcommunity.microsoft.com/blog/aiplatformblog/introducing-phi-4-microsoft%E2%80%99s-newest-small-language-model-specializing-in-comple/4357090",
  "source_repo_link": null,
  "source_weights_link": "https://huggingface.co/microsoft/phi-4",
  "created_at": "2025-07-19T19:49:05.549276+00:00",
  "updated_at": "2025-07-19T19:49:05.549276+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/microsoft/models/phi-4-mini/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 11,
    "benchmark_id": "arc-c",
    "model_id": "phi-4-mini",
    "score": 0.837,
    "normalized_score": 0.837,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-mini-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "10-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.105059+00:00",
    "updated_at": "2025-07-19T19:56:11.105059+00:00",
    "benchmark_name": "ARC-C"
  },
  {
    "model_benchmark_id": 1446,
    "benchmark_id": "arena-hard",
    "model_id": "phi-4-mini",
    "score": 0.328,
    "normalized_score": 0.328,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-mini-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Standard evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.084727+00:00",
    "updated_at": "2025-07-19T19:56:14.084727+00:00",
    "benchmark_name": "Arena Hard"
  },
  {
    "model_benchmark_id": 1076,
    "benchmark_id": "big-bench-hard",
    "model_id": "phi-4-mini",
    "score": 0.704,
    "normalized_score": 0.704,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-mini-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot, CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.242363+00:00",
    "updated_at": "2025-07-19T19:56:13.242363+00:00",
    "benchmark_name": "BIG-Bench Hard"
  },
  {
    "model_benchmark_id": 1023,
    "benchmark_id": "boolq",
    "model_id": "phi-4-mini",
    "score": 0.812,
    "normalized_score": 0.812,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-mini-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "2-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.129244+00:00",
    "updated_at": "2025-07-19T19:56:13.129244+00:00",
    "benchmark_name": "BoolQ"
  },
  {
    "model_benchmark_id": 283,
    "benchmark_id": "gpqa",
    "model_id": "phi-4-mini",
    "score": 0.252,
    "normalized_score": 0.252,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-mini-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot, CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.646470+00:00",
    "updated_at": "2025-07-19T19:56:11.646470+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 985,
    "benchmark_id": "gsm8k",
    "model_id": "phi-4-mini",
    "score": 0.886,
    "normalized_score": 0.886,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-mini-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "8-shot, CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.066927+00:00",
    "updated_at": "2025-07-19T19:56:13.066927+00:00",
    "benchmark_name": "GSM8k"
  },
  {
    "model_benchmark_id": 41,
    "benchmark_id": "hellaswag",
    "model_id": "phi-4-mini",
    "score": 0.691,
    "normalized_score": 0.691,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-mini-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.169983+00:00",
    "updated_at": "2025-07-19T19:56:11.169983+00:00",
    "benchmark_name": "HellaSwag"
  },
  {
    "model_benchmark_id": 390,
    "benchmark_id": "math",
    "model_id": "phi-4-mini",
    "score": 0.64,
    "normalized_score": 0.64,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-mini-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot, CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.839081+00:00",
    "updated_at": "2025-07-19T19:56:11.839081+00:00",
    "benchmark_name": "MATH"
  },
  {
    "model_benchmark_id": 1280,
    "benchmark_id": "mgsm",
    "model_id": "phi-4-mini",
    "score": 0.639,
    "normalized_score": 0.639,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-mini-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.683394+00:00",
    "updated_at": "2025-07-19T19:56:13.683394+00:00",
    "benchmark_name": "MGSM"
  },
  {
    "model_benchmark_id": 73,
    "benchmark_id": "mmlu",
    "model_id": "phi-4-mini",
    "score": 0.673,
    "normalized_score": 0.673,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-mini-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.237489+00:00",
    "updated_at": "2025-07-19T19:56:11.237489+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 177,
    "benchmark_id": "mmlu-pro",
    "model_id": "phi-4-mini",
    "score": 0.528,
    "normalized_score": 0.528,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-mini-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot, CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.443019+00:00",
    "updated_at": "2025-07-19T19:56:11.443019+00:00",
    "benchmark_name": "MMLU-Pro"
  },
  {
    "model_benchmark_id": 1473,
    "benchmark_id": "multilingual-mmlu",
    "model_id": "phi-4-mini",
    "score": 0.493,
    "normalized_score": 0.493,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-mini-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.141886+00:00",
    "updated_at": "2025-07-19T19:56:14.141886+00:00",
    "benchmark_name": "Multilingual MMLU"
  },
  {
    "model_benchmark_id": 1469,
    "benchmark_id": "openbookqa",
    "model_id": "phi-4-mini",
    "score": 0.792,
    "normalized_score": 0.792,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-mini-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "10-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.132301+00:00",
    "updated_at": "2025-07-19T19:56:14.132301+00:00",
    "benchmark_name": "OpenBookQA"
  },
  {
    "model_benchmark_id": 1032,
    "benchmark_id": "piqa",
    "model_id": "phi-4-mini",
    "score": 0.776,
    "normalized_score": 0.776,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-mini-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.150113+00:00",
    "updated_at": "2025-07-19T19:56:13.150113+00:00",
    "benchmark_name": "PIQA"
  },
  {
    "model_benchmark_id": 1041,
    "benchmark_id": "social-iqa",
    "model_id": "phi-4-mini",
    "score": 0.725,
    "normalized_score": 0.725,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-mini-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.172567+00:00",
    "updated_at": "2025-07-19T19:56:13.172567+00:00",
    "benchmark_name": "Social IQa"
  },
  {
    "model_benchmark_id": 132,
    "benchmark_id": "truthfulqa",
    "model_id": "phi-4-mini",
    "score": 0.664,
    "normalized_score": 0.664,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-mini-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "MC2, 10-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.343180+00:00",
    "updated_at": "2025-07-19T19:56:11.343180+00:00",
    "benchmark_name": "TruthfulQA"
  },
  {
    "model_benchmark_id": 149,
    "benchmark_id": "winogrande",
    "model_id": "phi-4-mini",
    "score": 0.67,
    "normalized_score": 0.67,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-mini-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.382335+00:00",
    "updated_at": "2025-07-19T19:56:11.382335+00:00",
    "benchmark_name": "Winogrande"
  }
]

================================================
FILE: data/organizations/microsoft/models/phi-4-mini/model.json
================================================
{
  "model_id": "phi-4-mini",
  "name": "Phi 4 Mini",
  "organization_id": "microsoft",
  "fine_tuned_from_model_id": null,
  "description": "Phi 4 Mini Instruct is a lightweight (3.8B parameters) open model built upon synthetic data and filtered web data, focusing on high-quality reasoning. It supports a 128K token context length and is enhanced for instruction adherence and safety via supervised fine-tuning and direct preference optimization.",
  "release_date": "2025-02-01",
  "announcement_date": "2025-02-01",
  "license_id": "mit",
  "multimodal": false,
  "knowledge_cutoff": "2024-06-01",
  "param_count": 3840000000,
  "training_tokens": 5000000000000,
  "available_in_zeroeval": true,
  "source_api_ref": null,
  "source_playground": null,
  "source_paper": "https://arxiv.org/pdf/2503.01743",
  "source_scorecard_blog_link": "https://azure.microsoft.com/en-us/blog/empowering-innovation-the-next-generation-of-the-phi-family/",
  "source_repo_link": null,
  "source_weights_link": "https://huggingface.co/microsoft/Phi-4-mini-instruct",
  "created_at": "2025-07-19T19:49:05.552796+00:00",
  "updated_at": "2025-07-19T19:49:05.552796+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/microsoft/models/phi-4-mini-reasoning/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 1436,
    "benchmark_id": "aime",
    "model_id": "phi-4-mini-reasoning",
    "score": 0.575,
    "normalized_score": 0.575,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-mini-reasoning",
    "verified_by_llmstats": false,
    "analysis_method": "Standard evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.061299+00:00",
    "updated_at": "2025-07-19T19:56:14.061299+00:00",
    "benchmark_name": "AIME"
  },
  {
    "model_benchmark_id": 281,
    "benchmark_id": "gpqa",
    "model_id": "phi-4-mini-reasoning",
    "score": 0.52,
    "normalized_score": 0.52,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-mini-reasoning",
    "verified_by_llmstats": false,
    "analysis_method": "Diamond",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.642870+00:00",
    "updated_at": "2025-07-19T19:56:11.642870+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 494,
    "benchmark_id": "math-500",
    "model_id": "phi-4-mini-reasoning",
    "score": 0.946,
    "normalized_score": 0.946,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-mini-reasoning",
    "verified_by_llmstats": false,
    "analysis_method": "Standard evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.032863+00:00",
    "updated_at": "2025-07-19T19:56:12.032863+00:00",
    "benchmark_name": "MATH-500"
  }
]

================================================
FILE: data/organizations/microsoft/models/phi-4-mini-reasoning/model.json
================================================
{
  "model_id": "phi-4-mini-reasoning",
  "name": "Phi 4 Mini Reasoning",
  "organization_id": "microsoft",
  "fine_tuned_from_model_id": null,
  "description": "Phi-4-mini-reasoning is designed for multi-step, logic-intensive mathematical problem-solving tasks under memory/compute constrained environments and latency bound scenarios. Some of the use cases include formal proof generation, symbolic computation, advanced word problems, and a wide range of mathematical reasoning scenarios. These models excel at maintaining context across steps, applying structured logic, and delivering accurate, reliable solutions in domains that require deep analytical thinking.",
  "release_date": "2025-04-30",
  "announcement_date": "2025-04-30",
  "license_id": "mit",
  "multimodal": false,
  "knowledge_cutoff": "2025-02-01",
  "param_count": 3800000000,
  "training_tokens": 150000000000,
  "available_in_zeroeval": true,
  "source_api_ref": "https://learn.microsoft.com/en-us/windows/ai/apis/phi-silica?tabs=csharp0,csharp1,csharp2,csharp3",
  "source_playground": null,
  "source_paper": "https://arxiv.org/pdf/2504.21233",
  "source_scorecard_blog_link": "https://azure.microsoft.com/en-us/blog/one-year-of-phi-small-language-models-making-big-leaps-in-ai/",
  "source_repo_link": null,
  "source_weights_link": "https://huggingface.co/microsoft/Phi-4-mini-reasoning",
  "created_at": "2025-07-19T19:49:05.545846+00:00",
  "updated_at": "2025-07-19T19:49:05.545846+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/microsoft/models/phi-4-multimodal-instruct/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 1251,
    "benchmark_id": "ai2d",
    "model_id": "phi-4-multimodal-instruct",
    "score": 0.823,
    "normalized_score": 0.823,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-multimodal-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Standard Evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.628230+00:00",
    "updated_at": "2025-07-19T19:56:13.628230+00:00",
    "benchmark_name": "AI2D"
  },
  {
    "model_benchmark_id": 1545,
    "benchmark_id": "blink",
    "model_id": "phi-4-multimodal-instruct",
    "score": 0.613,
    "normalized_score": 0.613,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-multimodal-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Standard Evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.329567+00:00",
    "updated_at": "2025-07-19T19:56:14.329567+00:00",
    "benchmark_name": "BLINK"
  },
  {
    "model_benchmark_id": 859,
    "benchmark_id": "chartqa",
    "model_id": "phi-4-multimodal-instruct",
    "score": 0.814,
    "normalized_score": 0.814,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-multimodal-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Standard Evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.797898+00:00",
    "updated_at": "2025-07-19T19:56:12.797898+00:00",
    "benchmark_name": "ChartQA"
  },
  {
    "model_benchmark_id": 881,
    "benchmark_id": "docvqa",
    "model_id": "phi-4-multimodal-instruct",
    "score": 0.932,
    "normalized_score": 0.932,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-multimodal-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Standard Evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.836095+00:00",
    "updated_at": "2025-07-19T19:56:12.836095+00:00",
    "benchmark_name": "DocVQA"
  },
  {
    "model_benchmark_id": 1241,
    "benchmark_id": "infovqa",
    "model_id": "phi-4-multimodal-instruct",
    "score": 0.727,
    "normalized_score": 0.727,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-multimodal-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Standard Evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.609397+00:00",
    "updated_at": "2025-07-19T19:56:13.609397+00:00",
    "benchmark_name": "InfoVQA"
  },
  {
    "model_benchmark_id": 1521,
    "benchmark_id": "intergps",
    "model_id": "phi-4-multimodal-instruct",
    "score": 0.486,
    "normalized_score": 0.486,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-multimodal-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "testmini",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.263464+00:00",
    "updated_at": "2025-07-19T19:56:14.263464+00:00",
    "benchmark_name": "InterGPS"
  },
  {
    "model_benchmark_id": 521,
    "benchmark_id": "mathvista",
    "model_id": "phi-4-multimodal-instruct",
    "score": 0.624,
    "normalized_score": 0.624,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-multimodal-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "testmini",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.082453+00:00",
    "updated_at": "2025-07-19T19:56:12.082453+00:00",
    "benchmark_name": "MathVista"
  },
  {
    "model_benchmark_id": 1510,
    "benchmark_id": "mmbench",
    "model_id": "phi-4-multimodal-instruct",
    "score": 0.867,
    "normalized_score": 0.867,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-multimodal-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "dev-en",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.240071+00:00",
    "updated_at": "2025-07-19T19:56:14.240071+00:00",
    "benchmark_name": "MMBench"
  },
  {
    "model_benchmark_id": 564,
    "benchmark_id": "mmmu",
    "model_id": "phi-4-multimodal-instruct",
    "score": 0.551,
    "normalized_score": 0.551,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-multimodal-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Standard Evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.161302+00:00",
    "updated_at": "2025-07-19T19:56:12.161302+00:00",
    "benchmark_name": "MMMU"
  },
  {
    "model_benchmark_id": 1528,
    "benchmark_id": "mmmu-pro",
    "model_id": "phi-4-multimodal-instruct",
    "score": 0.385,
    "normalized_score": 0.385,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-multimodal-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "std/vision",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.285447+00:00",
    "updated_at": "2025-07-19T19:56:14.285447+00:00",
    "benchmark_name": "MMMU-Pro"
  },
  {
    "model_benchmark_id": 1538,
    "benchmark_id": "ocrbench",
    "model_id": "phi-4-multimodal-instruct",
    "score": 0.844,
    "normalized_score": 0.844,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-multimodal-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Standard Evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.309778+00:00",
    "updated_at": "2025-07-19T19:56:14.309778+00:00",
    "benchmark_name": "OCRBench"
  },
  {
    "model_benchmark_id": 1523,
    "benchmark_id": "pope",
    "model_id": "phi-4-multimodal-instruct",
    "score": 0.856,
    "normalized_score": 0.856,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-multimodal-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Standard Evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.268923+00:00",
    "updated_at": "2025-07-19T19:56:14.268923+00:00",
    "benchmark_name": "POPE"
  },
  {
    "model_benchmark_id": 1537,
    "benchmark_id": "scienceqa-visual",
    "model_id": "phi-4-multimodal-instruct",
    "score": 0.975,
    "normalized_score": 0.975,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-multimodal-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "img-test",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.303456+00:00",
    "updated_at": "2025-07-19T19:56:14.303456+00:00",
    "benchmark_name": "ScienceQA Visual"
  },
  {
    "model_benchmark_id": 907,
    "benchmark_id": "textvqa",
    "model_id": "phi-4-multimodal-instruct",
    "score": 0.756,
    "normalized_score": 0.756,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-multimodal-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Standard Evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.890738+00:00",
    "updated_at": "2025-07-19T19:56:12.890738+00:00",
    "benchmark_name": "TextVQA"
  },
  {
    "model_benchmark_id": 1383,
    "benchmark_id": "video-mme",
    "model_id": "phi-4-multimodal-instruct",
    "score": 0.55,
    "normalized_score": 0.55,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-multimodal-instruct",
    "verified_by_llmstats": false,
    "analysis_method": "16 frames",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.911859+00:00",
    "updated_at": "2025-07-19T19:56:13.911859+00:00",
    "benchmark_name": "Video-MME"
  }
]

================================================
FILE: data/organizations/microsoft/models/phi-4-multimodal-instruct/model.json
================================================
{
  "model_id": "phi-4-multimodal-instruct",
  "name": "Phi-4-multimodal-instruct",
  "organization_id": "microsoft",
  "fine_tuned_from_model_id": null,
  "description": "Phi-4-multimodal-instruct is a lightweight (5.57B parameters) open multimodal foundation model that leverages research and datasets from Phi-3.5 and 4.0. It processes text, image, and audio inputs to generate text outputs, supporting a 128K token context length. Enhanced via SFT, DPO, and RLHF for instruction following and safety.",
  "release_date": "2025-02-01",
  "announcement_date": "2025-02-01",
  "license_id": "mit",
  "multimodal": true,
  "knowledge_cutoff": "2024-06-01",
  "param_count": 5600000000,
  "training_tokens": 5000000000000,
  "available_in_zeroeval": true,
  "source_api_ref": null,
  "source_playground": "https://ai.azure.com/explore/models?selectedCollection=phi&tid=72f988bf-86f1-41af-91ab-2d7cd011db47",
  "source_paper": "https://arxiv.org/abs/2503.01743",
  "source_scorecard_blog_link": "https://azure.microsoft.com/en-us/blog/empowering-innovation-the-next-generation-of-the-phi-family/",
  "source_repo_link": null,
  "source_weights_link": "https://huggingface.co/microsoft/Phi-4-multimodal-instruct",
  "created_at": "2025-07-19T19:49:05.571307+00:00",
  "updated_at": "2025-07-19T19:49:05.571307+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/microsoft/models/phi-4-reasoning/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 450,
    "benchmark_id": "aime-2024",
    "model_id": "phi-4-reasoning",
    "score": 0.753,
    "normalized_score": 0.753,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-reasoning",
    "verified_by_llmstats": false,
    "analysis_method": "Standard evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.955706+00:00",
    "updated_at": "2025-07-19T19:56:11.955706+00:00",
    "benchmark_name": "AIME 2024"
  },
  {
    "model_benchmark_id": 688,
    "benchmark_id": "aime-2025",
    "model_id": "phi-4-reasoning",
    "score": 0.629,
    "normalized_score": 0.629,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-reasoning",
    "verified_by_llmstats": false,
    "analysis_method": "Standard evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.444086+00:00",
    "updated_at": "2025-07-19T19:56:12.444086+00:00",
    "benchmark_name": "AIME 2025"
  },
  {
    "model_benchmark_id": 1450,
    "benchmark_id": "arena-hard",
    "model_id": "phi-4-reasoning",
    "score": 0.733,
    "normalized_score": 0.733,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-reasoning-plus",
    "verified_by_llmstats": false,
    "analysis_method": "Standard evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.091856+00:00",
    "updated_at": "2025-07-19T19:56:14.091856+00:00",
    "benchmark_name": "Arena Hard"
  },
  {
    "model_benchmark_id": 1527,
    "benchmark_id": "flenqa",
    "model_id": "phi-4-reasoning",
    "score": 0.977,
    "normalized_score": 0.977,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-reasoning-plus",
    "verified_by_llmstats": false,
    "analysis_method": "3K-token subset",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.281300+00:00",
    "updated_at": "2025-07-19T19:56:14.281300+00:00",
    "benchmark_name": "FlenQA"
  },
  {
    "model_benchmark_id": 287,
    "benchmark_id": "gpqa",
    "model_id": "phi-4-reasoning",
    "score": 0.658,
    "normalized_score": 0.658,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-reasoning",
    "verified_by_llmstats": false,
    "analysis_method": "Diamond",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.654843+00:00",
    "updated_at": "2025-07-19T19:56:11.654843+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 1439,
    "benchmark_id": "humaneval+",
    "model_id": "phi-4-reasoning",
    "score": 0.929,
    "normalized_score": 0.929,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-reasoning-plus",
    "verified_by_llmstats": false,
    "analysis_method": "Standard evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.068831+00:00",
    "updated_at": "2025-07-19T19:56:14.068831+00:00",
    "benchmark_name": "HumanEval+"
  },
  {
    "model_benchmark_id": 613,
    "benchmark_id": "ifeval",
    "model_id": "phi-4-reasoning",
    "score": 0.834,
    "normalized_score": 0.834,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-reasoning-plus",
    "verified_by_llmstats": false,
    "analysis_method": "Strict",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.265033+00:00",
    "updated_at": "2025-07-19T19:56:12.265033+00:00",
    "benchmark_name": "IFEval"
  },
  {
    "model_benchmark_id": 1114,
    "benchmark_id": "livecodebench",
    "model_id": "phi-4-reasoning",
    "score": 0.538,
    "normalized_score": 0.538,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-reasoning",
    "verified_by_llmstats": false,
    "analysis_method": "8/1/24\u20132/1/25",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.324523+00:00",
    "updated_at": "2025-07-19T19:56:13.324523+00:00",
    "benchmark_name": "LiveCodeBench"
  },
  {
    "model_benchmark_id": 183,
    "benchmark_id": "mmlu-pro",
    "model_id": "phi-4-reasoning",
    "score": 0.743,
    "normalized_score": 0.743,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-reasoning-plus",
    "verified_by_llmstats": false,
    "analysis_method": "Standard evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.453150+00:00",
    "updated_at": "2025-07-19T19:56:11.453150+00:00",
    "benchmark_name": "MMLU-Pro"
  },
  {
    "model_benchmark_id": 1525,
    "benchmark_id": "omnimath",
    "model_id": "phi-4-reasoning",
    "score": 0.766,
    "normalized_score": 0.766,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-reasoning",
    "verified_by_llmstats": false,
    "analysis_method": "Standard evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.276205+00:00",
    "updated_at": "2025-07-19T19:56:14.276205+00:00",
    "benchmark_name": "OmniMath"
  },
  {
    "model_benchmark_id": 1468,
    "benchmark_id": "phibench",
    "model_id": "phi-4-reasoning",
    "score": 0.706,
    "normalized_score": 0.706,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-reasoning-plus",
    "verified_by_llmstats": false,
    "analysis_method": "2.21",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.127989+00:00",
    "updated_at": "2025-07-19T19:56:14.127989+00:00",
    "benchmark_name": "PhiBench"
  }
]

================================================
FILE: data/organizations/microsoft/models/phi-4-reasoning/model.json
================================================
{
  "model_id": "phi-4-reasoning",
  "name": "Phi 4 Reasoning",
  "organization_id": "microsoft",
  "fine_tuned_from_model_id": "phi-4",
  "description": "Phi-4-reasoning is a state-of-the-art open-weight reasoning model finetuned from Phi-4 using supervised fine-tuning on a dataset of chain-of-thought traces and reinforcement learning. It focuses on math, science, and coding skills.",
  "release_date": "2025-04-30",
  "announcement_date": "2025-04-30",
  "license_id": "mit",
  "multimodal": false,
  "knowledge_cutoff": "2025-03-01",
  "param_count": 14000000000,
  "training_tokens": 16000000000,
  "available_in_zeroeval": true,
  "source_api_ref": "https://learn.microsoft.com/en-us/windows/ai/apis/phi-silica?tabs=csharp0,csharp1,csharp2,csharp3",
  "source_playground": null,
  "source_paper": "https://arxiv.org/abs/2504.21318",
  "source_scorecard_blog_link": "https://azure.microsoft.com/en-us/blog/one-year-of-phi-small-language-models-making-big-leaps-in-ai/",
  "source_repo_link": null,
  "source_weights_link": "https://huggingface.co/microsoft/Phi-4-reasoning",
  "created_at": "2025-07-19T19:49:05.879382+00:00",
  "updated_at": "2025-07-19T19:49:05.879382+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/microsoft/models/phi-4-reasoning-plus/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 449,
    "benchmark_id": "aime-2024",
    "model_id": "phi-4-reasoning-plus",
    "score": 0.813,
    "normalized_score": 0.813,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-reasoning-plus",
    "verified_by_llmstats": false,
    "analysis_method": "Standard evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.953709+00:00",
    "updated_at": "2025-07-19T19:56:11.953709+00:00",
    "benchmark_name": "AIME 2024"
  },
  {
    "model_benchmark_id": 687,
    "benchmark_id": "aime-2025",
    "model_id": "phi-4-reasoning-plus",
    "score": 0.78,
    "normalized_score": 0.78,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-reasoning-plus",
    "verified_by_llmstats": false,
    "analysis_method": "Standard evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.440995+00:00",
    "updated_at": "2025-07-19T19:56:12.440995+00:00",
    "benchmark_name": "AIME 2025"
  },
  {
    "model_benchmark_id": 1449,
    "benchmark_id": "arena-hard",
    "model_id": "phi-4-reasoning-plus",
    "score": 0.79,
    "normalized_score": 0.79,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-reasoning-plus",
    "verified_by_llmstats": false,
    "analysis_method": "Standard evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.090173+00:00",
    "updated_at": "2025-07-19T19:56:14.090173+00:00",
    "benchmark_name": "Arena Hard"
  },
  {
    "model_benchmark_id": 1526,
    "benchmark_id": "flenqa",
    "model_id": "phi-4-reasoning-plus",
    "score": 0.979,
    "normalized_score": 0.979,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-reasoning-plus",
    "verified_by_llmstats": false,
    "analysis_method": "3K-token subset",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.279654+00:00",
    "updated_at": "2025-07-19T19:56:14.279654+00:00",
    "benchmark_name": "FlenQA"
  },
  {
    "model_benchmark_id": 286,
    "benchmark_id": "gpqa",
    "model_id": "phi-4-reasoning-plus",
    "score": 0.689,
    "normalized_score": 0.689,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-reasoning-plus",
    "verified_by_llmstats": false,
    "analysis_method": "Diamond",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.652983+00:00",
    "updated_at": "2025-07-19T19:56:11.652983+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 1438,
    "benchmark_id": "humaneval+",
    "model_id": "phi-4-reasoning-plus",
    "score": 0.923,
    "normalized_score": 0.923,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-reasoning-plus",
    "verified_by_llmstats": false,
    "analysis_method": "Standard evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.066904+00:00",
    "updated_at": "2025-07-19T19:56:14.066904+00:00",
    "benchmark_name": "HumanEval+"
  },
  {
    "model_benchmark_id": 612,
    "benchmark_id": "ifeval",
    "model_id": "phi-4-reasoning-plus",
    "score": 0.849,
    "normalized_score": 0.849,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-reasoning-plus",
    "verified_by_llmstats": false,
    "analysis_method": "Strict",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.263243+00:00",
    "updated_at": "2025-07-19T19:56:12.263243+00:00",
    "benchmark_name": "IFEval"
  },
  {
    "model_benchmark_id": 1113,
    "benchmark_id": "livecodebench",
    "model_id": "phi-4-reasoning-plus",
    "score": 0.531,
    "normalized_score": 0.531,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-reasoning-plus",
    "verified_by_llmstats": false,
    "analysis_method": "8/1/24\u20132/1/25",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.322076+00:00",
    "updated_at": "2025-07-19T19:56:13.322076+00:00",
    "benchmark_name": "LiveCodeBench"
  },
  {
    "model_benchmark_id": 182,
    "benchmark_id": "mmlu-pro",
    "model_id": "phi-4-reasoning-plus",
    "score": 0.76,
    "normalized_score": 0.76,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-reasoning-plus",
    "verified_by_llmstats": false,
    "analysis_method": "Standard evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.451685+00:00",
    "updated_at": "2025-07-19T19:56:11.451685+00:00",
    "benchmark_name": "MMLU-Pro"
  },
  {
    "model_benchmark_id": 1524,
    "benchmark_id": "omnimath",
    "model_id": "phi-4-reasoning-plus",
    "score": 0.819,
    "normalized_score": 0.819,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-reasoning-plus",
    "verified_by_llmstats": false,
    "analysis_method": "Standard evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.274539+00:00",
    "updated_at": "2025-07-19T19:56:14.274539+00:00",
    "benchmark_name": "OmniMath"
  },
  {
    "model_benchmark_id": 1467,
    "benchmark_id": "phibench",
    "model_id": "phi-4-reasoning-plus",
    "score": 0.742,
    "normalized_score": 0.742,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/microsoft/Phi-4-reasoning-plus",
    "verified_by_llmstats": false,
    "analysis_method": "2.21",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.126449+00:00",
    "updated_at": "2025-07-19T19:56:14.126449+00:00",
    "benchmark_name": "PhiBench"
  }
]

================================================
FILE: data/organizations/microsoft/models/phi-4-reasoning-plus/model.json
================================================
{
  "model_id": "phi-4-reasoning-plus",
  "name": "Phi 4 Reasoning Plus",
  "organization_id": "microsoft",
  "fine_tuned_from_model_id": null,
  "description": "Phi-4-reasoning-plus is a state-of-the-art open-weight reasoning model finetuned from Phi-4 using supervised fine-tuning and reinforcement learning. It focuses on math, science, and coding skills. This 'plus' version has higher accuracy due to additional RL training but may have higher latency.",
  "release_date": "2025-04-30",
  "announcement_date": "2025-04-30",
  "license_id": "mit",
  "multimodal": false,
  "knowledge_cutoff": "2025-03-01",
  "param_count": 14000000000,
  "training_tokens": 16000000000,
  "available_in_zeroeval": true,
  "source_api_ref": "https://learn.microsoft.com/en-us/windows/ai/apis/phi-silica?tabs=csharp0,csharp1,csharp2,csharp3",
  "source_playground": null,
  "source_paper": "https://arxiv.org/abs/2504.21318",
  "source_scorecard_blog_link": "https://azure.microsoft.com/en-us/blog/one-year-of-phi-small-language-models-making-big-leaps-in-ai/",
  "source_repo_link": null,
  "source_weights_link": "https://huggingface.co/microsoft/Phi-4-reasoning-plus",
  "created_at": "2025-07-19T19:49:05.567534+00:00",
  "updated_at": "2025-07-19T19:49:05.567534+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/microsoft/organization.json
================================================
{
  "organization_id": "microsoft",
  "name": "Microsoft",
  "website": "https://microsoft.com",
  "description": "Technology company",
  "country": "US",
  "created_at": "2025-07-19T19:49:05.543205+00:00",
  "updated_at": "2025-07-19T19:49:05.543205+00:00"
}


================================================
FILE: data/organizations/mistral/models/codestral-22b/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 1823,
    "benchmark_id": "cruxeval-o",
    "model_id": "codestral-22b",
    "score": 0.513,
    "normalized_score": 0.513,
    "is_self_reported": true,
    "self_reported_source_link": "https://mistral.ai/news/codestral/",
    "verified_by_llmstats": false,
    "analysis_method": "pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.151317+00:00",
    "updated_at": "2025-07-19T19:56:15.151317+00:00",
    "benchmark_name": "CruxEval-O"
  },
  {
    "model_benchmark_id": 809,
    "benchmark_id": "humaneval",
    "model_id": "codestral-22b",
    "score": 0.811,
    "normalized_score": 0.811,
    "is_self_reported": true,
    "self_reported_source_link": "https://mistral.ai/news/codestral/",
    "verified_by_llmstats": false,
    "analysis_method": "pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.685855+00:00",
    "updated_at": "2025-07-19T19:56:12.685855+00:00",
    "benchmark_name": "HumanEval"
  },
  {
    "model_benchmark_id": 1827,
    "benchmark_id": "humaneval-average",
    "model_id": "codestral-22b",
    "score": 0.615,
    "normalized_score": 0.615,
    "is_self_reported": true,
    "self_reported_source_link": "https://mistral.ai/news/codestral/",
    "verified_by_llmstats": false,
    "analysis_method": "pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.174206+00:00",
    "updated_at": "2025-07-19T19:56:15.174206+00:00",
    "benchmark_name": "HumanEval-Average"
  },
  {
    "model_benchmark_id": 1826,
    "benchmark_id": "humanevalfim-average",
    "model_id": "codestral-22b",
    "score": 0.916,
    "normalized_score": 0.916,
    "is_self_reported": true,
    "self_reported_source_link": "https://mistral.ai/news/codestral/",
    "verified_by_llmstats": false,
    "analysis_method": "pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.169908+00:00",
    "updated_at": "2025-07-19T19:56:15.169908+00:00",
    "benchmark_name": "HumanEvalFIM-Average"
  },
  {
    "model_benchmark_id": 1196,
    "benchmark_id": "mbpp",
    "model_id": "codestral-22b",
    "score": 0.782,
    "normalized_score": 0.782,
    "is_self_reported": true,
    "self_reported_source_link": "https://mistral.ai/news/codestral/",
    "verified_by_llmstats": false,
    "analysis_method": "pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.517772+00:00",
    "updated_at": "2025-07-19T19:56:13.517772+00:00",
    "benchmark_name": "MBPP"
  },
  {
    "model_benchmark_id": 1824,
    "benchmark_id": "repobench",
    "model_id": "codestral-22b",
    "score": 0.34,
    "normalized_score": 0.34,
    "is_self_reported": true,
    "self_reported_source_link": "https://mistral.ai/news/codestral/",
    "verified_by_llmstats": false,
    "analysis_method": "pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.155008+00:00",
    "updated_at": "2025-07-19T19:56:15.155008+00:00",
    "benchmark_name": "RepoBench"
  },
  {
    "model_benchmark_id": 1825,
    "benchmark_id": "spider",
    "model_id": "codestral-22b",
    "score": 0.635,
    "normalized_score": 0.635,
    "is_self_reported": true,
    "self_reported_source_link": "https://mistral.ai/news/codestral/",
    "verified_by_llmstats": false,
    "analysis_method": "pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.159626+00:00",
    "updated_at": "2025-07-19T19:56:15.159626+00:00",
    "benchmark_name": "Spider"
  }
]

================================================
FILE: data/organizations/mistral/models/codestral-22b/model.json
================================================
{
  "model_id": "codestral-22b",
  "name": "Codestral-22B",
  "organization_id": "mistral",
  "fine_tuned_from_model_id": null,
  "description": "A 22B parameter code generation model trained on 80+ programming languages including Python, Java, C, C++, JavaScript, and Bash. Supports both instruction-following and fill-in-the-middle (FIM) capabilities for code completion and generation tasks.",
  "release_date": "2024-05-29",
  "announcement_date": "2024-05-29",
  "license_id": "mnpl_0_1",
  "multimodal": false,
  "knowledge_cutoff": null,
  "param_count": 22200000000,
  "training_tokens": null,
  "available_in_zeroeval": false,
  "source_api_ref": "https://docs.mistral.ai/api/",
  "source_playground": "https://chat.mistral.ai/",
  "source_paper": null,
  "source_scorecard_blog_link": "https://mistral.ai/news/codestral/",
  "source_repo_link": null,
  "source_weights_link": "https://huggingface.co/mistralai/Codestral-22B-v0.1",
  "created_at": "2025-07-19T19:49:05.805621+00:00",
  "updated_at": "2025-07-19T19:49:05.805621+00:00",
  "model_family_id": null
}


================================================
FILE: data/organizations/mistral/models/devstral-medium-2507/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 1352,
    "benchmark_id": "swe-bench-verified",
    "model_id": "devstral-medium-2507",
    "score": 0.616,
    "normalized_score": 0.616,
    "is_self_reported": true,
    "self_reported_source_link": "https://mistral.ai/news/devstral-2507",
    "verified_by_llmstats": false,
    "analysis_method": "N/A",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.845635+00:00",
    "updated_at": "2025-07-19T19:56:13.845635+00:00",
    "benchmark_name": "SWE-Bench Verified"
  }
]

================================================
FILE: data/organizations/mistral/models/devstral-medium-2507/model.json
================================================
{
  "model_id": "devstral-medium-2507",
  "name": "Devstral Medium",
  "organization_id": "mistral",
  "fine_tuned_from_model_id": null,
  "description": "Devstral Medium builds upon the strengths of Devstral Small and takes performance to the next level with a score of 61.6% on SWE-Bench Verified. Devstral Medium is available through the Mistral public API, and offers exceptional performance at a competitive price point, making it an ideal choice for businesses and developers looking for a high-quality, cost-effective model.",
  "release_date": "2025-07-10",
  "announcement_date": "2025-07-10",
  "license_id": "proprietary",
  "multimodal": false,
  "knowledge_cutoff": null,
  "param_count": null,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://console.mistral.ai",
  "source_playground": null,
  "source_paper": null,
  "source_scorecard_blog_link": "https://mistral.ai/news/devstral-2507",
  "source_repo_link": null,
  "source_weights_link": null,
  "created_at": "2025-07-19T19:49:05.783461+00:00",
  "updated_at": "2025-07-19T19:49:05.783461+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/mistral/models/devstral-small-2507/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 1353,
    "benchmark_id": "swe-bench-verified",
    "model_id": "devstral-small-2507",
    "score": 0.536,
    "normalized_score": 0.536,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Devstral-Small-2507",
    "verified_by_llmstats": false,
    "analysis_method": "OpenHands scaffold",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.847228+00:00",
    "updated_at": "2025-07-19T19:56:13.847228+00:00",
    "benchmark_name": "SWE-Bench Verified"
  }
]

================================================
FILE: data/organizations/mistral/models/devstral-small-2507/model.json
================================================
{
  "model_id": "devstral-small-2507",
  "name": "Devstral Small 1.1",
  "organization_id": "mistral",
  "fine_tuned_from_model_id": null,
  "description": "Devstral Small 1.1 (also called devstral-small-2507) is based on the Mistral-Small-3.1 foundation model and contains approximately 24 billion parameters. It supports a 128k token context window, which allows it to handle multi-file code inputs and long prompts typical in software engineering workflows. The model is fine-tuned specifically for structured outputs, including XML and function-calling formats. This makes it compatible with agent frameworks such as OpenHands and suitable for tasks like program navigation, multi-step edits, and code search. It is licensed under Apache 2.0 and available for both research and commercial use.",
  "release_date": "2025-07-11",
  "announcement_date": "2025-07-11",
  "license_id": "apache_2_0",
  "multimodal": false,
  "knowledge_cutoff": null,
  "param_count": 24000000000,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://console.mistral.ai",
  "source_playground": null,
  "source_paper": null,
  "source_scorecard_blog_link": "https://huggingface.co/mistralai/Devstral-Small-2507",
  "source_repo_link": null,
  "source_weights_link": "https://huggingface.co/mistralai/Devstral-Small-2507/blob/main/model.safetensors.index.json",
  "created_at": "2025-07-19T19:49:05.797947+00:00",
  "updated_at": "2025-07-19T19:49:05.797947+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/mistral/models/magistral-medium/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 665,
    "benchmark_id": "aider-polyglot",
    "model_id": "magistral-medium",
    "score": 0.471,
    "normalized_score": 0.471,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2506.10910",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.379075+00:00",
    "updated_at": "2025-07-19T19:56:12.379075+00:00",
    "benchmark_name": "Aider-Polyglot"
  },
  {
    "model_benchmark_id": 480,
    "benchmark_id": "aime-2024",
    "model_id": "magistral-medium",
    "score": 0.736,
    "normalized_score": 0.736,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2506.10910",
    "verified_by_llmstats": false,
    "analysis_method": "pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.011044+00:00",
    "updated_at": "2025-07-19T19:56:12.011044+00:00",
    "benchmark_name": "AIME 2024"
  },
  {
    "model_benchmark_id": 704,
    "benchmark_id": "aime-2025",
    "model_id": "magistral-medium",
    "score": 0.649,
    "normalized_score": 0.649,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2506.10910",
    "verified_by_llmstats": false,
    "analysis_method": "pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.473748+00:00",
    "updated_at": "2025-07-19T19:56:12.473748+00:00",
    "benchmark_name": "AIME 2025"
  },
  {
    "model_benchmark_id": 343,
    "benchmark_id": "gpqa",
    "model_id": "magistral-medium",
    "score": 0.708,
    "normalized_score": 0.708,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2506.10910",
    "verified_by_llmstats": false,
    "analysis_method": "Diamond",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.745089+00:00",
    "updated_at": "2025-07-19T19:56:11.745089+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 724,
    "benchmark_id": "humanity's-last-exam",
    "model_id": "magistral-medium",
    "score": 0.09,
    "normalized_score": 0.09,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2506.10910",
    "verified_by_llmstats": false,
    "analysis_method": "text subset",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.525031+00:00",
    "updated_at": "2025-07-19T19:56:12.525031+00:00",
    "benchmark_name": "Humanity's Last Exam"
  },
  {
    "model_benchmark_id": 1145,
    "benchmark_id": "livecodebench",
    "model_id": "magistral-medium",
    "score": 0.503,
    "normalized_score": 0.503,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/pdf/2506.10910",
    "verified_by_llmstats": false,
    "analysis_method": "v6",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.408465+00:00",
    "updated_at": "2025-07-19T19:56:13.410002+00:00",
    "benchmark_name": "LiveCodeBench"
  }
]

================================================
FILE: data/organizations/mistral/models/magistral-medium/model.json
================================================
{
  "model_id": "magistral-medium",
  "name": "Magistral Medium",
  "organization_id": "mistral",
  "fine_tuned_from_model_id": null,
  "description": "Trained solely with reinforcement learning on top of Mistral Medium 3, Magistral Medium is a reasoning model that achieves strong performance on complex math and code tasks without relying on distillation from existing reasoning models. The training uses an RLVR framework with modifications to GRPO, enabling improved reasoning ability and multilingual consistency.",
  "release_date": "2025-06-10",
  "announcement_date": "2025-06-10",
  "license_id": "apache_2_0",
  "multimodal": true,
  "knowledge_cutoff": "2025-06-01",
  "param_count": 24000000000,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://docs.mistral.ai/api/",
  "source_playground": "https://chat.mistral.ai/",
  "source_paper": "https://arxiv.org/pdf/2506.10910",
  "source_scorecard_blog_link": "https://mistral.ai/news/magistral",
  "source_repo_link": null,
  "source_weights_link": null,
  "created_at": "2025-07-19T19:49:05.780565+00:00",
  "updated_at": "2025-07-19T19:49:05.780565+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/mistral/models/magistral-small-2506/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 479,
    "benchmark_id": "aime-2024",
    "model_id": "magistral-small-2506",
    "score": 0.7068,
    "normalized_score": 0.7068,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Magistral-Small-2506",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.009597+00:00",
    "updated_at": "2025-07-19T19:56:12.009597+00:00",
    "benchmark_name": "AIME 2024"
  },
  {
    "model_benchmark_id": 703,
    "benchmark_id": "aime-2025",
    "model_id": "magistral-small-2506",
    "score": 0.6276,
    "normalized_score": 0.6276,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Magistral-Small-2506",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.471565+00:00",
    "updated_at": "2025-07-19T19:56:12.471565+00:00",
    "benchmark_name": "AIME 2025"
  },
  {
    "model_benchmark_id": 342,
    "benchmark_id": "gpqa",
    "model_id": "magistral-small-2506",
    "score": 0.6818,
    "normalized_score": 0.6818,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Magistral-Small-2506",
    "verified_by_llmstats": false,
    "analysis_method": "Diamond",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.743610+00:00",
    "updated_at": "2025-07-19T19:56:11.743610+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 1144,
    "benchmark_id": "livecodebench",
    "model_id": "magistral-small-2506",
    "score": 0.513,
    "normalized_score": 0.513,
    "is_self_reported": true,
    "self_reported_source_link": "https://mistral.ai/news/codestral/",
    "verified_by_llmstats": false,
    "analysis_method": "v5",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.406640+00:00",
    "updated_at": "2025-07-19T19:56:13.406640+00:00",
    "benchmark_name": "LiveCodeBench"
  }
]

================================================
FILE: data/organizations/mistral/models/magistral-small-2506/model.json
================================================
{
  "model_id": "magistral-small-2506",
  "name": "Magistral Small 2506",
  "organization_id": "mistral",
  "fine_tuned_from_model_id": null,
  "description": "Building upon Mistral Small 3.1 (2503), with added reasoning capabilities, undergoing SFT from Magistral Medium traces and RL on top, it's a small, efficient reasoning model with 24B parameters. Magistral Small can be deployed locally, fitting within a single RTX 4090 or a 32GB RAM MacBook once quantized.",
  "release_date": "2025-06-10",
  "announcement_date": "2025-06-10",
  "license_id": "apache_2_0",
  "multimodal": false,
  "knowledge_cutoff": "2025-06-01",
  "param_count": 24000000000,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://docs.mistral.ai/api/",
  "source_playground": "https://chat.mistral.ai/",
  "source_paper": "https://arxiv.org/pdf/2506.10910",
  "source_scorecard_blog_link": "https://mistral.ai/news/magistral",
  "source_repo_link": null,
  "source_weights_link": "https://huggingface.co/mistralai/Magistral-Small-2506",
  "created_at": "2025-07-19T19:49:05.777162+00:00",
  "updated_at": "2025-07-19T19:49:05.777162+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/mistral/models/ministral-8b-instruct-2410/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 1410,
    "benchmark_id": "agieval",
    "model_id": "ministral-8b-instruct-2410",
    "score": 0.483,
    "normalized_score": 0.483,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Ministral-8B-Instruct-2410",
    "verified_by_llmstats": false,
    "analysis_method": "-",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.978647+00:00",
    "updated_at": "2025-07-19T19:56:13.978647+00:00",
    "benchmark_name": "AGIEval"
  },
  {
    "model_benchmark_id": 30,
    "benchmark_id": "arc-c",
    "model_id": "ministral-8b-instruct-2410",
    "score": 0.719,
    "normalized_score": 0.719,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Ministral-8B-Instruct-2410",
    "verified_by_llmstats": false,
    "analysis_method": "-",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.142536+00:00",
    "updated_at": "2025-07-19T19:56:11.142536+00:00",
    "benchmark_name": "ARC-C"
  },
  {
    "model_benchmark_id": 1464,
    "benchmark_id": "arena-hard",
    "model_id": "ministral-8b-instruct-2410",
    "score": 0.709,
    "normalized_score": 0.709,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Ministral-8B-Instruct-2410",
    "verified_by_llmstats": false,
    "analysis_method": "",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.118772+00:00",
    "updated_at": "2025-07-19T19:56:14.118772+00:00",
    "benchmark_name": "Arena Hard"
  },
  {
    "model_benchmark_id": 1820,
    "benchmark_id": "french-mmlu",
    "model_id": "ministral-8b-instruct-2410",
    "score": 0.575,
    "normalized_score": 0.575,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Ministral-8B-Instruct-2410",
    "verified_by_llmstats": false,
    "analysis_method": "-",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.137792+00:00",
    "updated_at": "2025-07-19T19:56:15.137792+00:00",
    "benchmark_name": "French MMLU"
  },
  {
    "model_benchmark_id": 806,
    "benchmark_id": "humaneval",
    "model_id": "ministral-8b-instruct-2410",
    "score": 0.348,
    "normalized_score": 0.348,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Ministral-8B-Instruct-2410",
    "verified_by_llmstats": false,
    "analysis_method": "-",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.681246+00:00",
    "updated_at": "2025-07-19T19:56:12.681246+00:00",
    "benchmark_name": "HumanEval"
  },
  {
    "model_benchmark_id": 422,
    "benchmark_id": "math",
    "model_id": "ministral-8b-instruct-2410",
    "score": 0.545,
    "normalized_score": 0.545,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Ministral-8B-Instruct-2410",
    "verified_by_llmstats": false,
    "analysis_method": "-",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.895272+00:00",
    "updated_at": "2025-07-19T19:56:11.895272+00:00",
    "benchmark_name": "MATH"
  },
  {
    "model_benchmark_id": 1821,
    "benchmark_id": "mbpp-pass@1",
    "model_id": "ministral-8b-instruct-2410",
    "score": 0.7,
    "normalized_score": 0.7,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Ministral-8B-Instruct-2410",
    "verified_by_llmstats": false,
    "analysis_method": "-",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.141858+00:00",
    "updated_at": "2025-07-19T19:56:15.141858+00:00",
    "benchmark_name": "MBPP pass@1"
  },
  {
    "model_benchmark_id": 112,
    "benchmark_id": "mmlu",
    "model_id": "ministral-8b-instruct-2410",
    "score": 0.65,
    "normalized_score": 0.65,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Ministral-8B-Instruct-2410",
    "verified_by_llmstats": false,
    "analysis_method": "-",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.309619+00:00",
    "updated_at": "2025-07-19T19:56:11.309619+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 1612,
    "benchmark_id": "mt-bench",
    "model_id": "ministral-8b-instruct-2410",
    "score": 0.83,
    "normalized_score": 0.83,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Ministral-8B-Instruct-2410",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.535003+00:00",
    "updated_at": "2025-07-19T19:56:14.535003+00:00",
    "benchmark_name": "MT-Bench"
  },
  {
    "model_benchmark_id": 253,
    "benchmark_id": "triviaqa",
    "model_id": "ministral-8b-instruct-2410",
    "score": 0.655,
    "normalized_score": 0.655,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Ministral-8B-Instruct-2410",
    "verified_by_llmstats": false,
    "analysis_method": "-",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.582765+00:00",
    "updated_at": "2025-07-19T19:56:11.582765+00:00",
    "benchmark_name": "TriviaQA"
  },
  {
    "model_benchmark_id": 155,
    "benchmark_id": "winogrande",
    "model_id": "ministral-8b-instruct-2410",
    "score": 0.753,
    "normalized_score": 0.753,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Ministral-8B-Instruct-2410",
    "verified_by_llmstats": false,
    "analysis_method": "-",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.394106+00:00",
    "updated_at": "2025-07-19T19:56:11.394106+00:00",
    "benchmark_name": "Winogrande"
  }
]

================================================
FILE: data/organizations/mistral/models/ministral-8b-instruct-2410/model.json
================================================
{
  "model_id": "ministral-8b-instruct-2410",
  "name": "Ministral 8B Instruct",
  "organization_id": "mistral",
  "fine_tuned_from_model_id": null,
  "description": "The Ministral-8B-Instruct-2410 is an instruct fine-tuned model for local intelligence, on-device computing, and at-the-edge use cases, significantly outperforming existing models of similar size.",
  "release_date": "2024-10-16",
  "announcement_date": "2024-10-16",
  "license_id": "mistral_research_license",
  "multimodal": false,
  "knowledge_cutoff": null,
  "param_count": 8019808256,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://huggingface.co/mistralai/Ministral-8B-Instruct-2410",
  "source_playground": null,
  "source_paper": null,
  "source_scorecard_blog_link": "https://mistral.ai/news/ministraux/",
  "source_repo_link": null,
  "source_weights_link": "https://huggingface.co/mistralai/Ministral-8B-Instruct-2410",
  "created_at": "2025-07-19T19:49:05.786083+00:00",
  "updated_at": "2025-07-19T19:49:05.786083+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/mistral/models/mistral-large-2-2407/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 1014,
    "benchmark_id": "gsm8k",
    "model_id": "mistral-large-2-2407",
    "score": 0.93,
    "normalized_score": 0.93,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Large-Instruct-2407",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.113392+00:00",
    "updated_at": "2025-07-19T19:56:13.113392+00:00",
    "benchmark_name": "GSM8k"
  },
  {
    "model_benchmark_id": 810,
    "benchmark_id": "humaneval",
    "model_id": "mistral-large-2-2407",
    "score": 0.92,
    "normalized_score": 0.92,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Large-Instruct-2407",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.687406+00:00",
    "updated_at": "2025-07-19T19:56:12.687406+00:00",
    "benchmark_name": "HumanEval"
  },
  {
    "model_benchmark_id": 116,
    "benchmark_id": "mmlu",
    "model_id": "mistral-large-2-2407",
    "score": 0.84,
    "normalized_score": 0.84,
    "is_self_reported": true,
    "self_reported_source_link": "https://mistral.ai/news/mistral-large-2407/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.316024+00:00",
    "updated_at": "2025-07-19T19:56:11.316024+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 1828,
    "benchmark_id": "mmlu-french",
    "model_id": "mistral-large-2-2407",
    "score": 0.828,
    "normalized_score": 0.828,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Large-Instruct-2407",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.178056+00:00",
    "updated_at": "2025-07-19T19:56:15.178056+00:00",
    "benchmark_name": "MMLU French"
  },
  {
    "model_benchmark_id": 1615,
    "benchmark_id": "mt-bench",
    "model_id": "mistral-large-2-2407",
    "score": 0.863,
    "normalized_score": 0.863,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Large-Instruct-2407",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.541051+00:00",
    "updated_at": "2025-07-19T19:56:14.541051+00:00",
    "benchmark_name": "MT-Bench"
  }
]

================================================
FILE: data/organizations/mistral/models/mistral-large-2-2407/model.json
================================================
{
  "model_id": "mistral-large-2-2407",
  "name": "Mistral Large 2",
  "organization_id": "mistral",
  "fine_tuned_from_model_id": null,
  "description": "A 123B parameter model with strong capabilities in code generation, mathematics, and reasoning. Features enhanced multilingual support across dozens of languages, 128k context window, and advanced function calling capabilities. Excels in instruction-following and maintains concise outputs.",
  "release_date": "2024-07-24",
  "announcement_date": "2024-07-24",
  "license_id": "mistral_research_license",
  "multimodal": false,
  "knowledge_cutoff": null,
  "param_count": 123000000000,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://docs.mistral.ai/",
  "source_playground": "https://chat.mistral.ai/",
  "source_paper": null,
  "source_scorecard_blog_link": "https://mistral.ai/news/mistral-large-2407/",
  "source_repo_link": null,
  "source_weights_link": "https://huggingface.co/mistralai/Mistral-Large-Instruct-2407",
  "created_at": "2025-07-19T19:49:05.813974+00:00",
  "updated_at": "2025-07-19T19:49:05.813974+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/mistral/models/mistral-nemo-instruct-2407/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 1819,
    "benchmark_id": "commonsenseqa",
    "model_id": "mistral-nemo-instruct-2407",
    "score": 0.704,
    "normalized_score": 0.704,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Nemo-Instruct-2407",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.133096+00:00",
    "updated_at": "2025-07-19T19:56:15.133096+00:00",
    "benchmark_name": "CommonSenseQA"
  },
  {
    "model_benchmark_id": 54,
    "benchmark_id": "hellaswag",
    "model_id": "mistral-nemo-instruct-2407",
    "score": 0.835,
    "normalized_score": 0.835,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Nemo-Instruct-2407",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.196732+00:00",
    "updated_at": "2025-07-19T19:56:11.196732+00:00",
    "benchmark_name": "HellaSwag"
  },
  {
    "model_benchmark_id": 111,
    "benchmark_id": "mmlu",
    "model_id": "mistral-nemo-instruct-2407",
    "score": 0.68,
    "normalized_score": 0.68,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Nemo-Instruct-2407",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.308247+00:00",
    "updated_at": "2025-07-19T19:56:11.308247+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 1050,
    "benchmark_id": "natural-questions",
    "model_id": "mistral-nemo-instruct-2407",
    "score": 0.312,
    "normalized_score": 0.312,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Nemo-Instruct-2407",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.191770+00:00",
    "updated_at": "2025-07-19T19:56:13.191770+00:00",
    "benchmark_name": "Natural Questions"
  },
  {
    "model_benchmark_id": 1472,
    "benchmark_id": "openbookqa",
    "model_id": "mistral-nemo-instruct-2407",
    "score": 0.606,
    "normalized_score": 0.606,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Nemo-Instruct-2407",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.138075+00:00",
    "updated_at": "2025-07-19T19:56:14.138075+00:00",
    "benchmark_name": "OpenBookQA"
  },
  {
    "model_benchmark_id": 252,
    "benchmark_id": "triviaqa",
    "model_id": "mistral-nemo-instruct-2407",
    "score": 0.738,
    "normalized_score": 0.738,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Nemo-Instruct-2407",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.581108+00:00",
    "updated_at": "2025-07-19T19:56:11.581108+00:00",
    "benchmark_name": "TriviaQA"
  },
  {
    "model_benchmark_id": 146,
    "benchmark_id": "truthfulqa",
    "model_id": "mistral-nemo-instruct-2407",
    "score": 0.503,
    "normalized_score": 0.503,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Nemo-Instruct-2407",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.369082+00:00",
    "updated_at": "2025-07-19T19:56:11.369082+00:00",
    "benchmark_name": "TruthfulQA"
  },
  {
    "model_benchmark_id": 154,
    "benchmark_id": "winogrande",
    "model_id": "mistral-nemo-instruct-2407",
    "score": 0.768,
    "normalized_score": 0.768,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Nemo-Instruct-2407",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.392106+00:00",
    "updated_at": "2025-07-19T19:56:11.392106+00:00",
    "benchmark_name": "Winogrande"
  }
]

================================================
FILE: data/organizations/mistral/models/mistral-nemo-instruct-2407/model.json
================================================
{
  "model_id": "mistral-nemo-instruct-2407",
  "name": "Mistral NeMo Instruct",
  "organization_id": "mistral",
  "fine_tuned_from_model_id": null,
  "description": "A state-of-the-art 12B multilingual model with a 128k context window, designed for global applications and strong in multiple languages.",
  "release_date": "2024-07-18",
  "announcement_date": "2024-07-18",
  "license_id": "apache_2_0",
  "multimodal": false,
  "knowledge_cutoff": null,
  "param_count": 12000000000,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://docs.mistral.ai/getting-started/models/models_overview/",
  "source_playground": null,
  "source_paper": null,
  "source_scorecard_blog_link": "https://mistral.ai/news/mistral-nemo/",
  "source_repo_link": "https://huggingface.co/mistralai/Mistral-Nemo-Instruct-2407",
  "source_weights_link": "https://huggingface.co/mistralai/Mistral-Nemo-Instruct-2407",
  "created_at": "2025-07-19T19:49:05.773595+00:00",
  "updated_at": "2025-07-19T19:49:05.773595+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/mistral/models/mistral-small-2409/model.json
================================================
{
  "model_id": "mistral-small-2409",
  "name": "Mistral Small",
  "organization_id": "mistral",
  "fine_tuned_from_model_id": null,
  "description": "An enterprise-grade 22B parameter model optimized for tasks like translation, summarization, and sentiment analysis. Offers significant improvements in human alignment, reasoning capabilities, and code generation compared to previous versions.",
  "release_date": "2024-09-17",
  "announcement_date": "2024-09-17",
  "license_id": "mistral_research_license",
  "multimodal": false,
  "knowledge_cutoff": null,
  "param_count": 22000000000,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://docs.mistral.ai/api/",
  "source_playground": "https://console.mistral.ai/",
  "source_paper": null,
  "source_scorecard_blog_link": "https://mistral.ai/news/september-24-release/",
  "source_repo_link": null,
  "source_weights_link": "https://huggingface.co/mistralai/Mistral-Small-Instruct-2409",
  "created_at": "2025-07-19T19:49:05.809465+00:00",
  "updated_at": "2025-07-19T19:49:05.809465+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/mistral/models/mistral-small-24b-base-2501/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 1411,
    "benchmark_id": "agieval",
    "model_id": "mistral-small-24b-base-2501",
    "score": 0.658,
    "normalized_score": 0.658,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-24B-Base-2501",
    "verified_by_llmstats": false,
    "analysis_method": "-",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.980585+00:00",
    "updated_at": "2025-07-19T19:56:13.980585+00:00",
    "benchmark_name": "AGIEval"
  },
  {
    "model_benchmark_id": 31,
    "benchmark_id": "arc-c",
    "model_id": "mistral-small-24b-base-2501",
    "score": 0.9129,
    "normalized_score": 0.9129,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-24B-Base-2501",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.143960+00:00",
    "updated_at": "2025-07-19T19:56:11.143960+00:00",
    "benchmark_name": "ARC-C"
  },
  {
    "model_benchmark_id": 345,
    "benchmark_id": "gpqa",
    "model_id": "mistral-small-24b-base-2501",
    "score": 0.3437,
    "normalized_score": 0.3437,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-24B-Base-2501",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot, CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.748111+00:00",
    "updated_at": "2025-07-19T19:56:11.748111+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 1013,
    "benchmark_id": "gsm8k",
    "model_id": "mistral-small-24b-base-2501",
    "score": 0.8073,
    "normalized_score": 0.8073,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-24B-Base-2501",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot, maj@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.111924+00:00",
    "updated_at": "2025-07-19T19:56:13.111924+00:00",
    "benchmark_name": "GSM8k"
  },
  {
    "model_benchmark_id": 424,
    "benchmark_id": "math",
    "model_id": "mistral-small-24b-base-2501",
    "score": 0.4598,
    "normalized_score": 0.4598,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-24B-Base-2501",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot, MaJ",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.898806+00:00",
    "updated_at": "2025-07-19T19:56:11.898806+00:00",
    "benchmark_name": "MATH"
  },
  {
    "model_benchmark_id": 1195,
    "benchmark_id": "mbpp",
    "model_id": "mistral-small-24b-base-2501",
    "score": 0.6964,
    "normalized_score": 0.6964,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-24B-Base-2501",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.516399+00:00",
    "updated_at": "2025-07-19T19:56:13.516399+00:00",
    "benchmark_name": "MBPP"
  },
  {
    "model_benchmark_id": 113,
    "benchmark_id": "mmlu",
    "model_id": "mistral-small-24b-base-2501",
    "score": 0.8073,
    "normalized_score": 0.8073,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-24B-Base-2501",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.311218+00:00",
    "updated_at": "2025-07-19T19:56:11.311218+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 217,
    "benchmark_id": "mmlu-pro",
    "model_id": "mistral-small-24b-base-2501",
    "score": 0.5437,
    "normalized_score": 0.5437,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-24B-Base-2501",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.511957+00:00",
    "updated_at": "2025-07-19T19:56:11.511957+00:00",
    "benchmark_name": "MMLU-Pro"
  },
  {
    "model_benchmark_id": 254,
    "benchmark_id": "triviaqa",
    "model_id": "mistral-small-24b-base-2501",
    "score": 0.8032,
    "normalized_score": 0.8032,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-24B-Base-2501",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.585944+00:00",
    "updated_at": "2025-07-19T19:56:11.585944+00:00",
    "benchmark_name": "TriviaQA"
  }
]

================================================
FILE: data/organizations/mistral/models/mistral-small-24b-base-2501/model.json
================================================
{
  "model_id": "mistral-small-24b-base-2501",
  "name": "Mistral Small 3 24B Base",
  "organization_id": "mistral",
  "fine_tuned_from_model_id": null,
  "description": "Mistral Small 3 is competitive with larger models such as Llama 3.3 70B or Qwen 32B, and is an excellent open replacement for opaque proprietary models like GPT4o-mini. Mistral Small 3 is on par with Llama 3.3 70B instruct, while being more than 3x faster on the same hardware.",
  "release_date": "2025-01-30",
  "announcement_date": "2025-01-30",
  "license_id": "apache_2_0",
  "multimodal": true,
  "knowledge_cutoff": "2023-10-01",
  "param_count": 23600000000,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": null,
  "source_playground": "https://console.mistral.ai/",
  "source_paper": null,
  "source_scorecard_blog_link": "https://mistral.ai/news/mistral-small-3",
  "source_repo_link": null,
  "source_weights_link": "https://huggingface.co/mistralai/Mistral-Small-24B-Base-2501",
  "created_at": "2025-07-19T19:49:05.791166+00:00",
  "updated_at": "2025-07-19T19:49:05.791166+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/mistral/models/mistral-small-24b-instruct-2501/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 1465,
    "benchmark_id": "arena-hard",
    "model_id": "mistral-small-24b-instruct-2501",
    "score": 0.876,
    "normalized_score": 0.876,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-24B-Instruct-2501",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.120697+00:00",
    "updated_at": "2025-07-19T19:56:14.120697+00:00",
    "benchmark_name": "Arena Hard"
  },
  {
    "model_benchmark_id": 344,
    "benchmark_id": "gpqa",
    "model_id": "mistral-small-24b-instruct-2501",
    "score": 0.453,
    "normalized_score": 0.453,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-24B-Instruct-2501",
    "verified_by_llmstats": false,
    "analysis_method": "5 shot COT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.746578+00:00",
    "updated_at": "2025-07-19T19:56:11.746578+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 807,
    "benchmark_id": "humaneval",
    "model_id": "mistral-small-24b-instruct-2501",
    "score": 0.848,
    "normalized_score": 0.848,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-24B-Instruct-2501",
    "verified_by_llmstats": false,
    "analysis_method": "5 shot COT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.682647+00:00",
    "updated_at": "2025-07-19T19:56:12.682647+00:00",
    "benchmark_name": "HumanEval"
  },
  {
    "model_benchmark_id": 630,
    "benchmark_id": "ifeval",
    "model_id": "mistral-small-24b-instruct-2501",
    "score": 0.829,
    "normalized_score": 0.829,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-24B-Instruct-2501",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.295754+00:00",
    "updated_at": "2025-07-19T19:56:12.295754+00:00",
    "benchmark_name": "IFEval"
  },
  {
    "model_benchmark_id": 423,
    "benchmark_id": "math",
    "model_id": "mistral-small-24b-instruct-2501",
    "score": 0.706,
    "normalized_score": 0.706,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-24B-Instruct-2501",
    "verified_by_llmstats": false,
    "analysis_method": "instruct",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.896887+00:00",
    "updated_at": "2025-07-19T19:56:11.896887+00:00",
    "benchmark_name": "MATH"
  },
  {
    "model_benchmark_id": 216,
    "benchmark_id": "mmlu-pro",
    "model_id": "mistral-small-24b-instruct-2501",
    "score": 0.663,
    "normalized_score": 0.663,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-24B-Instruct-2501",
    "verified_by_llmstats": false,
    "analysis_method": "5 shot COT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.510254+00:00",
    "updated_at": "2025-07-19T19:56:11.510254+00:00",
    "benchmark_name": "MMLU-Pro"
  },
  {
    "model_benchmark_id": 1613,
    "benchmark_id": "mt-bench",
    "model_id": "mistral-small-24b-instruct-2501",
    "score": 0.835,
    "normalized_score": 0.835,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-24B-Instruct-2501",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.537073+00:00",
    "updated_at": "2025-07-19T19:56:14.537073+00:00",
    "benchmark_name": "MT-Bench"
  },
  {
    "model_benchmark_id": 1818,
    "benchmark_id": "wild-bench",
    "model_id": "mistral-small-24b-instruct-2501",
    "score": 0.522,
    "normalized_score": 0.522,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-24B-Instruct-2501",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.128734+00:00",
    "updated_at": "2025-07-19T19:56:15.128734+00:00",
    "benchmark_name": "Wild Bench"
  }
]

================================================
FILE: data/organizations/mistral/models/mistral-small-24b-instruct-2501/model.json
================================================
{
  "model_id": "mistral-small-24b-instruct-2501",
  "name": "Mistral Small 3 24B Instruct",
  "organization_id": "mistral",
  "fine_tuned_from_model_id": null,
  "description": "Mistral Small 3 is a 24B-parameter LLM licensed under Apache-2.0. It focuses on low-latency, high-efficiency instruction following, maintaining performance comparable to larger models. It provides quick, accurate responses for conversational agents, function calling, and domain-specific fine-tuning. Suitable for local inference when quantized, it rivals models 2\u20133\u00d7 its size while using significantly fewer compute resources.",
  "release_date": "2025-01-30",
  "announcement_date": "2025-01-30",
  "license_id": "apache_2_0",
  "multimodal": false,
  "knowledge_cutoff": "2023-10-01",
  "param_count": 24000000000,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://docs.mistral.ai/api/",
  "source_playground": null,
  "source_paper": null,
  "source_scorecard_blog_link": "https://mistral.ai/news/mistral-small-3/",
  "source_repo_link": null,
  "source_weights_link": "https://huggingface.co/mistralai/Mistral-Small-24B-Instruct-2501",
  "created_at": "2025-07-19T19:49:05.788628+00:00",
  "updated_at": "2025-07-19T19:49:05.788628+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/mistral/models/mistral-small-3.1-24b-base-2503/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 346,
    "benchmark_id": "gpqa",
    "model_id": "mistral-small-3.1-24b-base-2503",
    "score": 0.375,
    "normalized_score": 0.375,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Base-2503",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.749533+00:00",
    "updated_at": "2025-07-19T19:56:11.749533+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 114,
    "benchmark_id": "mmlu",
    "model_id": "mistral-small-3.1-24b-base-2503",
    "score": 0.8101,
    "normalized_score": 0.8101,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Base-2503",
    "verified_by_llmstats": false,
    "analysis_method": "-",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.312907+00:00",
    "updated_at": "2025-07-19T19:56:11.312907+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 218,
    "benchmark_id": "mmlu-pro",
    "model_id": "mistral-small-3.1-24b-base-2503",
    "score": 0.5603,
    "normalized_score": 0.5603,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Base-2503",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.513719+00:00",
    "updated_at": "2025-07-19T19:56:11.513719+00:00",
    "benchmark_name": "MMLU-Pro"
  },
  {
    "model_benchmark_id": 587,
    "benchmark_id": "mmmu",
    "model_id": "mistral-small-3.1-24b-base-2503",
    "score": 0.5927,
    "normalized_score": 0.5927,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Base-2503",
    "verified_by_llmstats": false,
    "analysis_method": "CoT accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.207080+00:00",
    "updated_at": "2025-07-19T19:56:12.207080+00:00",
    "benchmark_name": "MMMU"
  },
  {
    "model_benchmark_id": 255,
    "benchmark_id": "triviaqa",
    "model_id": "mistral-small-3.1-24b-base-2503",
    "score": 0.805,
    "normalized_score": 0.805,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Base-2503",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.587622+00:00",
    "updated_at": "2025-07-19T19:56:11.587622+00:00",
    "benchmark_name": "TriviaQA"
  }
]


================================================
FILE: data/organizations/mistral/models/mistral-small-3.1-24b-base-2503/model.json
================================================
{
  "model_id": "mistral-small-3.1-24b-base-2503",
  "name": "Mistral Small 3.1 24B Base",
  "organization_id": "mistral",
  "fine_tuned_from_model_id": null,
  "description": "Pretrained base model version of Mistral Small 3.1. Features improved text performance, multimodal understanding, multilingual capabilities, and an expanded 128k token context window compared to Mistral Small 3. Designed for fine-tuning.",
  "release_date": "2025-03-17",
  "announcement_date": "2025-03-17",
  "license_id": "apache_2_0",
  "multimodal": true,
  "knowledge_cutoff": null,
  "param_count": 24000000000,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": null,
  "source_playground": "https://console.mistral.ai/",
  "source_paper": null,
  "source_scorecard_blog_link": "https://mistral.ai/news/mistral-small-3-1",
  "source_repo_link": null,
  "source_weights_link": "https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Base-2503",
  "created_at": "2025-07-19T19:49:05.793911+00:00",
  "updated_at": "2025-07-19T19:49:05.793911+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/mistral/models/mistral-small-3.1-24b-instruct-2503/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 340,
    "benchmark_id": "gpqa",
    "model_id": "mistral-small-3.1-24b-instruct-2503",
    "score": 0.4596,
    "normalized_score": 0.4596,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503",
    "verified_by_llmstats": false,
    "analysis_method": "Diamond, 5-shot CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.740584+00:00",
    "updated_at": "2025-07-19T19:56:11.741944+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 805,
    "benchmark_id": "humaneval",
    "model_id": "mistral-small-3.1-24b-instruct-2503",
    "score": 0.8841,
    "normalized_score": 0.8841,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503",
    "verified_by_llmstats": false,
    "analysis_method": "standard",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.677771+00:00",
    "updated_at": "2025-07-19T19:56:12.677771+00:00",
    "benchmark_name": "HumanEval"
  },
  {
    "model_benchmark_id": 421,
    "benchmark_id": "math",
    "model_id": "mistral-small-3.1-24b-instruct-2503",
    "score": 0.693,
    "normalized_score": 0.693,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503",
    "verified_by_llmstats": false,
    "analysis_method": "standard",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.893255+00:00",
    "updated_at": "2025-07-19T19:56:11.893255+00:00",
    "benchmark_name": "MATH"
  },
  {
    "model_benchmark_id": 1194,
    "benchmark_id": "mbpp",
    "model_id": "mistral-small-3.1-24b-instruct-2503",
    "score": 0.7471,
    "normalized_score": 0.7471,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503",
    "verified_by_llmstats": false,
    "analysis_method": "standard",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.514872+00:00",
    "updated_at": "2025-07-19T19:56:13.514872+00:00",
    "benchmark_name": "MBPP"
  },
  {
    "model_benchmark_id": 110,
    "benchmark_id": "mmlu",
    "model_id": "mistral-small-3.1-24b-instruct-2503",
    "score": 0.8062,
    "normalized_score": 0.8062,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503",
    "verified_by_llmstats": false,
    "analysis_method": "-",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.306426+00:00",
    "updated_at": "2025-07-19T19:56:11.306426+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 215,
    "benchmark_id": "mmlu-pro",
    "model_id": "mistral-small-3.1-24b-instruct-2503",
    "score": 0.6676,
    "normalized_score": 0.6676,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.508555+00:00",
    "updated_at": "2025-07-19T19:56:11.508555+00:00",
    "benchmark_name": "MMLU-Pro"
  },
  {
    "model_benchmark_id": 585,
    "benchmark_id": "mmmu",
    "model_id": "mistral-small-3.1-24b-instruct-2503",
    "score": 0.5927,
    "normalized_score": 0.5927,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503",
    "verified_by_llmstats": false,
    "analysis_method": "CoT accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.203401+00:00",
    "updated_at": "2025-07-19T19:56:12.203401+00:00",
    "benchmark_name": "MMMU"
  },
  {
    "model_benchmark_id": 237,
    "benchmark_id": "simpleqa",
    "model_id": "mistral-small-3.1-24b-instruct-2503",
    "score": 0.1043,
    "normalized_score": 0.1043,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503",
    "verified_by_llmstats": false,
    "analysis_method": "TotalAcc, Correct",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.552923+00:00",
    "updated_at": "2025-07-19T19:56:11.552923+00:00",
    "benchmark_name": "SimpleQA"
  },
  {
    "model_benchmark_id": 251,
    "benchmark_id": "triviaqa",
    "model_id": "mistral-small-3.1-24b-instruct-2503",
    "score": 0.805,
    "normalized_score": 0.805,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.579482+00:00",
    "updated_at": "2025-07-19T19:56:11.579482+00:00",
    "benchmark_name": "TriviaQA"
  }
]

================================================
FILE: data/organizations/mistral/models/mistral-small-3.1-24b-instruct-2503/model.json
================================================
{
  "model_id": "mistral-small-3.1-24b-instruct-2503",
  "name": "Mistral Small 3.1 24B Instruct",
  "organization_id": "mistral",
  "fine_tuned_from_model_id": null,
  "description": "Building upon Mistral Small 3 (2501), Mistral Small 3.1 (2503) adds state-of-the-art vision understanding and enhances long context capabilities up to 128k tokens without compromising text performance. With 24 billion parameters, this model achieves top-tier capabilities in both text and vision tasks.",
  "release_date": "2025-03-17",
  "announcement_date": "2025-03-17",
  "license_id": "apache_2_0",
  "multimodal": true,
  "knowledge_cutoff": null,
  "param_count": 24000000000,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": null,
  "source_playground": "https://console.mistral.ai/",
  "source_paper": null,
  "source_scorecard_blog_link": "https://mistral.ai/news/mistral-small-3-1",
  "source_repo_link": null,
  "source_weights_link": "https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503",
  "created_at": "2025-07-19T19:49:05.770816+00:00",
  "updated_at": "2025-07-19T19:49:05.770816+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/mistral/models/mistral-small-3.2-24b-instruct-2506/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 16767,
    "benchmark_id": "ai2d",
    "model_id": "mistral-small-3.2-24b-instruct-2506",
    "score": 0.9291,
    "normalized_score": 0.9291,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-3.2-24B-Instruct-2506",
    "verified_by_llmstats": false,
    "analysis_method": "-",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-03T22:06:15.105841+00:00",
    "updated_at": "2025-08-03T22:06:15.105841+00:00",
    "benchmark_name": "AI2D"
  },
  {
    "model_benchmark_id": 16768,
    "benchmark_id": "arena-hard",
    "model_id": "mistral-small-3.2-24b-instruct-2506",
    "score": 0.431,
    "normalized_score": 0.431,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-3.2-24B-Instruct-2506",
    "verified_by_llmstats": false,
    "analysis_method": "v2",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-03T22:06:15.107885+00:00",
    "updated_at": "2025-08-03T22:06:15.107885+00:00",
    "benchmark_name": "Arena Hard"
  },
  {
    "model_benchmark_id": 16769,
    "benchmark_id": "chartqa",
    "model_id": "mistral-small-3.2-24b-instruct-2506",
    "score": 0.874,
    "normalized_score": 0.874,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-3.2-24B-Instruct-2506",
    "verified_by_llmstats": false,
    "analysis_method": "-",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-03T22:06:15.109760+00:00",
    "updated_at": "2025-08-03T22:06:15.109760+00:00",
    "benchmark_name": "ChartQA"
  },
  {
    "model_benchmark_id": 16770,
    "benchmark_id": "docvqa",
    "model_id": "mistral-small-3.2-24b-instruct-2506",
    "score": 0.9486,
    "normalized_score": 0.9486,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-3.2-24B-Instruct-2506",
    "verified_by_llmstats": false,
    "analysis_method": "-",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-03T22:06:15.111977+00:00",
    "updated_at": "2025-08-03T22:06:15.111977+00:00",
    "benchmark_name": "DocVQA"
  },
  {
    "model_benchmark_id": 16771,
    "benchmark_id": "gpqa",
    "model_id": "mistral-small-3.2-24b-instruct-2506",
    "score": 0.4422,
    "normalized_score": 0.4422,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-3.2-24B-Instruct-2506",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-03T22:06:15.113518+00:00",
    "updated_at": "2025-08-03T22:06:15.113518+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 16772,
    "benchmark_id": "gpqa",
    "model_id": "mistral-small-3.2-24b-instruct-2506",
    "score": 0.4613,
    "normalized_score": 0.4613,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-3.2-24B-Instruct-2506",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-03T22:06:15.115179+00:00",
    "updated_at": "2025-08-03T22:06:15.115179+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 16773,
    "benchmark_id": "humaneval-plus",
    "model_id": "mistral-small-3.2-24b-instruct-2506",
    "score": 0.929,
    "normalized_score": 0.929,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-3.2-24B-Instruct-2506",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@5",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-03T22:06:15.116763+00:00",
    "updated_at": "2025-08-03T22:06:15.116763+00:00",
    "benchmark_name": "HumanEval Plus"
  },
  {
    "model_benchmark_id": 16774,
    "benchmark_id": "if",
    "model_id": "mistral-small-3.2-24b-instruct-2506",
    "score": 0.8478,
    "normalized_score": 0.8478,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-3.2-24B-Instruct-2506",
    "verified_by_llmstats": false,
    "analysis_method": "-",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-03T22:06:15.118250+00:00",
    "updated_at": "2025-08-03T22:06:15.118250+00:00",
    "benchmark_name": "IF"
  },
  {
    "model_benchmark_id": 16775,
    "benchmark_id": "math",
    "model_id": "mistral-small-3.2-24b-instruct-2506",
    "score": 0.6942,
    "normalized_score": 0.6942,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-3.2-24B-Instruct-2506",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-03T22:06:15.119723+00:00",
    "updated_at": "2025-08-03T22:06:15.119723+00:00",
    "benchmark_name": "MATH"
  },
  {
    "model_benchmark_id": 16776,
    "benchmark_id": "mathvista",
    "model_id": "mistral-small-3.2-24b-instruct-2506",
    "score": 0.6709,
    "normalized_score": 0.6709,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-3.2-24B-Instruct-2506",
    "verified_by_llmstats": false,
    "analysis_method": "-",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-03T22:06:15.121246+00:00",
    "updated_at": "2025-08-03T22:06:15.121246+00:00",
    "benchmark_name": "MathVista"
  },
  {
    "model_benchmark_id": 16777,
    "benchmark_id": "mbpp-plus",
    "model_id": "mistral-small-3.2-24b-instruct-2506",
    "score": 0.7833,
    "normalized_score": 0.7833,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-3.2-24B-Instruct-2506",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@5",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-03T22:06:15.122828+00:00",
    "updated_at": "2025-08-03T22:06:15.122828+00:00",
    "benchmark_name": "MBPP Plus"
  },
  {
    "model_benchmark_id": 16778,
    "benchmark_id": "mmlu",
    "model_id": "mistral-small-3.2-24b-instruct-2506",
    "score": 0.805,
    "normalized_score": 0.805,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-3.2-24B-Instruct-2506",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-03T22:06:15.124220+00:00",
    "updated_at": "2025-08-03T22:06:15.124220+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 16779,
    "benchmark_id": "mmlu-pro",
    "model_id": "mistral-small-3.2-24b-instruct-2506",
    "score": 0.6906,
    "normalized_score": 0.6906,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-3.2-24B-Instruct-2506",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-03T22:06:15.125972+00:00",
    "updated_at": "2025-08-03T22:06:15.125972+00:00",
    "benchmark_name": "MMLU-Pro"
  },
  {
    "model_benchmark_id": 16780,
    "benchmark_id": "mmmu",
    "model_id": "mistral-small-3.2-24b-instruct-2506",
    "score": 0.625,
    "normalized_score": 0.625,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-3.2-24B-Instruct-2506",
    "verified_by_llmstats": false,
    "analysis_method": "-",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-03T22:06:15.127425+00:00",
    "updated_at": "2025-08-03T22:06:15.127425+00:00",
    "benchmark_name": "MMMU"
  },
  {
    "model_benchmark_id": 16781,
    "benchmark_id": "simpleqa",
    "model_id": "mistral-small-3.2-24b-instruct-2506",
    "score": 0.121,
    "normalized_score": 0.121,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-3.2-24B-Instruct-2506",
    "verified_by_llmstats": false,
    "analysis_method": "TotalAcc",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-03T22:06:15.129114+00:00",
    "updated_at": "2025-08-03T22:06:15.129114+00:00",
    "benchmark_name": "SimpleQA"
  },
  {
    "model_benchmark_id": 16782,
    "benchmark_id": "wild-bench",
    "model_id": "mistral-small-3.2-24b-instruct-2506",
    "score": 0.6533,
    "normalized_score": 0.6533,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/mistralai/Mistral-Small-3.2-24B-Instruct-2506",
    "verified_by_llmstats": false,
    "analysis_method": "v2",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-03T22:06:15.130665+00:00",
    "updated_at": "2025-08-03T22:06:15.130665+00:00",
    "benchmark_name": "Wild Bench"
  }
]


================================================
FILE: data/organizations/mistral/models/mistral-small-3.2-24b-instruct-2506/model.json
================================================
{
  "model_id": "mistral-small-3.2-24b-instruct-2506",
  "name": "Mistral Small 3.2 24B Instruct",
  "organization_id": "mistral",
  "fine_tuned_from_model_id": "mistral-small-3.1-24b-base-2503",
  "description": "Mistral-Small-3.2-24B-Instruct-2506 is a minor update of Mistral-Small-3.1-24B-Instruct-2503.",
  "release_date": "2025-06-20",
  "announcement_date": "2025-06-20",
  "license_id": "apache_2_0",
  "multimodal": true,
  "knowledge_cutoff": "2023-10-01",
  "param_count": 23600000000,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": null,
  "source_playground": "https://console.mistral.ai/",
  "source_paper": null,
  "source_scorecard_blog_link": null,
  "source_repo_link": null,
  "source_weights_link": "https://huggingface.co/mistralai/Mistral-Small-3.2-24B-Instruct-2506",
  "created_at": "2025-08-03T22:06:11.933573+00:00",
  "updated_at": "2025-08-03T22:06:11.933573+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/mistral/models/pixtral-12b-2409/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 874,
    "benchmark_id": "chartqa",
    "model_id": "pixtral-12b-2409",
    "score": 0.818,
    "normalized_score": 0.818,
    "is_self_reported": true,
    "self_reported_source_link": "https://mistral.ai/news/pixtral-12b/",
    "verified_by_llmstats": false,
    "analysis_method": "Chain of Thought (CoT)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.822444+00:00",
    "updated_at": "2025-07-19T19:56:12.822444+00:00",
    "benchmark_name": "ChartQA"
  },
  {
    "model_benchmark_id": 899,
    "benchmark_id": "docvqa",
    "model_id": "pixtral-12b-2409",
    "score": 0.907,
    "normalized_score": 0.907,
    "is_self_reported": true,
    "self_reported_source_link": "https://mistral.ai/news/pixtral-12b/",
    "verified_by_llmstats": false,
    "analysis_method": "ANLS",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.871485+00:00",
    "updated_at": "2025-07-19T19:56:12.871485+00:00",
    "benchmark_name": "DocVQA"
  },
  {
    "model_benchmark_id": 808,
    "benchmark_id": "humaneval",
    "model_id": "pixtral-12b-2409",
    "score": 0.72,
    "normalized_score": 0.72,
    "is_self_reported": true,
    "self_reported_source_link": "https://mistral.ai/news/pixtral-12b/",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.684555+00:00",
    "updated_at": "2025-07-19T19:56:12.684555+00:00",
    "benchmark_name": "HumanEval"
  },
  {
    "model_benchmark_id": 631,
    "benchmark_id": "ifeval",
    "model_id": "pixtral-12b-2409",
    "score": 0.613,
    "normalized_score": 0.613,
    "is_self_reported": true,
    "self_reported_source_link": "https://mistral.ai/news/pixtral-12b/",
    "verified_by_llmstats": false,
    "analysis_method": "Text Instruction Following Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.297384+00:00",
    "updated_at": "2025-07-19T19:56:12.297384+00:00",
    "benchmark_name": "IFEval"
  },
  {
    "model_benchmark_id": 425,
    "benchmark_id": "math",
    "model_id": "pixtral-12b-2409",
    "score": 0.481,
    "normalized_score": 0.481,
    "is_self_reported": true,
    "self_reported_source_link": "https://mistral.ai/news/pixtral-12b/",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.900275+00:00",
    "updated_at": "2025-07-19T19:56:11.900275+00:00",
    "benchmark_name": "MATH"
  },
  {
    "model_benchmark_id": 537,
    "benchmark_id": "mathvista",
    "model_id": "pixtral-12b-2409",
    "score": 0.58,
    "normalized_score": 0.58,
    "is_self_reported": true,
    "self_reported_source_link": "https://mistral.ai/news/pixtral-12b/",
    "verified_by_llmstats": false,
    "analysis_method": "Chain of Thought (CoT)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.111272+00:00",
    "updated_at": "2025-07-19T19:56:12.111272+00:00",
    "benchmark_name": "MathVista"
  },
  {
    "model_benchmark_id": 1822,
    "benchmark_id": "mm-if-eval",
    "model_id": "pixtral-12b-2409",
    "score": 0.527,
    "normalized_score": 0.527,
    "is_self_reported": true,
    "self_reported_source_link": "https://mistral.ai/news/pixtral-12b/",
    "verified_by_llmstats": false,
    "analysis_method": "Multimodal Instruction Following Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.145578+00:00",
    "updated_at": "2025-07-19T19:56:15.145578+00:00",
    "benchmark_name": "MM IF-Eval"
  },
  {
    "model_benchmark_id": 115,
    "benchmark_id": "mmlu",
    "model_id": "pixtral-12b-2409",
    "score": 0.692,
    "normalized_score": 0.692,
    "is_self_reported": true,
    "self_reported_source_link": "https://mistral.ai/news/pixtral-12b/",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.314507+00:00",
    "updated_at": "2025-07-19T19:56:11.314507+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 1733,
    "benchmark_id": "mm-mt-bench",
    "model_id": "pixtral-12b-2409",
    "score": 0.605,
    "normalized_score": 0.605,
    "is_self_reported": true,
    "self_reported_source_link": "https://mistral.ai/news/pixtral-12b/",
    "verified_by_llmstats": false,
    "analysis_method": "Multimodal MT-Bench Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.887276+00:00",
    "updated_at": "2025-07-19T19:56:14.887276+00:00",
    "benchmark_name": "MM-MT-Bench"
  },
  {
    "model_benchmark_id": 588,
    "benchmark_id": "mmmu",
    "model_id": "pixtral-12b-2409",
    "score": 0.525,
    "normalized_score": 0.525,
    "is_self_reported": true,
    "self_reported_source_link": "https://mistral.ai/news/pixtral-12b/",
    "verified_by_llmstats": false,
    "analysis_method": "Chain of Thought (CoT)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.209409+00:00",
    "updated_at": "2025-07-19T19:56:12.209409+00:00",
    "benchmark_name": "MMMU"
  },
  {
    "model_benchmark_id": 1614,
    "benchmark_id": "mt-bench",
    "model_id": "pixtral-12b-2409",
    "score": 0.768,
    "normalized_score": 0.768,
    "is_self_reported": true,
    "self_reported_source_link": "https://mistral.ai/news/pixtral-12b/",
    "verified_by_llmstats": false,
    "analysis_method": "Text MT-Bench Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.539185+00:00",
    "updated_at": "2025-07-19T19:56:14.539185+00:00",
    "benchmark_name": "MT-Bench"
  },
  {
    "model_benchmark_id": 1575,
    "benchmark_id": "vqav2",
    "model_id": "pixtral-12b-2409",
    "score": 0.786,
    "normalized_score": 0.786,
    "is_self_reported": true,
    "self_reported_source_link": "https://mistral.ai/news/pixtral-12b/",
    "verified_by_llmstats": false,
    "analysis_method": "VQA Match",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.416120+00:00",
    "updated_at": "2025-07-19T19:56:14.416120+00:00",
    "benchmark_name": "VQAv2"
  }
]

================================================
FILE: data/organizations/mistral/models/pixtral-12b-2409/model.json
================================================
{
  "model_id": "pixtral-12b-2409",
  "name": "Pixtral-12B",
  "organization_id": "mistral",
  "fine_tuned_from_model_id": null,
  "description": "A 12B parameter multimodal model with a 400M parameter vision encoder, capable of understanding both natural images and documents. Excels at multimodal tasks while maintaining strong text-only performance. Supports variable image sizes and multiple images in context.",
  "release_date": "2024-09-17",
  "announcement_date": "2024-09-17",
  "license_id": "apache_2_0",
  "multimodal": true,
  "knowledge_cutoff": null,
  "param_count": 12400000000,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://docs.mistral.ai/platform/endpoints/",
  "source_playground": "https://chat.mistral.ai",
  "source_paper": null,
  "source_scorecard_blog_link": "https://mistral.ai/news/pixtral-12b/",
  "source_repo_link": "https://huggingface.co/mistralai/Pixtral-12B-2409",
  "source_weights_link": "https://huggingface.co/mistralai/Pixtral-12B-2409",
  "created_at": "2025-07-19T19:49:05.802013+00:00",
  "updated_at": "2025-07-19T19:49:05.802013+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/mistral/models/pixtral-large/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 1261,
    "benchmark_id": "ai2d",
    "model_id": "pixtral-large",
    "score": 0.938,
    "normalized_score": 0.938,
    "is_self_reported": true,
    "self_reported_source_link": "https://mistral.ai/news/pixtral-large/",
    "verified_by_llmstats": false,
    "analysis_method": "BBox",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.645378+00:00",
    "updated_at": "2025-07-19T19:56:13.645378+00:00",
    "benchmark_name": "AI2D"
  },
  {
    "model_benchmark_id": 873,
    "benchmark_id": "chartqa",
    "model_id": "pixtral-large",
    "score": 0.881,
    "normalized_score": 0.881,
    "is_self_reported": true,
    "self_reported_source_link": "https://mistral.ai/news/pixtral-large/",
    "verified_by_llmstats": false,
    "analysis_method": "CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.820802+00:00",
    "updated_at": "2025-07-19T19:56:12.820802+00:00",
    "benchmark_name": "ChartQA"
  },
  {
    "model_benchmark_id": 898,
    "benchmark_id": "docvqa",
    "model_id": "pixtral-large",
    "score": 0.933,
    "normalized_score": 0.933,
    "is_self_reported": true,
    "self_reported_source_link": "https://mistral.ai/news/pixtral-large/",
    "verified_by_llmstats": false,
    "analysis_method": "ANLS",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.869454+00:00",
    "updated_at": "2025-07-19T19:56:12.869454+00:00",
    "benchmark_name": "DocVQA"
  },
  {
    "model_benchmark_id": 536,
    "benchmark_id": "mathvista",
    "model_id": "pixtral-large",
    "score": 0.694,
    "normalized_score": 0.694,
    "is_self_reported": true,
    "self_reported_source_link": "https://mistral.ai/news/pixtral-large/",
    "verified_by_llmstats": false,
    "analysis_method": "CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.109764+00:00",
    "updated_at": "2025-07-19T19:56:12.109764+00:00",
    "benchmark_name": "MathVista"
  },
  {
    "model_benchmark_id": 1732,
    "benchmark_id": "mm-mt-bench",
    "model_id": "pixtral-large",
    "score": 0.74,
    "normalized_score": 0.74,
    "is_self_reported": true,
    "self_reported_source_link": "https://mistral.ai/news/pixtral-large/",
    "verified_by_llmstats": false,
    "analysis_method": "GPT-4o Judge",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.885715+00:00",
    "updated_at": "2025-07-19T19:56:14.885715+00:00",
    "benchmark_name": "MM-MT-Bench"
  },
  {
    "model_benchmark_id": 586,
    "benchmark_id": "mmmu",
    "model_id": "pixtral-large",
    "score": 0.64,
    "normalized_score": 0.64,
    "is_self_reported": true,
    "self_reported_source_link": "https://mistral.ai/news/pixtral-large/",
    "verified_by_llmstats": false,
    "analysis_method": "CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.205240+00:00",
    "updated_at": "2025-07-19T19:56:12.205240+00:00",
    "benchmark_name": "MMMU"
  },
  {
    "model_benchmark_id": 1574,
    "benchmark_id": "vqav2",
    "model_id": "pixtral-large",
    "score": 0.809,
    "normalized_score": 0.809,
    "is_self_reported": true,
    "self_reported_source_link": "https://mistral.ai/news/pixtral-large/",
    "verified_by_llmstats": false,
    "analysis_method": "VQA Match",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.414450+00:00",
    "updated_at": "2025-07-19T19:56:14.414450+00:00",
    "benchmark_name": "VQAv2"
  }
]

================================================
FILE: data/organizations/mistral/models/pixtral-large/model.json
================================================
{
  "model_id": "pixtral-large",
  "name": "Pixtral Large",
  "organization_id": "mistral",
  "fine_tuned_from_model_id": "mistral-large-2-2407",
  "description": "A 124B parameter multimodal model built on top of Mistral Large 2, featuring frontier-level image understanding capabilities. Excels at understanding documents, charts, and natural images while maintaining strong text-only performance. Features a 123B multimodal decoder and 1B parameter vision encoder with a 128K context window supporting up to 30 high-resolution images.",
  "release_date": "2024-11-18",
  "announcement_date": "2024-11-18",
  "license_id": "mistral_research_license_(mrl)_for_research;_mistral_commercial_license_for_commercial_use",
  "multimodal": true,
  "knowledge_cutoff": null,
  "param_count": 124000000000,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://mistral.ai/",
  "source_playground": "https://chat.mistral.ai/",
  "source_paper": null,
  "source_scorecard_blog_link": "https://mistral.ai/news/pixtral-large/",
  "source_repo_link": null,
  "source_weights_link": "https://huggingface.co/mistralai/Pixtral-Large-Instruct-2411",
  "created_at": "2025-07-19T19:49:05.913427+00:00",
  "updated_at": "2025-07-19T19:49:05.913427+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/mistral/organization.json
================================================
{
  "organization_id": "mistral",
  "name": "Mistral AI",
  "website": "https://mistral.ai",
  "description": "French AI company",
  "country": "FR",
  "created_at": "2025-07-19T19:49:05.769198+00:00",
  "updated_at": "2025-07-19T19:49:05.769198+00:00"
}


================================================
FILE: data/organizations/moonshotai/models/kimi-k1.5/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 444,
    "benchmark_id": "aime-2024",
    "model_id": "kimi-k1.5",
    "score": 0.775,
    "normalized_score": 0.775,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/MoonshotAI/Kimi-k1.5",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.945090+00:00",
    "updated_at": "2025-07-19T19:56:11.945090+00:00",
    "benchmark_name": "AIME 2024"
  },
  {
    "model_benchmark_id": 435,
    "benchmark_id": "c-eval",
    "model_id": "kimi-k1.5",
    "score": 0.883,
    "normalized_score": 0.883,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/MoonshotAI/Kimi-k1.5",
    "verified_by_llmstats": false,
    "analysis_method": "Exact Match",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.922484+00:00",
    "updated_at": "2025-07-19T19:56:11.922484+00:00",
    "benchmark_name": "C-Eval"
  },
  {
    "model_benchmark_id": 599,
    "benchmark_id": "cluewsc",
    "model_id": "kimi-k1.5",
    "score": 0.914,
    "normalized_score": 0.914,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/MoonshotAI/Kimi-k1.5",
    "verified_by_llmstats": false,
    "analysis_method": "Exact Match",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.236097+00:00",
    "updated_at": "2025-07-19T19:56:12.236097+00:00",
    "benchmark_name": "CLUEWSC"
  },
  {
    "model_benchmark_id": 602,
    "benchmark_id": "ifeval",
    "model_id": "kimi-k1.5",
    "score": 0.872,
    "normalized_score": 0.872,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/MoonshotAI/Kimi-k1.5",
    "verified_by_llmstats": false,
    "analysis_method": "Exact Match",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.244895+00:00",
    "updated_at": "2025-07-19T19:56:12.244895+00:00",
    "benchmark_name": "IFEval"
  },
  {
    "model_benchmark_id": 514,
    "benchmark_id": "livecodebench-v5-24.12-25.2",
    "model_id": "kimi-k1.5",
    "score": 0.625,
    "normalized_score": 0.625,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/MoonshotAI/Kimi-k1.5",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.068737+00:00",
    "updated_at": "2025-07-19T19:56:12.068737+00:00",
    "benchmark_name": "LiveCodeBench v5 24.12-25.2"
  },
  {
    "model_benchmark_id": 492,
    "benchmark_id": "math-500",
    "model_id": "kimi-k1.5",
    "score": 0.962,
    "normalized_score": 0.962,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/MoonshotAI/Kimi-k1.5",
    "verified_by_llmstats": false,
    "analysis_method": "Exact Match",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.029931+00:00",
    "updated_at": "2025-07-19T19:56:12.029931+00:00",
    "benchmark_name": "MATH-500"
  },
  {
    "model_benchmark_id": 515,
    "benchmark_id": "mathvista",
    "model_id": "kimi-k1.5",
    "score": 0.749,
    "normalized_score": 0.749,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/MoonshotAI/Kimi-k1.5",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.071814+00:00",
    "updated_at": "2025-07-19T19:56:12.071814+00:00",
    "benchmark_name": "MathVista"
  },
  {
    "model_benchmark_id": 58,
    "benchmark_id": "mmlu",
    "model_id": "kimi-k1.5",
    "score": 0.874,
    "normalized_score": 0.874,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/MoonshotAI/Kimi-k1.5",
    "verified_by_llmstats": false,
    "analysis_method": "Exact Match",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.207582+00:00",
    "updated_at": "2025-07-19T19:56:11.207582+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 549,
    "benchmark_id": "mmmu",
    "model_id": "kimi-k1.5",
    "score": 0.7,
    "normalized_score": 0.7,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/MoonshotAI/Kimi-k1.5",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.132422+00:00",
    "updated_at": "2025-07-19T19:56:12.132422+00:00",
    "benchmark_name": "MMMU"
  }
]

================================================
FILE: data/organizations/moonshotai/models/kimi-k1.5/model.json
================================================
{
  "model_id": "kimi-k1.5",
  "name": "Kimi-k1.5",
  "organization_id": "moonshotai",
  "fine_tuned_from_model_id": null,
  "description": "Kimi 1.5 is a next-generation multimodal large language model developed by Moonshot AI. It incorporates advanced reinforcement learning (RL) and scalable multimodal reasoning, delivering state-of-the-art performance in math, code, vision, and long-context reasoning tasks.",
  "release_date": "2025-01-20",
  "announcement_date": "2025-01-20",
  "license_id": "proprietary",
  "multimodal": true,
  "knowledge_cutoff": null,
  "param_count": null,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://platform.moonshot.cn/docs/api-reference",
  "source_playground": "https://kimi.ai/",
  "source_paper": "https://arxiv.org/abs/2501.12599",
  "source_scorecard_blog_link": null,
  "source_repo_link": "https://github.com/MoonshotAI/Kimi-k1.5",
  "source_weights_link": null,
  "created_at": "2025-07-19T19:49:05.426406+00:00",
  "updated_at": "2025-07-19T19:49:05.426406+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/moonshotai/models/kimi-k2-0905/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 9001,
    "benchmark_id": "gpqa",
    "model_id": "kimi-k2-0905",
    "score": 0.758,
    "normalized_score": 0.758,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshot.cn/blog/kimi-k2-0905",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2024-09-05T00:00:00.000000+00:00",
    "updated_at": "2024-09-15T00:00:00.000000+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 9002,
    "benchmark_id": "mmlu",
    "model_id": "kimi-k2-0905",
    "score": 0.902,
    "normalized_score": 0.902,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshot.cn/blog/kimi-k2-0905",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2024-09-05T00:00:00.000000+00:00",
    "updated_at": "2024-09-15T00:00:00.000000+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 9003,
    "benchmark_id": "math",
    "model_id": "kimi-k2-0905",
    "score": 0.891,
    "normalized_score": 0.891,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshot.cn/blog/kimi-k2-0905",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2024-09-05T00:00:00.000000+00:00",
    "updated_at": "2024-09-15T00:00:00.000000+00:00",
    "benchmark_name": "MATH"
  },
  {
    "model_benchmark_id": 9004,
    "benchmark_id": "humaneval",
    "model_id": "kimi-k2-0905",
    "score": 0.945,
    "normalized_score": 0.945,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshot.cn/blog/kimi-k2-0905",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2024-09-05T00:00:00.000000+00:00",
    "updated_at": "2024-09-15T00:00:00.000000+00:00",
    "benchmark_name": "HumanEval"
  },
  {
    "model_benchmark_id": 9005,
    "benchmark_id": "mmlu-pro",
    "model_id": "kimi-k2-0905",
    "score": 0.825,
    "normalized_score": 0.825,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshot.cn/blog/kimi-k2-0905",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2024-09-05T00:00:00.000000+00:00",
    "updated_at": "2024-09-15T00:00:00.000000+00:00",
    "benchmark_name": "MMLU-Pro"
  },
  {
    "model_benchmark_id": 9006,
    "benchmark_id": "aime-2024",
    "model_id": "kimi-k2-0905",
    "score": 0.72,
    "normalized_score": 0.72,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshot.cn/blog/kimi-k2-0905",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2024-09-05T00:00:00.000000+00:00",
    "updated_at": "2024-09-15T00:00:00.000000+00:00",
    "benchmark_name": "AIME 2024"
  }
]


================================================
FILE: data/organizations/moonshotai/models/kimi-k2-0905/model.json
================================================
{
  "model_id": "kimi-k2-0905",
  "name": "Kimi K2 0905",
  "organization_id": "moonshotai",
  "fine_tuned_from_model_id": "kimi-k2-instruct",
  "description": "Kimi K2 0905 is the September update of Kimi K2 0711. It is a large-scale Mixture-of-Experts (MoE) language model developed by Moonshot AI, featuring 1 trillion total parameters with 32 billion active per forward pass. It supports long-context inference up to 256k tokens, extended from the previous 128k. This update improves agentic coding with higher accuracy and better generalization across scaffolds, and enhances frontend coding with more aesthetic and functional outputs for web, 3D, and related tasks. The model is trained with a novel stack incorporating the MuonClip optimizer for stable large-scale MoE training.",
  "release_date": "2025-09-05",
  "announcement_date": "2025-09-05",
  "license_id": "proprietary",
  "multimodal": false,
  "knowledge_cutoff": null,
  "param_count": 1000000000000,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://docs.moonshot.cn/",
  "source_playground": "https://kimi.moonshot.cn/",
  "source_paper": null,
  "source_scorecard_blog_link": "https://moonshot.cn/blog/kimi-k2-0905",
  "source_repo_link": null,
  "source_weights_link": null,
  "created_at": "2025-09-14T00:00:00.000000+00:00",
  "updated_at": "2025-09-14T00:00:00.000000+00:00",
  "model_family_id": null
}


================================================
FILE: data/organizations/moonshotai/models/kimi-k2-base/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 434,
    "benchmark_id": "c-eval",
    "model_id": "kimi-k2-base",
    "score": 0.925,
    "normalized_score": 0.925,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/MoonshotAI/Kimi-K2",
    "verified_by_llmstats": false,
    "analysis_method": "EM",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.920573+00:00",
    "updated_at": "2025-07-19T19:56:11.920573+00:00",
    "benchmark_name": "C-Eval"
  },
  {
    "model_benchmark_id": 440,
    "benchmark_id": "csimpleqa",
    "model_id": "kimi-k2-base",
    "score": 0.776,
    "normalized_score": 0.776,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/MoonshotAI/Kimi-K2",
    "verified_by_llmstats": false,
    "analysis_method": "Correct",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.934566+00:00",
    "updated_at": "2025-07-19T19:56:11.934566+00:00",
    "benchmark_name": "CSimpleQA"
  },
  {
    "model_benchmark_id": 369,
    "benchmark_id": "evalplus",
    "model_id": "kimi-k2-base",
    "score": 0.803,
    "normalized_score": 0.803,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/MoonshotAI/Kimi-K2",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.796250+00:00",
    "updated_at": "2025-07-19T19:56:11.796250+00:00",
    "benchmark_name": "EvalPlus"
  },
  {
    "model_benchmark_id": 256,
    "benchmark_id": "gpqa",
    "model_id": "kimi-k2-base",
    "score": 0.481,
    "normalized_score": 0.481,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/MoonshotAI/Kimi-K2",
    "verified_by_llmstats": false,
    "analysis_method": "Diamond Avg@8",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.591508+00:00",
    "updated_at": "2025-07-19T19:56:11.591508+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 158,
    "benchmark_id": "gsm8k",
    "model_id": "kimi-k2-base",
    "score": 0.921,
    "normalized_score": 0.921,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/MoonshotAI/Kimi-K2",
    "verified_by_llmstats": false,
    "analysis_method": "EM",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.403308+00:00",
    "updated_at": "2025-07-19T19:56:11.403308+00:00",
    "benchmark_name": "GSM8k"
  },
  {
    "model_benchmark_id": 367,
    "benchmark_id": "livecodebench-v6",
    "model_id": "kimi-k2-base",
    "score": 0.263,
    "normalized_score": 0.263,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/MoonshotAI/Kimi-K2",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.789592+00:00",
    "updated_at": "2025-07-19T19:56:11.789592+00:00",
    "benchmark_name": "LiveCodeBench v6"
  },
  {
    "model_benchmark_id": 373,
    "benchmark_id": "math",
    "model_id": "kimi-k2-base",
    "score": 0.702,
    "normalized_score": 0.702,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/MoonshotAI/Kimi-K2",
    "verified_by_llmstats": false,
    "analysis_method": "EM",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.808795+00:00",
    "updated_at": "2025-07-19T19:56:11.808795+00:00",
    "benchmark_name": "MATH"
  },
  {
    "model_benchmark_id": 57,
    "benchmark_id": "mmlu",
    "model_id": "kimi-k2-base",
    "score": 0.878,
    "normalized_score": 0.878,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/MoonshotAI/Kimi-K2",
    "verified_by_llmstats": false,
    "analysis_method": "EM",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.205746+00:00",
    "updated_at": "2025-07-19T19:56:11.205746+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 161,
    "benchmark_id": "mmlu-pro",
    "model_id": "kimi-k2-base",
    "score": 0.692,
    "normalized_score": 0.692,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/MoonshotAI/Kimi-K2",
    "verified_by_llmstats": false,
    "analysis_method": "EM",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.410852+00:00",
    "updated_at": "2025-07-19T19:56:11.410852+00:00",
    "benchmark_name": "MMLU-Pro"
  },
  {
    "model_benchmark_id": 221,
    "benchmark_id": "mmlu-redux-2.0",
    "model_id": "kimi-k2-base",
    "score": 0.902,
    "normalized_score": 0.902,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/MoonshotAI/Kimi-K2",
    "verified_by_llmstats": false,
    "analysis_method": "EM",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.520883+00:00",
    "updated_at": "2025-07-19T19:56:11.520883+00:00",
    "benchmark_name": "MMLU-redux-2.0"
  },
  {
    "model_benchmark_id": 222,
    "benchmark_id": "simpleqa",
    "model_id": "kimi-k2-base",
    "score": 0.353,
    "normalized_score": 0.353,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/MoonshotAI/Kimi-K2",
    "verified_by_llmstats": false,
    "analysis_method": "Correct",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.524097+00:00",
    "updated_at": "2025-07-19T19:56:11.524097+00:00",
    "benchmark_name": "SimpleQA"
  },
  {
    "model_benchmark_id": 364,
    "benchmark_id": "supergpqa",
    "model_id": "kimi-k2-base",
    "score": 0.447,
    "normalized_score": 0.447,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/MoonshotAI/Kimi-K2",
    "verified_by_llmstats": false,
    "analysis_method": "EM",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.781413+00:00",
    "updated_at": "2025-07-19T19:56:11.781413+00:00",
    "benchmark_name": "SuperGPQA"
  },
  {
    "model_benchmark_id": 243,
    "benchmark_id": "triviaqa",
    "model_id": "kimi-k2-base",
    "score": 0.851,
    "normalized_score": 0.851,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/MoonshotAI/Kimi-K2",
    "verified_by_llmstats": false,
    "analysis_method": "EM",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.566226+00:00",
    "updated_at": "2025-07-19T19:56:11.566226+00:00",
    "benchmark_name": "TriviaQA"
  }
]

================================================
FILE: data/organizations/moonshotai/models/kimi-k2-base/model.json
================================================
{
  "model_id": "kimi-k2-base",
  "name": "Kimi K2 Base",
  "organization_id": "moonshotai",
  "fine_tuned_from_model_id": null,
  "description": "Kimi K2 base model is a state-of-the-art mixture-of-experts (MoE) language model with 32 billion activated parameters and 1 trillion total parameters. Trained on 15.5 trillion tokens with the MuonClip optimizer, this is the foundation model before instruction tuning. It demonstrates strong performance on knowledge, reasoning, and coding benchmarks while being optimized for agentic capabilities.",
  "release_date": "2025-07-11",
  "announcement_date": "2025-07-11",
  "license_id": "mit",
  "multimodal": false,
  "knowledge_cutoff": null,
  "param_count": 1000000000000,
  "training_tokens": 15500000000000,
  "available_in_zeroeval": true,
  "source_api_ref": "https://platform.moonshot.ai",
  "source_playground": null,
  "source_paper": null,
  "source_scorecard_blog_link": "https://moonshotai.github.io/Kimi-K2/",
  "source_repo_link": "https://github.com/MoonshotAI/Kimi-K2",
  "source_weights_link": "https://huggingface.co/moonshotai/Kimi-K2-Base",
  "created_at": "2025-07-19T19:49:05.422399+00:00",
  "updated_at": "2025-07-19T19:49:05.422399+00:00",
  "model_family_id": null
}


================================================
FILE: data/organizations/moonshotai/models/kimi-k2-instruct/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 676,
    "benchmark_id": "acebench",
    "model_id": "kimi-k2-instruct",
    "score": 0.765,
    "normalized_score": 0.765,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.408910+00:00",
    "updated_at": "2025-07-19T19:56:12.408910+00:00",
    "benchmark_name": "AceBench"
  },
  {
    "model_benchmark_id": 657,
    "benchmark_id": "aider-polyglot",
    "model_id": "kimi-k2-instruct",
    "score": 0.6,
    "normalized_score": 0.6,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.362819+00:00",
    "updated_at": "2025-07-19T19:56:12.362819+00:00",
    "benchmark_name": "Aider-Polyglot"
  },
  {
    "model_benchmark_id": 445,
    "benchmark_id": "aime-2024",
    "model_id": "kimi-k2-instruct",
    "score": 0.696,
    "normalized_score": 0.696,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "Avg@64",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.946639+00:00",
    "updated_at": "2025-07-19T19:56:11.946639+00:00",
    "benchmark_name": "AIME 2024"
  },
  {
    "model_benchmark_id": 677,
    "benchmark_id": "aime-2025",
    "model_id": "kimi-k2-instruct",
    "score": 0.495,
    "normalized_score": 0.495,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "Avg@64",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.412395+00:00",
    "updated_at": "2025-07-19T19:56:12.412395+00:00",
    "benchmark_name": "AIME 2025"
  },
  {
    "model_benchmark_id": 715,
    "benchmark_id": "autologi",
    "model_id": "kimi-k2-instruct",
    "score": 0.895,
    "normalized_score": 0.895,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.506457+00:00",
    "updated_at": "2025-07-19T19:56:12.506457+00:00",
    "benchmark_name": "AutoLogi"
  },
  {
    "model_benchmark_id": 757,
    "benchmark_id": "cbnsl",
    "model_id": "kimi-k2-instruct",
    "score": 0.956,
    "normalized_score": 0.956,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.594017+00:00",
    "updated_at": "2025-07-19T19:56:12.594017+00:00",
    "benchmark_name": "CBNSL"
  },
  {
    "model_benchmark_id": 709,
    "benchmark_id": "cnmo-2024",
    "model_id": "kimi-k2-instruct",
    "score": 0.743,
    "normalized_score": 0.743,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "Avg@16",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.489469+00:00",
    "updated_at": "2025-07-19T19:56:12.489469+00:00",
    "benchmark_name": "CNMO 2024"
  },
  {
    "model_benchmark_id": 441,
    "benchmark_id": "csimpleqa",
    "model_id": "kimi-k2-instruct",
    "score": 0.784,
    "normalized_score": 0.784,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "Correct",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.936097+00:00",
    "updated_at": "2025-07-19T19:56:11.936097+00:00",
    "benchmark_name": "CSimpleQA"
  },
  {
    "model_benchmark_id": 257,
    "benchmark_id": "gpqa",
    "model_id": "kimi-k2-instruct",
    "score": 0.751,
    "normalized_score": 0.751,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "Diamond Avg@8",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.593256+00:00",
    "updated_at": "2025-07-19T19:56:11.593256+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 159,
    "benchmark_id": "gsm8k",
    "model_id": "kimi-k2-instruct",
    "score": 0.973,
    "normalized_score": 0.973,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.405113+00:00",
    "updated_at": "2025-07-19T19:56:11.405113+00:00",
    "benchmark_name": "GSM8k"
  },
  {
    "model_benchmark_id": 707,
    "benchmark_id": "hmmt-2025",
    "model_id": "kimi-k2-instruct",
    "score": 0.388,
    "normalized_score": 0.388,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "Avg@32",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.482540+00:00",
    "updated_at": "2025-07-19T19:56:12.482540+00:00",
    "benchmark_name": "HMMT 2025"
  },
  {
    "model_benchmark_id": 758,
    "benchmark_id": "humaneval",
    "model_id": "kimi-k2-instruct",
    "score": 0.933,
    "normalized_score": 0.933,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.598519+00:00",
    "updated_at": "2025-07-19T19:56:12.598519+00:00",
    "benchmark_name": "HumanEval"
  },
  {
    "model_benchmark_id": 819,
    "benchmark_id": "humaneval-er",
    "model_id": "kimi-k2-instruct",
    "score": 0.811,
    "normalized_score": 0.811,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.707650+00:00",
    "updated_at": "2025-07-19T19:56:12.707650+00:00",
    "benchmark_name": "HumanEval-ER"
  },
  {
    "model_benchmark_id": 716,
    "benchmark_id": "humanity's-last-exam",
    "model_id": "kimi-k2-instruct",
    "score": 0.047,
    "normalized_score": 0.047,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy (Text Only)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.510122+00:00",
    "updated_at": "2025-07-19T19:56:12.510122+00:00",
    "benchmark_name": "Humanity's Last Exam"
  },
  {
    "model_benchmark_id": 603,
    "benchmark_id": "ifeval",
    "model_id": "kimi-k2-instruct",
    "score": 0.898,
    "normalized_score": 0.898,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "Prompt Strict",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.247003+00:00",
    "updated_at": "2025-07-19T19:56:12.247003+00:00",
    "benchmark_name": "IFEval"
  },
  {
    "model_benchmark_id": 745,
    "benchmark_id": "livebench",
    "model_id": "kimi-k2-instruct",
    "score": 0.764,
    "normalized_score": 0.764,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.567525+00:00",
    "updated_at": "2025-07-19T19:56:12.567525+00:00",
    "benchmark_name": "LiveBench"
  },
  {
    "model_benchmark_id": 368,
    "benchmark_id": "livecodebench-v6",
    "model_id": "kimi-k2-instruct",
    "score": 0.537,
    "normalized_score": 0.537,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.791826+00:00",
    "updated_at": "2025-07-19T19:56:11.791826+00:00",
    "benchmark_name": "LiveCodeBench v6"
  },
  {
    "model_benchmark_id": 493,
    "benchmark_id": "math-500",
    "model_id": "kimi-k2-instruct",
    "score": 0.974,
    "normalized_score": 0.974,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.031465+00:00",
    "updated_at": "2025-07-19T19:56:12.031465+00:00",
    "benchmark_name": "MATH-500"
  },
  {
    "model_benchmark_id": 59,
    "benchmark_id": "mmlu",
    "model_id": "kimi-k2-instruct",
    "score": 0.895,
    "normalized_score": 0.895,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "EM",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.209924+00:00",
    "updated_at": "2025-07-19T19:56:11.209924+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 162,
    "benchmark_id": "mmlu-pro",
    "model_id": "kimi-k2-instruct",
    "score": 0.811,
    "normalized_score": 0.811,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "EM",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.412849+00:00",
    "updated_at": "2025-07-19T19:56:11.412849+00:00",
    "benchmark_name": "MMLU-Pro"
  },
  {
    "model_benchmark_id": 727,
    "benchmark_id": "mmlu-redux",
    "model_id": "kimi-k2-instruct",
    "score": 0.927,
    "normalized_score": 0.927,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "EM",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.531649+00:00",
    "updated_at": "2025-07-19T19:56:12.531649+00:00",
    "benchmark_name": "MMLU-Redux"
  },
  {
    "model_benchmark_id": 739,
    "benchmark_id": "multichallenge",
    "model_id": "kimi-k2-instruct",
    "score": 0.541,
    "normalized_score": 0.541,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.554319+00:00",
    "updated_at": "2025-07-19T19:56:12.554319+00:00",
    "benchmark_name": "MultiChallenge"
  },
  {
    "model_benchmark_id": 639,
    "benchmark_id": "multipl-e",
    "model_id": "kimi-k2-instruct",
    "score": 0.857,
    "normalized_score": 0.857,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.314432+00:00",
    "updated_at": "2025-07-19T19:56:12.314432+00:00",
    "benchmark_name": "MultiPL-E"
  },
  {
    "model_benchmark_id": 820,
    "benchmark_id": "musr",
    "model_id": "kimi-k2-instruct",
    "score": 0.764,
    "normalized_score": 0.764,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.711252+00:00",
    "updated_at": "2025-07-19T19:56:12.711252+00:00",
    "benchmark_name": "MuSR"
  },
  {
    "model_benchmark_id": 638,
    "benchmark_id": "ojbench",
    "model_id": "kimi-k2-instruct",
    "score": 0.271,
    "normalized_score": 0.271,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.310963+00:00",
    "updated_at": "2025-07-19T19:56:12.310963+00:00",
    "benchmark_name": "OJBench"
  },
  {
    "model_benchmark_id": 713,
    "benchmark_id": "polymath-en",
    "model_id": "kimi-k2-instruct",
    "score": 0.651,
    "normalized_score": 0.651,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "Avg@4",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.499339+00:00",
    "updated_at": "2025-07-19T19:56:12.499339+00:00",
    "benchmark_name": "PolyMath-en"
  },
  {
    "model_benchmark_id": 223,
    "benchmark_id": "simpleqa",
    "model_id": "kimi-k2-instruct",
    "score": 0.31,
    "normalized_score": 0.31,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "Correct",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.526736+00:00",
    "updated_at": "2025-07-19T19:56:11.526736+00:00",
    "benchmark_name": "SimpleQA"
  },
  {
    "model_benchmark_id": 365,
    "benchmark_id": "supergpqa",
    "model_id": "kimi-k2-instruct",
    "score": 0.572,
    "normalized_score": 0.572,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.782850+00:00",
    "updated_at": "2025-07-19T19:56:11.782850+00:00",
    "benchmark_name": "SuperGPQA"
  },
  {
    "model_benchmark_id": 651,
    "benchmark_id": "swe-bench-multilingual",
    "model_id": "kimi-k2-instruct",
    "score": 0.473,
    "normalized_score": 0.473,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "Single Attempt",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.343981+00:00",
    "updated_at": "2025-07-19T19:56:12.343981+00:00",
    "benchmark_name": "SWE-bench Multilingual"
  },
  {
    "model_benchmark_id": 649,
    "benchmark_id": "swe-bench-verified-(agentic-coding)",
    "model_id": "kimi-k2-instruct",
    "score": 0.658,
    "normalized_score": 0.658,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "Single Attempt",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.333761+00:00",
    "updated_at": "2025-07-19T19:56:12.333761+00:00",
    "benchmark_name": "SWE-bench Verified (Agentic Coding)"
  },
  {
    "model_benchmark_id": 648,
    "benchmark_id": "swe-bench-verified-(agentless)",
    "model_id": "kimi-k2-instruct",
    "score": 0.518,
    "normalized_score": 0.518,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "Single Patch without Test",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.330548+00:00",
    "updated_at": "2025-07-19T19:56:12.330548+00:00",
    "benchmark_name": "SWE-bench Verified (Agentless)"
  },
  {
    "model_benchmark_id": 650,
    "benchmark_id": "swe-bench-verified-(multiple-attempts)",
    "model_id": "kimi-k2-instruct",
    "score": 0.716,
    "normalized_score": 0.716,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "Multiple Attempts with parallel test-time compute",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.339305+00:00",
    "updated_at": "2025-07-19T19:56:12.339305+00:00",
    "benchmark_name": "SWE-bench Verified (Multiple Attempts)"
  },
  {
    "model_benchmark_id": 674,
    "benchmark_id": "tau2-airline",
    "model_id": "kimi-k2-instruct",
    "score": 0.565,
    "normalized_score": 0.565,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "Avg@4",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.401229+00:00",
    "updated_at": "2025-07-19T19:56:12.401229+00:00",
    "benchmark_name": "Tau2 airline"
  },
  {
    "model_benchmark_id": 673,
    "benchmark_id": "tau2-retail",
    "model_id": "kimi-k2-instruct",
    "score": 0.706,
    "normalized_score": 0.706,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "Avg@4",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.395604+00:00",
    "updated_at": "2025-07-19T19:56:12.395604+00:00",
    "benchmark_name": "Tau2 retail"
  },
  {
    "model_benchmark_id": 675,
    "benchmark_id": "tau2-telecom",
    "model_id": "kimi-k2-instruct",
    "score": 0.658,
    "normalized_score": 0.658,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "Avg@4",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.405145+00:00",
    "updated_at": "2025-07-19T19:56:12.405145+00:00",
    "benchmark_name": "Tau2 telecom"
  },
  {
    "model_benchmark_id": 652,
    "benchmark_id": "terminal-bench",
    "model_id": "kimi-k2-instruct",
    "score": 0.3,
    "normalized_score": 0.3,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "Inhouse Framework",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.348003+00:00",
    "updated_at": "2025-07-19T19:56:12.348003+00:00",
    "benchmark_name": "Terminal-bench"
  },
  {
    "model_benchmark_id": 656,
    "benchmark_id": "terminus",
    "model_id": "kimi-k2-instruct",
    "score": 0.25,
    "normalized_score": 0.25,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.358921+00:00",
    "updated_at": "2025-07-19T19:56:12.358921+00:00",
    "benchmark_name": "Terminus"
  },
  {
    "model_benchmark_id": 714,
    "benchmark_id": "zebralogic",
    "model_id": "kimi-k2-instruct",
    "score": 0.89,
    "normalized_score": 0.89,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.502879+00:00",
    "updated_at": "2025-07-19T19:56:12.502879+00:00",
    "benchmark_name": "ZebraLogic"
  }
]

================================================
FILE: data/organizations/moonshotai/models/kimi-k2-instruct/model.json
================================================
{
  "model_id": "kimi-k2-instruct",
  "name": "Kimi K2 Instruct",
  "organization_id": "moonshotai",
  "fine_tuned_from_model_id": "kimi-k2-base",
  "description": "Kimi K2 is a state-of-the-art mixture-of-experts (MoE) language model with 32 billion activated parameters and 1 trillion total parameters. Trained with the MuonClip optimizer, it achieves exceptional performance across frontier knowledge, reasoning, and coding tasks while being meticulously optimized for agentic capabilities. The instruct variant is post-trained for drop-in, general-purpose chat and agentic experiences without long thinking.",
  "release_date": "2025-07-11",
  "announcement_date": "2025-07-11",
  "license_id": "mit",
  "multimodal": false,
  "knowledge_cutoff": null,
  "param_count": 1000000000000,
  "training_tokens": 15500000000000,
  "available_in_zeroeval": true,
  "source_api_ref": "https://platform.moonshot.ai",
  "source_playground": "https://kimi.com",
  "source_paper": null,
  "source_scorecard_blog_link": "https://moonshotai.github.io/Kimi-K2/",
  "source_repo_link": "https://github.com/MoonshotAI/Kimi-K2",
  "source_weights_link": "https://huggingface.co/moonshotai/Kimi-K2-Instruct",
  "created_at": "2025-07-19T19:49:05.875884+00:00",
  "updated_at": "2025-07-19T19:49:05.875884+00:00",
  "model_family_id": null
}


================================================
FILE: data/organizations/moonshotai/models/kimi-k2-instruct-0905/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 10001,
    "benchmark_id": "swe-bench-verified",
    "model_id": "kimi-k2-instruct-0905",
    "score": 0.658,
    "normalized_score": 0.658,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "Agentic Coding - Single Attempt",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": "65.8% single attempt, 71.6% multiple",
    "created_at": "2025-09-05T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "Swe Bench Verified"
  },
  {
    "model_benchmark_id": 10002,
    "benchmark_id": "swe-bench-multilingual",
    "model_id": "kimi-k2-instruct-0905",
    "score": 0.473,
    "normalized_score": 0.473,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "Agentic Coding - Single Attempt",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-09-05T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "Swe Bench Multilingual"
  },
  {
    "model_benchmark_id": 10003,
    "benchmark_id": "terminal-bench",
    "model_id": "kimi-k2-instruct-0905",
    "score": 0.25,
    "normalized_score": 0.25,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "Terminus",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-09-05T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "Terminal Bench"
  },
  {
    "model_benchmark_id": 10004,
    "benchmark_id": "livecodebench",
    "model_id": "kimi-k2-instruct-0905",
    "score": 0.537,
    "normalized_score": 0.537,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "v6 (Aug 24-May 25) Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-09-05T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "Livecodebench"
  },
  {
    "model_benchmark_id": 10005,
    "benchmark_id": "ojbench",
    "model_id": "kimi-k2-instruct-0905",
    "score": 0.271,
    "normalized_score": 0.271,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-09-05T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "Ojbench"
  },
  {
    "model_benchmark_id": 10006,
    "benchmark_id": "multipl-e",
    "model_id": "kimi-k2-instruct-0905",
    "score": 0.857,
    "normalized_score": 0.857,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-09-05T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "Multiple"
  },
  {
    "model_benchmark_id": 10007,
    "benchmark_id": "aider-polyglot",
    "model_id": "kimi-k2-instruct-0905",
    "score": 0.6,
    "normalized_score": 0.6,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-09-05T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "Aider Polyglot"
  },
  {
    "model_benchmark_id": 10008,
    "benchmark_id": "tau2-retail",
    "model_id": "kimi-k2-instruct-0905",
    "score": 0.706,
    "normalized_score": 0.706,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "Avg@4",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-09-05T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "Tau2 Retail"
  },
  {
    "model_benchmark_id": 10009,
    "benchmark_id": "tau2-airline",
    "model_id": "kimi-k2-instruct-0905",
    "score": 0.565,
    "normalized_score": 0.565,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "Avg@4",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-09-05T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "Tau2 Airline"
  },
  {
    "model_benchmark_id": 10010,
    "benchmark_id": "tau2-telecom",
    "model_id": "kimi-k2-instruct-0905",
    "score": 0.658,
    "normalized_score": 0.658,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "Avg@4",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-09-05T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "Tau2 Telecom"
  },
  {
    "model_benchmark_id": 10011,
    "benchmark_id": "acebench",
    "model_id": "kimi-k2-instruct-0905",
    "score": 0.765,
    "normalized_score": 0.765,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-09-05T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "Acebench"
  },
  {
    "model_benchmark_id": 10012,
    "benchmark_id": "aime-2024",
    "model_id": "kimi-k2-instruct-0905",
    "score": 0.696,
    "normalized_score": 0.696,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "Avg@64",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-09-05T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "Aime 2024"
  },
  {
    "model_benchmark_id": 10013,
    "benchmark_id": "aime-2025",
    "model_id": "kimi-k2-instruct-0905",
    "score": 0.495,
    "normalized_score": 0.495,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "Avg@64",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-09-05T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "Aime 2025"
  },
  {
    "model_benchmark_id": 10014,
    "benchmark_id": "math-500",
    "model_id": "kimi-k2-instruct-0905",
    "score": 0.974,
    "normalized_score": 0.974,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-09-05T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "Math 500"
  },
  {
    "model_benchmark_id": 10015,
    "benchmark_id": "hmmt-2025",
    "model_id": "kimi-k2-instruct-0905",
    "score": 0.388,
    "normalized_score": 0.388,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "Avg@32",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-09-05T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "Hmmt 2025"
  },
  {
    "model_benchmark_id": 10016,
    "benchmark_id": "cnmo-2024",
    "model_id": "kimi-k2-instruct-0905",
    "score": 0.743,
    "normalized_score": 0.743,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "Avg@16",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-09-05T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "Cnmo 2024"
  },
  {
    "model_benchmark_id": 10017,
    "benchmark_id": "polymath-en",
    "model_id": "kimi-k2-instruct-0905",
    "score": 0.651,
    "normalized_score": 0.651,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "Avg@4",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-09-05T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "Polymath En"
  },
  {
    "model_benchmark_id": 10018,
    "benchmark_id": "zebralogic",
    "model_id": "kimi-k2-instruct-0905",
    "score": 0.89,
    "normalized_score": 0.89,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-09-05T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "Zebralogic"
  },
  {
    "model_benchmark_id": 10019,
    "benchmark_id": "autologi",
    "model_id": "kimi-k2-instruct-0905",
    "score": 0.895,
    "normalized_score": 0.895,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-09-05T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "Autologi"
  },
  {
    "model_benchmark_id": 10020,
    "benchmark_id": "gpqa",
    "model_id": "kimi-k2-instruct-0905",
    "score": 0.751,
    "normalized_score": 0.751,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "Diamond - Avg@8",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-09-05T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "Gpqa"
  },
  {
    "model_benchmark_id": 10021,
    "benchmark_id": "supergpqa",
    "model_id": "kimi-k2-instruct-0905",
    "score": 0.572,
    "normalized_score": 0.572,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-09-05T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "Supergpqa"
  },
  {
    "model_benchmark_id": 10022,
    "benchmark_id": "hle",
    "model_id": "kimi-k2-instruct-0905",
    "score": 0.047,
    "normalized_score": 0.047,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "Text Only",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-09-05T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "Hle"
  },
  {
    "model_benchmark_id": 10023,
    "benchmark_id": "mmlu",
    "model_id": "kimi-k2-instruct-0905",
    "score": 0.895,
    "normalized_score": 0.895,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "EM",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-09-05T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "Mmlu"
  },
  {
    "model_benchmark_id": 10024,
    "benchmark_id": "mmlu-redux",
    "model_id": "kimi-k2-instruct-0905",
    "score": 0.927,
    "normalized_score": 0.927,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "EM",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-09-05T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "Mmlu Redux"
  },
  {
    "model_benchmark_id": 10025,
    "benchmark_id": "mmlu-pro",
    "model_id": "kimi-k2-instruct-0905",
    "score": 0.811,
    "normalized_score": 0.811,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "EM",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-09-05T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "Mmlu Pro"
  },
  {
    "model_benchmark_id": 10026,
    "benchmark_id": "ifeval",
    "model_id": "kimi-k2-instruct-0905",
    "score": 0.898,
    "normalized_score": 0.898,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "Prompt Strict",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-09-05T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "Ifeval"
  },
  {
    "model_benchmark_id": 10027,
    "benchmark_id": "multichallenge",
    "model_id": "kimi-k2-instruct-0905",
    "score": 0.541,
    "normalized_score": 0.541,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-09-05T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "Multichallenge"
  },
  {
    "model_benchmark_id": 10028,
    "benchmark_id": "simpleqa",
    "model_id": "kimi-k2-instruct-0905",
    "score": 0.31,
    "normalized_score": 0.31,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "Correct",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-09-05T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "Simpleqa"
  },
  {
    "model_benchmark_id": 10029,
    "benchmark_id": "livebench",
    "model_id": "kimi-k2-instruct-0905",
    "score": 0.764,
    "normalized_score": 0.764,
    "is_self_reported": true,
    "self_reported_source_link": "https://moonshotai.github.io/Kimi-K2/",
    "verified_by_llmstats": false,
    "analysis_method": "2024/11/25 Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-09-05T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "Livebench"
  }
]


================================================
FILE: data/organizations/moonshotai/models/kimi-k2-instruct-0905/model.json
================================================
{
  "model_id": "kimi-k2-instruct-0905",
  "name": "Kimi K2-Instruct-0905",
  "organization_id": "moonshotai",
  "model_family_id": null,
  "fine_tuned_from_model_id": null,
  "description": "Kimi K2-Instruct-0905 is the latest, most capable version of Kimi K2, achieving state-of-the-art performance in frontier knowledge, math, and coding among non-thinking models. This Mixture-of-Experts model features 32 billion activated parameters and 1 trillion total parameters, meticulously optimized for agentic tasks. Key features include enhanced agentic coding intelligence, extended context length to 256K tokens, and a hybrid architecture trained with MuonClip optimizer on 15.5T tokens. The model achieves 65.8% on SWE-bench Verified (single attempt), 47.3% on SWE-bench Multilingual, and excels at tool use with 70.6% on Tau2-retail. It is a reflex-grade model without long thinking, designed to act and execute complex tasks seamlessly.",
  "release_date": "2025-09-05",
  "announcement_date": "2025-09-05",
  "license_id": "mit",
  "multimodal": false,
  "knowledge_cutoff": null,
  "param_count": 1000000000000,
  "training_tokens": 15500000000000,
  "available_in_zeroeval": true,
  "source_api_ref": "https://platform.moonshot.ai",
  "source_playground": "https://kimi.moonshot.cn/",
  "source_paper": "https://github.com/MoonshotAI/Kimi-K2/blob/main/tech_report.pdf",
  "source_scorecard_blog_link": "https://moonshotai.github.io/Kimi-K2/",
  "source_repo_link": "https://github.com/MoonshotAI/Kimi-K2",
  "source_weights_link": "https://huggingface.co/MoonshotAI",
  "created_at": "2025-09-05T00:00:00.000000+00:00",
  "updated_at": "2025-09-15T00:00:00.000000+00:00"
}


================================================
FILE: data/organizations/moonshotai/organization.json
================================================
{
  "organization_id": "moonshotai",
  "name": "Moonshot AI",
  "website": "https://moonshot.cn",
  "description": "Chinese AI company developing the Kimi series of large language models, including state-of-the-art mixture-of-experts models with long-context capabilities",
  "country": "CN",
  "created_at": "2025-07-19T19:49:05.419295+00:00",
  "updated_at": "2025-09-15T00:00:00.000000+00:00"
}


================================================
FILE: data/organizations/nvidia/models/llama-3.1-nemotron-70b-instruct/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 24,
    "benchmark_id": "arc-c",
    "model_id": "llama-3.1-nemotron-70b-instruct",
    "score": 0.692,
    "normalized_score": 0.692,
    "is_self_reported": true,
    "self_reported_source_link": "https://developer.nvidia.com/blog/advancing-the-accuracy-efficiency-frontier-with-llama-3-1-nemotron-51b/",
    "verified_by_llmstats": false,
    "analysis_method": "Standard evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.133318+00:00",
    "updated_at": "2025-07-19T19:56:11.133318+00:00",
    "benchmark_name": "ARC-C"
  },
  {
    "model_benchmark_id": 1005,
    "benchmark_id": "gsm8k",
    "model_id": "llama-3.1-nemotron-70b-instruct",
    "score": 0.9143,
    "normalized_score": 0.9143,
    "is_self_reported": true,
    "self_reported_source_link": "https://developer.nvidia.com/blog/advancing-the-accuracy-efficiency-frontier-with-llama-3-1-nemotron-51b/",
    "verified_by_llmstats": false,
    "analysis_method": "Standard evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.099846+00:00",
    "updated_at": "2025-07-19T19:56:13.099846+00:00",
    "benchmark_name": "GSM8k"
  },
  {
    "model_benchmark_id": 1811,
    "benchmark_id": "gsm8k-chat",
    "model_id": "llama-3.1-nemotron-70b-instruct",
    "score": 0.8188,
    "normalized_score": 0.8188,
    "is_self_reported": true,
    "self_reported_source_link": "https://developer.nvidia.com/blog/advancing-the-accuracy-efficiency-frontier-with-llama-3-1-nemotron-51b/",
    "verified_by_llmstats": false,
    "analysis_method": "Chat evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.104394+00:00",
    "updated_at": "2025-07-19T19:56:15.104394+00:00",
    "benchmark_name": "GSM8K Chat"
  },
  {
    "model_benchmark_id": 50,
    "benchmark_id": "hellaswag",
    "model_id": "llama-3.1-nemotron-70b-instruct",
    "score": 0.8558,
    "normalized_score": 0.8558,
    "is_self_reported": true,
    "self_reported_source_link": "https://developer.nvidia.com/blog/advancing-the-accuracy-efficiency-frontier-with-llama-3-1-nemotron-51b/",
    "verified_by_llmstats": false,
    "analysis_method": "Standard evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.188734+00:00",
    "updated_at": "2025-07-19T19:56:11.188734+00:00",
    "benchmark_name": "HellaSwag"
  },
  {
    "model_benchmark_id": 1812,
    "benchmark_id": "instruct-humaneval",
    "model_id": "llama-3.1-nemotron-70b-instruct",
    "score": 0.7384,
    "normalized_score": 0.7384,
    "is_self_reported": true,
    "self_reported_source_link": "https://developer.nvidia.com/blog/advancing-the-accuracy-efficiency-frontier-with-llama-3-1-nemotron-51b/",
    "verified_by_llmstats": false,
    "analysis_method": "Code evaluation (n=20)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.108307+00:00",
    "updated_at": "2025-07-19T19:56:15.108307+00:00",
    "benchmark_name": "Instruct HumanEval"
  },
  {
    "model_benchmark_id": 102,
    "benchmark_id": "mmlu",
    "model_id": "llama-3.1-nemotron-70b-instruct",
    "score": 0.802,
    "normalized_score": 0.802,
    "is_self_reported": true,
    "self_reported_source_link": "https://developer.nvidia.com/blog/advancing-the-accuracy-efficiency-frontier-with-llama-3-1-nemotron-51b/",
    "verified_by_llmstats": false,
    "analysis_method": "Standard evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.292516+00:00",
    "updated_at": "2025-07-19T19:56:11.292516+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 1810,
    "benchmark_id": "mmlu-chat",
    "model_id": "llama-3.1-nemotron-70b-instruct",
    "score": 0.8058,
    "normalized_score": 0.8058,
    "is_self_reported": true,
    "self_reported_source_link": "https://developer.nvidia.com/blog/advancing-the-accuracy-efficiency-frontier-with-llama-3-1-nemotron-51b/",
    "verified_by_llmstats": false,
    "analysis_method": "Chat evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.100072+00:00",
    "updated_at": "2025-07-19T19:56:15.100072+00:00",
    "benchmark_name": "MMLU Chat"
  },
  {
    "model_benchmark_id": 1611,
    "benchmark_id": "mt-bench",
    "model_id": "llama-3.1-nemotron-70b-instruct",
    "score": 0.0899,
    "normalized_score": 0.0899,
    "is_self_reported": true,
    "self_reported_source_link": "https://developer.nvidia.com/blog/advancing-the-accuracy-efficiency-frontier-with-llama-3-1-nemotron-51b/",
    "verified_by_llmstats": false,
    "analysis_method": "Chat evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.532800+00:00",
    "updated_at": "2025-07-19T19:56:14.532800+00:00",
    "benchmark_name": "MT-Bench"
  },
  {
    "model_benchmark_id": 143,
    "benchmark_id": "truthfulqa",
    "model_id": "llama-3.1-nemotron-70b-instruct",
    "score": 0.5863,
    "normalized_score": 0.5863,
    "is_self_reported": true,
    "self_reported_source_link": "https://developer.nvidia.com/blog/advancing-the-accuracy-efficiency-frontier-with-llama-3-1-nemotron-51b/",
    "verified_by_llmstats": false,
    "analysis_method": "Standard evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.363751+00:00",
    "updated_at": "2025-07-19T19:56:11.363751+00:00",
    "benchmark_name": "TruthfulQA"
  },
  {
    "model_benchmark_id": 153,
    "benchmark_id": "winogrande",
    "model_id": "llama-3.1-nemotron-70b-instruct",
    "score": 0.8453,
    "normalized_score": 0.8453,
    "is_self_reported": true,
    "self_reported_source_link": "https://developer.nvidia.com/blog/advancing-the-accuracy-efficiency-frontier-with-llama-3-1-nemotron-51b/",
    "verified_by_llmstats": false,
    "analysis_method": "Standard evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.390043+00:00",
    "updated_at": "2025-07-19T19:56:11.390043+00:00",
    "benchmark_name": "Winogrande"
  },
  {
    "model_benchmark_id": 1809,
    "benchmark_id": "xlsum-english",
    "model_id": "llama-3.1-nemotron-70b-instruct",
    "score": 0.3161,
    "normalized_score": 0.3161,
    "is_self_reported": true,
    "self_reported_source_link": "https://developer.nvidia.com/blog/advancing-the-accuracy-efficiency-frontier-with-llama-3-1-nemotron-51b/",
    "verified_by_llmstats": false,
    "analysis_method": "Standard evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.094560+00:00",
    "updated_at": "2025-07-19T19:56:15.094560+00:00",
    "benchmark_name": "XLSum English"
  }
]

================================================
FILE: data/organizations/nvidia/models/llama-3.1-nemotron-70b-instruct/model.json
================================================
{
  "model_id": "llama-3.1-nemotron-70b-instruct",
  "name": "Llama 3.1 Nemotron 70B Instruct",
  "organization_id": "nvidia",
  "fine_tuned_from_model_id": "llama-3.1-70b-instruct",
  "description": "A large language model customized by NVIDIA to improve the helpfulness of LLM generated responses. It is a fine-tuned version of Llama 3.1 70B Instruct. The model was trained using RLHF (REINFORCE) with HelpSteer2-Preference prompts.",
  "release_date": "2024-10-01",
  "announcement_date": "2024-10-01",
  "license_id": "llama_3_1_community_license",
  "multimodal": false,
  "knowledge_cutoff": "2023-12-01",
  "param_count": 70000000000,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://build.nvidia.com/nvidia/llama-3_1-nemotron-70b-instruct",
  "source_playground": null,
  "source_paper": "https://arxiv.org/abs/2410.01257",
  "source_scorecard_blog_link": "https://developer.nvidia.com/blog/advancing-the-accuracy-efficiency-frontier-with-llama-3-1-nemotron-51b/",
  "source_repo_link": null,
  "source_weights_link": "https://huggingface.co/nvidia/Llama-3.1-Nemotron-70B-Instruct",
  "created_at": "2025-07-19T19:49:05.908923+00:00",
  "updated_at": "2025-07-19T19:49:05.908923+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/nvidia/models/llama-3.1-nemotron-nano-8b-v1/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 698,
    "benchmark_id": "aime-2025",
    "model_id": "llama-3.1-nemotron-nano-8b-v1",
    "score": 0.471,
    "normalized_score": 0.471,
    "is_self_reported": true,
    "self_reported_source_link": "https://build.nvidia.com/nvidia/llama-3_1-nemotron-nano-8b-v1/modelcard",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1, Reasoning",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.461794+00:00",
    "updated_at": "2025-07-19T19:56:12.461794+00:00",
    "benchmark_name": "AIME 2025"
  },
  {
    "model_benchmark_id": 1586,
    "benchmark_id": "bfcl-v2",
    "model_id": "llama-3.1-nemotron-nano-8b-v1",
    "score": 0.636,
    "normalized_score": 0.636,
    "is_self_reported": true,
    "self_reported_source_link": "https://build.nvidia.com/nvidia/llama-3_1-nemotron-nano-8b-v1/modelcard",
    "verified_by_llmstats": false,
    "analysis_method": "Score, Reasoning",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.454860+00:00",
    "updated_at": "2025-07-19T19:56:14.454860+00:00",
    "benchmark_name": "BFCL v2"
  },
  {
    "model_benchmark_id": 327,
    "benchmark_id": "gpqa",
    "model_id": "llama-3.1-nemotron-nano-8b-v1",
    "score": 0.541,
    "normalized_score": 0.541,
    "is_self_reported": true,
    "self_reported_source_link": "https://build.nvidia.com/nvidia/llama-3_1-nemotron-nano-8b-v1/modelcard",
    "verified_by_llmstats": false,
    "analysis_method": "Diamond, Pass@1, Reasoning",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.719213+00:00",
    "updated_at": "2025-07-19T19:56:11.719213+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 627,
    "benchmark_id": "ifeval",
    "model_id": "llama-3.1-nemotron-nano-8b-v1",
    "score": 0.793,
    "normalized_score": 0.793,
    "is_self_reported": true,
    "self_reported_source_link": "https://build.nvidia.com/nvidia/llama-3_1-nemotron-nano-8b-v1/modelcard",
    "verified_by_llmstats": false,
    "analysis_method": "Strict Accuracy, Reasoning",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.289960+00:00",
    "updated_at": "2025-07-19T19:56:12.289960+00:00",
    "benchmark_name": "IFEval"
  },
  {
    "model_benchmark_id": 510,
    "benchmark_id": "math-500",
    "model_id": "llama-3.1-nemotron-nano-8b-v1",
    "score": 0.954,
    "normalized_score": 0.954,
    "is_self_reported": true,
    "self_reported_source_link": "https://build.nvidia.com/nvidia/llama-3_1-nemotron-nano-8b-v1/modelcard",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1, Reasoning",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.059893+00:00",
    "updated_at": "2025-07-19T19:56:12.059893+00:00",
    "benchmark_name": "MATH-500"
  },
  {
    "model_benchmark_id": 1193,
    "benchmark_id": "mbpp",
    "model_id": "llama-3.1-nemotron-nano-8b-v1",
    "score": 0.846,
    "normalized_score": 0.846,
    "is_self_reported": true,
    "self_reported_source_link": "https://build.nvidia.com/nvidia/llama-3_1-nemotron-nano-8b-v1/modelcard",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot, Pass@1, Reasoning",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.512976+00:00",
    "updated_at": "2025-07-19T19:56:13.512976+00:00",
    "benchmark_name": "MBPP"
  },
  {
    "model_benchmark_id": 1610,
    "benchmark_id": "mt-bench",
    "model_id": "llama-3.1-nemotron-nano-8b-v1",
    "score": 0.81,
    "normalized_score": 0.81,
    "is_self_reported": true,
    "self_reported_source_link": "https://build.nvidia.com/nvidia/llama-3_1-nemotron-nano-8b-v1/modelcard",
    "verified_by_llmstats": false,
    "analysis_method": "Score, Reasoning",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.530016+00:00",
    "updated_at": "2025-07-19T19:56:14.530016+00:00",
    "benchmark_name": "MT-Bench"
  }
]

================================================
FILE: data/organizations/nvidia/models/llama-3.1-nemotron-nano-8b-v1/model.json
================================================
{
  "model_id": "llama-3.1-nemotron-nano-8b-v1",
  "name": "Llama 3.1 Nemotron Nano 8B V1",
  "organization_id": "nvidia",
  "fine_tuned_from_model_id": null,
  "description": "Llama-3.1-Nemotron-Nano-8B-v1 is a large language model (LLM) which is a derivative of Meta Llama-3.1-8B-Instruct (AKA the reference model). It is a reasoning model that is post trained for reasoning, human chat preferences, and tasks, such as RAG and tool calling.",
  "release_date": "2025-03-18",
  "announcement_date": "2025-03-18",
  "license_id": "llama_3_1_community_license",
  "multimodal": false,
  "knowledge_cutoff": "2023-12-31",
  "param_count": 8000000000,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": null,
  "source_playground": "https://build.nvidia.com/nvidia/llama-3_1-nemotron-nano-8b-v1",
  "source_paper": "https://arxiv.org/abs/2502.00203",
  "source_scorecard_blog_link": "https://build.nvidia.com/nvidia/llama-3_1-nemotron-nano-8b-v1/modelcard",
  "source_repo_link": null,
  "source_weights_link": "https://huggingface.co/nvidia/Llama-3.1-Nemotron-Nano-8B-v1",
  "created_at": "2025-07-19T19:49:05.733231+00:00",
  "updated_at": "2025-07-19T19:49:05.733231+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/nvidia/models/llama-3.1-nemotron-ultra-253b-v1/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 699,
    "benchmark_id": "aime-2025",
    "model_id": "llama-3.1-nemotron-ultra-253b-v1",
    "score": 0.725,
    "normalized_score": 0.725,
    "is_self_reported": true,
    "self_reported_source_link": "https://build.nvidia.com/nvidia/llama-3_1-nemotron-ultra-253b-v1/modelcard",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1, Reasoning",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.463355+00:00",
    "updated_at": "2025-07-19T19:56:12.463355+00:00",
    "benchmark_name": "AIME 2025"
  },
  {
    "model_benchmark_id": 1587,
    "benchmark_id": "bfcl-v2",
    "model_id": "llama-3.1-nemotron-ultra-253b-v1",
    "score": 0.741,
    "normalized_score": 0.741,
    "is_self_reported": true,
    "self_reported_source_link": "https://build.nvidia.com/nvidia/llama-3_1-nemotron-ultra-253b-v1/modelcard",
    "verified_by_llmstats": false,
    "analysis_method": "Score, Reasoning",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.456840+00:00",
    "updated_at": "2025-07-19T19:56:14.456840+00:00",
    "benchmark_name": "BFCL v2"
  },
  {
    "model_benchmark_id": 328,
    "benchmark_id": "gpqa",
    "model_id": "llama-3.1-nemotron-ultra-253b-v1",
    "score": 0.7601,
    "normalized_score": 0.7601,
    "is_self_reported": true,
    "self_reported_source_link": "https://build.nvidia.com/nvidia/llama-3_1-nemotron-ultra-253b-v1/modelcard",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1, Reasoning",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.721348+00:00",
    "updated_at": "2025-07-19T19:56:11.721348+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 628,
    "benchmark_id": "ifeval",
    "model_id": "llama-3.1-nemotron-ultra-253b-v1",
    "score": 0.8945,
    "normalized_score": 0.8945,
    "is_self_reported": true,
    "self_reported_source_link": "https://build.nvidia.com/nvidia/llama-3_1-nemotron-ultra-253b-v1/modelcard",
    "verified_by_llmstats": false,
    "analysis_method": "Strict Accuracy, Reasoning",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.292359+00:00",
    "updated_at": "2025-07-19T19:56:12.292359+00:00",
    "benchmark_name": "IFEval"
  },
  {
    "model_benchmark_id": 1143,
    "benchmark_id": "livecodebench",
    "model_id": "llama-3.1-nemotron-ultra-253b-v1",
    "score": 0.6631,
    "normalized_score": 0.6631,
    "is_self_reported": true,
    "self_reported_source_link": "https://build.nvidia.com/nvidia/llama-3_1-nemotron-ultra-253b-v1/modelcard",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1, Reasoning",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.404565+00:00",
    "updated_at": "2025-07-19T19:56:13.404565+00:00",
    "benchmark_name": "LiveCodeBench"
  },
  {
    "model_benchmark_id": 511,
    "benchmark_id": "math-500",
    "model_id": "llama-3.1-nemotron-ultra-253b-v1",
    "score": 0.97,
    "normalized_score": 0.97,
    "is_self_reported": true,
    "self_reported_source_link": "https://build.nvidia.com/nvidia/llama-3_1-nemotron-ultra-253b-v1/modelcard",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1, Reasoning",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.061892+00:00",
    "updated_at": "2025-07-19T19:56:12.061892+00:00",
    "benchmark_name": "MATH-500"
  }
]

================================================
FILE: data/organizations/nvidia/models/llama-3.1-nemotron-ultra-253b-v1/model.json
================================================
{
  "model_id": "llama-3.1-nemotron-ultra-253b-v1",
  "name": "Llama 3.1 Nemotron Ultra 253B v1",
  "organization_id": "nvidia",
  "fine_tuned_from_model_id": null,
  "description": "A 253B parameter derivative of Meta Llama 3.1 405B Instruct, developed by NVIDIA using Neural Architecture Search (NAS) and vertical compression. It underwent multi-phase post-training (SFT for Math, Code, Reasoning, Chat, Tool Calling; RL with GRPO) to enhance reasoning and instruction-following. Optimized for accuracy/efficiency tradeoff on NVIDIA GPUs. Supports 128k context.",
  "release_date": "2025-04-07",
  "announcement_date": "2025-04-07",
  "license_id": "llama_3_1_community_license",
  "multimodal": false,
  "knowledge_cutoff": "2023-12-01",
  "param_count": 253000000000,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": null,
  "source_playground": "https://build.nvidia.com/nvidia/llama-3_1-nemotron-ultra-253b-v1",
  "source_paper": "https://arxiv.org/abs/2502.00203",
  "source_scorecard_blog_link": "https://build.nvidia.com/nvidia/llama-3_1-nemotron-ultra-253b-v1/modelcard",
  "source_repo_link": null,
  "source_weights_link": "https://huggingface.co/nvidia/Llama-3_1-Nemotron-Ultra-253B-v1",
  "created_at": "2025-07-19T19:49:05.735588+00:00",
  "updated_at": "2025-07-19T19:49:05.735588+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/nvidia/models/llama-3.3-nemotron-super-49b-v1/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 697,
    "benchmark_id": "aime-2025",
    "model_id": "llama-3.3-nemotron-super-49b-v1",
    "score": 0.584,
    "normalized_score": 0.584,
    "is_self_reported": true,
    "self_reported_source_link": "https://build.nvidia.com/nvidia/llama-3_3-nemotron-super-49b-v1/modelcard",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1, Reasoning On",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.459628+00:00",
    "updated_at": "2025-07-19T19:56:12.459628+00:00",
    "benchmark_name": "AIME 2025"
  },
  {
    "model_benchmark_id": 1461,
    "benchmark_id": "arena-hard",
    "model_id": "llama-3.3-nemotron-super-49b-v1",
    "score": 0.883,
    "normalized_score": 0.883,
    "is_self_reported": true,
    "self_reported_source_link": "https://build.nvidia.com/nvidia/llama-3_3-nemotron-super-49b-v1/modelcard",
    "verified_by_llmstats": false,
    "analysis_method": "Score, Reasoning Off",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.113375+00:00",
    "updated_at": "2025-07-19T19:56:14.113375+00:00",
    "benchmark_name": "Arena Hard"
  },
  {
    "model_benchmark_id": 1585,
    "benchmark_id": "bfcl-v2",
    "model_id": "llama-3.3-nemotron-super-49b-v1",
    "score": 0.737,
    "normalized_score": 0.737,
    "is_self_reported": true,
    "self_reported_source_link": "https://build.nvidia.com/nvidia/llama-3_3-nemotron-super-49b-v1/modelcard",
    "verified_by_llmstats": false,
    "analysis_method": "Score, Reasoning On",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.452681+00:00",
    "updated_at": "2025-07-19T19:56:14.452681+00:00",
    "benchmark_name": "BFCL v2"
  },
  {
    "model_benchmark_id": 326,
    "benchmark_id": "gpqa",
    "model_id": "llama-3.3-nemotron-super-49b-v1",
    "score": 0.6667,
    "normalized_score": 0.6667,
    "is_self_reported": true,
    "self_reported_source_link": "https://build.nvidia.com/nvidia/llama-3_3-nemotron-super-49b-v1/modelcard",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1, Reasoning On",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.717785+00:00",
    "updated_at": "2025-07-19T19:56:11.717785+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 509,
    "benchmark_id": "math-500",
    "model_id": "llama-3.3-nemotron-super-49b-v1",
    "score": 0.966,
    "normalized_score": 0.966,
    "is_self_reported": true,
    "self_reported_source_link": "https://build.nvidia.com/nvidia/llama-3_3-nemotron-super-49b-v1/modelcard",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1, Reasoning On",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.058280+00:00",
    "updated_at": "2025-07-19T19:56:12.058280+00:00",
    "benchmark_name": "MATH-500"
  },
  {
    "model_benchmark_id": 1192,
    "benchmark_id": "mbpp",
    "model_id": "llama-3.3-nemotron-super-49b-v1",
    "score": 0.913,
    "normalized_score": 0.913,
    "is_self_reported": true,
    "self_reported_source_link": "https://build.nvidia.com/nvidia/llama-3_3-nemotron-super-49b-v1/modelcard",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1, Reasoning On",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.511549+00:00",
    "updated_at": "2025-07-19T19:56:13.511549+00:00",
    "benchmark_name": "MBPP"
  },
  {
    "model_benchmark_id": 1609,
    "benchmark_id": "mt-bench",
    "model_id": "llama-3.3-nemotron-super-49b-v1",
    "score": 0.917,
    "normalized_score": 0.917,
    "is_self_reported": true,
    "self_reported_source_link": "https://build.nvidia.com/nvidia/llama-3_3-nemotron-super-49b-v1/modelcard",
    "verified_by_llmstats": false,
    "analysis_method": "Score, Reasoning On",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.527840+00:00",
    "updated_at": "2025-07-19T19:56:14.527840+00:00",
    "benchmark_name": "MT-Bench"
  }
]

================================================
FILE: data/organizations/nvidia/models/llama-3.3-nemotron-super-49b-v1/model.json
================================================
{
  "model_id": "llama-3.3-nemotron-super-49b-v1",
  "name": "Llama-3.3 Nemotron Super 49B v1",
  "organization_id": "nvidia",
  "fine_tuned_from_model_id": null,
  "description": "Llama-3.3-Nemotron-Super-49B-v1 is a large language model (LLM) derived from Meta Llama-3.3-70B-Instruct. It's post-trained for reasoning, chat, RAG, and tool calling, offering a balance between accuracy and efficiency (optimized for single H100). It underwent multi-phase post-training including SFT and RL (RLOO, RPO).",
  "release_date": "2025-03-18",
  "announcement_date": "2025-03-18",
  "license_id": "llama_3_1_community_license",
  "multimodal": false,
  "knowledge_cutoff": "2023-12-31",
  "param_count": 49900000000,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": null,
  "source_playground": "https://build.nvidia.com/nvidia/llama-3_3-nemotron-super-49b-v1",
  "source_paper": "https://arxiv.org/abs/2502.00203",
  "source_scorecard_blog_link": "https://build.nvidia.com/nvidia/llama-3_3-nemotron-super-49b-v1/modelcard",
  "source_repo_link": null,
  "source_weights_link": "https://huggingface.co/nvidia/Llama-3_3-Nemotron-Super-49B-v1",
  "created_at": "2025-07-19T19:49:05.730826+00:00",
  "updated_at": "2025-07-19T19:49:05.730826+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/nvidia/models/nemotron-nano-9b-v2/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 12345,
    "benchmark_id": "aime-2025",
    "model_id": "nvidia-nemotron-nano-9b-v2",
    "score": 0.721,
    "normalized_score": 0.721,
    "is_self_reported": true,
    "self_reported_source_link": "https://build.nvidia.com/nvidia/nvidia-nemotron-nano-9b-v2/modelcard",
    "verified_by_llmstats": false,
    "analysis_method": "Score, Reasoning On",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-10-04T16:07:30.482+00:00",
    "updated_at": "2025-10-04T16:07:30.482+00:00",
    "benchmark_name": "AIME 2025"
  },
  {
    "model_benchmark_id": 12345,
    "benchmark_id": "math-500",
    "model_id": "nvidia-nemotron-nano-9b-v2",
    "score": 0.978,
    "normalized_score": 0.978,
    "is_self_reported": true,
    "self_reported_source_link": "https://build.nvidia.com/nvidia/nvidia-nemotron-nano-9b-v2/modelcard",
    "verified_by_llmstats": false,
    "analysis_method": "Score, Reasoning On",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-10-04T16:07:30.482+00:00",
    "updated_at": "2025-10-04T16:07:30.482+00:00",
    "benchmark_name": "MATH-500"
  },
  {
    "model_benchmark_id": 12345,
    "benchmark_id": "gpqa",
    "model_id": "nvidia-nemotron-nano-9b-v2",
    "score": 0.640,
    "normalized_score": 0.640,
    "is_self_reported": true,
    "self_reported_source_link": "https://build.nvidia.com/nvidia/nvidia-nemotron-nano-9b-v2/modelcard",
    "verified_by_llmstats": false,
    "analysis_method": "Score, Reasoning On",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-10-04T16:07:30.482+00:00",
    "updated_at": "2025-10-04T16:07:30.482+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 12345,
    "benchmark_id": "livecodebench",
    "model_id": "nvidia-nemotron-nano-9b-v2",
    "score": 0.711,
    "normalized_score": 0.711,
    "is_self_reported": true,
    "self_reported_source_link": "https://build.nvidia.com/nvidia/nvidia-nemotron-nano-9b-v2/modelcard",
    "verified_by_llmstats": false,
    "analysis_method": "Score, Reasoning On",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-10-04T16:07:30.482+00:00",
    "updated_at": "2025-10-04T16:07:30.482+00:00",
    "benchmark_name": "LiveCodeBench"
  },
  {
    "model_benchmark_id": 12345,
    "benchmark_id": "bfcl-v3-multiturn",
    "model_id": "nvidia-nemotron-nano-9b-v2",
    "score": 0.669,
    "normalized_score": 0.669,
    "is_self_reported": true,
    "self_reported_source_link": "https://build.nvidia.com/nvidia/nvidia-nemotron-nano-9b-v2/modelcard",
    "verified_by_llmstats": false,
    "analysis_method": "Score, Reasoning On",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-10-04T16:07:30.482+00:00",
    "updated_at": "2025-10-04T16:07:30.482+00:00",
    "benchmark_name": "BFCL v3"
  },
  {
    "model_benchmark_id": 12345,
    "benchmark_id": "ifeval",
    "model_id": "nvidia-nemotron-nano-9b-v2",
    "score": 0.903,
    "normalized_score": 0.903,
    "is_self_reported": true,
    "self_reported_source_link": "https://build.nvidia.com/nvidia/nvidia-nemotron-nano-9b-v2/modelcard",
    "verified_by_llmstats": false,
    "analysis_method": "Score, Reasoning On",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-10-04T16:07:30.482+00:00",
    "updated_at": "2025-10-04T16:07:30.482+00:00",
    "benchmark_name": "IFEval"
  }
]

================================================
FILE: data/organizations/nvidia/models/nemotron-nano-9b-v2/model.json
================================================
{
  "model_id": "nvidia-nemotron-nano-9b-v2",
  "name": "Nemotron Nano 9B v2",
  "organization_id": "nvidia",
  "fine_tuned_from_model_id": null,
  "description": "NVIDIA-Nemotron-Nano-9B-v2 is a large language model (LLM) trained from scratch by NVIDIA, and designed as a unified model for both reasoning and non-reasoning tasks. It responds to user queries and tasks by first generating a reasoning trace and then concluding with a final response. The model's reasoning capabilities can be controlled via a system prompt. If the user prefers the model to provide its final answer without intermediate reasoning traces, it can be configured to do so, albeit with a slight decrease in accuracy for harder prompts that require reasoning. Conversely, allowing the model to generate reasoning traces first generally results in higher-quality final solutions to queries and tasks.",
  "release_date": "2025-08-18",
  "announcement_date": "2025-08-18",
  "license_id": "nvidia_open_model_license_agreement",
  "multimodal": false,
  "knowledge_cutoff": "2024-09",
  "param_count": 8900000000,
  "training_tokens": 21100000000000,
  "available_in_zeroeval": true,
  "source_api_ref": null,
  "source_playground": "https://build.nvidia.com/nvidia/nvidia-nemotron-nano-9b-v2",
  "source_paper": "https://arxiv.org/abs/2508.14444",
  "source_scorecard_blog_link": "https://build.nvidia.com/nvidia/nvidia-nemotron-nano-9b-v2/modelcard",
  "source_repo_link": null,
  "source_weights_link": "https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-9B-v2",
  "created_at": "2025-10-02T21:51:16.835+00:00",
  "updated_at": "2025-10-02T21:51:16.835+00:00",
  "model_family_id": null
}


================================================
FILE: data/organizations/nvidia/organization.json
================================================
{
  "organization_id": "nvidia",
  "name": "NVIDIA",
  "website": "https://nvidia.com",
  "description": "GPU and AI company",
  "country": "US",
  "created_at": "2025-07-19T19:49:05.728519+00:00",
  "updated_at": "2025-07-19T19:49:05.728519+00:00"
}


================================================
FILE: data/organizations/openai/models/gpt-3.5-turbo-0125/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 963,
    "benchmark_id": "drop",
    "model_id": "gpt-3.5-turbo-0125",
    "score": 0.702,
    "normalized_score": 0.702,
    "is_self_reported": false,
    "self_reported_source_link": "https://example.com/benchmark-image",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.025267+00:00",
    "updated_at": "2025-07-19T19:56:13.025267+00:00",
    "benchmark_name": "DROP"
  },
  {
    "model_benchmark_id": 359,
    "benchmark_id": "gpqa",
    "model_id": "gpt-3.5-turbo-0125",
    "score": 0.308,
    "normalized_score": 0.308,
    "is_self_reported": false,
    "self_reported_source_link": "https://example.com/benchmark-image",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.770449+00:00",
    "updated_at": "2025-07-19T19:56:11.770449+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 815,
    "benchmark_id": "humaneval",
    "model_id": "gpt-3.5-turbo-0125",
    "score": 0.68,
    "normalized_score": 0.68,
    "is_self_reported": false,
    "self_reported_source_link": "https://example.com/benchmark-image",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.697970+00:00",
    "updated_at": "2025-07-19T19:56:12.697970+00:00",
    "benchmark_name": "HumanEval"
  },
  {
    "model_benchmark_id": 429,
    "benchmark_id": "math",
    "model_id": "gpt-3.5-turbo-0125",
    "score": 0.431,
    "normalized_score": 0.431,
    "is_self_reported": false,
    "self_reported_source_link": "https://example.com/benchmark-image",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.906977+00:00",
    "updated_at": "2025-07-19T19:56:11.906977+00:00",
    "benchmark_name": "MATH"
  },
  {
    "model_benchmark_id": 547,
    "benchmark_id": "mathvista",
    "model_id": "gpt-3.5-turbo-0125",
    "score": 0.0,
    "normalized_score": 0.0,
    "is_self_reported": false,
    "self_reported_source_link": "https://example.com/benchmark-image",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.127494+00:00",
    "updated_at": "2025-07-19T19:56:12.127494+00:00",
    "benchmark_name": "MathVista"
  },
  {
    "model_benchmark_id": 1299,
    "benchmark_id": "mgsm",
    "model_id": "gpt-3.5-turbo-0125",
    "score": 0.563,
    "normalized_score": 0.563,
    "is_self_reported": false,
    "self_reported_source_link": "https://example.com/benchmark-image",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.717321+00:00",
    "updated_at": "2025-07-19T19:56:13.717321+00:00",
    "benchmark_name": "MGSM"
  },
  {
    "model_benchmark_id": 126,
    "benchmark_id": "mmlu",
    "model_id": "gpt-3.5-turbo-0125",
    "score": 0.698,
    "normalized_score": 0.698,
    "is_self_reported": false,
    "self_reported_source_link": "https://example.com/benchmark-image",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.331664+00:00",
    "updated_at": "2025-07-19T19:56:11.331664+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 597,
    "benchmark_id": "mmmu",
    "model_id": "gpt-3.5-turbo-0125",
    "score": 0.0,
    "normalized_score": 0.0,
    "is_self_reported": false,
    "self_reported_source_link": "https://example.com/benchmark-image",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.230222+00:00",
    "updated_at": "2025-07-19T19:56:12.230222+00:00",
    "benchmark_name": "MMMU"
  }
]

================================================
FILE: data/organizations/openai/models/gpt-3.5-turbo-0125/model.json
================================================
{
  "model_id": "gpt-3.5-turbo-0125",
  "name": "GPT-3.5 Turbo",
  "organization_id": "openai",
  "fine_tuned_from_model_id": null,
  "description": "The latest GPT-3.5 Turbo model with higher accuracy at responding in requested formats and a fix for a bug which caused a text encoding issue for non-English language function calls.",
  "release_date": "2023-03-21",
  "announcement_date": "2023-03-21",
  "license_id": "proprietary",
  "multimodal": false,
  "knowledge_cutoff": "2021-09-30",
  "param_count": null,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://platform.openai.com/docs/models/gpt-3-5-turbo",
  "source_playground": "https://platform.openai.com/playground",
  "source_paper": null,
  "source_scorecard_blog_link": "https://openai.com/blog/new-models-and-developer-products-announced-at-devday",
  "source_repo_link": null,
  "source_weights_link": null,
  "created_at": "2025-07-19T19:49:05.858492+00:00",
  "updated_at": "2025-07-19T19:49:05.858492+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/openai/models/gpt-4-0613/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 1917,
    "benchmark_id": "ai2-reasoning-challenge-(arc)",
    "model_id": "gpt-4-0613",
    "score": 0.963,
    "normalized_score": 0.963,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/research/gpt-4",
    "verified_by_llmstats": false,
    "analysis_method": "25-shot, Grade-school multiple choice science questions (Challenge-set)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.421959+00:00",
    "updated_at": "2025-07-19T19:56:15.421959+00:00",
    "benchmark_name": "AI2 Reasoning Challenge (ARC)"
  },
  {
    "model_benchmark_id": 965,
    "benchmark_id": "drop",
    "model_id": "gpt-4-0613",
    "score": 0.809,
    "normalized_score": 0.809,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/research/gpt-4",
    "verified_by_llmstats": false,
    "analysis_method": "3-shot, Reading comprehension & arithmetic (f1 score)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.028099+00:00",
    "updated_at": "2025-07-19T19:56:13.028099+00:00",
    "benchmark_name": "DROP"
  },
  {
    "model_benchmark_id": 362,
    "benchmark_id": "gpqa",
    "model_id": "gpt-4-0613",
    "score": 0.357,
    "normalized_score": 0.357,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/hello-gpt-4o/",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot, Commonsense reasoning",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.775863+00:00",
    "updated_at": "2025-07-19T19:56:11.775863+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 55,
    "benchmark_id": "hellaswag",
    "model_id": "gpt-4-0613",
    "score": 0.953,
    "normalized_score": 0.953,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/research/gpt-4",
    "verified_by_llmstats": false,
    "analysis_method": "10-shot, Commonsense reasoning around everyday events",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.199031+00:00",
    "updated_at": "2025-07-19T19:56:11.199031+00:00",
    "benchmark_name": "HellaSwag"
  },
  {
    "model_benchmark_id": 817,
    "benchmark_id": "humaneval",
    "model_id": "gpt-4-0613",
    "score": 0.67,
    "normalized_score": 0.67,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/research/gpt-4",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot, Python coding tasks",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.702020+00:00",
    "updated_at": "2025-07-19T19:56:12.702020+00:00",
    "benchmark_name": "HumanEval"
  },
  {
    "model_benchmark_id": 1915,
    "benchmark_id": "lsat",
    "model_id": "gpt-4-0613",
    "score": 0.88,
    "normalized_score": 0.88,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/research/gpt-4",
    "verified_by_llmstats": false,
    "analysis_method": "Percentile score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.413295+00:00",
    "updated_at": "2025-07-19T19:56:15.413295+00:00",
    "benchmark_name": "LSAT"
  },
  {
    "model_benchmark_id": 432,
    "benchmark_id": "math",
    "model_id": "gpt-4-0613",
    "score": 0.42,
    "normalized_score": 0.42,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/hello-gpt-4o/",
    "verified_by_llmstats": false,
    "analysis_method": "Mathematics problem-solving",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.913379+00:00",
    "updated_at": "2025-07-19T19:56:11.913379+00:00",
    "benchmark_name": "MATH"
  },
  {
    "model_benchmark_id": 1302,
    "benchmark_id": "mgsm",
    "model_id": "gpt-4-0613",
    "score": 0.745,
    "normalized_score": 0.745,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/hello-gpt-4o/",
    "verified_by_llmstats": false,
    "analysis_method": "Mathematics problem-solving",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.721873+00:00",
    "updated_at": "2025-07-19T19:56:13.721873+00:00",
    "benchmark_name": "MGSM"
  },
  {
    "model_benchmark_id": 129,
    "benchmark_id": "mmlu",
    "model_id": "gpt-4-0613",
    "score": 0.864,
    "normalized_score": 0.864,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/research/gpt-4",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot, Multiple-choice questions in 57 subjects (professional & academic)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.336601+00:00",
    "updated_at": "2025-07-19T19:56:11.336601+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 1916,
    "benchmark_id": "sat-math",
    "model_id": "gpt-4-0613",
    "score": 0.89,
    "normalized_score": 0.89,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/research/gpt-4",
    "verified_by_llmstats": false,
    "analysis_method": "Estimated from reported score of 710 out of 800",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.417889+00:00",
    "updated_at": "2025-07-19T19:56:15.417889+00:00",
    "benchmark_name": "SAT Math"
  },
  {
    "model_benchmark_id": 1914,
    "benchmark_id": "uniform-bar-exam",
    "model_id": "gpt-4-0613",
    "score": 0.9,
    "normalized_score": 0.9,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/research/gpt-4",
    "verified_by_llmstats": false,
    "analysis_method": "Percentage score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.408427+00:00",
    "updated_at": "2025-07-19T19:56:15.408427+00:00",
    "benchmark_name": "Uniform Bar Exam"
  },
  {
    "model_benchmark_id": 156,
    "benchmark_id": "winogrande",
    "model_id": "gpt-4-0613",
    "score": 0.875,
    "normalized_score": 0.875,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/research/gpt-4",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot, Commonsense reasoning around pronoun resolution",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.396099+00:00",
    "updated_at": "2025-07-19T19:56:11.396099+00:00",
    "benchmark_name": "Winogrande"
  }
]

================================================
FILE: data/organizations/openai/models/gpt-4-0613/model.json
================================================
{
  "model_id": "gpt-4-0613",
  "name": "GPT-4",
  "organization_id": "openai",
  "fine_tuned_from_model_id": null,
  "description": "GPT-4 is a large multimodal model capable of processing both image and text inputs and generating human-like text outputs. It demonstrates human-level performance on various professional and academic benchmarks.",
  "release_date": "2023-06-13",
  "announcement_date": "2023-06-13",
  "license_id": "proprietary",
  "multimodal": true,
  "knowledge_cutoff": "2022-12-31",
  "param_count": null,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://platform.openai.com/docs/api-reference/chat",
  "source_playground": "https://platform.openai.com/playground",
  "source_paper": "https://arxiv.org/abs/2303.08774",
  "source_scorecard_blog_link": "https://openai.com/research/gpt-4",
  "source_repo_link": "https://github.com/openai/gpt-4",
  "source_weights_link": null,
  "created_at": "2025-07-19T19:49:05.869531+00:00",
  "updated_at": "2025-07-19T19:49:05.869531+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/openai/models/gpt-4-turbo-2024-04-09/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 966,
    "benchmark_id": "drop",
    "model_id": "gpt-4-turbo-2024-04-09",
    "score": 0.86,
    "normalized_score": 0.86,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/hello-gpt-4o/",
    "verified_by_llmstats": false,
    "analysis_method": "Reading comprehension & arithmetic (f1 score)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.030041+00:00",
    "updated_at": "2025-07-19T19:56:13.030041+00:00",
    "benchmark_name": "DROP"
  },
  {
    "model_benchmark_id": 363,
    "benchmark_id": "gpqa",
    "model_id": "gpt-4-turbo-2024-04-09",
    "score": 0.48,
    "normalized_score": 0.48,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/hello-gpt-4o/",
    "verified_by_llmstats": false,
    "analysis_method": "General-Purpose Question Answering",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.777899+00:00",
    "updated_at": "2025-07-19T19:56:11.777899+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 818,
    "benchmark_id": "humaneval",
    "model_id": "gpt-4-turbo-2024-04-09",
    "score": 0.871,
    "normalized_score": 0.871,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/hello-gpt-4o/",
    "verified_by_llmstats": false,
    "analysis_method": "Python coding tasks",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.703615+00:00",
    "updated_at": "2025-07-19T19:56:12.703615+00:00",
    "benchmark_name": "HumanEval"
  },
  {
    "model_benchmark_id": 433,
    "benchmark_id": "math",
    "model_id": "gpt-4-turbo-2024-04-09",
    "score": 0.726,
    "normalized_score": 0.726,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/hello-gpt-4o/",
    "verified_by_llmstats": false,
    "analysis_method": "Mathematics problem-solving",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.916360+00:00",
    "updated_at": "2025-07-19T19:56:11.916360+00:00",
    "benchmark_name": "MATH"
  },
  {
    "model_benchmark_id": 1303,
    "benchmark_id": "mgsm",
    "model_id": "gpt-4-turbo-2024-04-09",
    "score": 0.885,
    "normalized_score": 0.885,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/hello-gpt-4o/",
    "verified_by_llmstats": false,
    "analysis_method": "Grade School Math Word Problems",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.723556+00:00",
    "updated_at": "2025-07-19T19:56:13.723556+00:00",
    "benchmark_name": "MGSM"
  },
  {
    "model_benchmark_id": 130,
    "benchmark_id": "mmlu",
    "model_id": "gpt-4-turbo-2024-04-09",
    "score": 0.865,
    "normalized_score": 0.865,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/hello-gpt-4o/",
    "verified_by_llmstats": false,
    "analysis_method": "Multiple-choice questions in 57 subjects (professional & academic)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.337995+00:00",
    "updated_at": "2025-07-19T19:56:11.337995+00:00",
    "benchmark_name": "MMLU"
  }
]

================================================
FILE: data/organizations/openai/models/gpt-4-turbo-2024-04-09/model.json
================================================
{
  "model_id": "gpt-4-turbo-2024-04-09",
  "name": "GPT-4 Turbo",
  "organization_id": "openai",
  "fine_tuned_from_model_id": null,
  "description": "The latest GPT-4 model with improved performance, updated knowledge, and enhanced capabilities. It offers faster response times and more affordable pricing compared to previous versions.",
  "release_date": "2024-04-09",
  "announcement_date": "2024-04-09",
  "license_id": "proprietary",
  "multimodal": false,
  "knowledge_cutoff": "2023-12-31",
  "param_count": null,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4",
  "source_playground": "https://platform.openai.com/playground",
  "source_paper": null,
  "source_scorecard_blog_link": "https://openai.com/index/new-models-and-developer-products-announced-at-devday/",
  "source_repo_link": null,
  "source_weights_link": null,
  "created_at": "2025-07-19T19:49:05.872559+00:00",
  "updated_at": "2025-07-19T19:49:05.872559+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/openai/models/gpt-4.1-2025-04-14/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 671,
    "benchmark_id": "aider-polyglot",
    "model_id": "gpt-4.1-2025-04-14",
    "score": 0.516,
    "normalized_score": 0.516,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "Standard benchmark",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.389292+00:00",
    "updated_at": "2025-07-19T19:56:12.389292+00:00",
    "benchmark_name": "Aider-Polyglot"
  },
  {
    "model_benchmark_id": 1335,
    "benchmark_id": "aider-polyglot-edit",
    "model_id": "gpt-4.1-2025-04-14",
    "score": 0.529,
    "normalized_score": 0.529,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "Standard benchmark",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.808732+00:00",
    "updated_at": "2025-07-19T19:56:13.808732+00:00",
    "benchmark_name": "Aider-Polyglot Edit"
  },
  {
    "model_benchmark_id": 486,
    "benchmark_id": "aime-2024",
    "model_id": "gpt-4.1-2025-04-14",
    "score": 0.481,
    "normalized_score": 0.481,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "Standard benchmark",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.019979+00:00",
    "updated_at": "2025-07-19T19:56:12.019979+00:00",
    "benchmark_name": "AIME 2024"
  },
  {
    "model_benchmark_id": 1889,
    "benchmark_id": "charxiv-d",
    "model_id": "gpt-4.1-2025-04-14",
    "score": 0.879,
    "normalized_score": 0.879,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "Standard benchmark",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.330689+00:00",
    "updated_at": "2025-07-19T19:56:15.330689+00:00",
    "benchmark_name": "CharXiv-D"
  },
  {
    "model_benchmark_id": 1837,
    "benchmark_id": "charxiv-r",
    "model_id": "gpt-4.1-2025-04-14",
    "score": 0.567,
    "normalized_score": 0.567,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "Standard benchmark",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.201588+00:00",
    "updated_at": "2025-07-19T19:56:15.201588+00:00",
    "benchmark_name": "CharXiv-R"
  },
  {
    "model_benchmark_id": 1860,
    "benchmark_id": "collie",
    "model_id": "gpt-4.1-2025-04-14",
    "score": 0.658,
    "normalized_score": 0.658,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "Standard benchmark",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.261360+00:00",
    "updated_at": "2025-07-19T19:56:15.261360+00:00",
    "benchmark_name": "COLLIE"
  },
  {
    "model_benchmark_id": 1895,
    "benchmark_id": "complexfuncbench",
    "model_id": "gpt-4.1-2025-04-14",
    "score": 0.655,
    "normalized_score": 0.655,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "Standard benchmark",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.348011+00:00",
    "updated_at": "2025-07-19T19:56:15.348011+00:00",
    "benchmark_name": "ComplexFuncBench"
  },
  {
    "model_benchmark_id": 353,
    "benchmark_id": "gpqa",
    "model_id": "gpt-4.1-2025-04-14",
    "score": 0.663,
    "normalized_score": 0.663,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "Diamond",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.761405+00:00",
    "updated_at": "2025-07-19T19:56:11.761405+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 1874,
    "benchmark_id": "graphwalks-bfs-<128k",
    "model_id": "gpt-4.1-2025-04-14",
    "score": 0.617,
    "normalized_score": 0.617,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "Standard benchmark",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.294683+00:00",
    "updated_at": "2025-07-19T19:56:15.294683+00:00",
    "benchmark_name": "Graphwalks BFS <128k"
  },
  {
    "model_benchmark_id": 1877,
    "benchmark_id": "graphwalks-bfs->128k",
    "model_id": "gpt-4.1-2025-04-14",
    "score": 0.19,
    "normalized_score": 0.19,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "Internal benchmark",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.302353+00:00",
    "updated_at": "2025-07-19T19:56:15.302353+00:00",
    "benchmark_name": "Graphwalks BFS >128k"
  },
  {
    "model_benchmark_id": 1881,
    "benchmark_id": "graphwalks-parents-<128k",
    "model_id": "gpt-4.1-2025-04-14",
    "score": 0.58,
    "normalized_score": 0.58,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "Internal benchmark",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.312231+00:00",
    "updated_at": "2025-07-19T19:56:15.312231+00:00",
    "benchmark_name": "Graphwalks parents <128k"
  },
  {
    "model_benchmark_id": 1886,
    "benchmark_id": "graphwalks-parents->128k",
    "model_id": "gpt-4.1-2025-04-14",
    "score": 0.25,
    "normalized_score": 0.25,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "Internal benchmark",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.324002+00:00",
    "updated_at": "2025-07-19T19:56:15.324002+00:00",
    "benchmark_name": "Graphwalks parents >128k"
  },
  {
    "model_benchmark_id": 635,
    "benchmark_id": "ifeval",
    "model_id": "gpt-4.1-2025-04-14",
    "score": 0.874,
    "normalized_score": 0.874,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "Standard benchmark",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.304284+00:00",
    "updated_at": "2025-07-19T19:56:12.304284+00:00",
    "benchmark_name": "IFEval"
  },
  {
    "model_benchmark_id": 1848,
    "benchmark_id": "internal-api-instruction-following-(hard)",
    "model_id": "gpt-4.1-2025-04-14",
    "score": 0.491,
    "normalized_score": 0.491,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "Internal benchmark",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.230360+00:00",
    "updated_at": "2025-07-19T19:56:15.230360+00:00",
    "benchmark_name": "Internal API instruction following (hard)"
  },
  {
    "model_benchmark_id": 543,
    "benchmark_id": "mathvista",
    "model_id": "gpt-4.1-2025-04-14",
    "score": 0.722,
    "normalized_score": 0.722,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "Standard benchmark",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.121168+00:00",
    "updated_at": "2025-07-19T19:56:12.121168+00:00",
    "benchmark_name": "MathVista"
  },
  {
    "model_benchmark_id": 121,
    "benchmark_id": "mmlu",
    "model_id": "gpt-4.1-2025-04-14",
    "score": 0.902,
    "normalized_score": 0.902,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "Standard benchmark",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.323612+00:00",
    "updated_at": "2025-07-19T19:56:11.323612+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 1483,
    "benchmark_id": "mmmlu",
    "model_id": "gpt-4.1-2025-04-14",
    "score": 0.873,
    "normalized_score": 0.873,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "Standard benchmark",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.161058+00:00",
    "updated_at": "2025-07-19T19:56:14.161058+00:00",
    "benchmark_name": "MMMLU"
  },
  {
    "model_benchmark_id": 593,
    "benchmark_id": "mmmu",
    "model_id": "gpt-4.1-2025-04-14",
    "score": 0.748,
    "normalized_score": 0.748,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "Standard benchmark",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.222754+00:00",
    "updated_at": "2025-07-19T19:56:12.222754+00:00",
    "benchmark_name": "MMMU"
  },
  {
    "model_benchmark_id": 743,
    "benchmark_id": "multichallenge",
    "model_id": "gpt-4.1-2025-04-14",
    "score": 0.383,
    "normalized_score": 0.383,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "Standard benchmark (GPT-4o grader)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.561934+00:00",
    "updated_at": "2025-07-19T19:56:12.561934+00:00",
    "benchmark_name": "MultiChallenge"
  },
  {
    "model_benchmark_id": 1854,
    "benchmark_id": "multichallenge-(o3-mini-grader)",
    "model_id": "gpt-4.1-2025-04-14",
    "score": 0.462,
    "normalized_score": 0.462,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "Standard benchmark (o3-mini grader, see footnote [3])",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.244951+00:00",
    "updated_at": "2025-07-19T19:56:15.244951+00:00",
    "benchmark_name": "MultiChallenge (o3-mini grader)"
  },
  {
    "model_benchmark_id": 1653,
    "benchmark_id": "multi-if",
    "model_id": "gpt-4.1-2025-04-14",
    "score": 0.708,
    "normalized_score": 0.708,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "Standard benchmark",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.648170+00:00",
    "updated_at": "2025-07-19T19:56:14.648170+00:00",
    "benchmark_name": "Multi-IF"
  },
  {
    "model_benchmark_id": 1866,
    "benchmark_id": "openai-mrcr:-2-needle-128k",
    "model_id": "gpt-4.1-2025-04-14",
    "score": 0.572,
    "normalized_score": 0.572,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "Internal benchmark",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.275855+00:00",
    "updated_at": "2025-07-19T19:56:15.275855+00:00",
    "benchmark_name": "OpenAI-MRCR: 2 needle 128k"
  },
  {
    "model_benchmark_id": 1871,
    "benchmark_id": "openai-mrcr:-2-needle-1m",
    "model_id": "gpt-4.1-2025-04-14",
    "score": 0.463,
    "normalized_score": 0.463,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "Internal benchmark",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.286394+00:00",
    "updated_at": "2025-07-19T19:56:15.286394+00:00",
    "benchmark_name": "OpenAI-MRCR: 2 needle 1M"
  },
  {
    "model_benchmark_id": 1358,
    "benchmark_id": "swe-bench-verified",
    "model_id": "gpt-4.1-2025-04-14",
    "score": 0.546,
    "normalized_score": 0.546,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "Internal methodology, see source footnote [2]",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.858938+00:00",
    "updated_at": "2025-07-19T19:56:13.858938+00:00",
    "benchmark_name": "SWE-Bench Verified"
  },
  {
    "model_benchmark_id": 1780,
    "benchmark_id": "tau-bench-airline",
    "model_id": "gpt-4.1-2025-04-14",
    "score": 0.494,
    "normalized_score": 0.494,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "Avg 5 runs, no custom tools/prompting (footnote [4])",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.015514+00:00",
    "updated_at": "2025-07-19T19:56:15.015514+00:00",
    "benchmark_name": "TAU-bench Airline"
  },
  {
    "model_benchmark_id": 1766,
    "benchmark_id": "tau-bench-retail",
    "model_id": "gpt-4.1-2025-04-14",
    "score": 0.68,
    "normalized_score": 0.68,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "Avg 5 runs, no custom tools/prompting (footnote [4], GPT-4o user model)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.986496+00:00",
    "updated_at": "2025-07-19T19:56:14.986496+00:00",
    "benchmark_name": "TAU-bench Retail"
  },
  {
    "model_benchmark_id": 1907,
    "benchmark_id": "video-mme-(long,-no-subtitles)",
    "model_id": "gpt-4.1-2025-04-14",
    "score": 0.72,
    "normalized_score": 0.72,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "Standard benchmark",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.377204+00:00",
    "updated_at": "2025-07-19T19:56:15.377204+00:00",
    "benchmark_name": "Video-MME (long, no subtitles)"
  },
  {
    "model_benchmark_id": 10011,
    "benchmark_id": "aime-2025",
    "model_id": "gpt-4.1-2025-04-14",
    "score": 0.464,
    "normalized_score": 0.464,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "GPT-4.1 with no tools - Competition mathematics (AIME 2025).",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "AIME 2025"
  },
  {
    "model_benchmark_id": 10012,
    "benchmark_id": "humanity's-last-exam",
    "model_id": "gpt-4.1-2025-04-14",
    "score": 0.054,
    "normalized_score": 0.054,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "GPT-4.1 with no tools - Expert-level questions across subjects.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "Humanity's Last Exam"
  },
  {
    "model_benchmark_id": 10013,
    "benchmark_id": "hmmt-2025",
    "model_id": "gpt-4.1-2025-04-14",
    "score": 0.289,
    "normalized_score": 0.289,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "GPT-4.1 with no tools - Harvard-MIT Mathematics Tournament.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "HMMT 2025"
  }
]


================================================
FILE: data/organizations/openai/models/gpt-4.1-2025-04-14/model.json
================================================
{
  "model_id": "gpt-4.1-2025-04-14",
  "name": "GPT-4.1",
  "organization_id": "openai",
  "fine_tuned_from_model_id": null,
  "description": "GPT-4.1 is OpenAI's latest and most advanced flagship model, significantly improving upon GPT-4 Turbo in performance across benchmarks, speed, and cost-effectiveness.",
  "release_date": "2025-04-14",
  "announcement_date": "2025-04-14",
  "license_id": "proprietary",
  "multimodal": true,
  "knowledge_cutoff": "2024-06-01",
  "param_count": null,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://platform.openai.com/docs/models/gpt-4.1",
  "source_playground": "https://platform.openai.com/playground?mode=chat&model=gpt-4.1",
  "source_paper": null,
  "source_scorecard_blog_link": "https://openai.com/index/gpt-4-1/",
  "source_repo_link": null,
  "source_weights_link": null,
  "created_at": "2025-07-19T19:49:05.841143+00:00",
  "updated_at": "2025-07-19T19:49:05.841143+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/openai/models/gpt-4.1-mini-2025-04-14/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 667,
    "benchmark_id": "aider-polyglot",
    "model_id": "gpt-4.1-mini-2025-04-14",
    "score": 0.347,
    "normalized_score": 0.347,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "Standard benchmark",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.382631+00:00",
    "updated_at": "2025-07-19T19:56:12.382631+00:00",
    "benchmark_name": "Aider-Polyglot"
  },
  {
    "model_benchmark_id": 1331,
    "benchmark_id": "aider-polyglot-edit",
    "model_id": "gpt-4.1-mini-2025-04-14",
    "score": 0.316,
    "normalized_score": 0.316,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "Standard benchmark",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.801113+00:00",
    "updated_at": "2025-07-19T19:56:13.801113+00:00",
    "benchmark_name": "Aider-Polyglot Edit"
  },
  {
    "model_benchmark_id": 482,
    "benchmark_id": "aime-2024",
    "model_id": "gpt-4.1-mini-2025-04-14",
    "score": 0.496,
    "normalized_score": 0.496,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "Standard benchmark",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.013761+00:00",
    "updated_at": "2025-07-19T19:56:12.013761+00:00",
    "benchmark_name": "AIME 2024"
  },
  {
    "model_benchmark_id": 1887,
    "benchmark_id": "charxiv-d",
    "model_id": "gpt-4.1-mini-2025-04-14",
    "score": 0.884,
    "normalized_score": 0.884,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "Standard benchmark",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.327509+00:00",
    "updated_at": "2025-07-19T19:56:15.327509+00:00",
    "benchmark_name": "CharXiv-D"
  },
  {
    "model_benchmark_id": 1834,
    "benchmark_id": "charxiv-r",
    "model_id": "gpt-4.1-mini-2025-04-14",
    "score": 0.568,
    "normalized_score": 0.568,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "Standard benchmark",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.195563+00:00",
    "updated_at": "2025-07-19T19:56:15.195563+00:00",
    "benchmark_name": "CharXiv-R"
  },
  {
    "model_benchmark_id": 1857,
    "benchmark_id": "collie",
    "model_id": "gpt-4.1-mini-2025-04-14",
    "score": 0.546,
    "normalized_score": 0.546,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "Standard benchmark",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.255006+00:00",
    "updated_at": "2025-07-19T19:56:15.255006+00:00",
    "benchmark_name": "COLLIE"
  },
  {
    "model_benchmark_id": 1892,
    "benchmark_id": "complexfuncbench",
    "model_id": "gpt-4.1-mini-2025-04-14",
    "score": 0.493,
    "normalized_score": 0.493,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "Standard benchmark",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.339307+00:00",
    "updated_at": "2025-07-19T19:56:15.339307+00:00",
    "benchmark_name": "ComplexFuncBench"
  },
  {
    "model_benchmark_id": 348,
    "benchmark_id": "gpqa",
    "model_id": "gpt-4.1-mini-2025-04-14",
    "score": 0.65,
    "normalized_score": 0.65,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "Diamond",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.752534+00:00",
    "updated_at": "2025-07-19T19:56:11.752534+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 1872,
    "benchmark_id": "graphwalks-bfs-<128k",
    "model_id": "gpt-4.1-mini-2025-04-14",
    "score": 0.617,
    "normalized_score": 0.617,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "Standard benchmark",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.289789+00:00",
    "updated_at": "2025-07-19T19:56:15.289789+00:00",
    "benchmark_name": "Graphwalks BFS <128k"
  },
  {
    "model_benchmark_id": 1875,
    "benchmark_id": "graphwalks-bfs->128k",
    "model_id": "gpt-4.1-mini-2025-04-14",
    "score": 0.15,
    "normalized_score": 0.15,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "Internal benchmark",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.298708+00:00",
    "updated_at": "2025-07-19T19:56:15.298708+00:00",
    "benchmark_name": "Graphwalks BFS >128k"
  },
  {
    "model_benchmark_id": 1878,
    "benchmark_id": "graphwalks-parents-<128k",
    "model_id": "gpt-4.1-mini-2025-04-14",
    "score": 0.605,
    "normalized_score": 0.605,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "Internal benchmark",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.306151+00:00",
    "updated_at": "2025-07-19T19:56:15.306151+00:00",
    "benchmark_name": "Graphwalks parents <128k"
  },
  {
    "model_benchmark_id": 1884,
    "benchmark_id": "graphwalks-parents->128k",
    "model_id": "gpt-4.1-mini-2025-04-14",
    "score": 0.11,
    "normalized_score": 0.11,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "Internal benchmark",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.319823+00:00",
    "updated_at": "2025-07-19T19:56:15.319823+00:00",
    "benchmark_name": "Graphwalks parents >128k"
  },
  {
    "model_benchmark_id": 632,
    "benchmark_id": "ifeval",
    "model_id": "gpt-4.1-mini-2025-04-14",
    "score": 0.841,
    "normalized_score": 0.841,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "Standard benchmark",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.299050+00:00",
    "updated_at": "2025-07-19T19:56:12.299050+00:00",
    "benchmark_name": "IFEval"
  },
  {
    "model_benchmark_id": 1845,
    "benchmark_id": "internal-api-instruction-following-(hard)",
    "model_id": "gpt-4.1-mini-2025-04-14",
    "score": 0.451,
    "normalized_score": 0.451,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "Internal benchmark",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.225405+00:00",
    "updated_at": "2025-07-19T19:56:15.225405+00:00",
    "benchmark_name": "Internal API instruction following (hard)"
  },
  {
    "model_benchmark_id": 539,
    "benchmark_id": "mathvista",
    "model_id": "gpt-4.1-mini-2025-04-14",
    "score": 0.731,
    "normalized_score": 0.731,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "Standard benchmark",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.114367+00:00",
    "updated_at": "2025-07-19T19:56:12.114367+00:00",
    "benchmark_name": "MathVista"
  },
  {
    "model_benchmark_id": 117,
    "benchmark_id": "mmlu",
    "model_id": "gpt-4.1-mini-2025-04-14",
    "score": 0.875,
    "normalized_score": 0.875,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "Standard benchmark",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.317652+00:00",
    "updated_at": "2025-07-19T19:56:11.317652+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 1481,
    "benchmark_id": "mmmlu",
    "model_id": "gpt-4.1-mini-2025-04-14",
    "score": 0.785,
    "normalized_score": 0.785,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "Standard benchmark",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.157799+00:00",
    "updated_at": "2025-07-19T19:56:14.157799+00:00",
    "benchmark_name": "MMMLU"
  },
  {
    "model_benchmark_id": 590,
    "benchmark_id": "mmmu",
    "model_id": "gpt-4.1-mini-2025-04-14",
    "score": 0.727,
    "normalized_score": 0.727,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "Standard benchmark",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.217019+00:00",
    "updated_at": "2025-07-19T19:56:12.217019+00:00",
    "benchmark_name": "MMMU"
  },
  {
    "model_benchmark_id": 740,
    "benchmark_id": "multichallenge",
    "model_id": "gpt-4.1-mini-2025-04-14",
    "score": 0.358,
    "normalized_score": 0.358,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "Standard benchmark (GPT-4o grader)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.555824+00:00",
    "updated_at": "2025-07-19T19:56:12.555824+00:00",
    "benchmark_name": "MultiChallenge"
  },
  {
    "model_benchmark_id": 1851,
    "benchmark_id": "multichallenge-(o3-mini-grader)",
    "model_id": "gpt-4.1-mini-2025-04-14",
    "score": 0.422,
    "normalized_score": 0.422,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "Standard benchmark (o3-mini grader, see footnote [3])",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.239021+00:00",
    "updated_at": "2025-07-19T19:56:15.239021+00:00",
    "benchmark_name": "MultiChallenge (o3-mini grader)"
  },
  {
    "model_benchmark_id": 1650,
    "benchmark_id": "multi-if",
    "model_id": "gpt-4.1-mini-2025-04-14",
    "score": 0.67,
    "normalized_score": 0.67,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "Standard benchmark",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.643303+00:00",
    "updated_at": "2025-07-19T19:56:14.643303+00:00",
    "benchmark_name": "Multi-IF"
  },
  {
    "model_benchmark_id": 1863,
    "benchmark_id": "openai-mrcr:-2-needle-128k",
    "model_id": "gpt-4.1-mini-2025-04-14",
    "score": 0.472,
    "normalized_score": 0.472,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "Internal benchmark",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.270008+00:00",
    "updated_at": "2025-07-19T19:56:15.270008+00:00",
    "benchmark_name": "OpenAI-MRCR: 2 needle 128k"
  },
  {
    "model_benchmark_id": 1869,
    "benchmark_id": "openai-mrcr:-2-needle-1m",
    "model_id": "gpt-4.1-mini-2025-04-14",
    "score": 0.333,
    "normalized_score": 0.333,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "Internal benchmark",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.282718+00:00",
    "updated_at": "2025-07-19T19:56:15.282718+00:00",
    "benchmark_name": "OpenAI-MRCR: 2 needle 1M"
  },
  {
    "model_benchmark_id": 1355,
    "benchmark_id": "swe-bench-verified",
    "model_id": "gpt-4.1-mini-2025-04-14",
    "score": 0.236,
    "normalized_score": 0.236,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "Internal methodology, see source footnote [2]",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.852737+00:00",
    "updated_at": "2025-07-19T19:56:13.852737+00:00",
    "benchmark_name": "SWE-Bench Verified"
  },
  {
    "model_benchmark_id": 1776,
    "benchmark_id": "tau-bench-airline",
    "model_id": "gpt-4.1-mini-2025-04-14",
    "score": 0.36,
    "normalized_score": 0.36,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "Avg 5 runs, no custom tools/prompting (footnote [4])",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.007636+00:00",
    "updated_at": "2025-07-19T19:56:15.007636+00:00",
    "benchmark_name": "TAU-bench Airline"
  },
  {
    "model_benchmark_id": 1762,
    "benchmark_id": "tau-bench-retail",
    "model_id": "gpt-4.1-mini-2025-04-14",
    "score": 0.558,
    "normalized_score": 0.558,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "Avg 5 runs, no custom tools/prompting (footnote [4], GPT-4o user model)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.978528+00:00",
    "updated_at": "2025-07-19T19:56:14.978528+00:00",
    "benchmark_name": "TAU-bench Retail"
  },
  {
    "model_benchmark_id": 10014,
    "benchmark_id": "aime-2025",
    "model_id": "gpt-4.1-mini-2025-04-14",
    "score": 0.402,
    "normalized_score": 0.402,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "GPT-4.1 mini with no tools - Competition mathematics (AIME 2025).",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "AIME 2025"
  },
  {
    "model_benchmark_id": 10015,
    "benchmark_id": "humanity's-last-exam",
    "model_id": "gpt-4.1-mini-2025-04-14",
    "score": 0.037,
    "normalized_score": 0.037,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "GPT-4.1 mini with no tools - Expert-level questions across subjects.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "Humanity's Last Exam"
  },
  {
    "model_benchmark_id": 10016,
    "benchmark_id": "hmmt-2025",
    "model_id": "gpt-4.1-mini-2025-04-14",
    "score": 0.35,
    "normalized_score": 0.35,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "GPT-4.1 mini with no tools - Harvard-MIT Mathematics Tournament.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "HMMT 2025"
  }
]


================================================
FILE: data/organizations/openai/models/gpt-4.1-mini-2025-04-14/model.json
================================================
{
  "model_id": "gpt-4.1-mini-2025-04-14",
  "name": "GPT-4.1 mini",
  "organization_id": "openai",
  "fine_tuned_from_model_id": null,
  "description": "GPT-4.1 mini provides a balance between intelligence, speed, and cost. It's a significant leap in small model performance, even beating GPT-4o in many benchmarks while reducing latency and cost.",
  "release_date": "2025-04-14",
  "announcement_date": "2025-04-14",
  "license_id": "proprietary",
  "multimodal": true,
  "knowledge_cutoff": "2024-05-31",
  "param_count": null,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://platform.openai.com/docs/models/gpt-4.1-mini",
  "source_playground": "https://platform.openai.com/playground?mode=chat&model=gpt-4.1-mini",
  "source_paper": null,
  "source_scorecard_blog_link": "https://openai.com/index/gpt-4-1/",
  "source_repo_link": null,
  "source_weights_link": null,
  "created_at": "2025-07-19T19:49:05.821382+00:00",
  "updated_at": "2025-07-19T19:49:05.821382+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/openai/models/gpt-4.1-nano-2025-04-14/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 669,
    "benchmark_id": "aider-polyglot",
    "model_id": "gpt-4.1-nano-2025-04-14",
    "score": 0.098,
    "normalized_score": 0.098,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Standard benchmark",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.385924+00:00",
    "updated_at": "2025-07-19T19:56:12.385924+00:00",
    "benchmark_name": "Aider-Polyglot"
  },
  {
    "model_benchmark_id": 1333,
    "benchmark_id": "aider-polyglot-edit",
    "model_id": "gpt-4.1-nano-2025-04-14",
    "score": 0.062,
    "normalized_score": 0.062,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Standard benchmark",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.804864+00:00",
    "updated_at": "2025-07-19T19:56:13.804864+00:00",
    "benchmark_name": "Aider-Polyglot Edit"
  },
  {
    "model_benchmark_id": 484,
    "benchmark_id": "aime-2024",
    "model_id": "gpt-4.1-nano-2025-04-14",
    "score": 0.294,
    "normalized_score": 0.294,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Standard benchmark",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.016856+00:00",
    "updated_at": "2025-07-19T19:56:12.016856+00:00",
    "benchmark_name": "AIME 2024"
  },
  {
    "model_benchmark_id": 1888,
    "benchmark_id": "charxiv-d",
    "model_id": "gpt-4.1-nano-2025-04-14",
    "score": 0.739,
    "normalized_score": 0.739,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Standard benchmark",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.329021+00:00",
    "updated_at": "2025-07-19T19:56:15.329021+00:00",
    "benchmark_name": "CharXiv-D"
  },
  {
    "model_benchmark_id": 1836,
    "benchmark_id": "charxiv-r",
    "model_id": "gpt-4.1-nano-2025-04-14",
    "score": 0.405,
    "normalized_score": 0.405,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Standard benchmark",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.199274+00:00",
    "updated_at": "2025-07-19T19:56:15.199274+00:00",
    "benchmark_name": "CharXiv-R"
  },
  {
    "model_benchmark_id": 1858,
    "benchmark_id": "collie",
    "model_id": "gpt-4.1-nano-2025-04-14",
    "score": 0.425,
    "normalized_score": 0.425,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Standard benchmark",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.257208+00:00",
    "updated_at": "2025-07-19T19:56:15.257208+00:00",
    "benchmark_name": "COLLIE"
  },
  {
    "model_benchmark_id": 1893,
    "benchmark_id": "complexfuncbench",
    "model_id": "gpt-4.1-nano-2025-04-14",
    "score": 0.057,
    "normalized_score": 0.057,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Standard benchmark",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.341699+00:00",
    "updated_at": "2025-07-19T19:56:15.341699+00:00",
    "benchmark_name": "ComplexFuncBench"
  },
  {
    "model_benchmark_id": 350,
    "benchmark_id": "gpqa",
    "model_id": "gpt-4.1-nano-2025-04-14",
    "score": 0.503,
    "normalized_score": 0.503,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Diamond",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.756178+00:00",
    "updated_at": "2025-07-19T19:56:11.756178+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 1873,
    "benchmark_id": "graphwalks-bfs-<128k",
    "model_id": "gpt-4.1-nano-2025-04-14",
    "score": 0.25,
    "normalized_score": 0.25,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Standard benchmark",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.291775+00:00",
    "updated_at": "2025-07-19T19:56:15.291775+00:00",
    "benchmark_name": "Graphwalks BFS <128k"
  },
  {
    "model_benchmark_id": 1876,
    "benchmark_id": "graphwalks-bfs->128k",
    "model_id": "gpt-4.1-nano-2025-04-14",
    "score": 0.029,
    "normalized_score": 0.029,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Internal benchmark",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.300453+00:00",
    "updated_at": "2025-07-19T19:56:15.300453+00:00",
    "benchmark_name": "Graphwalks BFS >128k"
  },
  {
    "model_benchmark_id": 1879,
    "benchmark_id": "graphwalks-parents-<128k",
    "model_id": "gpt-4.1-nano-2025-04-14",
    "score": 0.094,
    "normalized_score": 0.094,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Internal benchmark",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.308330+00:00",
    "updated_at": "2025-07-19T19:56:15.308330+00:00",
    "benchmark_name": "Graphwalks parents <128k"
  },
  {
    "model_benchmark_id": 1885,
    "benchmark_id": "graphwalks-parents->128k",
    "model_id": "gpt-4.1-nano-2025-04-14",
    "score": 0.056,
    "normalized_score": 0.056,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Internal benchmark",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.322097+00:00",
    "updated_at": "2025-07-19T19:56:15.322097+00:00",
    "benchmark_name": "Graphwalks parents >128k"
  },
  {
    "model_benchmark_id": 633,
    "benchmark_id": "ifeval",
    "model_id": "gpt-4.1-nano-2025-04-14",
    "score": 0.745,
    "normalized_score": 0.745,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Standard benchmark",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.300562+00:00",
    "updated_at": "2025-07-19T19:56:12.300562+00:00",
    "benchmark_name": "IFEval"
  },
  {
    "model_benchmark_id": 1846,
    "benchmark_id": "internal-api-instruction-following-(hard)",
    "model_id": "gpt-4.1-nano-2025-04-14",
    "score": 0.316,
    "normalized_score": 0.316,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Internal benchmark",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.227248+00:00",
    "updated_at": "2025-07-19T19:56:15.227248+00:00",
    "benchmark_name": "Internal API instruction following (hard)"
  },
  {
    "model_benchmark_id": 541,
    "benchmark_id": "mathvista",
    "model_id": "gpt-4.1-nano-2025-04-14",
    "score": 0.562,
    "normalized_score": 0.562,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Standard benchmark",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.117553+00:00",
    "updated_at": "2025-07-19T19:56:12.117553+00:00",
    "benchmark_name": "MathVista"
  },
  {
    "model_benchmark_id": 118,
    "benchmark_id": "mmlu",
    "model_id": "gpt-4.1-nano-2025-04-14",
    "score": 0.801,
    "normalized_score": 0.801,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Standard benchmark",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.319012+00:00",
    "updated_at": "2025-07-19T19:56:11.319012+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 1482,
    "benchmark_id": "mmmlu",
    "model_id": "gpt-4.1-nano-2025-04-14",
    "score": 0.669,
    "normalized_score": 0.669,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Standard benchmark",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.159419+00:00",
    "updated_at": "2025-07-19T19:56:14.159419+00:00",
    "benchmark_name": "MMMLU"
  },
  {
    "model_benchmark_id": 592,
    "benchmark_id": "mmmu",
    "model_id": "gpt-4.1-nano-2025-04-14",
    "score": 0.554,
    "normalized_score": 0.554,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Standard benchmark",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.220951+00:00",
    "updated_at": "2025-07-19T19:56:12.220951+00:00",
    "benchmark_name": "MMMU"
  },
  {
    "model_benchmark_id": 741,
    "benchmark_id": "multichallenge",
    "model_id": "gpt-4.1-nano-2025-04-14",
    "score": 0.15,
    "normalized_score": 0.15,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Standard benchmark (GPT-4o grader)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.557571+00:00",
    "updated_at": "2025-07-19T19:56:12.557571+00:00",
    "benchmark_name": "MultiChallenge"
  },
  {
    "model_benchmark_id": 1852,
    "benchmark_id": "multichallenge-(o3-mini-grader)",
    "model_id": "gpt-4.1-nano-2025-04-14",
    "score": 0.311,
    "normalized_score": 0.311,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Standard benchmark (o3-mini grader, see footnote [3])",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.241054+00:00",
    "updated_at": "2025-07-19T19:56:15.241054+00:00",
    "benchmark_name": "MultiChallenge (o3-mini grader)"
  },
  {
    "model_benchmark_id": 1651,
    "benchmark_id": "multi-if",
    "model_id": "gpt-4.1-nano-2025-04-14",
    "score": 0.572,
    "normalized_score": 0.572,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Standard benchmark",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.645047+00:00",
    "updated_at": "2025-07-19T19:56:14.645047+00:00",
    "benchmark_name": "Multi-IF"
  },
  {
    "model_benchmark_id": 1864,
    "benchmark_id": "openai-mrcr:-2-needle-128k",
    "model_id": "gpt-4.1-nano-2025-04-14",
    "score": 0.366,
    "normalized_score": 0.366,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Internal benchmark",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.272341+00:00",
    "updated_at": "2025-07-19T19:56:15.272341+00:00",
    "benchmark_name": "OpenAI-MRCR: 2 needle 128k"
  },
  {
    "model_benchmark_id": 1870,
    "benchmark_id": "openai-mrcr:-2-needle-1m",
    "model_id": "gpt-4.1-nano-2025-04-14",
    "score": 0.12,
    "normalized_score": 0.12,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Internal benchmark",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.284545+00:00",
    "updated_at": "2025-07-19T19:56:15.284545+00:00",
    "benchmark_name": "OpenAI-MRCR: 2 needle 1M"
  },
  {
    "model_benchmark_id": 1778,
    "benchmark_id": "tau-bench-airline",
    "model_id": "gpt-4.1-nano-2025-04-14",
    "score": 0.14,
    "normalized_score": 0.14,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Avg 5 runs, no custom tools/prompting (footnote [4])",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.011934+00:00",
    "updated_at": "2025-07-19T19:56:15.011934+00:00",
    "benchmark_name": "TAU-bench Airline"
  },
  {
    "model_benchmark_id": 1764,
    "benchmark_id": "tau-bench-retail",
    "model_id": "gpt-4.1-nano-2025-04-14",
    "score": 0.226,
    "normalized_score": 0.226,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Avg 5 runs, no custom tools/prompting (footnote [4], GPT-4o user model)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.982239+00:00",
    "updated_at": "2025-07-19T19:56:14.982239+00:00",
    "benchmark_name": "TAU-bench Retail"
  }
]

================================================
FILE: data/organizations/openai/models/gpt-4.1-nano-2025-04-14/model.json
================================================
{
  "model_id": "gpt-4.1-nano-2025-04-14",
  "name": "GPT-4.1 nano",
  "organization_id": "openai",
  "fine_tuned_from_model_id": null,
  "description": "GPT-4.1 nano is OpenAI's fastest and cheapest model available in the GPT-4.1 family. It delivers exceptional performance at a small size with its 1 million token context window. Ideal for tasks like classification or autocompletion.",
  "release_date": "2025-04-14",
  "announcement_date": "2025-04-14",
  "license_id": "proprietary",
  "multimodal": true,
  "knowledge_cutoff": "2024-05-31",
  "param_count": null,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://platform.openai.com/docs/models/gpt-4.1-nano",
  "source_playground": "https://platform.openai.com/playground?mode=chat&model=gpt-4.1-nano",
  "source_paper": null,
  "source_scorecard_blog_link": "https://openai.com/index/gpt-4-1/",
  "source_repo_link": null,
  "source_weights_link": null,
  "created_at": "2025-07-19T19:49:05.827978+00:00",
  "updated_at": "2025-07-19T19:49:05.827978+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/openai/models/gpt-4.5/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 1337,
    "benchmark_id": "aider-polyglot-edit",
    "model_id": "gpt-4.5",
    "score": 0.449,
    "normalized_score": 0.449,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.811839+00:00",
    "updated_at": "2025-07-19T19:56:13.811839+00:00",
    "benchmark_name": "Aider-Polyglot Edit"
  },
  {
    "model_benchmark_id": 489,
    "benchmark_id": "aime-2024",
    "model_id": "gpt-4.5",
    "score": 0.367,
    "normalized_score": 0.367,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.024273+00:00",
    "updated_at": "2025-07-19T19:56:12.024273+00:00",
    "benchmark_name": "AIME 2024"
  },
  {
    "model_benchmark_id": 1891,
    "benchmark_id": "charxiv-d",
    "model_id": "gpt-4.5",
    "score": 0.9,
    "normalized_score": 0.9,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.335527+00:00",
    "updated_at": "2025-07-19T19:56:15.335527+00:00",
    "benchmark_name": "CharXiv-D"
  },
  {
    "model_benchmark_id": 1839,
    "benchmark_id": "charxiv-r",
    "model_id": "gpt-4.5",
    "score": 0.554,
    "normalized_score": 0.554,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.204875+00:00",
    "updated_at": "2025-07-19T19:56:15.204875+00:00",
    "benchmark_name": "CharXiv-R"
  },
  {
    "model_benchmark_id": 1862,
    "benchmark_id": "collie",
    "model_id": "gpt-4.5",
    "score": 0.723,
    "normalized_score": 0.723,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.265565+00:00",
    "updated_at": "2025-07-19T19:56:15.265565+00:00",
    "benchmark_name": "COLLIE"
  },
  {
    "model_benchmark_id": 1897,
    "benchmark_id": "complexfuncbench",
    "model_id": "gpt-4.5",
    "score": 0.63,
    "normalized_score": 0.63,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.351430+00:00",
    "updated_at": "2025-07-19T19:56:15.351430+00:00",
    "benchmark_name": "ComplexFuncBench"
  },
  {
    "model_benchmark_id": 357,
    "benchmark_id": "gpqa",
    "model_id": "gpt-4.5",
    "score": 0.695,
    "normalized_score": 0.695,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy (Diamond)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.767414+00:00",
    "updated_at": "2025-07-19T19:56:11.767414+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 1906,
    "benchmark_id": "graphwalks-bfs-<128k",
    "model_id": "gpt-4.5",
    "score": 0.723,
    "normalized_score": 0.723,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.372855+00:00",
    "updated_at": "2025-07-19T19:56:15.372855+00:00",
    "benchmark_name": "Graphwalks BFS <128k"
  },
  {
    "model_benchmark_id": 1883,
    "benchmark_id": "graphwalks-parents-<128k",
    "model_id": "gpt-4.5",
    "score": 0.726,
    "normalized_score": 0.726,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.315697+00:00",
    "updated_at": "2025-07-19T19:56:15.315697+00:00",
    "benchmark_name": "Graphwalks parents <128k"
  },
  {
    "model_benchmark_id": 1015,
    "benchmark_id": "gsm8k",
    "model_id": "gpt-4.5",
    "score": 0.97,
    "normalized_score": 0.97,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-4-5/",
    "verified_by_llmstats": false,
    "analysis_method": "Answer accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.114869+00:00",
    "updated_at": "2025-07-19T19:56:13.114869+00:00",
    "benchmark_name": "GSM8k"
  },
  {
    "model_benchmark_id": 813,
    "benchmark_id": "humaneval",
    "model_id": "gpt-4.5",
    "score": 0.88,
    "normalized_score": 0.88,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-4-5/",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.694244+00:00",
    "updated_at": "2025-07-19T19:56:12.694244+00:00",
    "benchmark_name": "HumanEval"
  },
  {
    "model_benchmark_id": 637,
    "benchmark_id": "ifeval",
    "model_id": "gpt-4.5",
    "score": 0.882,
    "normalized_score": 0.882,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.307682+00:00",
    "updated_at": "2025-07-19T19:56:12.307682+00:00",
    "benchmark_name": "IFEval"
  },
  {
    "model_benchmark_id": 1850,
    "benchmark_id": "internal-api-instruction-following-(hard)",
    "model_id": "gpt-4.5",
    "score": 0.54,
    "normalized_score": 0.54,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.234022+00:00",
    "updated_at": "2025-07-19T19:56:15.234022+00:00",
    "benchmark_name": "Internal API instruction following (hard)"
  },
  {
    "model_benchmark_id": 545,
    "benchmark_id": "mathvista",
    "model_id": "gpt-4.5",
    "score": 0.723,
    "normalized_score": 0.723,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.124115+00:00",
    "updated_at": "2025-07-19T19:56:12.124115+00:00",
    "benchmark_name": "MathVista"
  },
  {
    "model_benchmark_id": 124,
    "benchmark_id": "mmlu",
    "model_id": "gpt-4.5",
    "score": 0.908,
    "normalized_score": 0.908,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Multiple-choice accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.328688+00:00",
    "updated_at": "2025-07-19T19:56:11.328688+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 1485,
    "benchmark_id": "mmmlu",
    "model_id": "gpt-4.5",
    "score": 0.851,
    "normalized_score": 0.851,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.164320+00:00",
    "updated_at": "2025-07-19T19:56:14.164320+00:00",
    "benchmark_name": "MMMLU"
  },
  {
    "model_benchmark_id": 595,
    "benchmark_id": "mmmu",
    "model_id": "gpt-4.5",
    "score": 0.752,
    "normalized_score": 0.752,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.226731+00:00",
    "updated_at": "2025-07-19T19:56:12.226731+00:00",
    "benchmark_name": "MMMU"
  },
  {
    "model_benchmark_id": 744,
    "benchmark_id": "multichallenge",
    "model_id": "gpt-4.5",
    "score": 0.438,
    "normalized_score": 0.438,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.563438+00:00",
    "updated_at": "2025-07-19T19:56:12.563438+00:00",
    "benchmark_name": "MultiChallenge"
  },
  {
    "model_benchmark_id": 1856,
    "benchmark_id": "multichallenge-(o3-mini-grader)",
    "model_id": "gpt-4.5",
    "score": 0.501,
    "normalized_score": 0.501,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.249385+00:00",
    "updated_at": "2025-07-19T19:56:15.249385+00:00",
    "benchmark_name": "MultiChallenge (o3-mini grader)"
  },
  {
    "model_benchmark_id": 1655,
    "benchmark_id": "multi-if",
    "model_id": "gpt-4.5",
    "score": 0.708,
    "normalized_score": 0.708,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.652033+00:00",
    "updated_at": "2025-07-19T19:56:14.652033+00:00",
    "benchmark_name": "Multi-IF"
  },
  {
    "model_benchmark_id": 1868,
    "benchmark_id": "openai-mrcr:-2-needle-128k",
    "model_id": "gpt-4.5",
    "score": 0.385,
    "normalized_score": 0.385,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.279311+00:00",
    "updated_at": "2025-07-19T19:56:15.279311+00:00",
    "benchmark_name": "OpenAI-MRCR: 2 needle 128k"
  },
  {
    "model_benchmark_id": 240,
    "benchmark_id": "simpleqa",
    "model_id": "gpt-4.5",
    "score": 0.625,
    "normalized_score": 0.625,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-4-5/",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.559622+00:00",
    "updated_at": "2025-07-19T19:56:11.559622+00:00",
    "benchmark_name": "SimpleQA"
  },
  {
    "model_benchmark_id": 1360,
    "benchmark_id": "swe-bench-verified",
    "model_id": "gpt-4.5",
    "score": 0.38,
    "normalized_score": 0.38,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Success rate",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.863719+00:00",
    "updated_at": "2025-07-19T19:56:13.863719+00:00",
    "benchmark_name": "SWE-Bench Verified"
  },
  {
    "model_benchmark_id": 1900,
    "benchmark_id": "swe-lancer",
    "model_id": "gpt-4.5",
    "score": 0.373,
    "normalized_score": 0.373,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Success rate ($186K equivalent)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.358579+00:00",
    "updated_at": "2025-07-19T19:56:15.358579+00:00",
    "benchmark_name": "SWE-Lancer"
  },
  {
    "model_benchmark_id": 1903,
    "benchmark_id": "swe-lancer-(ic-diamond-subset)",
    "model_id": "gpt-4.5",
    "score": 0.174,
    "normalized_score": 0.174,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Success rate ($41K equivalent)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.365353+00:00",
    "updated_at": "2025-07-19T19:56:15.365353+00:00",
    "benchmark_name": "SWE-Lancer (IC-Diamond subset)"
  },
  {
    "model_benchmark_id": 1782,
    "benchmark_id": "tau-bench-airline",
    "model_id": "gpt-4.5",
    "score": 0.5,
    "normalized_score": 0.5,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.020093+00:00",
    "updated_at": "2025-07-19T19:56:15.020093+00:00",
    "benchmark_name": "TAU-bench Airline"
  },
  {
    "model_benchmark_id": 1768,
    "benchmark_id": "tau-bench-retail",
    "model_id": "gpt-4.5",
    "score": 0.684,
    "normalized_score": 0.684,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.989887+00:00",
    "updated_at": "2025-07-19T19:56:14.989887+00:00",
    "benchmark_name": "TAU-bench Retail"
  }
]

================================================
FILE: data/organizations/openai/models/gpt-4.5/model.json
================================================
{
  "model_id": "gpt-4.5",
  "name": "GPT-4.5",
  "organization_id": "openai",
  "fine_tuned_from_model_id": null,
  "description": "GPT-4.5 is OpenAI's most advanced model, offering improved reasoning, coding, and creative capabilities with faster performance and longer context handling than GPT-4. It features enhanced instruction following, reduced hallucinations, and better factual accuracy.",
  "release_date": "2025-02-27",
  "announcement_date": "2025-02-27",
  "license_id": "proprietary",
  "multimodal": true,
  "knowledge_cutoff": null,
  "param_count": null,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://platform.openai.com/docs/models/gpt-4-5#gpt-4-5",
  "source_playground": "https://platform.openai.com/playground",
  "source_paper": null,
  "source_scorecard_blog_link": "https://openai.com/index/introducing-gpt-4-5/",
  "source_repo_link": "https://github.com/openai",
  "source_weights_link": null,
  "created_at": "2025-07-19T19:49:05.852855+00:00",
  "updated_at": "2025-07-19T19:49:05.852855+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/openai/models/gpt-4o-2024-05-13/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 962,
    "benchmark_id": "drop",
    "model_id": "gpt-4o-2024-05-13",
    "score": 0.834,
    "normalized_score": 0.834,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/blog/gpt-4o",
    "verified_by_llmstats": false,
    "analysis_method": "F1 Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.023727+00:00",
    "updated_at": "2025-07-19T19:56:13.023727+00:00",
    "benchmark_name": "DROP"
  },
  {
    "model_benchmark_id": 352,
    "benchmark_id": "gpqa",
    "model_id": "gpt-4o-2024-05-13",
    "score": 0.536,
    "normalized_score": 0.536,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/blog/gpt-4o",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.759539+00:00",
    "updated_at": "2025-07-19T19:56:11.759539+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 811,
    "benchmark_id": "humaneval",
    "model_id": "gpt-4o-2024-05-13",
    "score": 0.902,
    "normalized_score": 0.902,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/blog/gpt-4o",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.689969+00:00",
    "updated_at": "2025-07-19T19:56:12.689969+00:00",
    "benchmark_name": "HumanEval"
  },
  {
    "model_benchmark_id": 427,
    "benchmark_id": "math",
    "model_id": "gpt-4o-2024-05-13",
    "score": 0.766,
    "normalized_score": 0.766,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/blog/gpt-4o",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.903446+00:00",
    "updated_at": "2025-07-19T19:56:11.903446+00:00",
    "benchmark_name": "MATH"
  },
  {
    "model_benchmark_id": 542,
    "benchmark_id": "mathvista",
    "model_id": "gpt-4o-2024-05-13",
    "score": 0.638,
    "normalized_score": 0.638,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/blog/gpt-4o",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.119289+00:00",
    "updated_at": "2025-07-19T19:56:12.119289+00:00",
    "benchmark_name": "MathVista"
  },
  {
    "model_benchmark_id": 1297,
    "benchmark_id": "mgsm",
    "model_id": "gpt-4o-2024-05-13",
    "score": 0.905,
    "normalized_score": 0.905,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/blog/gpt-4o",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.714155+00:00",
    "updated_at": "2025-07-19T19:56:13.714155+00:00",
    "benchmark_name": "MGSM"
  },
  {
    "model_benchmark_id": 120,
    "benchmark_id": "mmlu",
    "model_id": "gpt-4o-2024-05-13",
    "score": 0.887,
    "normalized_score": 0.887,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/blog/gpt-4o",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.322163+00:00",
    "updated_at": "2025-07-19T19:56:11.322163+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 219,
    "benchmark_id": "mmlu-pro",
    "model_id": "gpt-4o-2024-05-13",
    "score": 0.726,
    "normalized_score": 0.726,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/blog/gpt-4o",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.515262+00:00",
    "updated_at": "2025-07-19T19:56:11.515262+00:00",
    "benchmark_name": "MMLU-Pro"
  }
]

================================================
FILE: data/organizations/openai/models/gpt-4o-2024-05-13/model.json
================================================
{
  "model_id": "gpt-4o-2024-05-13",
  "name": "GPT-4o",
  "organization_id": "openai",
  "fine_tuned_from_model_id": null,
  "description": "GPT-4o ('o' for 'omni') is a multimodal AI model that accepts text, audio, image, and video inputs, and generates text, audio, and image outputs. It matches GPT-4 Turbo performance on text and code, with improvements in non-English languages, vision, and audio understanding.",
  "release_date": "2024-05-13",
  "announcement_date": "2024-05-13",
  "license_id": "proprietary",
  "multimodal": true,
  "knowledge_cutoff": null,
  "param_count": null,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://platform.openai.com/docs/api-reference",
  "source_playground": "https://chat.openai.com/",
  "source_paper": null,
  "source_scorecard_blog_link": "https://openai.com/index/hello-gpt-4o/",
  "source_repo_link": null,
  "source_weights_link": null,
  "created_at": "2025-07-19T19:49:05.838358+00:00",
  "updated_at": "2025-07-19T19:49:05.838358+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/openai/models/gpt-4o-2024-08-06/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 1908,
    "benchmark_id": "activitynet",
    "model_id": "gpt-4o-2024-08-06",
    "score": 0.619,
    "normalized_score": 0.619,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/hello-gpt-4o/",
    "verified_by_llmstats": false,
    "analysis_method": "test set evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.381219+00:00",
    "updated_at": "2025-07-19T19:56:15.381219+00:00",
    "benchmark_name": "ActivityNet"
  },
  {
    "model_benchmark_id": 1262,
    "benchmark_id": "ai2d",
    "model_id": "gpt-4o-2024-08-06",
    "score": 0.942,
    "normalized_score": 0.942,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/hello-gpt-4o/",
    "verified_by_llmstats": false,
    "analysis_method": "test set evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.646808+00:00",
    "updated_at": "2025-07-19T19:56:13.646808+00:00",
    "benchmark_name": "AI2D"
  },
  {
    "model_benchmark_id": 672,
    "benchmark_id": "aider-polyglot",
    "model_id": "gpt-4o-2024-08-06",
    "score": 0.307,
    "normalized_score": 0.307,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.391433+00:00",
    "updated_at": "2025-07-19T19:56:12.391433+00:00",
    "benchmark_name": "Aider-Polyglot"
  },
  {
    "model_benchmark_id": 1336,
    "benchmark_id": "aider-polyglot-edit",
    "model_id": "gpt-4o-2024-08-06",
    "score": 0.182,
    "normalized_score": 0.182,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.810263+00:00",
    "updated_at": "2025-07-19T19:56:13.810263+00:00",
    "benchmark_name": "Aider-Polyglot Edit"
  },
  {
    "model_benchmark_id": 488,
    "benchmark_id": "aime-2024",
    "model_id": "gpt-4o-2024-08-06",
    "score": 0.131,
    "normalized_score": 0.131,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.022775+00:00",
    "updated_at": "2025-07-19T19:56:12.022775+00:00",
    "benchmark_name": "AIME 2024"
  },
  {
    "model_benchmark_id": 875,
    "benchmark_id": "chartqa",
    "model_id": "gpt-4o-2024-08-06",
    "score": 0.857,
    "normalized_score": 0.857,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/hello-gpt-4o/",
    "verified_by_llmstats": false,
    "analysis_method": "test set evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.824155+00:00",
    "updated_at": "2025-07-19T19:56:12.824155+00:00",
    "benchmark_name": "ChartQA"
  },
  {
    "model_benchmark_id": 1890,
    "benchmark_id": "charxiv-d",
    "model_id": "gpt-4o-2024-08-06",
    "score": 0.853,
    "normalized_score": 0.853,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.333294+00:00",
    "updated_at": "2025-07-19T19:56:15.333294+00:00",
    "benchmark_name": "CharXiv-D"
  },
  {
    "model_benchmark_id": 1838,
    "benchmark_id": "charxiv-r",
    "model_id": "gpt-4o-2024-08-06",
    "score": 0.588,
    "normalized_score": 0.588,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-5/",
    "verified_by_llmstats": false,
    "analysis_method": "GPT-4o without thinking mode - Scientific figure reasoning and interpretation.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.203285+00:00",
    "updated_at": "2025-07-19T19:56:15.203285+00:00",
    "benchmark_name": "CharXiv-R"
  },
  {
    "model_benchmark_id": 1861,
    "benchmark_id": "collie",
    "model_id": "gpt-4o-2024-08-06",
    "score": 0.61,
    "normalized_score": 0.61,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-5/",
    "verified_by_llmstats": false,
    "analysis_method": "GPT-4o without thinking mode - Instruction-following in freeform writing.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.262884+00:00",
    "updated_at": "2025-07-19T19:56:15.262884+00:00",
    "benchmark_name": "COLLIE"
  },
  {
    "model_benchmark_id": 1867,
    "benchmark_id": "tau2-airline",
    "model_id": "gpt-4o-2024-08-06",
    "score": 0.455,
    "normalized_score": 0.455,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-5/",
    "verified_by_llmstats": false,
    "analysis_method": "GPT-4o without thinking mode - Function calling benchmark (airline domain).",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "Tau2 airline"
  },
  {
    "model_benchmark_id": 1868,
    "benchmark_id": "tau2-retail",
    "model_id": "gpt-4o-2024-08-06",
    "score": 0.634,
    "normalized_score": 0.634,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-5/",
    "verified_by_llmstats": false,
    "analysis_method": "GPT-4o without thinking mode - Function calling benchmark (retail domain).",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "Tau2 retail"
  },
  {
    "model_benchmark_id": 1869,
    "benchmark_id": "tau2-telecom",
    "model_id": "gpt-4o-2024-08-06",
    "score": 0.235,
    "normalized_score": 0.235,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-5/",
    "verified_by_llmstats": false,
    "analysis_method": "GPT-4o without thinking mode - Function calling benchmark (telecom domain).",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "Tau2 telecom"
  },
  {
    "model_benchmark_id": 1870,
    "benchmark_id": "mmmu-pro",
    "model_id": "gpt-4o-2024-08-06",
    "score": 0.599,
    "normalized_score": 0.599,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-5/",
    "verified_by_llmstats": false,
    "analysis_method": "GPT-4o without thinking mode - Graduate-level visual problem-solving with advanced multimodal reasoning.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "MMMU-Pro"
  },
  {
    "model_benchmark_id": 1871,
    "benchmark_id": "videommmu",
    "model_id": "gpt-4o-2024-08-06",
    "score": 0.612,
    "normalized_score": 0.612,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-5/",
    "verified_by_llmstats": false,
    "analysis_method": "GPT-4o without thinking mode - Video-based multimodal reasoning (max frame 256).",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "VideoMMMU"
  },
  {
    "model_benchmark_id": 1872,
    "benchmark_id": "erqa",
    "model_id": "gpt-4o-2024-08-06",
    "score": 0.352,
    "normalized_score": 0.352,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-5/",
    "verified_by_llmstats": false,
    "analysis_method": "GPT-4o without thinking mode - Multimodal spatial reasoning.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "ERQA"
  },
  {
    "model_benchmark_id": 1896,
    "benchmark_id": "complexfuncbench",
    "model_id": "gpt-4o-2024-08-06",
    "score": 0.665,
    "normalized_score": 0.665,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.349679+00:00",
    "updated_at": "2025-07-19T19:56:15.349679+00:00",
    "benchmark_name": "ComplexFuncBench"
  },
  {
    "model_benchmark_id": 900,
    "benchmark_id": "docvqa",
    "model_id": "gpt-4o-2024-08-06",
    "score": 0.928,
    "normalized_score": 0.928,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/hello-gpt-4o/",
    "verified_by_llmstats": false,
    "analysis_method": "test set evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.873722+00:00",
    "updated_at": "2025-07-19T19:56:12.873722+00:00",
    "benchmark_name": "DocVQA"
  },
  {
    "model_benchmark_id": 926,
    "benchmark_id": "egoschema",
    "model_id": "gpt-4o-2024-08-06",
    "score": 0.722,
    "normalized_score": 0.722,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/hello-gpt-4o/",
    "verified_by_llmstats": false,
    "analysis_method": "test set evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.935728+00:00",
    "updated_at": "2025-07-19T19:56:12.935728+00:00",
    "benchmark_name": "EgoSchema"
  },
  {
    "model_benchmark_id": 355,
    "benchmark_id": "gpqa",
    "model_id": "gpt-4o-2024-08-06",
    "score": 0.701,
    "normalized_score": 0.701,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-5/",
    "verified_by_llmstats": false,
    "analysis_method": "GPT-4o - Diamond no thinking no tools",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.764329+00:00",
    "updated_at": "2025-07-19T19:56:11.764329+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 1905,
    "benchmark_id": "graphwalks-bfs-<128k",
    "model_id": "gpt-4o-2024-08-06",
    "score": 0.417,
    "normalized_score": 0.417,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.370259+00:00",
    "updated_at": "2025-07-19T19:56:15.370259+00:00",
    "benchmark_name": "Graphwalks BFS <128k"
  },
  {
    "model_benchmark_id": 1882,
    "benchmark_id": "graphwalks-parents-<128k",
    "model_id": "gpt-4o-2024-08-06",
    "score": 0.354,
    "normalized_score": 0.354,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.314044+00:00",
    "updated_at": "2025-07-19T19:56:15.314044+00:00",
    "benchmark_name": "Graphwalks parents <128k"
  },
  {
    "model_benchmark_id": 636,
    "benchmark_id": "ifeval",
    "model_id": "gpt-4o-2024-08-06",
    "score": 0.81,
    "normalized_score": 0.81,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.306083+00:00",
    "updated_at": "2025-07-19T19:56:12.306083+00:00",
    "benchmark_name": "IFEval"
  },
  {
    "model_benchmark_id": 1849,
    "benchmark_id": "internal-api-instruction-following-(hard)",
    "model_id": "gpt-4o-2024-08-06",
    "score": 0.292,
    "normalized_score": 0.292,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.232334+00:00",
    "updated_at": "2025-07-19T19:56:15.232334+00:00",
    "benchmark_name": "Internal API instruction following (hard)"
  },
  {
    "model_benchmark_id": 544,
    "benchmark_id": "mathvista",
    "model_id": "gpt-4o-2024-08-06",
    "score": 0.614,
    "normalized_score": 0.614,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.122558+00:00",
    "updated_at": "2025-07-19T19:56:12.122558+00:00",
    "benchmark_name": "MathVista"
  },
  {
    "model_benchmark_id": 122,
    "benchmark_id": "mmlu",
    "model_id": "gpt-4o-2024-08-06",
    "score": 0.857,
    "normalized_score": 0.857,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.325082+00:00",
    "updated_at": "2025-07-19T19:56:11.325082+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 220,
    "benchmark_id": "mmlu-pro",
    "model_id": "gpt-4o-2024-08-06",
    "score": 0.747,
    "normalized_score": 0.747,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/spaces/TIGER-Lab/MMLU-Pro",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot CoT",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.517058+00:00",
    "updated_at": "2025-07-19T19:56:11.517058+00:00",
    "benchmark_name": "MMLU-Pro"
  },
  {
    "model_benchmark_id": 1484,
    "benchmark_id": "mmmlu",
    "model_id": "gpt-4o-2024-08-06",
    "score": 0.814,
    "normalized_score": 0.814,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.162717+00:00",
    "updated_at": "2025-07-19T19:56:14.162717+00:00",
    "benchmark_name": "MMMLU"
  },
  {
    "model_benchmark_id": 594,
    "benchmark_id": "mmmu",
    "model_id": "gpt-4o-2024-08-06",
    "score": 0.722,
    "normalized_score": 0.722,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-5/",
    "verified_by_llmstats": false,
    "analysis_method": "GPT-4o without thinking mode - College-level visual problem-solving with multimodal reasoning.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.224513+00:00",
    "updated_at": "2025-07-19T19:56:12.224513+00:00",
    "benchmark_name": "MMMU"
  },
  {
    "model_benchmark_id": 1855,
    "benchmark_id": "multichallenge-(o3-mini-grader)",
    "model_id": "gpt-4o-2024-08-06",
    "score": 0.399,
    "normalized_score": 0.399,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.246431+00:00",
    "updated_at": "2025-07-19T19:56:15.246431+00:00",
    "benchmark_name": "MultiChallenge (o3-mini grader)"
  },
  {
    "model_benchmark_id": 1654,
    "benchmark_id": "multi-if",
    "model_id": "gpt-4o-2024-08-06",
    "score": 0.609,
    "normalized_score": 0.609,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.650416+00:00",
    "updated_at": "2025-07-19T19:56:14.650416+00:00",
    "benchmark_name": "Multi-IF"
  },
  {
    "model_benchmark_id": 1867,
    "benchmark_id": "openai-mrcr:-2-needle-128k",
    "model_id": "gpt-4o-2024-08-06",
    "score": 0.319,
    "normalized_score": 0.319,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.277538+00:00",
    "updated_at": "2025-07-19T19:56:15.277538+00:00",
    "benchmark_name": "OpenAI-MRCR: 2 needle 128k"
  },
  {
    "model_benchmark_id": 239,
    "benchmark_id": "simpleqa",
    "model_id": "gpt-4o-2024-08-06",
    "score": 0.382,
    "normalized_score": 0.382,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-4-5/",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.557852+00:00",
    "updated_at": "2025-07-19T19:56:11.557852+00:00",
    "benchmark_name": "SimpleQA"
  },
  {
    "model_benchmark_id": 1359,
    "benchmark_id": "swe-bench-verified",
    "model_id": "gpt-4o-2024-08-06",
    "score": 0.332,
    "normalized_score": 0.332,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.861280+00:00",
    "updated_at": "2025-07-19T19:56:13.861280+00:00",
    "benchmark_name": "SWE-Bench Verified"
  },
  {
    "model_benchmark_id": 1899,
    "benchmark_id": "swe-lancer",
    "model_id": "gpt-4o-2024-08-06",
    "score": 0.326,
    "normalized_score": 0.326,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "percentage score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.356738+00:00",
    "updated_at": "2025-07-19T19:56:15.356738+00:00",
    "benchmark_name": "SWE-Lancer"
  },
  {
    "model_benchmark_id": 1902,
    "benchmark_id": "swe-lancer-(ic-diamond-subset)",
    "model_id": "gpt-4o-2024-08-06",
    "score": 0.124,
    "normalized_score": 0.124,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "percentage score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.363614+00:00",
    "updated_at": "2025-07-19T19:56:15.363614+00:00",
    "benchmark_name": "SWE-Lancer (IC-Diamond subset)"
  },
  {
    "model_benchmark_id": 1781,
    "benchmark_id": "tau-bench-airline",
    "model_id": "gpt-4o-2024-08-06",
    "score": 0.428,
    "normalized_score": 0.428,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.017725+00:00",
    "updated_at": "2025-07-19T19:56:15.017725+00:00",
    "benchmark_name": "TAU-bench Airline"
  },
  {
    "model_benchmark_id": 1767,
    "benchmark_id": "tau-bench-retail",
    "model_id": "gpt-4o-2024-08-06",
    "score": 0.603,
    "normalized_score": 0.603,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.988086+00:00",
    "updated_at": "2025-07-19T19:56:14.988086+00:00",
    "benchmark_name": "TAU-bench Retail"
  },
  {
    "model_benchmark_id": 2003,
    "benchmark_id": "humanity's-last-exam",
    "model_id": "gpt-4o-2024-08-06",
    "score": 0.053,
    "normalized_score": 0.053,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-5/",
    "verified_by_llmstats": false,
    "analysis_method": "GPT-4o without thinking mode (no tools) - Full set of expert-level questions across subjects.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "Humanity's Last Exam"
  },
  {
    "model_benchmark_id": 2005,
    "benchmark_id": "scale-multichallenge",
    "model_id": "gpt-4o-2024-08-06",
    "score": 0.403,
    "normalized_score": 0.403,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-5/",
    "verified_by_llmstats": false,
    "analysis_method": "GPT-4o without thinking mode - Multi-turn instruction following benchmark.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "Scale MultiChallenge"
  }
]


================================================
FILE: data/organizations/openai/models/gpt-4o-2024-08-06/model.json
================================================
{
  "model_id": "gpt-4o-2024-08-06",
  "name": "GPT-4o",
  "organization_id": "openai",
  "fine_tuned_from_model_id": null,
  "description": "GPT-4o ('o' for 'omni') is a multimodal AI model that accepts text, audio, image, and video inputs, and generates text, audio, and image outputs. It matches GPT-4 Turbo performance on text and code, with improvements in non-English languages, vision, and audio understanding.",
  "release_date": "2024-08-06",
  "announcement_date": "2024-08-06",
  "license_id": "proprietary",
  "multimodal": true,
  "knowledge_cutoff": null,
  "param_count": null,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://platform.openai.com/docs/api-reference",
  "source_playground": "https://chat.openai.com/",
  "source_paper": null,
  "source_scorecard_blog_link": "https://openai.com/index/hello-gpt-4o/",
  "source_repo_link": null,
  "source_weights_link": null,
  "created_at": "2025-07-19T19:49:05.847621+00:00",
  "updated_at": "2025-07-19T19:49:05.847621+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/openai/models/gpt-4o-mini-2024-07-18/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 964,
    "benchmark_id": "drop",
    "model_id": "gpt-4o-mini-2024-07-18",
    "score": 0.797,
    "normalized_score": 0.797,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/blog/gpt-4o-mini-announcement",
    "verified_by_llmstats": false,
    "analysis_method": "F1 Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.026741+00:00",
    "updated_at": "2025-07-19T19:56:13.026741+00:00",
    "benchmark_name": "DROP"
  },
  {
    "model_benchmark_id": 361,
    "benchmark_id": "gpqa",
    "model_id": "gpt-4o-mini-2024-07-18",
    "score": 0.402,
    "normalized_score": 0.402,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/blog/gpt-4o-mini-announcement",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.774361+00:00",
    "updated_at": "2025-07-19T19:56:11.774361+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 816,
    "benchmark_id": "humaneval",
    "model_id": "gpt-4o-mini-2024-07-18",
    "score": 0.872,
    "normalized_score": 0.872,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/blog/gpt-4o-mini-announcement",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.700095+00:00",
    "updated_at": "2025-07-19T19:56:12.700095+00:00",
    "benchmark_name": "HumanEval"
  },
  {
    "model_benchmark_id": 431,
    "benchmark_id": "math",
    "model_id": "gpt-4o-mini-2024-07-18",
    "score": 0.702,
    "normalized_score": 0.702,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/blog/gpt-4o-mini-announcement",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.911917+00:00",
    "updated_at": "2025-07-19T19:56:11.911917+00:00",
    "benchmark_name": "MATH"
  },
  {
    "model_benchmark_id": 548,
    "benchmark_id": "mathvista",
    "model_id": "gpt-4o-mini-2024-07-18",
    "score": 0.567,
    "normalized_score": 0.567,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/blog/gpt-4o-mini-announcement",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.128984+00:00",
    "updated_at": "2025-07-19T19:56:12.128984+00:00",
    "benchmark_name": "MathVista"
  },
  {
    "model_benchmark_id": 1301,
    "benchmark_id": "mgsm",
    "model_id": "gpt-4o-mini-2024-07-18",
    "score": 0.87,
    "normalized_score": 0.87,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/blog/gpt-4o-mini-announcement",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.720445+00:00",
    "updated_at": "2025-07-19T19:56:13.720445+00:00",
    "benchmark_name": "MGSM"
  },
  {
    "model_benchmark_id": 128,
    "benchmark_id": "mmlu",
    "model_id": "gpt-4o-mini-2024-07-18",
    "score": 0.82,
    "normalized_score": 0.82,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/blog/gpt-4o-mini-announcement",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.335061+00:00",
    "updated_at": "2025-07-19T19:56:11.335061+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 598,
    "benchmark_id": "mmmu",
    "model_id": "gpt-4o-mini-2024-07-18",
    "score": 0.594,
    "normalized_score": 0.594,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/blog/gpt-4o-mini-announcement",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.232157+00:00",
    "updated_at": "2025-07-19T19:56:12.232157+00:00",
    "benchmark_name": "MMMU"
  },
  {
    "model_benchmark_id": 1363,
    "benchmark_id": "swe-bench-verified",
    "model_id": "gpt-4o-mini-2024-07-18",
    "score": 0.087,
    "normalized_score": 0.087,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "Pass Rate",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.870038+00:00",
    "updated_at": "2025-07-19T19:56:13.870038+00:00",
    "benchmark_name": "SWE-Bench Verified"
  }
]

================================================
FILE: data/organizations/openai/models/gpt-4o-mini-2024-07-18/model.json
================================================
{
  "model_id": "gpt-4o-mini-2024-07-18",
  "name": "GPT-4o mini",
  "organization_id": "openai",
  "fine_tuned_from_model_id": null,
  "description": "GPT-4o mini is OpenAI's latest cost-efficient small model, designed to make AI intelligence more accessible and affordable. It excels in textual intelligence and multimodal reasoning, outperforming previous models like GPT-3.5 Turbo. With a context window of 128K tokens and support for text and vision, it offers low-cost, real-time applications such as customer support chatbots. Priced at 15 cents per million input tokens and 60 cents per million output tokens, it is significantly cheaper than its predecessors. Safety is prioritized with built-in measures and improved resistance to security threats.",
  "release_date": "2024-07-18",
  "announcement_date": "2024-07-18",
  "license_id": "proprietary",
  "multimodal": true,
  "knowledge_cutoff": "2023-10-01",
  "param_count": null,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://platform.openai.com/docs/api-reference",
  "source_playground": null,
  "source_paper": null,
  "source_scorecard_blog_link": "https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/",
  "source_repo_link": null,
  "source_weights_link": null,
  "created_at": "2025-07-19T19:49:05.866393+00:00",
  "updated_at": "2025-07-19T19:49:05.866393+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/openai/models/gpt-5-2025-08-07/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 9002,
    "benchmark_id": "swe-bench-verified",
    "model_id": "gpt-5-2025-08-07",
    "score": 0.749,
    "normalized_score": 0.749,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-5/",
    "verified_by_llmstats": false,
    "analysis_method": "Thinking mode enabled (up to 128K tokens) with enhanced reasoning capabilities and iterative problem-solving approach.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "SWE-Bench Verified"
  },
  {
    "model_benchmark_id": 9004,
    "benchmark_id": "aider-polyglot",
    "model_id": "gpt-5-2025-08-07",
    "score": 0.88,
    "normalized_score": 0.88,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-5/",
    "verified_by_llmstats": false,
    "analysis_method": "Thinking mode enabled (up to 128K tokens) with step-by-step reasoning and multi-language code understanding.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "Aider-Polyglot"
  },
  {
    "model_benchmark_id": 10027,
    "benchmark_id": "swe-lancer-(ic-diamond-subset)",
    "model_id": "gpt-5-2025-08-07",
    "score": 1.0,
    "normalized_score": 1.0,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "GPT-5 - IC SWE Diamond Freelance Coding Tasks (earnings-based evaluation).",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "SWE-Lancer (IC-Diamond subset)"
  },
  {
    "model_benchmark_id": 9020,
    "benchmark_id": "aime-2025",
    "model_id": "gpt-5-2025-08-07",
    "score": 0.946,
    "normalized_score": 0.946,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-5/",
    "verified_by_llmstats": false,
    "analysis_method": "GPT-5 standard with thinking mode enabled (no tools) - competition mathematics.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "AIME 2025"
  },
  {
    "model_benchmark_id": 9009,
    "benchmark_id": "mmmu",
    "model_id": "gpt-5-2025-08-07",
    "score": 0.842,
    "normalized_score": 0.842,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-5/",
    "verified_by_llmstats": false,
    "analysis_method": "GPT-5 with thinking mode - College-level visual problem-solving with multimodal reasoning.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "MMMU"
  },
  {
    "model_benchmark_id": 9006,
    "benchmark_id": "mmlu",
    "model_id": "gpt-5-2025-08-07",
    "score": 0.925,
    "normalized_score": 0.925,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-5/",
    "verified_by_llmstats": false,
    "analysis_method": "Standard benchmark across multiple academic subjects with comprehensive knowledge evaluation.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 9007,
    "benchmark_id": "humaneval",
    "model_id": "gpt-5-2025-08-07",
    "score": 0.934,
    "normalized_score": 0.934,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-5/",
    "verified_by_llmstats": false,
    "analysis_method": "Code generation benchmark with function completion tasks in Python.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "HumanEval"
  },
  {
    "model_benchmark_id": 9008,
    "benchmark_id": "math",
    "model_id": "gpt-5-2025-08-07",
    "score": 0.847,
    "normalized_score": 0.847,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-5/",
    "verified_by_llmstats": false,
    "analysis_method": "Thinking mode enabled with step-by-step mathematical problem solving and verification.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "MATH"
  },
  {
    "model_benchmark_id": 9013,
    "benchmark_id": "healthbench-hard",
    "model_id": "gpt-5-2025-08-07",
    "score": 0.016,
    "normalized_score": 0.016,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-5/",
    "verified_by_llmstats": false,
    "analysis_method": "Thinking mode enabled for medical hallucination detection. Measured inaccuracies on challenging healthcare conversations.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "HealthBench Hard"
  },
  {
    "model_benchmark_id": 9024,
    "benchmark_id": "frontiermath",
    "model_id": "gpt-5-2025-08-07",
    "score": 0.263,
    "normalized_score": 0.263,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-5/",
    "verified_by_llmstats": false,
    "analysis_method": "GPT-5 standard with thinking mode enabled (with python tool only) - FrontierMath Tier 1-3 expert-level mathematics.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "FrontierMath"
  },
  {
    "model_benchmark_id": 9028,
    "benchmark_id": "hmmt-2025",
    "model_id": "gpt-5-2025-08-07",
    "score": 0.933,
    "normalized_score": 0.933,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-5/",
    "verified_by_llmstats": false,
    "analysis_method": "GPT-5 standard with thinking mode enabled (no tools) - Harvard-MIT Mathematics Tournament.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "HMMT 2025"
  },
  {
    "model_benchmark_id": 9032,
    "benchmark_id": "gpqa",
    "model_id": "gpt-5-2025-08-07",
    "score": 0.857,
    "normalized_score": 0.857,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-5/",
    "verified_by_llmstats": false,
    "analysis_method": "GPT-5 - Diamond thinking no tools",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 9037,
    "benchmark_id": "humanity's-last-exam",
    "model_id": "gpt-5-2025-08-07",
    "score": 0.248,
    "normalized_score": 0.248,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-5/",
    "verified_by_llmstats": false,
    "analysis_method": "GPT-5 standard with thinking mode (no tools) - Full set of expert-level questions across subjects.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "Humanity's Last Exam"
  },
  {
    "model_benchmark_id": 9041,
    "benchmark_id": "scale-multichallenge",
    "model_id": "gpt-5-2025-08-07",
    "score": 0.696,
    "normalized_score": 0.696,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-5/",
    "verified_by_llmstats": false,
    "analysis_method": "GPT-5 with thinking mode enabled - Multi-turn instruction following benchmark.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "Scale MultiChallenge"
  },
  {
    "model_benchmark_id": 9043,
    "benchmark_id": "browsecomp",
    "model_id": "gpt-5-2025-08-07",
    "score": 0.549,
    "normalized_score": 0.549,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-5/",
    "verified_by_llmstats": false,
    "analysis_method": "GPT-5 with thinking mode enabled - Agentic search & browsing benchmark.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "BrowseComp"
  },
  {
    "model_benchmark_id": 9045,
    "benchmark_id": "collie",
    "model_id": "gpt-5-2025-08-07",
    "score": 0.99,
    "normalized_score": 0.99,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-5/",
    "verified_by_llmstats": false,
    "analysis_method": "GPT-5 with thinking mode enabled - Instruction-following in freeform writing.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "COLLIE"
  },
  {
    "model_benchmark_id": 10034,
    "benchmark_id": "multichallenge-(o3-mini-grader)",
    "model_id": "gpt-5-2025-08-07",
    "score": 0.696,
    "normalized_score": 0.696,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "GPT-5 with o3-mini grader - Multi-turn instruction following benchmark with improved grading accuracy.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "MultiChallenge (o3-mini grader)"
  },
  {
    "model_benchmark_id": 10035,
    "benchmark_id": "internal-api-instruction-following-(hard)",
    "model_id": "gpt-5-2025-08-07",
    "score": 0.64,
    "normalized_score": 0.64,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "GPT-5 - Internal API instruction following evaluation (hard difficulty).",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "Internal API instruction following (hard)"
  },
  {
    "model_benchmark_id": 9047,
    "benchmark_id": "tau2-airline",
    "model_id": "gpt-5-2025-08-07",
    "score": 0.626,
    "normalized_score": 0.626,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "GPT-5 - Function calling benchmark (airline domain).",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "Tau2 airline"
  },
  {
    "model_benchmark_id": 9049,
    "benchmark_id": "tau2-retail",
    "model_id": "gpt-5-2025-08-07",
    "score": 0.811,
    "normalized_score": 0.811,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "GPT-5 with thinking mode - Function calling benchmark (retail domain).",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "Tau2 retail"
  },
  {
    "model_benchmark_id": 9051,
    "benchmark_id": "tau2-telecom",
    "model_id": "gpt-5-2025-08-07",
    "score": 0.967,
    "normalized_score": 0.967,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "GPT-5 with thinking mode - Function calling benchmark (telecom domain).",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "Tau2 telecom"
  },
  {
    "model_benchmark_id": 9053,
    "benchmark_id": "mmmu-pro",
    "model_id": "gpt-5-2025-08-07",
    "score": 0.784,
    "normalized_score": 0.784,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-5/",
    "verified_by_llmstats": false,
    "analysis_method": "GPT-5 with thinking mode - Graduate-level visual problem-solving with advanced multimodal reasoning.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "MMMU-Pro"
  },
  {
    "model_benchmark_id": 9055,
    "benchmark_id": "videommmu",
    "model_id": "gpt-5-2025-08-07",
    "score": 0.846,
    "normalized_score": 0.846,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-5/",
    "verified_by_llmstats": false,
    "analysis_method": "GPT-5 with thinking mode - Video-based multimodal reasoning (max frame 256).",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "VideoMMMU"
  },
  {
    "model_benchmark_id": 9057,
    "benchmark_id": "charxiv-r",
    "model_id": "gpt-5-2025-08-07",
    "score": 0.811,
    "normalized_score": 0.811,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-5/",
    "verified_by_llmstats": false,
    "analysis_method": "GPT-5 with thinking mode - Scientific figure reasoning and interpretation.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "CharXiv-R"
  },
  {
    "model_benchmark_id": 9059,
    "benchmark_id": "erqa",
    "model_id": "gpt-5-2025-08-07",
    "score": 0.657,
    "normalized_score": 0.657,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-5/",
    "verified_by_llmstats": false,
    "analysis_method": "GPT-5 with thinking mode - Multimodal spatial reasoning.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "ERQA"
  },
  {
    "model_benchmark_id": 10048,
    "benchmark_id": "openai-mrcr:-2-needle-128k",
    "model_id": "gpt-5-2025-08-07",
    "score": 0.952,
    "normalized_score": 0.952,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "OpenAI-MRCR 2-needle retrieval at 128k tokens.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "OpenAI-MRCR: 2 needle 128k"
  },
  {
    "model_benchmark_id": 10049,
    "benchmark_id": "openai-mrcr:-2-needle-256k",
    "model_id": "gpt-5-2025-08-07",
    "score": 0.868,
    "normalized_score": 0.868,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "OpenAI-MRCR 2-needle retrieval at 256k tokens.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "OpenAI-MRCR: 2 needle 256k"
  },
  {
    "model_benchmark_id": 10050,
    "benchmark_id": "graphwalks-bfs-<128k",
    "model_id": "gpt-5-2025-08-07",
    "score": 0.783,
    "normalized_score": 0.783,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "Graphwalks BFS (<128k) long-context reasoning.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "Graphwalks BFS <128k"
  },
  {
    "model_benchmark_id": 10051,
    "benchmark_id": "graphwalks-parents-<128k",
    "model_id": "gpt-5-2025-08-07",
    "score": 0.733,
    "normalized_score": 0.733,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "Graphwalks parents (<128k) long-context reasoning.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "Graphwalks parents <128k"
  },
  {
    "model_benchmark_id": 10052,
    "benchmark_id": "browsecomp-long-128k",
    "model_id": "gpt-5-2025-08-07",
    "score": 0.9,
    "normalized_score": 0.9,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "BrowseComp long-context 128k variant.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "BrowseComp Long Context 128k"
  },
  {
    "model_benchmark_id": 10053,
    "benchmark_id": "browsecomp-long-256k",
    "model_id": "gpt-5-2025-08-07",
    "score": 0.888,
    "normalized_score": 0.888,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "BrowseComp long-context 256k variant.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "BrowseComp Long Context 256k"
  },
  {
    "model_benchmark_id": 10054,
    "benchmark_id": "videomme-w-sub.",
    "model_id": "gpt-5-2025-08-07",
    "score": 0.867,
    "normalized_score": 0.867,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "VideoMME (long) with subtitles category.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "VideoMME w sub."
  },
  {
    "model_benchmark_id": 10069,
    "benchmark_id": "longfact-concepts",
    "model_id": "gpt-5-2025-08-07",
    "score": 0.007,
    "normalized_score": 0.007,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "Thinking mode enabled for hallucination detection. Measured on open-source prompts for concept-based factual queries.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "LongFact-Concepts"
  },
  {
    "model_benchmark_id": 10070,
    "benchmark_id": "longfact-objects",
    "model_id": "gpt-5-2025-08-07",
    "score": 0.008,
    "normalized_score": 0.008,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "Thinking mode enabled for hallucination detection. Measured on open-source prompts for object-based factual queries.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "LongFact-Objects"
  },
  {
    "model_benchmark_id": 10071,
    "benchmark_id": "factscore",
    "model_id": "gpt-5-2025-08-07",
    "score": 0.01,
    "normalized_score": 0.01,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-5-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "Thinking mode enabled for factual accuracy assessment. Measured hallucination rate on open-source prompts.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "FactScore"
  }
]


================================================
FILE: data/organizations/openai/models/gpt-5-2025-08-07/model.json
================================================
{
  "model_id": "gpt-5-2025-08-07",
  "name": "GPT-5",
  "organization_id": "openai",
  "fine_tuned_from_model_id": null,
  "description": "GPT-5 is our flagship model for coding, reasoning, and agentic tasks across domains. The best model for coding and agentic tasks with higher reasoning capabilities and medium speed.",
  "release_date": "2025-08-07",
  "announcement_date": "2025-08-07",
  "license_id": "proprietary",
  "multimodal": true,
  "knowledge_cutoff": "2024-09-30",
  "param_count": null,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://platform.openai.com/docs/models/gpt-5",
  "source_playground": "https://platform.openai.com/playground?mode=chat&model=gpt-5",
  "source_paper": "https://cdn.openai.com/pdf/8124a3ce-ab78-4f06-96eb-49ea29ffb52f/gpt5-system-card-aug7.pdf",
  "source_scorecard_blog_link": "https://openai.com/index/gpt-5/",
  "source_repo_link": null,
  "source_weights_link": null,
  "created_at": "2025-07-24T12:00:00.000000+00:00",
  "updated_at": "2025-07-24T12:00:00.000000+00:00",
  "model_family_id": null
}


================================================
FILE: data/organizations/openai/models/gpt-5-codex-2025-09-15/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 10100,
    "benchmark_id": "swe-bench-verified",
    "model_id": "gpt-5-codex-2025-09-15",
    "score": 0.745,
    "normalized_score": 0.745,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-upgrades-to-codex/",
    "verified_by_llmstats": false,
    "analysis_method": "GPT-5 Codex specialized for code review and critical flaw detection with enhanced agentic coding capabilities.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-09-18T00:00:00.000000+00:00",
    "updated_at": "2025-09-18T00:00:00.000000+00:00",
    "benchmark_name": "SWE-Bench Verified"
  }
]

================================================
FILE: data/organizations/openai/models/gpt-5-codex-2025-09-15/model.json
================================================
{
  "model_id": "gpt-5-codex-2025-09-15",
  "name": "GPT-5 Codex",
  "organization_id": "openai",
  "fine_tuned_from_model_id": null,
  "description": "GPT-5 Codex has been trained specifically for conducting code reviews and finding critical flaws. When reviewing, it navigates your codebase and analyzes code patterns to identify potential security vulnerabilities, performance issues, and bugs.",
  "release_date": "2025-09-15",
  "announcement_date": "2025-09-15",
  "license_id": "proprietary",
  "multimodal": false,
  "knowledge_cutoff": "2024-09-30",
  "param_count": null,
  "training_tokens": null,
  "available_in_zeroeval": false,
  "source_api_ref": "https://platform.openai.com/docs/models/gpt-5-codex",
  "source_playground": "https://platform.openai.com/playground?mode=chat&model=gpt-5-codex",
  "source_paper": null,
  "source_scorecard_blog_link": "https://openai.com/index/introducing-upgrades-to-codex/",
  "source_repo_link": null,
  "source_weights_link": null,
  "created_at": "2025-09-18T00:00:00.000000+00:00",
  "updated_at": "2025-09-18T00:00:00.000000+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/openai/models/gpt-5-mini-2025-08-07/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 9021,
    "benchmark_id": "aime-2025",
    "model_id": "gpt-5-mini-2025-08-07",
    "score": 0.911,
    "normalized_score": 0.911,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-5/",
    "verified_by_llmstats": false,
    "analysis_method": "GPT-5 mini with thinking mode enabled (no tools) - competition mathematics.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "AIME 2025"
  },
  {
    "model_benchmark_id": 9025,
    "benchmark_id": "frontiermath",
    "model_id": "gpt-5-mini-2025-08-07",
    "score": 0.221,
    "normalized_score": 0.221,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-5/",
    "verified_by_llmstats": false,
    "analysis_method": "GPT-5 mini with thinking mode enabled (with python tool only) - FrontierMath Tier 1-3 expert-level mathematics.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "FrontierMath"
  },
  {
    "model_benchmark_id": 9033,
    "benchmark_id": "gpqa",
    "model_id": "gpt-5-mini-2025-08-07",
    "score": 0.823,
    "normalized_score": 0.823,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-5/",
    "verified_by_llmstats": false,
    "analysis_method": "GPT-5 mini - Diamond thinking no tools",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 9038,
    "benchmark_id": "humanity's-last-exam",
    "model_id": "gpt-5-mini-2025-08-07",
    "score": 0.167,
    "normalized_score": 0.167,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-5/",
    "verified_by_llmstats": false,
    "analysis_method": "GPT-5 mini with thinking mode (no tools) - Full set of expert-level questions across subjects.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "Humanity's Last Exam"
  },
  {
    "model_benchmark_id": 9029,
    "benchmark_id": "hmmt-2025",
    "model_id": "gpt-5-mini-2025-08-07",
    "score": 0.878,
    "normalized_score": 0.878,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-5/",
    "verified_by_llmstats": false,
    "analysis_method": "GPT-5 mini with thinking mode enabled (no tools) - Harvard-MIT Mathematics Tournament.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "HMMT 2025"
  }
]


================================================
FILE: data/organizations/openai/models/gpt-5-mini-2025-08-07/model.json
================================================
{
  "model_id": "gpt-5-mini-2025-08-07",
  "name": "GPT-5 mini",
  "organization_id": "openai",
  "fine_tuned_from_model_id": null,
  "description": "A faster, more cost-efficient version of GPT-5 for well-defined tasks. Great for well-defined tasks and precise prompts with high reasoning capabilities at reduced cost.",
  "release_date": "2025-08-07",
  "announcement_date": "2025-08-07",
  "license_id": "proprietary",
  "multimodal": true,
  "knowledge_cutoff": "2024-05-30",
  "param_count": null,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://platform.openai.com/docs/models/gpt-5-mini",
  "source_playground": "https://platform.openai.com/playground?mode=chat&model=gpt-5-mini",
  "source_paper": null,
  "source_scorecard_blog_link": "https://openai.com/index/gpt-5/",
  "source_repo_link": null,
  "source_weights_link": null,
  "created_at": "2025-07-24T12:00:00.000000+00:00",
  "updated_at": "2025-07-24T12:00:00.000000+00:00",
  "model_family_id": null
}


================================================
FILE: data/organizations/openai/models/gpt-5-nano-2025-08-07/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 9022,
    "benchmark_id": "aime-2025",
    "model_id": "gpt-5-nano-2025-08-07",
    "score": 0.852,
    "normalized_score": 0.852,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-5/",
    "verified_by_llmstats": false,
    "analysis_method": "GPT-5 nano with thinking mode enabled (no tools) - competition mathematics.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "AIME 2025"
  },
  {
    "model_benchmark_id": 9026,
    "benchmark_id": "frontiermath",
    "model_id": "gpt-5-nano-2025-08-07",
    "score": 0.096,
    "normalized_score": 0.096,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-5/",
    "verified_by_llmstats": false,
    "analysis_method": "GPT-5 nano with thinking mode enabled (with python tool only) - FrontierMath Tier 1-3 expert-level mathematics.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "FrontierMath"
  },
  {
    "model_benchmark_id": 9034,
    "benchmark_id": "gpqa",
    "model_id": "gpt-5-nano-2025-08-07",
    "score": 0.712,
    "normalized_score": 0.712,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-5/",
    "verified_by_llmstats": false,
    "analysis_method": "GPT-5 nano - Diamond thinking no tools",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 9039,
    "benchmark_id": "humanity's-last-exam",
    "model_id": "gpt-5-nano-2025-08-07",
    "score": 0.087,
    "normalized_score": 0.087,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-5/",
    "verified_by_llmstats": false,
    "analysis_method": "GPT-5 nano with thinking mode (no tools) - Full set of expert-level questions across subjects.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "Humanity's Last Exam"
  },
  {
    "model_benchmark_id": 9030,
    "benchmark_id": "hmmt-2025",
    "model_id": "gpt-5-nano-2025-08-07",
    "score": 0.756,
    "normalized_score": 0.756,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-5/",
    "verified_by_llmstats": false,
    "analysis_method": "GPT-5 nano with thinking mode enabled (no tools) - Harvard-MIT Mathematics Tournament.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "HMMT 2025"
  }
]


================================================
FILE: data/organizations/openai/models/gpt-5-nano-2025-08-07/model.json
================================================
{
  "model_id": "gpt-5-nano-2025-08-07",
  "name": "GPT-5 nano",
  "organization_id": "openai",
  "fine_tuned_from_model_id": null,
  "description": "GPT-5 nano is our fastest, cheapest version of GPT-5. It's great for summarization and classification tasks with average reasoning capabilities and very fast speed.",
  "release_date": "2025-08-07",
  "announcement_date": "2025-08-07",
  "license_id": "proprietary",
  "multimodal": true,
  "knowledge_cutoff": "2024-05-30",
  "param_count": null,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://platform.openai.com/docs/models/gpt-5-nano",
  "source_playground": "https://platform.openai.com/playground?mode=chat&model=gpt-5-nano",
  "source_paper": null,
  "source_scorecard_blog_link": "https://openai.com/index/gpt-5/",
  "source_repo_link": null,
  "source_weights_link": null,
  "created_at": "2025-07-24T12:00:00.000000+00:00",
  "updated_at": "2025-07-24T12:00:00.000000+00:00",
  "model_family_id": null
}


================================================
FILE: data/organizations/openai/models/gpt-oss-120b/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 224,
    "benchmark_id": "codeforces",
    "model_id": "gpt-oss-120b",
    "score": 0.874,
    "normalized_score": 0.874,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-oss/",
    "verified_by_llmstats": false,
    "analysis_method": "Elo (with tools)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-05T19:49:05.852855+00:00",
    "updated_at": "2025-08-05T19:49:05.852855+00:00",
    "benchmark_name": "Codeforces Competition code"
  },
  {
    "model_benchmark_id": 224,
    "benchmark_id": "codeforces",
    "model_id": "gpt-oss-120b",
    "score": 0.821,
    "normalized_score": 0.821,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-oss/",
    "verified_by_llmstats": false,
    "analysis_method": "Elo (without tools)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-05T19:49:05.852855+00:00",
    "updated_at": "2025-08-05T19:49:05.852855+00:00",
    "benchmark_name": "Codeforces Competition code"
  },
  {
    "model_benchmark_id": 224,
    "benchmark_id": "humanity's-last-exam",
    "model_id": "gpt-oss-120b",
    "score": 0.19,
    "normalized_score": 0.19,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-oss/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy (with tools)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-05T19:49:05.852855+00:00",
    "updated_at": "2025-08-05T19:49:05.852855+00:00",
    "benchmark_name": "Humanity's Last Exam"
  },
  {
    "model_benchmark_id": 224,
    "benchmark_id": "humanity's-last-exam",
    "model_id": "gpt-oss-120b",
    "score": 0.149,
    "normalized_score": 0.149,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-oss/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy (without tools)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-05T19:49:05.852855+00:00",
    "updated_at": "2025-08-05T19:49:05.852855+00:00",
    "benchmark_name": "Humanity's Last Exam"
  },
  {
    "model_benchmark_id": 224,
    "benchmark_id": "healthbench",
    "model_id": "gpt-oss-120b",
    "score": 0.576,
    "normalized_score": 0.576,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-oss/",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-05T19:49:05.852855+00:00",
    "updated_at": "2025-08-05T19:49:05.852855+00:00",
    "benchmark_name": "HealthBench - Realistic health conversations"
  },
  {
    "model_benchmark_id": 225,
    "benchmark_id": "healthbench-hard",
    "model_id": "gpt-oss-120b",
    "score": 0.3,
    "normalized_score": 0.3,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-oss/",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-05T19:49:05.852855+00:00",
    "updated_at": "2025-08-05T19:49:05.852855+00:00",
    "benchmark_name": "HealthBench Hard - Challenging health conversations"
  },
  {
    "model_benchmark_id": 2226,
    "benchmark_id": "gpqa",
    "model_id": "gpt-oss-120b",
    "score": 0.801,
    "normalized_score": 0.801,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-oss/",
    "verified_by_llmstats": false,
    "analysis_method": "Without tools",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-05T19:49:05.852855+00:00",
    "updated_at": "2025-08-05T19:49:05.852855+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 22226,
    "benchmark_id": "mmlu",
    "model_id": "gpt-oss-120b",
    "score": 0.9,
    "normalized_score": 0.9,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-oss/",
    "verified_by_llmstats": false,
    "analysis_method": "Without tools",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-05T19:49:05.852855+00:00",
    "updated_at": "2025-08-05T19:49:05.852855+00:00",
    "benchmark_name": "MMLU benchmark"
  },
  {
    "model_benchmark_id": 22226,
    "benchmark_id": "tau-bench-retail",
    "model_id": "gpt-oss-120b",
    "score": 0.678,
    "normalized_score": 0.678,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-oss/",
    "verified_by_llmstats": false,
    "analysis_method": "Function calling",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-05T19:49:05.852855+00:00",
    "updated_at": "2025-08-05T19:49:05.852855+00:00",
    "benchmark_name": "TAU-bench Retail benchmark"
  }
]

================================================
FILE: data/organizations/openai/models/gpt-oss-120b/model.json
================================================
{
  "model_id": "gpt-oss-120b",
  "name": "GPT OSS 120B",
  "organization_id": "openai",
  "fine_tuned_from_model_id": null,
  "description": "GPT-OSS-120B is an open-weight, 116.8B-parameter Mixture-of-Experts (MoE) language model from OpenAI designed for high-reasoning, agentic, and general-purpose production use cases. It activates 5.1B parameters per forward pass and is optimized to run on a single H100 GPU with native MXFP4 quantization. The model supports configurable reasoning depth, full chain-of-thought access, and native tool use, including function calling, browsing, and structured output generation. It achieves near-parity with OpenAI o4-mini on core reasoning benchmarks. Note: While referred to as '120b' for simplicity, it technically has 116.8B parameters.",
  "release_date": "2025-08-05",
  "announcement_date": "2025-08-05",
  "license_id": "apache_2_0",
  "multimodal": false,
  "knowledge_cutoff": null,
  "param_count": 116800000000,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": null,
  "source_playground": "https://gpt-oss.com/",
  "source_paper": "https://cdn.openai.com/pdf/419b6906-9da6-406c-a19d-1bb078ac7637/oai_gpt-oss_model_card.pdf",
  "source_scorecard_blog_link": "https://openai.com/index/gpt-oss-model-card/",
  "source_repo_link": "https://github.com/openai/gpt-oss",
  "source_weights_link": "https://huggingface.co/openai/gpt-oss-120b",
  "created_at": "2025-08-05T19:49:05.852855+00:00",
  "updated_at": "2025-08-05T19:49:05.852855+00:00",
  "model_family_id": null
}


================================================
FILE: data/organizations/openai/models/gpt-oss-20b/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 224,
    "benchmark_id": "codeforces",
    "model_id": "gpt-oss-20b",
    "score": 0.8387,
    "normalized_score": 0.8387,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-oss/",
    "verified_by_llmstats": false,
    "analysis_method": "Elo (with tools)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-05T19:49:05.852855+00:00",
    "updated_at": "2025-08-05T19:49:05.852855+00:00",
    "benchmark_name": "Codeforces Competition code"
  },
  {
    "model_benchmark_id": 224,
    "benchmark_id": "codeforces",
    "model_id": "gpt-oss-20b",
    "score": 0.7433,
    "normalized_score": 0.7433,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-oss/",
    "verified_by_llmstats": false,
    "analysis_method": "Elo (without tools)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-05T19:49:05.852855+00:00",
    "updated_at": "2025-08-05T19:49:05.852855+00:00",
    "benchmark_name": "Codeforces Competition code"
  },
  {
    "model_benchmark_id": 224,
    "benchmark_id": "humanity's-last-exam",
    "model_id": "gpt-oss-20b",
    "score": 0.173,
    "normalized_score": 0.173,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-oss/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy (with tools)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-05T19:49:05.852855+00:00",
    "updated_at": "2025-08-05T19:49:05.852855+00:00",
    "benchmark_name": "Humanity's Last Exam"
  },
  {
    "model_benchmark_id": 224,
    "benchmark_id": "humanity's-last-exam",
    "model_id": "gpt-oss-20b",
    "score": 0.109,
    "normalized_score": 0.109,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-oss/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy (without tools)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-05T19:49:05.852855+00:00",
    "updated_at": "2025-08-05T19:49:05.852855+00:00",
    "benchmark_name": "Humanity's Last Exam"
  },
  {
    "model_benchmark_id": 224,
    "benchmark_id": "healthbench",
    "model_id": "gpt-oss-20b",
    "score": 0.425,
    "normalized_score": 0.425,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-oss/",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-05T19:49:05.852855+00:00",
    "updated_at": "2025-08-05T19:49:05.852855+00:00",
    "benchmark_name": "HealthBench - Realistic health conversations"
  },
  {
    "model_benchmark_id": 225,
    "benchmark_id": "healthbench-hard",
    "model_id": "gpt-oss-20b",
    "score": 0.108,
    "normalized_score": 0.108,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-oss/",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-05T19:49:05.852855+00:00",
    "updated_at": "2025-08-05T19:49:05.852855+00:00",
    "benchmark_name": "HealthBench Hard - Challenging health conversations"
  },
  {
    "model_benchmark_id": 2226,
    "benchmark_id": "gpqa",
    "model_id": "gpt-oss-20b",
    "score": 0.715,
    "normalized_score": 0.715,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-oss/",
    "verified_by_llmstats": false,
    "analysis_method": "Diamond (without tools)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-05T19:49:05.852855+00:00",
    "updated_at": "2025-08-05T19:49:05.852855+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 22226,
    "benchmark_id": "mmlu",
    "model_id": "gpt-oss-20b",
    "score": 0.853,
    "normalized_score": 0.853,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-oss/",
    "verified_by_llmstats": false,
    "analysis_method": "Without tools",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-05T19:49:05.852855+00:00",
    "updated_at": "2025-08-05T19:49:05.852855+00:00",
    "benchmark_name": "MMLU benchmark"
  },
  {
    "model_benchmark_id": 22226,
    "benchmark_id": "tau-bench-retail",
    "model_id": "gpt-oss-20b",
    "score": 0.548,
    "normalized_score": 0.548,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-oss/",
    "verified_by_llmstats": false,
    "analysis_method": "Function calling",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-05T19:49:05.852855+00:00",
    "updated_at": "2025-08-05T19:49:05.852855+00:00",
    "benchmark_name": "TAU-bench Retail benchmark"
  }
]

================================================
FILE: data/organizations/openai/models/gpt-oss-20b/model.json
================================================
{
  "model_id": "gpt-oss-20b",
  "name": "GPT OSS 20B",
  "organization_id": "openai",
  "fine_tuned_from_model_id": null,
  "description": "The gpt-oss-20b model (technically 20.9B parameters) achieves near-parity with OpenAI o4-mini on core reasoning benchmarks, while running efficiently on a single 80 GB GPU. The gpt-oss-20b model delivers similar results to OpenAI o3‑mini on common benchmarks and can run on edge devices with just 16 GB of memory, making it ideal for on-device use cases, local inference, or rapid iteration without costly infrastructure. Both models also perform strongly on tool use, few-shot function calling, CoT reasoning (as seen in results on the Tau-Bench agentic evaluation suite) and HealthBench (even outperforming proprietary models like OpenAI o1 and GPT‑4o). Note: While referred to as '20b' for simplicity, it technically has 20.9B parameters.",
  "release_date": "2025-08-05",
  "announcement_date": "2025-08-05",
  "license_id": "apache_2_0",
  "multimodal": false,
  "knowledge_cutoff": null,
  "param_count": 20900000000,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": null,
  "source_playground": "https://gpt-oss.com/",
  "source_paper": "https://cdn.openai.com/pdf/419b6906-9da6-406c-a19d-1bb078ac7637/oai_gpt-oss_model_card.pdf",
  "source_scorecard_blog_link": "https://openai.com/index/gpt-oss-model-card/",
  "source_repo_link": "https://github.com/openai/gpt-oss",
  "source_weights_link": "https://huggingface.co/openai/gpt-oss-20b",
  "created_at": "2025-08-05T19:49:05.852855+00:00",
  "updated_at": "2025-08-05T19:49:05.852855+00:00",
  "model_family_id": null
}


================================================
FILE: data/organizations/openai/models/o1-2024-12-17/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 490,
    "benchmark_id": "aime-2024",
    "model_id": "o1-2024-12-17",
    "score": 0.743,
    "normalized_score": 0.743,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.025628+00:00",
    "updated_at": "2025-07-19T19:56:12.025628+00:00",
    "benchmark_name": "AIME 2024"
  },
  {
    "model_benchmark_id": 1831,
    "benchmark_id": "frontiermath",
    "model_id": "o1-2024-12-17",
    "score": 0.055,
    "normalized_score": 0.055,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/o1-and-new-tools-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.186673+00:00",
    "updated_at": "2025-07-19T19:56:15.186673+00:00",
    "benchmark_name": "FrontierMath"
  },
  {
    "model_benchmark_id": 358,
    "benchmark_id": "gpqa",
    "model_id": "o1-2024-12-17",
    "score": 0.78,
    "normalized_score": 0.78,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/openai-o3-mini/",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.768954+00:00",
    "updated_at": "2025-07-19T19:56:11.768954+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 1911,
    "benchmark_id": "gpqa-biology",
    "model_id": "o1-2024-12-17",
    "score": 0.692,
    "normalized_score": 0.692,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/learning-to-reason-with-llms/",
    "verified_by_llmstats": false,
    "analysis_method": "pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.394088+00:00",
    "updated_at": "2025-07-19T19:56:15.394088+00:00",
    "benchmark_name": "GPQA Biology"
  },
  {
    "model_benchmark_id": 1912,
    "benchmark_id": "gpqa-chemistry",
    "model_id": "o1-2024-12-17",
    "score": 0.647,
    "normalized_score": 0.647,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/learning-to-reason-with-llms/",
    "verified_by_llmstats": false,
    "analysis_method": "pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.399030+00:00",
    "updated_at": "2025-07-19T19:56:15.399030+00:00",
    "benchmark_name": "GPQA Chemistry"
  },
  {
    "model_benchmark_id": 1913,
    "benchmark_id": "gpqa-physics",
    "model_id": "o1-2024-12-17",
    "score": 0.928,
    "normalized_score": 0.928,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/learning-to-reason-with-llms/",
    "verified_by_llmstats": false,
    "analysis_method": "pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.403790+00:00",
    "updated_at": "2025-07-19T19:56:15.403790+00:00",
    "benchmark_name": "GPQA Physics"
  },
  {
    "model_benchmark_id": 1016,
    "benchmark_id": "gsm8k",
    "model_id": "o1-2024-12-17",
    "score": 0.971,
    "normalized_score": 0.971,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/learning-to-reason-with-llms/",
    "verified_by_llmstats": false,
    "analysis_method": "pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.116437+00:00",
    "updated_at": "2025-07-19T19:56:13.116437+00:00",
    "benchmark_name": "GSM8k"
  },
  {
    "model_benchmark_id": 814,
    "benchmark_id": "humaneval",
    "model_id": "o1-2024-12-17",
    "score": 0.881,
    "normalized_score": 0.881,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/learning-to-reason-with-llms/",
    "verified_by_llmstats": false,
    "analysis_method": "pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.696047+00:00",
    "updated_at": "2025-07-19T19:56:12.696047+00:00",
    "benchmark_name": "HumanEval"
  },
  {
    "model_benchmark_id": 755,
    "benchmark_id": "livebench",
    "model_id": "o1-2024-12-17",
    "score": 0.67,
    "normalized_score": 0.67,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/openai-o3-mini//",
    "verified_by_llmstats": false,
    "analysis_method": "coding",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.587814+00:00",
    "updated_at": "2025-07-19T19:56:12.587814+00:00",
    "benchmark_name": "LiveBench"
  },
  {
    "model_benchmark_id": 428,
    "benchmark_id": "math",
    "model_id": "o1-2024-12-17",
    "score": 0.964,
    "normalized_score": 0.964,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/learning-to-reason-with-llms/",
    "verified_by_llmstats": false,
    "analysis_method": "pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.905279+00:00",
    "updated_at": "2025-07-19T19:56:11.905279+00:00",
    "benchmark_name": "MATH"
  },
  {
    "model_benchmark_id": 546,
    "benchmark_id": "mathvista",
    "model_id": "o1-2024-12-17",
    "score": 0.718,
    "normalized_score": 0.718,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.126058+00:00",
    "updated_at": "2025-07-19T19:56:12.126058+00:00",
    "benchmark_name": "MathVista"
  },
  {
    "model_benchmark_id": 1298,
    "benchmark_id": "mgsm",
    "model_id": "o1-2024-12-17",
    "score": 0.893,
    "normalized_score": 0.893,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/o1-and-new-tools-for-developers/",
    "verified_by_llmstats": false,
    "analysis_method": "pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.715686+00:00",
    "updated_at": "2025-07-19T19:56:13.715686+00:00",
    "benchmark_name": "MGSM"
  },
  {
    "model_benchmark_id": 125,
    "benchmark_id": "mmlu",
    "model_id": "o1-2024-12-17",
    "score": 0.918,
    "normalized_score": 0.918,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/learning-to-reason-with-llms/",
    "verified_by_llmstats": false,
    "analysis_method": "pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.330211+00:00",
    "updated_at": "2025-07-19T19:56:11.330211+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 1486,
    "benchmark_id": "mmmlu",
    "model_id": "o1-2024-12-17",
    "score": 0.877,
    "normalized_score": 0.877,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.165932+00:00",
    "updated_at": "2025-07-19T19:56:14.165932+00:00",
    "benchmark_name": "MMMLU"
  },
  {
    "model_benchmark_id": 596,
    "benchmark_id": "mmmu",
    "model_id": "o1-2024-12-17",
    "score": 0.776,
    "normalized_score": 0.776,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.228467+00:00",
    "updated_at": "2025-07-19T19:56:12.228467+00:00",
    "benchmark_name": "MMMU"
  },
  {
    "model_benchmark_id": 241,
    "benchmark_id": "simpleqa",
    "model_id": "o1-2024-12-17",
    "score": 0.47,
    "normalized_score": 0.47,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-4-5/",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.561209+00:00",
    "updated_at": "2025-07-19T19:56:11.561209+00:00",
    "benchmark_name": "SimpleQA"
  },
  {
    "model_benchmark_id": 1361,
    "benchmark_id": "swe-bench-verified",
    "model_id": "o1-2024-12-17",
    "score": 0.41,
    "normalized_score": 0.41,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "verified",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.865799+00:00",
    "updated_at": "2025-07-19T19:56:13.865799+00:00",
    "benchmark_name": "SWE-Bench Verified"
  },
  {
    "model_benchmark_id": 1783,
    "benchmark_id": "tau-bench-airline",
    "model_id": "o1-2024-12-17",
    "score": 0.5,
    "normalized_score": 0.5,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "agents",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.021642+00:00",
    "updated_at": "2025-07-19T19:56:15.021642+00:00",
    "benchmark_name": "TAU-bench Airline"
  },
  {
    "model_benchmark_id": 1769,
    "benchmark_id": "tau-bench-retail",
    "model_id": "o1-2024-12-17",
    "score": 0.708,
    "normalized_score": 0.708,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "agents",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.992114+00:00",
    "updated_at": "2025-07-19T19:56:14.992114+00:00",
    "benchmark_name": "TAU-bench Retail"
  }
]

================================================
FILE: data/organizations/openai/models/o1-2024-12-17/model.json
================================================
{
  "model_id": "o1-2024-12-17",
  "name": "o1",
  "organization_id": "openai",
  "fine_tuned_from_model_id": null,
  "description": "A research preview model focused on mathematical and logical reasoning capabilities, demonstrating improved performance on tasks requiring step-by-step reasoning, mathematical problem-solving, and code generation. The model shows enhanced capabilities in formal reasoning while maintaining strong general capabilities.",
  "release_date": "2024-12-17",
  "announcement_date": "2024-12-17",
  "license_id": "proprietary",
  "multimodal": false,
  "knowledge_cutoff": null,
  "param_count": null,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://platform.openai.com/docs/models",
  "source_playground": null,
  "source_paper": "https://cdn.openai.com/o1-system-card-20240917.pdf",
  "source_scorecard_blog_link": "https://openai.com/index/learning-to-reason-with-llms",
  "source_repo_link": "https://openai.com/index/o1-and-new-tools-for-developers/",
  "source_weights_link": null,
  "created_at": "2025-07-19T19:49:05.855348+00:00",
  "updated_at": "2025-07-19T19:49:05.855348+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/openai/models/o1-mini/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 1910,
    "benchmark_id": "cybersecurity-ctfs",
    "model_id": "o1-mini",
    "score": 0.287,
    "normalized_score": 0.287,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/openai-o1-mini-advancing-cost-efficient-reasoning/",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@12 accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.390045+00:00",
    "updated_at": "2025-07-19T19:56:15.390045+00:00",
    "benchmark_name": "Cybersecurity CTFs"
  },
  {
    "model_benchmark_id": 356,
    "benchmark_id": "gpqa",
    "model_id": "o1-mini",
    "score": 0.6,
    "normalized_score": 0.6,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/openai-o1-mini-advancing-cost-efficient-reasoning/",
    "verified_by_llmstats": false,
    "analysis_method": "Diamond, 0-shot Chain of Thought",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.765864+00:00",
    "updated_at": "2025-07-19T19:56:11.765864+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 812,
    "benchmark_id": "humaneval",
    "model_id": "o1-mini",
    "score": 0.924,
    "normalized_score": 0.924,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/openai-o1-mini-advancing-cost-efficient-reasoning/",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1 accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.692107+00:00",
    "updated_at": "2025-07-19T19:56:12.692107+00:00",
    "benchmark_name": "HumanEval"
  },
  {
    "model_benchmark_id": 513,
    "benchmark_id": "math-500",
    "model_id": "o1-mini",
    "score": 0.9,
    "normalized_score": 0.9,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/openai-o1-mini-advancing-cost-efficient-reasoning/",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot Chain of Thought",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.065288+00:00",
    "updated_at": "2025-07-19T19:56:12.065288+00:00",
    "benchmark_name": "MATH-500"
  },
  {
    "model_benchmark_id": 123,
    "benchmark_id": "mmlu",
    "model_id": "o1-mini",
    "score": 0.852,
    "normalized_score": 0.852,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/openai-o1-mini-advancing-cost-efficient-reasoning/",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot Chain of Thought",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.327239+00:00",
    "updated_at": "2025-07-19T19:56:11.327239+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 1909,
    "benchmark_id": "superglue",
    "model_id": "o1-mini",
    "score": 0.75,
    "normalized_score": 0.75,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/openai-o1-mini-advancing-cost-efficient-reasoning/",
    "verified_by_llmstats": false,
    "analysis_method": "Evaluation on validation set",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.385801+00:00",
    "updated_at": "2025-07-19T19:56:15.385801+00:00",
    "benchmark_name": "SuperGLUE"
  }
]

================================================
FILE: data/organizations/openai/models/o1-mini/model.json
================================================
{
  "model_id": "o1-mini",
  "name": "o1-mini",
  "organization_id": "openai",
  "fine_tuned_from_model_id": null,
  "description": "o1-mini is a cost-efficient language model developed by OpenAI, designed for advanced reasoning tasks while minimizing computational resources.",
  "release_date": "2024-09-12",
  "announcement_date": "2024-09-12",
  "license_id": "proprietary",
  "multimodal": false,
  "knowledge_cutoff": null,
  "param_count": null,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://openai.com/api/o1-mini",
  "source_playground": "https://platform.openai.com/playground",
  "source_paper": "https://cdn.openai.com/o1-system-card-20240917.pdf",
  "source_scorecard_blog_link": "https://openai.com/index/openai-o1-mini-advancing-cost-efficient-reasoning/",
  "source_repo_link": null,
  "source_weights_link": null,
  "created_at": "2025-07-19T19:49:05.850010+00:00",
  "updated_at": "2025-07-19T19:49:05.850010+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/openai/models/o1-preview/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 491,
    "benchmark_id": "aime-2024",
    "model_id": "o1-preview",
    "score": 0.42,
    "normalized_score": 0.42,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/learning-to-reason-with-llms/",
    "verified_by_llmstats": false,
    "analysis_method": "pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.027037+00:00",
    "updated_at": "2025-07-19T19:56:12.027037+00:00",
    "benchmark_name": "AIME 2024"
  },
  {
    "model_benchmark_id": 360,
    "benchmark_id": "gpqa",
    "model_id": "o1-preview",
    "score": 0.733,
    "normalized_score": 0.733,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/learning-to-reason-with-llms/",
    "verified_by_llmstats": false,
    "analysis_method": "pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.772534+00:00",
    "updated_at": "2025-07-19T19:56:11.772534+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 756,
    "benchmark_id": "livebench",
    "model_id": "o1-preview",
    "score": 0.523,
    "normalized_score": 0.523,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/learning-to-reason-with-llms/",
    "verified_by_llmstats": false,
    "analysis_method": "Coding",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.589687+00:00",
    "updated_at": "2025-07-19T19:56:12.589687+00:00",
    "benchmark_name": "LiveBench"
  },
  {
    "model_benchmark_id": 430,
    "benchmark_id": "math",
    "model_id": "o1-preview",
    "score": 0.855,
    "normalized_score": 0.855,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/learning-to-reason-with-llms",
    "verified_by_llmstats": false,
    "analysis_method": "pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.910412+00:00",
    "updated_at": "2025-07-19T19:56:11.910412+00:00",
    "benchmark_name": "MATH"
  },
  {
    "model_benchmark_id": 1300,
    "benchmark_id": "mgsm",
    "model_id": "o1-preview",
    "score": 0.908,
    "normalized_score": 0.908,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/learning-to-reason-with-llms/",
    "verified_by_llmstats": false,
    "analysis_method": "pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.718867+00:00",
    "updated_at": "2025-07-19T19:56:13.718867+00:00",
    "benchmark_name": "MGSM"
  },
  {
    "model_benchmark_id": 127,
    "benchmark_id": "mmlu",
    "model_id": "o1-preview",
    "score": 0.908,
    "normalized_score": 0.908,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/learning-to-reason-with-llms",
    "verified_by_llmstats": false,
    "analysis_method": "pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.333269+00:00",
    "updated_at": "2025-07-19T19:56:11.333269+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 242,
    "benchmark_id": "simpleqa",
    "model_id": "o1-preview",
    "score": 0.424,
    "normalized_score": 0.424,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/learning-to-reason-with-llms/",
    "verified_by_llmstats": false,
    "analysis_method": "Factuality",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.562695+00:00",
    "updated_at": "2025-07-19T19:56:11.562695+00:00",
    "benchmark_name": "SimpleQA"
  },
  {
    "model_benchmark_id": 1362,
    "benchmark_id": "swe-bench-verified",
    "model_id": "o1-preview",
    "score": 0.413,
    "normalized_score": 0.413,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/learning-to-reason-with-llms/",
    "verified_by_llmstats": false,
    "analysis_method": "Verified",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.867753+00:00",
    "updated_at": "2025-07-19T19:56:13.867753+00:00",
    "benchmark_name": "SWE-Bench Verified"
  }
]

================================================
FILE: data/organizations/openai/models/o1-preview/model.json
================================================
{
  "model_id": "o1-preview",
  "name": "o1-preview",
  "organization_id": "openai",
  "fine_tuned_from_model_id": null,
  "description": "A research preview model focused on mathematical and logical reasoning capabilities, demonstrating improved performance on tasks requiring step-by-step reasoning, mathematical problem-solving, and code generation. The model shows enhanced capabilities in formal reasoning while maintaining strong general capabilities.",
  "release_date": "2024-09-12",
  "announcement_date": "2024-09-12",
  "license_id": "proprietary",
  "multimodal": false,
  "knowledge_cutoff": null,
  "param_count": null,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://platform.openai.com/docs/models",
  "source_playground": null,
  "source_paper": "https://cdn.openai.com/o1-system-card-20240917.pdf",
  "source_scorecard_blog_link": "https://openai.com/index/learning-to-reason-with-llms",
  "source_repo_link": "https://github.com/openai",
  "source_weights_link": null,
  "created_at": "2025-07-19T19:49:05.862671+00:00",
  "updated_at": "2025-07-19T19:49:05.862671+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/openai/models/o1-pro/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 487,
    "benchmark_id": "aime-2024",
    "model_id": "o1-pro",
    "score": 0.86,
    "normalized_score": 0.86,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-chatgpt-pro/",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1 accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.021363+00:00",
    "updated_at": "2025-07-19T19:56:12.021363+00:00",
    "benchmark_name": "AIME 2024"
  },
  {
    "model_benchmark_id": 354,
    "benchmark_id": "gpqa",
    "model_id": "o1-pro",
    "score": 0.79,
    "normalized_score": 0.79,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-chatgpt-pro/",
    "verified_by_llmstats": false,
    "analysis_method": "Diamond, Pass@1 accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.762804+00:00",
    "updated_at": "2025-07-19T19:56:11.762804+00:00",
    "benchmark_name": "GPQA"
  }
]

================================================
FILE: data/organizations/openai/models/o1-pro/model.json
================================================
{
  "model_id": "o1-pro",
  "name": "o1-pro",
  "organization_id": "openai",
  "fine_tuned_from_model_id": null,
  "description": "o1-pro is OpenAI's advanced language model optimized for complex reasoning and specialized professional tasks, offering enhanced capabilities while maintaining high efficiency.",
  "release_date": "2024-12-17",
  "announcement_date": "2024-12-17",
  "license_id": "proprietary",
  "multimodal": true,
  "knowledge_cutoff": "2023-09-30",
  "param_count": null,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://openai.com/api",
  "source_playground": "https://platform.openai.com/playground",
  "source_paper": "https://cdn.openai.com/o1-system-card-20240917.pdf",
  "source_scorecard_blog_link": "https://openai.com/index/introducing-chatgpt-pro/",
  "source_repo_link": null,
  "source_weights_link": null,
  "created_at": "2025-07-19T19:49:05.844613+00:00",
  "updated_at": "2025-07-19T19:49:05.844613+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/openai/models/o3-2025-04-16/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 666,
    "benchmark_id": "aider-polyglot",
    "model_id": "o3-2025-04-16",
    "score": 0.813,
    "normalized_score": 0.813,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-o3-and-o4-mini/",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy (whole)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.380617+00:00",
    "updated_at": "2025-07-19T19:56:12.380617+00:00",
    "benchmark_name": "Aider-Polyglot"
  },
  {
    "model_benchmark_id": 481,
    "benchmark_id": "aime-2024",
    "model_id": "o3-2025-04-16",
    "score": 0.916,
    "normalized_score": 0.916,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-o3-and-o4-mini/",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy (no tools)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.012342+00:00",
    "updated_at": "2025-07-19T19:56:12.012342+00:00",
    "benchmark_name": "AIME 2024"
  },
  {
    "model_benchmark_id": 705,
    "benchmark_id": "aime-2025",
    "model_id": "o3-2025-04-16",
    "score": 0.864,
    "normalized_score": 0.864,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-o3-and-o4-mini/",
    "verified_by_llmstats": false,
    "analysis_method": "pass@1 (no tools)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.475926+00:00",
    "updated_at": "2025-07-19T19:56:12.475926+00:00",
    "benchmark_name": "AIME 2025"
  },
  {
    "model_benchmark_id": 1832,
    "benchmark_id": "arc-agi",
    "model_id": "o3-2025-04-16",
    "score": 0.88,
    "normalized_score": 0.88,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.youtube.com/live/SKBG1sqdyIU?si=lWccKHt8bnttuYta",
    "verified_by_llmstats": false,
    "analysis_method": "test set evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.190370+00:00",
    "updated_at": "2025-07-19T19:56:15.190370+00:00",
    "benchmark_name": "ARC-AGI"
  },
  {
    "model_benchmark_id": 1389,
    "benchmark_id": "arc-agi-v2",
    "model_id": "o3-2025-04-16",
    "score": 0.065,
    "normalized_score": 0.065,
    "is_self_reported": false,
    "self_reported_source_link": "https://x.com/xai/status/1943158495588815072",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.925569+00:00",
    "updated_at": "2025-07-19T19:56:13.925569+00:00",
    "benchmark_name": "ARC-AGI v2"
  },
  {
    "model_benchmark_id": 1842,
    "benchmark_id": "browsecomp",
    "model_id": "o3-2025-04-16",
    "score": 0.497,
    "normalized_score": 0.497,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-o3-and-o4-mini/",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy (with python + browsing)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.215315+00:00",
    "updated_at": "2025-07-19T19:56:15.215315+00:00",
    "benchmark_name": "BrowseComp"
  },
  {
    "model_benchmark_id": 1833,
    "benchmark_id": "charxiv-r",
    "model_id": "o3-2025-04-16",
    "score": 0.786,
    "normalized_score": 0.786,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-5/",
    "verified_by_llmstats": false,
    "analysis_method": "OpenAI o3 with thinking mode - Scientific figure reasoning and interpretation.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.193874+00:00",
    "updated_at": "2025-07-19T19:56:15.193874+00:00",
    "benchmark_name": "CharXiv-R"
  },
  {
    "model_benchmark_id": 1829,
    "benchmark_id": "frontiermath",
    "model_id": "o3-2025-04-16",
    "score": 0.158,
    "normalized_score": 0.158,
    "is_self_reported": true,
    "self_reported_source_link": "https://www.youtube.com/live/SKBG1sqdyIU?si=lWccKHt8bnttuYta",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.181554+00:00",
    "updated_at": "2025-07-19T19:56:15.181554+00:00",
    "benchmark_name": "FrontierMath"
  },
  {
    "model_benchmark_id": 347,
    "benchmark_id": "gpqa",
    "model_id": "o3-2025-04-16",
    "score": 0.833,
    "normalized_score": 0.833,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-o3-and-o4-mini/",
    "verified_by_llmstats": false,
    "analysis_method": "OpenAI o3 - Diamond thinking no tools",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.750986+00:00",
    "updated_at": "2025-07-19T19:56:11.750986+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 725,
    "benchmark_id": "humanity's-last-exam",
    "model_id": "o3-2025-04-16",
    "score": 0.202,
    "normalized_score": 0.202,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-o3-and-o4-mini/",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy (no tools)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.526631+00:00",
    "updated_at": "2025-07-19T19:56:12.526631+00:00",
    "benchmark_name": "Humanity's Last Exam"
  },
  {
    "model_benchmark_id": 2001,
    "benchmark_id": "humanity's-last-exam",
    "model_id": "o3-2025-04-16",
    "score": 0.243,
    "normalized_score": 0.243,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-5/",
    "verified_by_llmstats": false,
    "analysis_method": "OpenAI o3 with thinking mode enabled (Python + browser tools) - Full set of expert-level questions across subjects.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "Humanity's Last Exam"
  },
  {
    "model_benchmark_id": 2002,
    "benchmark_id": "humanity's-last-exam",
    "model_id": "o3-2025-04-16",
    "score": 0.147,
    "normalized_score": 0.147,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-5/",
    "verified_by_llmstats": false,
    "analysis_method": "OpenAI o3 with thinking mode enabled (no tools) - Full set of expert-level questions across subjects.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "Humanity's Last Exam"
  },
  {
    "model_benchmark_id": 538,
    "benchmark_id": "mathvista",
    "model_id": "o3-2025-04-16",
    "score": 0.868,
    "normalized_score": 0.868,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-o3-and-o4-mini/",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.112692+00:00",
    "updated_at": "2025-07-19T19:56:12.112692+00:00",
    "benchmark_name": "MathVista"
  },
  {
    "model_benchmark_id": 589,
    "benchmark_id": "mmmu",
    "model_id": "o3-2025-04-16",
    "score": 0.829,
    "normalized_score": 0.829,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-5/",
    "verified_by_llmstats": false,
    "analysis_method": "OpenAI o3 with thinking mode - College-level visual problem-solving with multimodal reasoning.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.211231+00:00",
    "updated_at": "2025-07-19T19:56:12.211231+00:00",
    "benchmark_name": "MMMU"
  },
  {
    "model_benchmark_id": 1840,
    "benchmark_id": "scale-multichallenge",
    "model_id": "o3-2025-04-16",
    "score": 0.565,
    "normalized_score": 0.565,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-o3-and-o4-mini/",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.208929+00:00",
    "updated_at": "2025-07-19T19:56:15.208929+00:00",
    "benchmark_name": "Scale MultiChallenge"
  },
  {
    "model_benchmark_id": 2004,
    "benchmark_id": "scale-multichallenge",
    "model_id": "o3-2025-04-16",
    "score": 0.604,
    "normalized_score": 0.604,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-5/",
    "verified_by_llmstats": false,
    "analysis_method": "OpenAI o3 with thinking mode enabled - Multi-turn instruction following benchmark.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "Scale MultiChallenge"
  },
  {
    "model_benchmark_id": 2006,
    "benchmark_id": "collie",
    "model_id": "o3-2025-04-16",
    "score": 0.984,
    "normalized_score": 0.984,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-5/",
    "verified_by_llmstats": false,
    "analysis_method": "OpenAI o3 with thinking mode enabled - Instruction-following in freeform writing.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "COLLIE"
  },
  {
    "model_benchmark_id": 2007,
    "benchmark_id": "tau2-airline",
    "model_id": "o3-2025-04-16",
    "score": 0.648,
    "normalized_score": 0.648,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-5/",
    "verified_by_llmstats": false,
    "analysis_method": "OpenAI o3 with thinking mode - Function calling benchmark (airline domain).",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "Tau2 airline"
  },
  {
    "model_benchmark_id": 2008,
    "benchmark_id": "tau2-retail",
    "model_id": "o3-2025-04-16",
    "score": 0.802,
    "normalized_score": 0.802,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-5/",
    "verified_by_llmstats": false,
    "analysis_method": "OpenAI o3 with thinking mode - Function calling benchmark (retail domain).",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "Tau2 retail"
  },
  {
    "model_benchmark_id": 2009,
    "benchmark_id": "tau2-telecom",
    "model_id": "o3-2025-04-16",
    "score": 0.582,
    "normalized_score": 0.582,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-5/",
    "verified_by_llmstats": false,
    "analysis_method": "OpenAI o3 with thinking mode - Function calling benchmark (telecom domain).",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "Tau2 telecom"
  },
  {
    "model_benchmark_id": 2010,
    "benchmark_id": "mmmu-pro",
    "model_id": "o3-2025-04-16",
    "score": 0.764,
    "normalized_score": 0.764,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-5/",
    "verified_by_llmstats": false,
    "analysis_method": "OpenAI o3 with thinking mode - Graduate-level visual problem-solving with advanced multimodal reasoning.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "MMMU-Pro"
  },
  {
    "model_benchmark_id": 2011,
    "benchmark_id": "videommmu",
    "model_id": "o3-2025-04-16",
    "score": 0.833,
    "normalized_score": 0.833,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-5/",
    "verified_by_llmstats": false,
    "analysis_method": "OpenAI o3 with thinking mode - Video-based multimodal reasoning (max frame 256).",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "VideoMMMU"
  },
  {
    "model_benchmark_id": 2012,
    "benchmark_id": "erqa",
    "model_id": "o3-2025-04-16",
    "score": 0.64,
    "normalized_score": 0.64,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-5/",
    "verified_by_llmstats": false,
    "analysis_method": "OpenAI o3 with thinking mode - Multimodal spatial reasoning.",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "benchmark_name": "ERQA"
  },
  {
    "model_benchmark_id": 1354,
    "benchmark_id": "swe-bench-verified",
    "model_id": "o3-2025-04-16",
    "score": 0.691,
    "normalized_score": 0.691,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-o3-and-o4-mini/",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.851256+00:00",
    "updated_at": "2025-07-19T19:56:13.851256+00:00",
    "benchmark_name": "SWE-Bench Verified"
  },
  {
    "model_benchmark_id": 1844,
    "benchmark_id": "tau-bench",
    "model_id": "o3-2025-04-16",
    "score": 0.63,
    "normalized_score": 0.63,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-o3-and-o4-mini/",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy (avg Airline/Retail)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.221470+00:00",
    "updated_at": "2025-07-19T19:56:15.221470+00:00",
    "benchmark_name": "Tau-bench"
  }
]


================================================
FILE: data/organizations/openai/models/o3-2025-04-16/model.json
================================================
{
  "model_id": "o3-2025-04-16",
  "name": "o3",
  "organization_id": "openai",
  "fine_tuned_from_model_id": null,
  "description": "OpenAI's most powerful reasoning model. o3 is a well-rounded and powerful model across domains. It sets a new standard for math, science, coding, and visual reasoning tasks. It also excels at technical writing and instruction-following. Use it to think through multi-step problems that involve analysis across text, code, and images.",
  "release_date": "2025-04-16",
  "announcement_date": "2025-04-16",
  "license_id": "proprietary",
  "multimodal": true,
  "knowledge_cutoff": "2024-05-31",
  "param_count": null,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://platform.openai.com/docs/models/o3",
  "source_playground": null,
  "source_paper": "https://cdn.openai.com/pdf/2221c875-02dc-4789-800b-e7758f3722c1/o3-and-o4-mini-system-card.pdf",
  "source_scorecard_blog_link": null,
  "source_repo_link": null,
  "source_weights_link": null,
  "created_at": "2025-07-19T19:49:05.818000+00:00",
  "updated_at": "2025-07-19T19:49:05.818000+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/openai/models/o3-mini/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 670,
    "benchmark_id": "aider-polyglot",
    "model_id": "o3-mini",
    "score": 0.667,
    "normalized_score": 0.667,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "benchmark score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.387419+00:00",
    "updated_at": "2025-07-19T19:56:12.387419+00:00",
    "benchmark_name": "Aider-Polyglot"
  },
  {
    "model_benchmark_id": 1334,
    "benchmark_id": "aider-polyglot-edit",
    "model_id": "o3-mini",
    "score": 0.604,
    "normalized_score": 0.604,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "benchmark score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.806560+00:00",
    "updated_at": "2025-07-19T19:56:13.806560+00:00",
    "benchmark_name": "Aider-Polyglot Edit"
  },
  {
    "model_benchmark_id": 485,
    "benchmark_id": "aime-2024",
    "model_id": "o3-mini",
    "score": 0.873,
    "normalized_score": 0.873,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/openai-o3-mini/",
    "verified_by_llmstats": false,
    "analysis_method": "test set evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.018382+00:00",
    "updated_at": "2025-07-19T19:56:12.018382+00:00",
    "benchmark_name": "AIME 2024"
  },
  {
    "model_benchmark_id": 1859,
    "benchmark_id": "collie",
    "model_id": "o3-mini",
    "score": 0.987,
    "normalized_score": 0.987,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "benchmark score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.259314+00:00",
    "updated_at": "2025-07-19T19:56:15.259314+00:00",
    "benchmark_name": "COLLIE"
  },
  {
    "model_benchmark_id": 1894,
    "benchmark_id": "complexfuncbench",
    "model_id": "o3-mini",
    "score": 0.176,
    "normalized_score": 0.176,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "benchmark score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.344047+00:00",
    "updated_at": "2025-07-19T19:56:15.344047+00:00",
    "benchmark_name": "ComplexFuncBench"
  },
  {
    "model_benchmark_id": 1830,
    "benchmark_id": "frontiermath",
    "model_id": "o3-mini",
    "score": 0.092,
    "normalized_score": 0.092,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/openai-o3-mini/",
    "verified_by_llmstats": false,
    "analysis_method": "pass @ 1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.183728+00:00",
    "updated_at": "2025-07-19T19:56:15.183728+00:00",
    "benchmark_name": "FrontierMath"
  },
  {
    "model_benchmark_id": 351,
    "benchmark_id": "gpqa",
    "model_id": "o3-mini",
    "score": 0.772,
    "normalized_score": 0.772,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "diamond",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.758026+00:00",
    "updated_at": "2025-07-19T19:56:11.758026+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 1904,
    "benchmark_id": "graphwalks-bfs-<128k",
    "model_id": "o3-mini",
    "score": 0.51,
    "normalized_score": 0.51,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "benchmark score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.368369+00:00",
    "updated_at": "2025-07-19T19:56:15.368369+00:00",
    "benchmark_name": "Graphwalks BFS <128k"
  },
  {
    "model_benchmark_id": 1880,
    "benchmark_id": "graphwalks-parents-<128k",
    "model_id": "o3-mini",
    "score": 0.583,
    "normalized_score": 0.583,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "benchmark score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.310391+00:00",
    "updated_at": "2025-07-19T19:56:15.310391+00:00",
    "benchmark_name": "Graphwalks parents <128k"
  },
  {
    "model_benchmark_id": 634,
    "benchmark_id": "ifeval",
    "model_id": "o3-mini",
    "score": 0.939,
    "normalized_score": 0.939,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "benchmark score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.302770+00:00",
    "updated_at": "2025-07-19T19:56:12.302770+00:00",
    "benchmark_name": "IFEval"
  },
  {
    "model_benchmark_id": 1847,
    "benchmark_id": "internal-api-instruction-following-(hard)",
    "model_id": "o3-mini",
    "score": 0.5,
    "normalized_score": 0.5,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "benchmark score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.228737+00:00",
    "updated_at": "2025-07-19T19:56:15.228737+00:00",
    "benchmark_name": "Internal API instruction following (hard)"
  },
  {
    "model_benchmark_id": 754,
    "benchmark_id": "livebench",
    "model_id": "o3-mini",
    "score": 0.846,
    "normalized_score": 0.846,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/openai-o3-mini/",
    "verified_by_llmstats": false,
    "analysis_method": "o3-mini high",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.585789+00:00",
    "updated_at": "2025-07-19T19:56:12.585789+00:00",
    "benchmark_name": "LiveBench"
  },
  {
    "model_benchmark_id": 426,
    "benchmark_id": "math",
    "model_id": "o3-mini",
    "score": 0.979,
    "normalized_score": 0.979,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/openai-o3-mini/",
    "verified_by_llmstats": false,
    "analysis_method": "o3-mini high",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.901889+00:00",
    "updated_at": "2025-07-19T19:56:11.901889+00:00",
    "benchmark_name": "MATH"
  },
  {
    "model_benchmark_id": 1296,
    "benchmark_id": "mgsm",
    "model_id": "o3-mini",
    "score": 0.92,
    "normalized_score": 0.92,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/openai-o3-mini/",
    "verified_by_llmstats": false,
    "analysis_method": "o3-mini high",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.712633+00:00",
    "updated_at": "2025-07-19T19:56:13.712633+00:00",
    "benchmark_name": "MGSM"
  },
  {
    "model_benchmark_id": 119,
    "benchmark_id": "mmlu",
    "model_id": "o3-mini",
    "score": 0.869,
    "normalized_score": 0.869,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/openai-o3-mini/",
    "verified_by_llmstats": false,
    "analysis_method": "o3-mini high",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.320589+00:00",
    "updated_at": "2025-07-19T19:56:11.320589+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 742,
    "benchmark_id": "multichallenge",
    "model_id": "o3-mini",
    "score": 0.399,
    "normalized_score": 0.399,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "benchmark score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.560158+00:00",
    "updated_at": "2025-07-19T19:56:12.560158+00:00",
    "benchmark_name": "MultiChallenge"
  },
  {
    "model_benchmark_id": 1853,
    "benchmark_id": "multichallenge-(o3-mini-grader)",
    "model_id": "o3-mini",
    "score": 0.502,
    "normalized_score": 0.502,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "benchmark score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.243415+00:00",
    "updated_at": "2025-07-19T19:56:15.243415+00:00",
    "benchmark_name": "MultiChallenge (o3-mini grader)"
  },
  {
    "model_benchmark_id": 1652,
    "benchmark_id": "multi-if",
    "model_id": "o3-mini",
    "score": 0.795,
    "normalized_score": 0.795,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "benchmark score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.646496+00:00",
    "updated_at": "2025-07-19T19:56:14.646496+00:00",
    "benchmark_name": "Multi-IF"
  },
  {
    "model_benchmark_id": 1474,
    "benchmark_id": "multilingual-mmlu",
    "model_id": "o3-mini",
    "score": 0.807,
    "normalized_score": 0.807,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "benchmark score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.143822+00:00",
    "updated_at": "2025-07-19T19:56:14.143822+00:00",
    "benchmark_name": "Multilingual MMLU"
  },
  {
    "model_benchmark_id": 1865,
    "benchmark_id": "openai-mrcr:-2-needle-128k",
    "model_id": "o3-mini",
    "score": 0.187,
    "normalized_score": 0.187,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "benchmark score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.274261+00:00",
    "updated_at": "2025-07-19T19:56:15.274261+00:00",
    "benchmark_name": "OpenAI-MRCR: 2 needle 128k"
  },
  {
    "model_benchmark_id": 238,
    "benchmark_id": "simpleqa",
    "model_id": "o3-mini",
    "score": 0.15,
    "normalized_score": 0.15,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-gpt-4-5/",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.554563+00:00",
    "updated_at": "2025-07-19T19:56:11.554563+00:00",
    "benchmark_name": "SimpleQA"
  },
  {
    "model_benchmark_id": 1357,
    "benchmark_id": "swe-bench-verified",
    "model_id": "o3-mini",
    "score": 0.493,
    "normalized_score": 0.493,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/openai-o3-mini/",
    "verified_by_llmstats": false,
    "analysis_method": "verified",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.856039+00:00",
    "updated_at": "2025-07-19T19:56:13.856039+00:00",
    "benchmark_name": "SWE-Bench Verified"
  },
  {
    "model_benchmark_id": 1898,
    "benchmark_id": "swe-lancer",
    "model_id": "o3-mini",
    "score": 0.18,
    "normalized_score": 0.18,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "percentage score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.355089+00:00",
    "updated_at": "2025-07-19T19:56:15.355089+00:00",
    "benchmark_name": "SWE-Lancer"
  },
  {
    "model_benchmark_id": 1901,
    "benchmark_id": "swe-lancer-(ic-diamond-subset)",
    "model_id": "o3-mini",
    "score": 0.074,
    "normalized_score": 0.074,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "percentage score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.362026+00:00",
    "updated_at": "2025-07-19T19:56:15.362026+00:00",
    "benchmark_name": "SWE-Lancer (IC-Diamond subset)"
  },
  {
    "model_benchmark_id": 1779,
    "benchmark_id": "tau-bench-airline",
    "model_id": "o3-mini",
    "score": 0.324,
    "normalized_score": 0.324,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "benchmark score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.013372+00:00",
    "updated_at": "2025-07-19T19:56:15.013372+00:00",
    "benchmark_name": "TAU-bench Airline"
  },
  {
    "model_benchmark_id": 1765,
    "benchmark_id": "tau-bench-retail",
    "model_id": "o3-mini",
    "score": 0.576,
    "normalized_score": 0.576,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/gpt-4-1/",
    "verified_by_llmstats": false,
    "analysis_method": "benchmark score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.984653+00:00",
    "updated_at": "2025-07-19T19:56:14.984653+00:00",
    "benchmark_name": "TAU-bench Retail"
  }
]

================================================
FILE: data/organizations/openai/models/o3-mini/model.json
================================================
{
  "model_id": "o3-mini",
  "name": "o3-mini",
  "organization_id": "openai",
  "fine_tuned_from_model_id": null,
  "description": "A smaller variant of O3, expected to offer enhanced multimodal capabilities, improved reasoning, and more efficient resource utilization compared to previous models while maintaining strong performance on core tasks.",
  "release_date": "2025-01-30",
  "announcement_date": "2025-01-30",
  "license_id": "proprietary",
  "multimodal": false,
  "knowledge_cutoff": "2023-09-30",
  "param_count": null,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://platform.openai.com/docs/models",
  "source_playground": null,
  "source_paper": "https://cdn.openai.com/o3-mini-system-card.pdf",
  "source_scorecard_blog_link": "https://openai.com/index/openai-o3-mini/",
  "source_repo_link": "https://github.com/openai",
  "source_weights_link": null,
  "created_at": "2025-07-19T19:49:05.835007+00:00",
  "updated_at": "2025-07-19T19:49:05.835007+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/openai/models/o3-pro-2025-06-10/model.json
================================================
{
  "model_id": "o3-pro-2025-06-10",
  "name": "o3-pro",
  "organization_id": "openai",
  "fine_tuned_from_model_id": null,
  "description": "Version of o3 with more compute for better responses. The o3-pro model uses more compute to think harder and provide consistently better answers. Designed to tackle tough problems with advanced reasoning capabilities.",
  "release_date": "2025-06-10",
  "announcement_date": "2025-06-10",
  "license_id": "proprietary",
  "multimodal": true,
  "knowledge_cutoff": "2024-05-31",
  "param_count": null,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://platform.openai.com/docs/models/o3-pro",
  "source_playground": null,
  "source_paper": null,
  "source_scorecard_blog_link": null,
  "source_repo_link": null,
  "source_weights_link": null,
  "created_at": "2025-07-19T19:49:05.832229+00:00",
  "updated_at": "2025-07-19T19:49:05.832229+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/openai/models/o4-mini/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 668,
    "benchmark_id": "aider-polyglot",
    "model_id": "o4-mini",
    "score": 0.689,
    "normalized_score": 0.689,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-o3-and-o4-mini/",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy (whole, o4-mini-high)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.384371+00:00",
    "updated_at": "2025-07-19T19:56:12.384371+00:00",
    "benchmark_name": "Aider-Polyglot"
  },
  {
    "model_benchmark_id": 1332,
    "benchmark_id": "aider-polyglot-edit",
    "model_id": "o4-mini",
    "score": 0.582,
    "normalized_score": 0.582,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-o3-and-o4-mini/",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy (diff, o4-mini-high)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.803065+00:00",
    "updated_at": "2025-07-19T19:56:13.803065+00:00",
    "benchmark_name": "Aider-Polyglot Edit"
  },
  {
    "model_benchmark_id": 483,
    "benchmark_id": "aime-2024",
    "model_id": "o4-mini",
    "score": 0.934,
    "normalized_score": 0.934,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-o3-and-o4-mini/",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy (no tools)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.015345+00:00",
    "updated_at": "2025-07-19T19:56:12.015345+00:00",
    "benchmark_name": "AIME 2024"
  },
  {
    "model_benchmark_id": 706,
    "benchmark_id": "aime-2025",
    "model_id": "o4-mini",
    "score": 0.927,
    "normalized_score": 0.927,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-o3-and-o4-mini/",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy (no tools)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.477657+00:00",
    "updated_at": "2025-07-19T19:56:12.477657+00:00",
    "benchmark_name": "AIME 2025"
  },
  {
    "model_benchmark_id": 1843,
    "benchmark_id": "browsecomp",
    "model_id": "o4-mini",
    "score": 0.515,
    "normalized_score": 0.515,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-o3-and-o4-mini/",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy (with python + browsing)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.217475+00:00",
    "updated_at": "2025-07-19T19:56:15.217475+00:00",
    "benchmark_name": "BrowseComp"
  },
  {
    "model_benchmark_id": 1835,
    "benchmark_id": "charxiv-r",
    "model_id": "o4-mini",
    "score": 0.72,
    "normalized_score": 0.72,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-o3-and-o4-mini/",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.197036+00:00",
    "updated_at": "2025-07-19T19:56:15.197036+00:00",
    "benchmark_name": "CharXiv-R"
  },
  {
    "model_benchmark_id": 349,
    "benchmark_id": "gpqa",
    "model_id": "o4-mini",
    "score": 0.814,
    "normalized_score": 0.814,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-o3-and-o4-mini/",
    "verified_by_llmstats": false,
    "analysis_method": "diamond accuracy (no tools)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.754610+00:00",
    "updated_at": "2025-07-19T19:56:11.754610+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 726,
    "benchmark_id": "humanity's-last-exam",
    "model_id": "o4-mini",
    "score": 0.147,
    "normalized_score": 0.147,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-o3-and-o4-mini/",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy (no tools)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.528160+00:00",
    "updated_at": "2025-07-19T19:56:12.528160+00:00",
    "benchmark_name": "Humanity's Last Exam"
  },
  {
    "model_benchmark_id": 540,
    "benchmark_id": "mathvista",
    "model_id": "o4-mini",
    "score": 0.843,
    "normalized_score": 0.843,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-o3-and-o4-mini/",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.115868+00:00",
    "updated_at": "2025-07-19T19:56:12.115868+00:00",
    "benchmark_name": "MathVista"
  },
  {
    "model_benchmark_id": 591,
    "benchmark_id": "mmmu",
    "model_id": "o4-mini",
    "score": 0.816,
    "normalized_score": 0.816,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-o3-and-o4-mini/",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.218993+00:00",
    "updated_at": "2025-07-19T19:56:12.218993+00:00",
    "benchmark_name": "MMMU"
  },
  {
    "model_benchmark_id": 1841,
    "benchmark_id": "scale-multichallenge",
    "model_id": "o4-mini",
    "score": 0.43,
    "normalized_score": 0.43,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-o3-and-o4-mini/",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.211372+00:00",
    "updated_at": "2025-07-19T19:56:15.211372+00:00",
    "benchmark_name": "Scale MultiChallenge"
  },
  {
    "model_benchmark_id": 1356,
    "benchmark_id": "swe-bench-verified",
    "model_id": "o4-mini",
    "score": 0.681,
    "normalized_score": 0.681,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-o3-and-o4-mini/",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.854236+00:00",
    "updated_at": "2025-07-19T19:56:13.854236+00:00",
    "benchmark_name": "SWE-Bench Verified"
  },
  {
    "model_benchmark_id": 1777,
    "benchmark_id": "tau-bench-airline",
    "model_id": "o4-mini",
    "score": 0.492,
    "normalized_score": 0.492,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-o3-and-o4-mini/",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy (o4-mini-high)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.009611+00:00",
    "updated_at": "2025-07-19T19:56:15.009611+00:00",
    "benchmark_name": "TAU-bench Airline"
  },
  {
    "model_benchmark_id": 1763,
    "benchmark_id": "tau-bench-retail",
    "model_id": "o4-mini",
    "score": 0.718,
    "normalized_score": 0.718,
    "is_self_reported": true,
    "self_reported_source_link": "https://openai.com/index/introducing-o3-and-o4-mini/",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy (o4-mini-high)",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.980200+00:00",
    "updated_at": "2025-07-19T19:56:14.980200+00:00",
    "benchmark_name": "TAU-bench Retail"
  }
]


================================================
FILE: data/organizations/openai/models/o4-mini/model.json
================================================
{
  "model_id": "o4-mini",
  "name": "o4-mini",
  "organization_id": "openai",
  "fine_tuned_from_model_id": null,
  "description": "o4-mini is OpenAI's latest small o-series model, optimized for fast, effective reasoning with exceptionally efficient performance in coding and visual tasks. It is faster and more affordable than o3.",
  "release_date": "2025-04-16",
  "announcement_date": "2025-04-16",
  "license_id": "proprietary",
  "multimodal": true,
  "knowledge_cutoff": "2024-05-31",
  "param_count": null,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://platform.openai.com/docs/models/o4-mini",
  "source_playground": null,
  "source_paper": "https://cdn.openai.com/pdf/2221c875-02dc-4789-800b-e7758f3722c1/o3-and-o4-mini-system-card.pdf",
  "source_scorecard_blog_link": null,
  "source_repo_link": "https://github.com/openai",
  "source_weights_link": null,
  "created_at": "2025-07-19T19:49:05.824485+00:00",
  "updated_at": "2025-07-19T19:49:05.824485+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/openai/organization.json
================================================
{
  "organization_id": "openai",
  "name": "OpenAI",
  "website": "https://openai.com",
  "description": "Leading AI research company",
  "country": "US",
  "created_at": "2025-07-19T19:49:05.815252+00:00",
  "updated_at": "2025-07-19T19:49:05.815252+00:00"
}


================================================
FILE: data/organizations/qwen/models/qvq-72b-preview/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 1675,
    "benchmark_id": "mathvision",
    "model_id": "qvq-72b-preview",
    "score": 0.359,
    "normalized_score": 0.359,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/QVQ-72B-Preview",
    "verified_by_llmstats": false,
    "analysis_method": "full",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.700746+00:00",
    "updated_at": "2025-07-19T19:56:14.700746+00:00",
    "benchmark_name": "MathVision"
  },
  {
    "model_benchmark_id": 526,
    "benchmark_id": "mathvista",
    "model_id": "qvq-72b-preview",
    "score": 0.714,
    "normalized_score": 0.714,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/QVQ-72B-Preview",
    "verified_by_llmstats": false,
    "analysis_method": "mini",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.092107+00:00",
    "updated_at": "2025-07-19T19:56:12.092107+00:00",
    "benchmark_name": "MathVista"
  },
  {
    "model_benchmark_id": 570,
    "benchmark_id": "mmmu",
    "model_id": "qvq-72b-preview",
    "score": 0.703,
    "normalized_score": 0.703,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/QVQ-72B-Preview",
    "verified_by_llmstats": false,
    "analysis_method": "val",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.173084+00:00",
    "updated_at": "2025-07-19T19:56:12.173084+00:00",
    "benchmark_name": "MMMU"
  },
  {
    "model_benchmark_id": 1716,
    "benchmark_id": "olympiadbench",
    "model_id": "qvq-72b-preview",
    "score": 0.204,
    "normalized_score": 0.204,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/QVQ-72B-Preview",
    "verified_by_llmstats": false,
    "analysis_method": "full",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.824642+00:00",
    "updated_at": "2025-07-19T19:56:14.824642+00:00",
    "benchmark_name": "OlympiadBench"
  }
]

================================================
FILE: data/organizations/qwen/models/qvq-72b-preview/model.json
================================================
{
  "model_id": "qvq-72b-preview",
  "name": "QvQ-72B-Preview",
  "organization_id": "qwen",
  "fine_tuned_from_model_id": "qwen2-vl-72b",
  "description": "An experimental research model focusing on advanced visual reasoning and step-by-step cognitive capabilities. Achieves strong performance on multi-modal science and mathematics tasks, though exhibits some limitations such as potential language mixing and recursive reasoning loops.",
  "release_date": "2024-12-25",
  "announcement_date": "2024-12-25",
  "license_id": "qwen",
  "multimodal": true,
  "knowledge_cutoff": null,
  "param_count": 73400000000,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://huggingface.co/Qwen/QVQ-72B-Preview",
  "source_playground": null,
  "source_paper": null,
  "source_scorecard_blog_link": "https://qwenlm.github.io/blog/qvq-72b-preview/",
  "source_repo_link": "https://github.com/QwenLM/Qwen2",
  "source_weights_link": "https://huggingface.co/Qwen/QVQ-72B-Preview",
  "created_at": "2025-07-19T19:49:05.895366+00:00",
  "updated_at": "2025-07-19T19:49:05.895366+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/qwen/models/qwen-2.5-14b-instruct/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 21,
    "benchmark_id": "arc-c",
    "model_id": "qwen-2.5-14b-instruct",
    "score": 0.673,
    "normalized_score": 0.673,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/",
    "verified_by_llmstats": false,
    "analysis_method": "ARC-C benchmark evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.127541+00:00",
    "updated_at": "2025-07-19T19:56:11.127541+00:00",
    "benchmark_name": "ARC-C"
  },
  {
    "model_benchmark_id": 971,
    "benchmark_id": "bbh",
    "model_id": "qwen-2.5-14b-instruct",
    "score": 0.782,
    "normalized_score": 0.782,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/",
    "verified_by_llmstats": false,
    "analysis_method": "BBH benchmark evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.042167+00:00",
    "updated_at": "2025-07-19T19:56:13.042167+00:00",
    "benchmark_name": "BBH"
  },
  {
    "model_benchmark_id": 301,
    "benchmark_id": "gpqa",
    "model_id": "qwen-2.5-14b-instruct",
    "score": 0.455,
    "normalized_score": 0.455,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/",
    "verified_by_llmstats": false,
    "analysis_method": "GPQA benchmark evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.677954+00:00",
    "updated_at": "2025-07-19T19:56:11.677954+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 994,
    "benchmark_id": "gsm8k",
    "model_id": "qwen-2.5-14b-instruct",
    "score": 0.948,
    "normalized_score": 0.948,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/",
    "verified_by_llmstats": false,
    "analysis_method": "GSM8K benchmark evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.082212+00:00",
    "updated_at": "2025-07-19T19:56:13.082212+00:00",
    "benchmark_name": "GSM8k"
  },
  {
    "model_benchmark_id": 786,
    "benchmark_id": "humaneval",
    "model_id": "qwen-2.5-14b-instruct",
    "score": 0.835,
    "normalized_score": 0.835,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/",
    "verified_by_llmstats": false,
    "analysis_method": "HumanEval benchmark evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.646500+00:00",
    "updated_at": "2025-07-19T19:56:12.646500+00:00",
    "benchmark_name": "HumanEval"
  },
  {
    "model_benchmark_id": 1441,
    "benchmark_id": "humaneval+",
    "model_id": "qwen-2.5-14b-instruct",
    "score": 0.512,
    "normalized_score": 0.512,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/",
    "verified_by_llmstats": false,
    "analysis_method": "HumanEval+ benchmark evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.071967+00:00",
    "updated_at": "2025-07-19T19:56:14.071967+00:00",
    "benchmark_name": "HumanEval+"
  },
  {
    "model_benchmark_id": 404,
    "benchmark_id": "math",
    "model_id": "qwen-2.5-14b-instruct",
    "score": 0.8,
    "normalized_score": 0.8,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/",
    "verified_by_llmstats": false,
    "analysis_method": "MATH benchmark evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.862254+00:00",
    "updated_at": "2025-07-19T19:56:11.862254+00:00",
    "benchmark_name": "MATH"
  },
  {
    "model_benchmark_id": 1185,
    "benchmark_id": "mbpp",
    "model_id": "qwen-2.5-14b-instruct",
    "score": 0.82,
    "normalized_score": 0.82,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/",
    "verified_by_llmstats": false,
    "analysis_method": "MBPP benchmark evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.497488+00:00",
    "updated_at": "2025-07-19T19:56:13.497488+00:00",
    "benchmark_name": "MBPP"
  },
  {
    "model_benchmark_id": 1602,
    "benchmark_id": "mbpp+",
    "model_id": "qwen-2.5-14b-instruct",
    "score": 0.632,
    "normalized_score": 0.632,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/",
    "verified_by_llmstats": false,
    "analysis_method": "MBPP+ benchmark evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.507421+00:00",
    "updated_at": "2025-07-19T19:56:14.507421+00:00",
    "benchmark_name": "MBPP+"
  },
  {
    "model_benchmark_id": 89,
    "benchmark_id": "mmlu",
    "model_id": "qwen-2.5-14b-instruct",
    "score": 0.797,
    "normalized_score": 0.797,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/",
    "verified_by_llmstats": false,
    "analysis_method": "MMLU benchmark evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.269091+00:00",
    "updated_at": "2025-07-19T19:56:11.269091+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 194,
    "benchmark_id": "mmlu-pro",
    "model_id": "qwen-2.5-14b-instruct",
    "score": 0.637,
    "normalized_score": 0.637,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/",
    "verified_by_llmstats": false,
    "analysis_method": "MMLU-Pro benchmark evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.471047+00:00",
    "updated_at": "2025-07-19T19:56:11.471047+00:00",
    "benchmark_name": "MMLU-Pro"
  },
  {
    "model_benchmark_id": 731,
    "benchmark_id": "mmlu-redux",
    "model_id": "qwen-2.5-14b-instruct",
    "score": 0.8,
    "normalized_score": 0.8,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/",
    "verified_by_llmstats": false,
    "analysis_method": "MMLU-redux benchmark evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.538944+00:00",
    "updated_at": "2025-07-19T19:56:12.538944+00:00",
    "benchmark_name": "MMLU-Redux"
  },
  {
    "model_benchmark_id": 1600,
    "benchmark_id": "mmlu-stem",
    "model_id": "qwen-2.5-14b-instruct",
    "score": 0.764,
    "normalized_score": 0.764,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/",
    "verified_by_llmstats": false,
    "analysis_method": "MMLU-STEM benchmark evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.500528+00:00",
    "updated_at": "2025-07-19T19:56:14.500528+00:00",
    "benchmark_name": "MMLU-STEM"
  },
  {
    "model_benchmark_id": 642,
    "benchmark_id": "multipl-e",
    "model_id": "qwen-2.5-14b-instruct",
    "score": 0.728,
    "normalized_score": 0.728,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/",
    "verified_by_llmstats": false,
    "analysis_method": "MultiPL-E benchmark evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.319213+00:00",
    "updated_at": "2025-07-19T19:56:12.319213+00:00",
    "benchmark_name": "MultiPL-E"
  },
  {
    "model_benchmark_id": 1597,
    "benchmark_id": "theoremqa",
    "model_id": "qwen-2.5-14b-instruct",
    "score": 0.43,
    "normalized_score": 0.43,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/",
    "verified_by_llmstats": false,
    "analysis_method": "TheoremQA benchmark evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.492163+00:00",
    "updated_at": "2025-07-19T19:56:14.492163+00:00",
    "benchmark_name": "TheoremQA"
  },
  {
    "model_benchmark_id": 138,
    "benchmark_id": "truthfulqa",
    "model_id": "qwen-2.5-14b-instruct",
    "score": 0.584,
    "normalized_score": 0.584,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/",
    "verified_by_llmstats": false,
    "analysis_method": "TruthfulQA benchmark evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.355004+00:00",
    "updated_at": "2025-07-19T19:56:11.355004+00:00",
    "benchmark_name": "TruthfulQA"
  }
]

================================================
FILE: data/organizations/qwen/models/qwen-2.5-14b-instruct/model.json
================================================
{
  "model_id": "qwen-2.5-14b-instruct",
  "name": "Qwen2.5 14B Instruct",
  "organization_id": "qwen",
  "fine_tuned_from_model_id": null,
  "description": "Qwen2.5-14B-Instruct is an instruction-tuned 14.7B parameter language model, part of the Qwen2.5 series. It features significant improvements in instruction following, long text generation (8K+ tokens), structured data understanding, and JSON output generation. The model supports a 128K token context length and multilingual capabilities across 29+ languages including Chinese, English, French, Spanish, and more.",
  "release_date": "2024-09-19",
  "announcement_date": "2024-09-19",
  "license_id": "apache_2_0",
  "multimodal": false,
  "knowledge_cutoff": null,
  "param_count": 14700000000,
  "training_tokens": 18000000000000,
  "available_in_zeroeval": true,
  "source_api_ref": "https://www.alibabacloud.com/help/en/model-studio/developer-reference/use-qwen-by-calling-api",
  "source_playground": null,
  "source_paper": "https://arxiv.org/abs/2407.10671",
  "source_scorecard_blog_link": "https://qwenlm.github.io/blog/qwen2.5-llm/",
  "source_repo_link": "https://github.com/QwenLM/Qwen2.5",
  "source_weights_link": "https://huggingface.co/Qwen/Qwen2.5-14B-Instruct",
  "created_at": "2025-07-19T19:49:05.615575+00:00",
  "updated_at": "2025-07-19T19:49:05.615575+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/qwen/models/qwen-2.5-32b-instruct/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 18,
    "benchmark_id": "arc-c",
    "model_id": "qwen-2.5-32b-instruct",
    "score": 0.704,
    "normalized_score": 0.704,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/",
    "verified_by_llmstats": false,
    "analysis_method": "ARC-C benchmark evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.121747+00:00",
    "updated_at": "2025-07-19T19:56:11.121747+00:00",
    "benchmark_name": "ARC-C"
  },
  {
    "model_benchmark_id": 970,
    "benchmark_id": "bbh",
    "model_id": "qwen-2.5-32b-instruct",
    "score": 0.845,
    "normalized_score": 0.845,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/",
    "verified_by_llmstats": false,
    "analysis_method": "BBH benchmark evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.040428+00:00",
    "updated_at": "2025-07-19T19:56:13.040428+00:00",
    "benchmark_name": "BBH"
  },
  {
    "model_benchmark_id": 297,
    "benchmark_id": "gpqa",
    "model_id": "qwen-2.5-32b-instruct",
    "score": 0.495,
    "normalized_score": 0.495,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/",
    "verified_by_llmstats": false,
    "analysis_method": "GPQA benchmark evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.671178+00:00",
    "updated_at": "2025-07-19T19:56:11.671178+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 990,
    "benchmark_id": "gsm8k",
    "model_id": "qwen-2.5-32b-instruct",
    "score": 0.959,
    "normalized_score": 0.959,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/",
    "verified_by_llmstats": false,
    "analysis_method": "GSM8K benchmark evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.074870+00:00",
    "updated_at": "2025-07-19T19:56:13.074870+00:00",
    "benchmark_name": "GSM8k"
  },
  {
    "model_benchmark_id": 45,
    "benchmark_id": "hellaswag",
    "model_id": "qwen-2.5-32b-instruct",
    "score": 0.852,
    "normalized_score": 0.852,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/",
    "verified_by_llmstats": false,
    "analysis_method": "HellaSwag benchmark evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.178158+00:00",
    "updated_at": "2025-07-19T19:56:11.178158+00:00",
    "benchmark_name": "HellaSwag"
  },
  {
    "model_benchmark_id": 782,
    "benchmark_id": "humaneval",
    "model_id": "qwen-2.5-32b-instruct",
    "score": 0.884,
    "normalized_score": 0.884,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/",
    "verified_by_llmstats": false,
    "analysis_method": "HumanEval benchmark evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.639922+00:00",
    "updated_at": "2025-07-19T19:56:12.639922+00:00",
    "benchmark_name": "HumanEval"
  },
  {
    "model_benchmark_id": 1440,
    "benchmark_id": "humaneval+",
    "model_id": "qwen-2.5-32b-instruct",
    "score": 0.524,
    "normalized_score": 0.524,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/",
    "verified_by_llmstats": false,
    "analysis_method": "HumanEval+ benchmark evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.070409+00:00",
    "updated_at": "2025-07-19T19:56:14.070409+00:00",
    "benchmark_name": "HumanEval+"
  },
  {
    "model_benchmark_id": 400,
    "benchmark_id": "math",
    "model_id": "qwen-2.5-32b-instruct",
    "score": 0.831,
    "normalized_score": 0.831,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/",
    "verified_by_llmstats": false,
    "analysis_method": "MATH benchmark evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.856115+00:00",
    "updated_at": "2025-07-19T19:56:11.856115+00:00",
    "benchmark_name": "MATH"
  },
  {
    "model_benchmark_id": 1181,
    "benchmark_id": "mbpp",
    "model_id": "qwen-2.5-32b-instruct",
    "score": 0.84,
    "normalized_score": 0.84,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/",
    "verified_by_llmstats": false,
    "analysis_method": "MBPP benchmark evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.489427+00:00",
    "updated_at": "2025-07-19T19:56:13.489427+00:00",
    "benchmark_name": "MBPP"
  },
  {
    "model_benchmark_id": 1601,
    "benchmark_id": "mbpp+",
    "model_id": "qwen-2.5-32b-instruct",
    "score": 0.672,
    "normalized_score": 0.672,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/",
    "verified_by_llmstats": false,
    "analysis_method": "MBPP+ benchmark evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.504915+00:00",
    "updated_at": "2025-07-19T19:56:14.504915+00:00",
    "benchmark_name": "MBPP+"
  },
  {
    "model_benchmark_id": 85,
    "benchmark_id": "mmlu",
    "model_id": "qwen-2.5-32b-instruct",
    "score": 0.833,
    "normalized_score": 0.833,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/",
    "verified_by_llmstats": false,
    "analysis_method": "MMLU benchmark evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.261705+00:00",
    "updated_at": "2025-07-19T19:56:11.261705+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 190,
    "benchmark_id": "mmlu-pro",
    "model_id": "qwen-2.5-32b-instruct",
    "score": 0.69,
    "normalized_score": 0.69,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/",
    "verified_by_llmstats": false,
    "analysis_method": "MMLU-Pro benchmark evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.465052+00:00",
    "updated_at": "2025-07-19T19:56:11.465052+00:00",
    "benchmark_name": "MMLU-Pro"
  },
  {
    "model_benchmark_id": 728,
    "benchmark_id": "mmlu-redux",
    "model_id": "qwen-2.5-32b-instruct",
    "score": 0.839,
    "normalized_score": 0.839,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/",
    "verified_by_llmstats": false,
    "analysis_method": "MMLU-redux benchmark evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.533630+00:00",
    "updated_at": "2025-07-19T19:56:12.533630+00:00",
    "benchmark_name": "MMLU-Redux"
  },
  {
    "model_benchmark_id": 1599,
    "benchmark_id": "mmlu-stem",
    "model_id": "qwen-2.5-32b-instruct",
    "score": 0.809,
    "normalized_score": 0.809,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/",
    "verified_by_llmstats": false,
    "analysis_method": "MMLU-STEM benchmark evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.498255+00:00",
    "updated_at": "2025-07-19T19:56:14.498255+00:00",
    "benchmark_name": "MMLU-STEM"
  },
  {
    "model_benchmark_id": 640,
    "benchmark_id": "multipl-e",
    "model_id": "qwen-2.5-32b-instruct",
    "score": 0.754,
    "normalized_score": 0.754,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/",
    "verified_by_llmstats": false,
    "analysis_method": "MultiPL-E benchmark evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.316384+00:00",
    "updated_at": "2025-07-19T19:56:12.316384+00:00",
    "benchmark_name": "MultiPL-E"
  },
  {
    "model_benchmark_id": 1593,
    "benchmark_id": "theoremqa",
    "model_id": "qwen-2.5-32b-instruct",
    "score": 0.441,
    "normalized_score": 0.441,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/",
    "verified_by_llmstats": false,
    "analysis_method": "TheoremQA benchmark evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.482526+00:00",
    "updated_at": "2025-07-19T19:56:14.482526+00:00",
    "benchmark_name": "TheoremQA"
  },
  {
    "model_benchmark_id": 135,
    "benchmark_id": "truthfulqa",
    "model_id": "qwen-2.5-32b-instruct",
    "score": 0.578,
    "normalized_score": 0.578,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/",
    "verified_by_llmstats": false,
    "analysis_method": "TruthfulQA benchmark evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.349397+00:00",
    "updated_at": "2025-07-19T19:56:11.349397+00:00",
    "benchmark_name": "TruthfulQA"
  },
  {
    "model_benchmark_id": 150,
    "benchmark_id": "winogrande",
    "model_id": "qwen-2.5-32b-instruct",
    "score": 0.82,
    "normalized_score": 0.82,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/",
    "verified_by_llmstats": false,
    "analysis_method": "Winogrande benchmark evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.384431+00:00",
    "updated_at": "2025-07-19T19:56:11.384431+00:00",
    "benchmark_name": "Winogrande"
  }
]

================================================
FILE: data/organizations/qwen/models/qwen-2.5-32b-instruct/model.json
================================================
{
  "model_id": "qwen-2.5-32b-instruct",
  "name": "Qwen2.5 32B Instruct",
  "organization_id": "qwen",
  "fine_tuned_from_model_id": null,
  "description": "Qwen2.5-32B-Instruct is an instruction-tuned 32 billion parameter language model, part of the Qwen2.5 series. It is designed to follow instructions, generate long texts (over 8K tokens), understand structured data (e.g., tables), and generate structured outputs, especially JSON. The model supports multilingual capabilities across over 29 languages.",
  "release_date": "2024-09-19",
  "announcement_date": "2024-09-19",
  "license_id": "apache_2_0",
  "multimodal": false,
  "knowledge_cutoff": null,
  "param_count": 32500000000,
  "training_tokens": 18000000000000,
  "available_in_zeroeval": true,
  "source_api_ref": "https://www.alibabacloud.com/help/en/model-studio/developer-reference/use-qwen-by-calling-api",
  "source_playground": null,
  "source_paper": null,
  "source_scorecard_blog_link": "https://qwenlm.github.io/blog/qwen2.5/",
  "source_repo_link": "https://github.com/QwenLM/Qwen2.5",
  "source_weights_link": "https://huggingface.co/Qwen/Qwen2.5-32B-Instruct",
  "created_at": "2025-07-19T19:49:05.606261+00:00",
  "updated_at": "2025-07-19T19:49:05.606261+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/qwen/models/qwen-2.5-72b-instruct/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 1617,
    "benchmark_id": "alignbench",
    "model_id": "qwen-2.5-72b-instruct",
    "score": 0.816,
    "normalized_score": 0.816,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5/",
    "verified_by_llmstats": false,
    "analysis_method": "AlignBench v1.1 benchmark evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.546122+00:00",
    "updated_at": "2025-07-19T19:56:14.546122+00:00",
    "benchmark_name": "AlignBench"
  },
  {
    "model_benchmark_id": 1453,
    "benchmark_id": "arena-hard",
    "model_id": "qwen-2.5-72b-instruct",
    "score": 0.812,
    "normalized_score": 0.812,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5/",
    "verified_by_llmstats": false,
    "analysis_method": "Arena Hard benchmark evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.097075+00:00",
    "updated_at": "2025-07-19T19:56:14.097075+00:00",
    "benchmark_name": "Arena Hard"
  },
  {
    "model_benchmark_id": 303,
    "benchmark_id": "gpqa",
    "model_id": "qwen-2.5-72b-instruct",
    "score": 0.49,
    "normalized_score": 0.49,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5/",
    "verified_by_llmstats": false,
    "analysis_method": "GPQA benchmark evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.681073+00:00",
    "updated_at": "2025-07-19T19:56:11.681073+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 996,
    "benchmark_id": "gsm8k",
    "model_id": "qwen-2.5-72b-instruct",
    "score": 0.958,
    "normalized_score": 0.958,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5/",
    "verified_by_llmstats": false,
    "analysis_method": "GSM8K benchmark evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.085236+00:00",
    "updated_at": "2025-07-19T19:56:13.085236+00:00",
    "benchmark_name": "GSM8k"
  },
  {
    "model_benchmark_id": 787,
    "benchmark_id": "humaneval",
    "model_id": "qwen-2.5-72b-instruct",
    "score": 0.866,
    "normalized_score": 0.866,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5/",
    "verified_by_llmstats": false,
    "analysis_method": "HumanEval benchmark evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.648406+00:00",
    "updated_at": "2025-07-19T19:56:12.648406+00:00",
    "benchmark_name": "HumanEval"
  },
  {
    "model_benchmark_id": 620,
    "benchmark_id": "ifeval",
    "model_id": "qwen-2.5-72b-instruct",
    "score": 0.841,
    "normalized_score": 0.841,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5/",
    "verified_by_llmstats": false,
    "analysis_method": "IFEval strict-prompt benchmark evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.277303+00:00",
    "updated_at": "2025-07-19T19:56:12.277303+00:00",
    "benchmark_name": "IFEval"
  },
  {
    "model_benchmark_id": 750,
    "benchmark_id": "livebench",
    "model_id": "qwen-2.5-72b-instruct",
    "score": 0.523,
    "normalized_score": 0.523,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5/",
    "verified_by_llmstats": false,
    "analysis_method": "LiveBench benchmark evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.577555+00:00",
    "updated_at": "2025-07-19T19:56:12.577555+00:00",
    "benchmark_name": "LiveBench"
  },
  {
    "model_benchmark_id": 1124,
    "benchmark_id": "livecodebench",
    "model_id": "qwen-2.5-72b-instruct",
    "score": 0.555,
    "normalized_score": 0.555,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5/",
    "verified_by_llmstats": false,
    "analysis_method": "LiveCodeBench benchmark evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.346315+00:00",
    "updated_at": "2025-07-19T19:56:13.346315+00:00",
    "benchmark_name": "LiveCodeBench"
  },
  {
    "model_benchmark_id": 406,
    "benchmark_id": "math",
    "model_id": "qwen-2.5-72b-instruct",
    "score": 0.831,
    "normalized_score": 0.831,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5/",
    "verified_by_llmstats": false,
    "analysis_method": "MATH benchmark evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.865721+00:00",
    "updated_at": "2025-07-19T19:56:11.865721+00:00",
    "benchmark_name": "MATH"
  },
  {
    "model_benchmark_id": 1187,
    "benchmark_id": "mbpp",
    "model_id": "qwen-2.5-72b-instruct",
    "score": 0.882,
    "normalized_score": 0.882,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5/",
    "verified_by_llmstats": false,
    "analysis_method": "MBPP benchmark evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.503069+00:00",
    "updated_at": "2025-07-19T19:56:13.503069+00:00",
    "benchmark_name": "MBPP"
  },
  {
    "model_benchmark_id": 196,
    "benchmark_id": "mmlu-pro",
    "model_id": "qwen-2.5-72b-instruct",
    "score": 0.711,
    "normalized_score": 0.711,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/",
    "verified_by_llmstats": false,
    "analysis_method": "MMLU-Pro benchmark evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.475182+00:00",
    "updated_at": "2025-07-19T19:56:11.475182+00:00",
    "benchmark_name": "MMLU-Pro"
  },
  {
    "model_benchmark_id": 733,
    "benchmark_id": "mmlu-redux",
    "model_id": "qwen-2.5-72b-instruct",
    "score": 0.868,
    "normalized_score": 0.868,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5/",
    "verified_by_llmstats": false,
    "analysis_method": "MMLU-redux benchmark evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.542364+00:00",
    "updated_at": "2025-07-19T19:56:12.542364+00:00",
    "benchmark_name": "MMLU-Redux"
  },
  {
    "model_benchmark_id": 1606,
    "benchmark_id": "mt-bench",
    "model_id": "qwen-2.5-72b-instruct",
    "score": 0.935,
    "normalized_score": 0.935,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5/",
    "verified_by_llmstats": false,
    "analysis_method": "MT-bench benchmark evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.521232+00:00",
    "updated_at": "2025-07-19T19:56:14.521232+00:00",
    "benchmark_name": "MT-Bench"
  },
  {
    "model_benchmark_id": 644,
    "benchmark_id": "multipl-e",
    "model_id": "qwen-2.5-72b-instruct",
    "score": 0.751,
    "normalized_score": 0.751,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5/",
    "verified_by_llmstats": false,
    "analysis_method": "MultiPL-E benchmark evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.322800+00:00",
    "updated_at": "2025-07-19T19:56:12.322800+00:00",
    "benchmark_name": "MultiPL-E"
  }
]

================================================
FILE: data/organizations/qwen/models/qwen-2.5-72b-instruct/model.json
================================================
{
  "model_id": "qwen-2.5-72b-instruct",
  "name": "Qwen2.5 72B Instruct",
  "organization_id": "qwen",
  "fine_tuned_from_model_id": null,
  "description": "Qwen2.5-72B-Instruct is an instruction-tuned 72 billion parameter language model, part of the Qwen2.5 series. It is designed to follow instructions, generate long texts (over 8K tokens), understand structured data (e.g., tables), and generate structured outputs, especially JSON. The model supports multilingual capabilities across over 29 languages.",
  "release_date": "2024-09-19",
  "announcement_date": "2024-09-19",
  "license_id": "qwen",
  "multimodal": false,
  "knowledge_cutoff": null,
  "param_count": 72700000000,
  "training_tokens": 18000000000000,
  "available_in_zeroeval": true,
  "source_api_ref": "https://www.alibabacloud.com/help/en/model-studio/developer-reference/use-qwen-by-calling-api",
  "source_playground": null,
  "source_paper": null,
  "source_scorecard_blog_link": "https://qwenlm.github.io/blog/qwen2.5/",
  "source_repo_link": "https://github.com/QwenLM/Qwen2.5",
  "source_weights_link": "https://huggingface.co/Qwen/Qwen2.5-72B-Instruct",
  "created_at": "2025-07-19T19:49:05.627855+00:00",
  "updated_at": "2025-07-19T19:49:05.627855+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/qwen/models/qwen-2.5-7b-instruct/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 1618,
    "benchmark_id": "alignbench",
    "model_id": "qwen-2.5-7b-instruct",
    "score": 0.733,
    "normalized_score": 0.733,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/",
    "verified_by_llmstats": false,
    "analysis_method": "AlignBench v1.1 benchmark evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.548680+00:00",
    "updated_at": "2025-07-19T19:56:14.548680+00:00",
    "benchmark_name": "AlignBench"
  },
  {
    "model_benchmark_id": 1455,
    "benchmark_id": "arena-hard",
    "model_id": "qwen-2.5-7b-instruct",
    "score": 0.52,
    "normalized_score": 0.52,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/",
    "verified_by_llmstats": false,
    "analysis_method": "Arena Hard benchmark evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.100766+00:00",
    "updated_at": "2025-07-19T19:56:14.100766+00:00",
    "benchmark_name": "Arena Hard"
  },
  {
    "model_benchmark_id": 306,
    "benchmark_id": "gpqa",
    "model_id": "qwen-2.5-7b-instruct",
    "score": 0.364,
    "normalized_score": 0.364,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/",
    "verified_by_llmstats": false,
    "analysis_method": "GPQA benchmark evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.685965+00:00",
    "updated_at": "2025-07-19T19:56:11.685965+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 998,
    "benchmark_id": "gsm8k",
    "model_id": "qwen-2.5-7b-instruct",
    "score": 0.916,
    "normalized_score": 0.916,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/",
    "verified_by_llmstats": false,
    "analysis_method": "GSM8K benchmark evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.088027+00:00",
    "updated_at": "2025-07-19T19:56:13.088027+00:00",
    "benchmark_name": "GSM8k"
  },
  {
    "model_benchmark_id": 789,
    "benchmark_id": "humaneval",
    "model_id": "qwen-2.5-7b-instruct",
    "score": 0.848,
    "normalized_score": 0.848,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/",
    "verified_by_llmstats": false,
    "analysis_method": "HumanEval benchmark evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.651744+00:00",
    "updated_at": "2025-07-19T19:56:12.651744+00:00",
    "benchmark_name": "HumanEval"
  },
  {
    "model_benchmark_id": 621,
    "benchmark_id": "ifeval",
    "model_id": "qwen-2.5-7b-instruct",
    "score": 0.712,
    "normalized_score": 0.712,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/",
    "verified_by_llmstats": false,
    "analysis_method": "IFEval strict-prompt benchmark evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.278867+00:00",
    "updated_at": "2025-07-19T19:56:12.278867+00:00",
    "benchmark_name": "IFEval"
  },
  {
    "model_benchmark_id": 753,
    "benchmark_id": "livebench",
    "model_id": "qwen-2.5-7b-instruct",
    "score": 0.359,
    "normalized_score": 0.359,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/",
    "verified_by_llmstats": false,
    "analysis_method": "LiveBench 0831 benchmark evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.584018+00:00",
    "updated_at": "2025-07-19T19:56:12.584018+00:00",
    "benchmark_name": "LiveBench"
  },
  {
    "model_benchmark_id": 1126,
    "benchmark_id": "livecodebench",
    "model_id": "qwen-2.5-7b-instruct",
    "score": 0.287,
    "normalized_score": 0.287,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/",
    "verified_by_llmstats": false,
    "analysis_method": "LiveCodeBench 2305-2409 benchmark evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.352497+00:00",
    "updated_at": "2025-07-19T19:56:13.352497+00:00",
    "benchmark_name": "LiveCodeBench"
  },
  {
    "model_benchmark_id": 408,
    "benchmark_id": "math",
    "model_id": "qwen-2.5-7b-instruct",
    "score": 0.755,
    "normalized_score": 0.755,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/",
    "verified_by_llmstats": false,
    "analysis_method": "MATH benchmark evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.869960+00:00",
    "updated_at": "2025-07-19T19:56:11.869960+00:00",
    "benchmark_name": "MATH"
  },
  {
    "model_benchmark_id": 1189,
    "benchmark_id": "mbpp",
    "model_id": "qwen-2.5-7b-instruct",
    "score": 0.792,
    "normalized_score": 0.792,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/",
    "verified_by_llmstats": false,
    "analysis_method": "MBPP benchmark evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.506947+00:00",
    "updated_at": "2025-07-19T19:56:13.506947+00:00",
    "benchmark_name": "MBPP"
  },
  {
    "model_benchmark_id": 198,
    "benchmark_id": "mmlu-pro",
    "model_id": "qwen-2.5-7b-instruct",
    "score": 0.563,
    "normalized_score": 0.563,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/",
    "verified_by_llmstats": false,
    "analysis_method": "MMLU-Pro benchmark evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.479104+00:00",
    "updated_at": "2025-07-19T19:56:11.479104+00:00",
    "benchmark_name": "MMLU-Pro"
  },
  {
    "model_benchmark_id": 735,
    "benchmark_id": "mmlu-redux",
    "model_id": "qwen-2.5-7b-instruct",
    "score": 0.754,
    "normalized_score": 0.754,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/",
    "verified_by_llmstats": false,
    "analysis_method": "MMLU-redux benchmark evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.545338+00:00",
    "updated_at": "2025-07-19T19:56:12.545338+00:00",
    "benchmark_name": "MMLU-Redux"
  },
  {
    "model_benchmark_id": 1607,
    "benchmark_id": "mt-bench",
    "model_id": "qwen-2.5-7b-instruct",
    "score": 0.875,
    "normalized_score": 0.875,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/",
    "verified_by_llmstats": false,
    "analysis_method": "MT-bench benchmark evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.523567+00:00",
    "updated_at": "2025-07-19T19:56:14.523567+00:00",
    "benchmark_name": "MT-Bench"
  },
  {
    "model_benchmark_id": 646,
    "benchmark_id": "multipl-e",
    "model_id": "qwen-2.5-7b-instruct",
    "score": 0.704,
    "normalized_score": 0.704,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-llm/",
    "verified_by_llmstats": false,
    "analysis_method": "MultiPL-E benchmark evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.325846+00:00",
    "updated_at": "2025-07-19T19:56:12.325846+00:00",
    "benchmark_name": "MultiPL-E"
  }
]

================================================
FILE: data/organizations/qwen/models/qwen-2.5-7b-instruct/model.json
================================================
{
  "model_id": "qwen-2.5-7b-instruct",
  "name": "Qwen2.5 7B Instruct",
  "organization_id": "qwen",
  "fine_tuned_from_model_id": null,
  "description": "Qwen2.5-7B-Instruct is an instruction-tuned 7B parameter language model that excels at following instructions, generating long texts (over 8K tokens), understanding structured data, and generating structured outputs like JSON. The model features enhanced capabilities in mathematics, coding, and multilingual support across 29+ languages including Chinese, English, French, Spanish, and more.",
  "release_date": "2024-09-19",
  "announcement_date": "2024-09-19",
  "license_id": "apache_2_0",
  "multimodal": false,
  "knowledge_cutoff": null,
  "param_count": 7610000000,
  "training_tokens": 18000000000000,
  "available_in_zeroeval": true,
  "source_api_ref": "https://www.alibabacloud.com/help/en/model-studio/developer-reference/use-qwen-by-calling-api",
  "source_playground": null,
  "source_paper": "https://arxiv.org/abs/2407.10671",
  "source_scorecard_blog_link": "https://qwenlm.github.io/blog/qwen2.5-llm/",
  "source_repo_link": "https://github.com/QwenLM/Qwen2.5",
  "source_weights_link": "https://huggingface.co/Qwen/Qwen2.5-7B-Instruct",
  "created_at": "2025-07-19T19:49:05.642960+00:00",
  "updated_at": "2025-07-19T19:49:05.642960+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/qwen/models/qwen-2.5-coder-32b-instruct/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 19,
    "benchmark_id": "arc-c",
    "model_id": "qwen-2.5-coder-32b-instruct",
    "score": 0.705,
    "normalized_score": 0.705,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/abs/2409.12186",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.123905+00:00",
    "updated_at": "2025-07-19T19:56:11.123905+00:00",
    "benchmark_name": "ARC-C"
  },
  {
    "model_benchmark_id": 1603,
    "benchmark_id": "bigcodebench-full",
    "model_id": "qwen-2.5-coder-32b-instruct",
    "score": 0.496,
    "normalized_score": 0.496,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/abs/2409.12186",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.511653+00:00",
    "updated_at": "2025-07-19T19:56:14.511653+00:00",
    "benchmark_name": "BigCodeBench-Full"
  },
  {
    "model_benchmark_id": 1604,
    "benchmark_id": "bigcodebench-hard",
    "model_id": "qwen-2.5-coder-32b-instruct",
    "score": 0.27,
    "normalized_score": 0.27,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/abs/2409.12186",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.515099+00:00",
    "updated_at": "2025-07-19T19:56:14.515099+00:00",
    "benchmark_name": "BigCodeBench-Hard"
  },
  {
    "model_benchmark_id": 991,
    "benchmark_id": "gsm8k",
    "model_id": "qwen-2.5-coder-32b-instruct",
    "score": 0.911,
    "normalized_score": 0.911,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/abs/2409.12186",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.076453+00:00",
    "updated_at": "2025-07-19T19:56:13.076453+00:00",
    "benchmark_name": "GSM8k"
  },
  {
    "model_benchmark_id": 46,
    "benchmark_id": "hellaswag",
    "model_id": "qwen-2.5-coder-32b-instruct",
    "score": 0.83,
    "normalized_score": 0.83,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/abs/2409.12186",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.180700+00:00",
    "updated_at": "2025-07-19T19:56:11.180700+00:00",
    "benchmark_name": "HellaSwag"
  },
  {
    "model_benchmark_id": 783,
    "benchmark_id": "humaneval",
    "model_id": "qwen-2.5-coder-32b-instruct",
    "score": 0.927,
    "normalized_score": 0.927,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/abs/2409.12186",
    "verified_by_llmstats": false,
    "analysis_method": "pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.641672+00:00",
    "updated_at": "2025-07-19T19:56:12.641672+00:00",
    "benchmark_name": "HumanEval"
  },
  {
    "model_benchmark_id": 1117,
    "benchmark_id": "livecodebench",
    "model_id": "qwen-2.5-coder-32b-instruct",
    "score": 0.314,
    "normalized_score": 0.314,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/abs/2409.12186",
    "verified_by_llmstats": false,
    "analysis_method": "pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.329968+00:00",
    "updated_at": "2025-07-19T19:56:13.329968+00:00",
    "benchmark_name": "LiveCodeBench"
  },
  {
    "model_benchmark_id": 401,
    "benchmark_id": "math",
    "model_id": "qwen-2.5-coder-32b-instruct",
    "score": 0.572,
    "normalized_score": 0.572,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/abs/2409.12186",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.857514+00:00",
    "updated_at": "2025-07-19T19:56:11.857514+00:00",
    "benchmark_name": "MATH"
  },
  {
    "model_benchmark_id": 1182,
    "benchmark_id": "mbpp",
    "model_id": "qwen-2.5-coder-32b-instruct",
    "score": 0.902,
    "normalized_score": 0.902,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/abs/2409.12186",
    "verified_by_llmstats": false,
    "analysis_method": "pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.491369+00:00",
    "updated_at": "2025-07-19T19:56:13.491369+00:00",
    "benchmark_name": "MBPP"
  },
  {
    "model_benchmark_id": 86,
    "benchmark_id": "mmlu",
    "model_id": "qwen-2.5-coder-32b-instruct",
    "score": 0.751,
    "normalized_score": 0.751,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/abs/2409.12186",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.263438+00:00",
    "updated_at": "2025-07-19T19:56:11.263438+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 191,
    "benchmark_id": "mmlu-pro",
    "model_id": "qwen-2.5-coder-32b-instruct",
    "score": 0.504,
    "normalized_score": 0.504,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/abs/2409.12186",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.466410+00:00",
    "updated_at": "2025-07-19T19:56:11.466410+00:00",
    "benchmark_name": "MMLU-Pro"
  },
  {
    "model_benchmark_id": 729,
    "benchmark_id": "mmlu-redux",
    "model_id": "qwen-2.5-coder-32b-instruct",
    "score": 0.775,
    "normalized_score": 0.775,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/abs/2409.12186",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.535302+00:00",
    "updated_at": "2025-07-19T19:56:12.535302+00:00",
    "benchmark_name": "MMLU-Redux"
  },
  {
    "model_benchmark_id": 1594,
    "benchmark_id": "theoremqa",
    "model_id": "qwen-2.5-coder-32b-instruct",
    "score": 0.431,
    "normalized_score": 0.431,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/abs/2409.12186",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.485084+00:00",
    "updated_at": "2025-07-19T19:56:14.485084+00:00",
    "benchmark_name": "TheoremQA"
  },
  {
    "model_benchmark_id": 136,
    "benchmark_id": "truthfulqa",
    "model_id": "qwen-2.5-coder-32b-instruct",
    "score": 0.542,
    "normalized_score": 0.542,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/abs/2409.12186",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.351250+00:00",
    "updated_at": "2025-07-19T19:56:11.351250+00:00",
    "benchmark_name": "TruthfulQA"
  },
  {
    "model_benchmark_id": 1064,
    "benchmark_id": "winogrande",
    "model_id": "qwen-2.5-coder-32b-instruct",
    "score": 0.808,
    "normalized_score": 0.808,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/abs/2409.12186",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.219435+00:00",
    "updated_at": "2025-07-19T19:56:13.219435+00:00",
    "benchmark_name": "Winogrande"
  }
]

================================================
FILE: data/organizations/qwen/models/qwen-2.5-coder-32b-instruct/model.json
================================================
{
  "model_id": "qwen-2.5-coder-32b-instruct",
  "name": "Qwen2.5-Coder 32B Instruct",
  "organization_id": "qwen",
  "fine_tuned_from_model_id": "qwen-2.5-32b-instruct",
  "description": "Qwen2.5-Coder is a specialized coding model trained on 5.5 trillion tokens of code data, supporting 92 programming languages with a 128K context window. It excels in code generation, completion, repair, and multi-programming tasks while maintaining strong performance in mathematics and general capabilities.",
  "release_date": "2024-09-19",
  "announcement_date": "2024-09-19",
  "license_id": "apache_2_0",
  "multimodal": false,
  "knowledge_cutoff": null,
  "param_count": 32000000000,
  "training_tokens": 5500000000000,
  "available_in_zeroeval": true,
  "source_api_ref": "https://www.alibabacloud.com/help/en/model-studio/developer-reference/use-qwen-by-calling-api",
  "source_playground": null,
  "source_paper": "https://arxiv.org/abs/2409.12186",
  "source_scorecard_blog_link": "https://qwenlm.github.io/blog/qwen2.5-coder/",
  "source_repo_link": "https://github.com/QwenLM/Qwen2.5-Coder",
  "source_weights_link": "https://huggingface.co/Qwen/Qwen2.5-Coder-32B",
  "created_at": "2025-07-19T19:49:05.882455+00:00",
  "updated_at": "2025-07-19T19:49:05.882455+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/qwen/models/qwen-2.5-coder-7b-instruct/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 1624,
    "benchmark_id": "aider",
    "model_id": "qwen-2.5-coder-7b-instruct",
    "score": 0.556,
    "normalized_score": 0.556,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/abs/2409.12186",
    "verified_by_llmstats": false,
    "analysis_method": "pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.569369+00:00",
    "updated_at": "2025-07-19T19:56:14.569369+00:00",
    "benchmark_name": "Aider"
  },
  {
    "model_benchmark_id": 20,
    "benchmark_id": "arc-c",
    "model_id": "qwen-2.5-coder-7b-instruct",
    "score": 0.609,
    "normalized_score": 0.609,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/abs/2409.12186",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.126002+00:00",
    "updated_at": "2025-07-19T19:56:11.126002+00:00",
    "benchmark_name": "ARC-C"
  },
  {
    "model_benchmark_id": 1434,
    "benchmark_id": "bigcodebench",
    "model_id": "qwen-2.5-coder-7b-instruct",
    "score": 0.41,
    "normalized_score": 0.41,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/abs/2409.12186",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.052666+00:00",
    "updated_at": "2025-07-19T19:56:14.052666+00:00",
    "benchmark_name": "BigCodeBench"
  },
  {
    "model_benchmark_id": 1620,
    "benchmark_id": "cruxeval-input-cot",
    "model_id": "qwen-2.5-coder-7b-instruct",
    "score": 0.565,
    "normalized_score": 0.565,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/abs/2409.12186",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.554528+00:00",
    "updated_at": "2025-07-19T19:56:14.554528+00:00",
    "benchmark_name": "CRUXEval-Input-CoT"
  },
  {
    "model_benchmark_id": 1621,
    "benchmark_id": "cruxeval-output-cot",
    "model_id": "qwen-2.5-coder-7b-instruct",
    "score": 0.56,
    "normalized_score": 0.56,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/abs/2409.12186",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.558251+00:00",
    "updated_at": "2025-07-19T19:56:14.558251+00:00",
    "benchmark_name": "CRUXEval-Output-CoT"
  },
  {
    "model_benchmark_id": 993,
    "benchmark_id": "gsm8k",
    "model_id": "qwen-2.5-coder-7b-instruct",
    "score": 0.839,
    "normalized_score": 0.839,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/abs/2409.12186",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.080381+00:00",
    "updated_at": "2025-07-19T19:56:13.080381+00:00",
    "benchmark_name": "GSM8k"
  },
  {
    "model_benchmark_id": 47,
    "benchmark_id": "hellaswag",
    "model_id": "qwen-2.5-coder-7b-instruct",
    "score": 0.768,
    "normalized_score": 0.768,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/abs/2409.12186",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.182466+00:00",
    "updated_at": "2025-07-19T19:56:11.182466+00:00",
    "benchmark_name": "HellaSwag"
  },
  {
    "model_benchmark_id": 785,
    "benchmark_id": "humaneval",
    "model_id": "qwen-2.5-coder-7b-instruct",
    "score": 0.884,
    "normalized_score": 0.884,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/abs/2409.12186",
    "verified_by_llmstats": false,
    "analysis_method": "pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.644936+00:00",
    "updated_at": "2025-07-19T19:56:12.644936+00:00",
    "benchmark_name": "HumanEval"
  },
  {
    "model_benchmark_id": 1121,
    "benchmark_id": "livecodebench",
    "model_id": "qwen-2.5-coder-7b-instruct",
    "score": 0.182,
    "normalized_score": 0.182,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/abs/2409.12186",
    "verified_by_llmstats": false,
    "analysis_method": "pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.340042+00:00",
    "updated_at": "2025-07-19T19:56:13.340042+00:00",
    "benchmark_name": "LiveCodeBench"
  },
  {
    "model_benchmark_id": 403,
    "benchmark_id": "math",
    "model_id": "qwen-2.5-coder-7b-instruct",
    "score": 0.466,
    "normalized_score": 0.466,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/abs/2409.12186",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.860821+00:00",
    "updated_at": "2025-07-19T19:56:11.860821+00:00",
    "benchmark_name": "MATH"
  },
  {
    "model_benchmark_id": 1184,
    "benchmark_id": "mbpp",
    "model_id": "qwen-2.5-coder-7b-instruct",
    "score": 0.835,
    "normalized_score": 0.835,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/abs/2409.12186",
    "verified_by_llmstats": false,
    "analysis_method": "pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.495284+00:00",
    "updated_at": "2025-07-19T19:56:13.495284+00:00",
    "benchmark_name": "MBPP"
  },
  {
    "model_benchmark_id": 88,
    "benchmark_id": "mmlu",
    "model_id": "qwen-2.5-coder-7b-instruct",
    "score": 0.676,
    "normalized_score": 0.676,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/abs/2409.12186",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.267319+00:00",
    "updated_at": "2025-07-19T19:56:11.267319+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 1623,
    "benchmark_id": "mmlu-base",
    "model_id": "qwen-2.5-coder-7b-instruct",
    "score": 0.68,
    "normalized_score": 0.68,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/abs/2409.12186",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.565292+00:00",
    "updated_at": "2025-07-19T19:56:14.565292+00:00",
    "benchmark_name": "MMLU-Base"
  },
  {
    "model_benchmark_id": 193,
    "benchmark_id": "mmlu-pro",
    "model_id": "qwen-2.5-coder-7b-instruct",
    "score": 0.401,
    "normalized_score": 0.401,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/abs/2409.12186",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.469384+00:00",
    "updated_at": "2025-07-19T19:56:11.469384+00:00",
    "benchmark_name": "MMLU-Pro"
  },
  {
    "model_benchmark_id": 730,
    "benchmark_id": "mmlu-redux",
    "model_id": "qwen-2.5-coder-7b-instruct",
    "score": 0.666,
    "normalized_score": 0.666,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/abs/2409.12186",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.537049+00:00",
    "updated_at": "2025-07-19T19:56:12.537049+00:00",
    "benchmark_name": "MMLU-Redux"
  },
  {
    "model_benchmark_id": 1622,
    "benchmark_id": "stem",
    "model_id": "qwen-2.5-coder-7b-instruct",
    "score": 0.34,
    "normalized_score": 0.34,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/abs/2409.12186",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.561469+00:00",
    "updated_at": "2025-07-19T19:56:14.561469+00:00",
    "benchmark_name": "STEM"
  },
  {
    "model_benchmark_id": 1596,
    "benchmark_id": "theoremqa",
    "model_id": "qwen-2.5-coder-7b-instruct",
    "score": 0.34,
    "normalized_score": 0.34,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/abs/2409.12186",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.489921+00:00",
    "updated_at": "2025-07-19T19:56:14.489921+00:00",
    "benchmark_name": "TheoremQA"
  },
  {
    "model_benchmark_id": 137,
    "benchmark_id": "truthfulqa",
    "model_id": "qwen-2.5-coder-7b-instruct",
    "score": 0.506,
    "normalized_score": 0.506,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/abs/2409.12186",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.353301+00:00",
    "updated_at": "2025-07-19T19:56:11.353301+00:00",
    "benchmark_name": "TruthfulQA"
  },
  {
    "model_benchmark_id": 1065,
    "benchmark_id": "winogrande",
    "model_id": "qwen-2.5-coder-7b-instruct",
    "score": 0.729,
    "normalized_score": 0.729,
    "is_self_reported": true,
    "self_reported_source_link": "https://arxiv.org/abs/2409.12186",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.221874+00:00",
    "updated_at": "2025-07-19T19:56:13.221874+00:00",
    "benchmark_name": "Winogrande"
  }
]

================================================
FILE: data/organizations/qwen/models/qwen-2.5-coder-7b-instruct/model.json
================================================
{
  "model_id": "qwen-2.5-coder-7b-instruct",
  "name": "Qwen2.5-Coder 7B Instruct",
  "organization_id": "qwen",
  "fine_tuned_from_model_id": "qwen-2.5-7b-instruct",
  "description": "Qwen2.5-Coder is a specialized coding model trained on 5.5 trillion tokens of code data, supporting 92 programming languages with a 128K context window. It excels in code generation, completion, and repair while maintaining strong performance in math and general tasks. The model demonstrates exceptional capabilities in multi-programming language tasks and code reasoning.",
  "release_date": "2024-09-19",
  "announcement_date": "2024-09-19",
  "license_id": "apache_2_0",
  "multimodal": false,
  "knowledge_cutoff": null,
  "param_count": 7000000000,
  "training_tokens": 5500000000000,
  "available_in_zeroeval": true,
  "source_api_ref": "https://www.alibabacloud.com/help/en/model-studio/developer-reference/use-qwen-by-calling-api",
  "source_playground": null,
  "source_paper": "https://arxiv.org/abs/2409.12186",
  "source_scorecard_blog_link": "https://qwenlm.github.io/blog/qwen2.5-coder/",
  "source_repo_link": "https://github.com/QwenLM/Qwen2",
  "source_weights_link": "https://huggingface.co/Qwen/Qwen2.5-7B-Coder",
  "created_at": "2025-07-19T19:49:05.890300+00:00",
  "updated_at": "2025-07-19T19:49:05.890300+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/qwen/models/qwen2-72b-instruct/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 22,
    "benchmark_id": "arc-c",
    "model_id": "qwen2-72b-instruct",
    "score": 0.689,
    "normalized_score": 0.689,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2-72B",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.129146+00:00",
    "updated_at": "2025-07-19T19:56:11.129146+00:00",
    "benchmark_name": "ARC-C"
  },
  {
    "model_benchmark_id": 973,
    "benchmark_id": "bbh",
    "model_id": "qwen2-72b-instruct",
    "score": 0.824,
    "normalized_score": 0.824,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2-72B",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.045120+00:00",
    "updated_at": "2025-07-19T19:56:13.045120+00:00",
    "benchmark_name": "BBH"
  },
  {
    "model_benchmark_id": 437,
    "benchmark_id": "c-eval",
    "model_id": "qwen2-72b-instruct",
    "score": 0.838,
    "normalized_score": 0.838,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2-72B",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.926225+00:00",
    "updated_at": "2025-07-19T19:56:11.926225+00:00",
    "benchmark_name": "C-Eval"
  },
  {
    "model_benchmark_id": 1749,
    "benchmark_id": "cmmlu",
    "model_id": "qwen2-72b-instruct",
    "score": 0.901,
    "normalized_score": 0.901,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2-72B",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.943893+00:00",
    "updated_at": "2025-07-19T19:56:14.943893+00:00",
    "benchmark_name": "CMMLU"
  },
  {
    "model_benchmark_id": 372,
    "benchmark_id": "evalplus",
    "model_id": "qwen2-72b-instruct",
    "score": 0.79,
    "normalized_score": 0.79,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2-72B",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.802955+00:00",
    "updated_at": "2025-07-19T19:56:11.802955+00:00",
    "benchmark_name": "EvalPlus"
  },
  {
    "model_benchmark_id": 307,
    "benchmark_id": "gpqa",
    "model_id": "qwen2-72b-instruct",
    "score": 0.424,
    "normalized_score": 0.424,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2-72B",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.687633+00:00",
    "updated_at": "2025-07-19T19:56:11.687633+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 999,
    "benchmark_id": "gsm8k",
    "model_id": "qwen2-72b-instruct",
    "score": 0.911,
    "normalized_score": 0.911,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2-72B",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.089706+00:00",
    "updated_at": "2025-07-19T19:56:13.089706+00:00",
    "benchmark_name": "GSM8k"
  },
  {
    "model_benchmark_id": 48,
    "benchmark_id": "hellaswag",
    "model_id": "qwen2-72b-instruct",
    "score": 0.876,
    "normalized_score": 0.876,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2-72B",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.184833+00:00",
    "updated_at": "2025-07-19T19:56:11.184833+00:00",
    "benchmark_name": "HellaSwag"
  },
  {
    "model_benchmark_id": 790,
    "benchmark_id": "humaneval",
    "model_id": "qwen2-72b-instruct",
    "score": 0.86,
    "normalized_score": 0.86,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2-72B",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.653267+00:00",
    "updated_at": "2025-07-19T19:56:12.653267+00:00",
    "benchmark_name": "HumanEval"
  },
  {
    "model_benchmark_id": 409,
    "benchmark_id": "math",
    "model_id": "qwen2-72b-instruct",
    "score": 0.597,
    "normalized_score": 0.597,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2-72B",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.871582+00:00",
    "updated_at": "2025-07-19T19:56:11.871582+00:00",
    "benchmark_name": "MATH"
  },
  {
    "model_benchmark_id": 1190,
    "benchmark_id": "mbpp",
    "model_id": "qwen2-72b-instruct",
    "score": 0.802,
    "normalized_score": 0.802,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2-72B",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.508406+00:00",
    "updated_at": "2025-07-19T19:56:13.508406+00:00",
    "benchmark_name": "MBPP"
  },
  {
    "model_benchmark_id": 91,
    "benchmark_id": "mmlu",
    "model_id": "qwen2-72b-instruct",
    "score": 0.823,
    "normalized_score": 0.823,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2-72B",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.272629+00:00",
    "updated_at": "2025-07-19T19:56:11.272629+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 199,
    "benchmark_id": "mmlu-pro",
    "model_id": "qwen2-72b-instruct",
    "score": 0.644,
    "normalized_score": 0.644,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2-72B",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.480879+00:00",
    "updated_at": "2025-07-19T19:56:11.480879+00:00",
    "benchmark_name": "MMLU-Pro"
  },
  {
    "model_benchmark_id": 647,
    "benchmark_id": "multipl-e",
    "model_id": "qwen2-72b-instruct",
    "score": 0.692,
    "normalized_score": 0.692,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2-72B",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.327331+00:00",
    "updated_at": "2025-07-19T19:56:12.327331+00:00",
    "benchmark_name": "MultiPL-E"
  },
  {
    "model_benchmark_id": 1598,
    "benchmark_id": "theoremqa",
    "model_id": "qwen2-72b-instruct",
    "score": 0.444,
    "normalized_score": 0.444,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2-72B",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.494165+00:00",
    "updated_at": "2025-07-19T19:56:14.494165+00:00",
    "benchmark_name": "TheoremQA"
  },
  {
    "model_benchmark_id": 139,
    "benchmark_id": "truthfulqa",
    "model_id": "qwen2-72b-instruct",
    "score": 0.548,
    "normalized_score": 0.548,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2-72B",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.356602+00:00",
    "updated_at": "2025-07-19T19:56:11.356602+00:00",
    "benchmark_name": "TruthfulQA"
  },
  {
    "model_benchmark_id": 151,
    "benchmark_id": "winogrande",
    "model_id": "qwen2-72b-instruct",
    "score": 0.851,
    "normalized_score": 0.851,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2-72B",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.386216+00:00",
    "updated_at": "2025-07-19T19:56:11.386216+00:00",
    "benchmark_name": "Winogrande"
  }
]

================================================
FILE: data/organizations/qwen/models/qwen2-72b-instruct/model.json
================================================
{
  "model_id": "qwen2-72b-instruct",
  "name": "Qwen2 72B Instruct",
  "organization_id": "qwen",
  "fine_tuned_from_model_id": null,
  "description": "Qwen2-72B-Instruct is an instruction-tuned language model with 72 billion parameters, supporting a context length of up to 131,072 tokens. It's part of the new Qwen2 series, which has surpassed most open-source models and demonstrates competitiveness against proprietary models across various benchmarks.",
  "release_date": "2024-07-23",
  "announcement_date": "2024-07-23",
  "license_id": "tongyi_qianwen",
  "multimodal": false,
  "knowledge_cutoff": null,
  "param_count": 72000000000,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://huggingface.co/Qwen/Qwen2-72B",
  "source_playground": "https://huggingface.co/Qwen/Qwen2-72B",
  "source_paper": "https://arxiv.org/abs/2309.00071",
  "source_scorecard_blog_link": "https://qwenlm.github.io/blog/qwen2/",
  "source_repo_link": "https://huggingface.co/Qwen/Qwen2-72B",
  "source_weights_link": "https://huggingface.co/Qwen/Qwen2-72B",
  "created_at": "2025-07-19T19:49:05.650844+00:00",
  "updated_at": "2025-07-19T19:49:05.650844+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/qwen/models/qwen2-7b-instruct/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 1616,
    "benchmark_id": "alignbench",
    "model_id": "qwen2-7b-instruct",
    "score": 0.721,
    "normalized_score": 0.721,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2-7B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.544441+00:00",
    "updated_at": "2025-07-19T19:56:14.544441+00:00",
    "benchmark_name": "AlignBench"
  },
  {
    "model_benchmark_id": 436,
    "benchmark_id": "c-eval",
    "model_id": "qwen2-7b-instruct",
    "score": 0.772,
    "normalized_score": 0.772,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2-7B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.924104+00:00",
    "updated_at": "2025-07-19T19:56:11.924104+00:00",
    "benchmark_name": "C-Eval"
  },
  {
    "model_benchmark_id": 370,
    "benchmark_id": "evalplus",
    "model_id": "qwen2-7b-instruct",
    "score": 0.703,
    "normalized_score": 0.703,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2-7B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.799094+00:00",
    "updated_at": "2025-07-19T19:56:11.799094+00:00",
    "benchmark_name": "EvalPlus"
  },
  {
    "model_benchmark_id": 299,
    "benchmark_id": "gpqa",
    "model_id": "qwen2-7b-instruct",
    "score": 0.253,
    "normalized_score": 0.253,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2-7B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.674412+00:00",
    "updated_at": "2025-07-19T19:56:11.674412+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 992,
    "benchmark_id": "gsm8k",
    "model_id": "qwen2-7b-instruct",
    "score": 0.823,
    "normalized_score": 0.823,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2-7B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.078833+00:00",
    "updated_at": "2025-07-19T19:56:13.078833+00:00",
    "benchmark_name": "GSM8k"
  },
  {
    "model_benchmark_id": 784,
    "benchmark_id": "humaneval",
    "model_id": "qwen2-7b-instruct",
    "score": 0.799,
    "normalized_score": 0.799,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2-7B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.643272+00:00",
    "updated_at": "2025-07-19T19:56:12.643272+00:00",
    "benchmark_name": "HumanEval"
  },
  {
    "model_benchmark_id": 1119,
    "benchmark_id": "livecodebench",
    "model_id": "qwen2-7b-instruct",
    "score": 0.266,
    "normalized_score": 0.266,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2-7B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.335377+00:00",
    "updated_at": "2025-07-19T19:56:13.335377+00:00",
    "benchmark_name": "LiveCodeBench"
  },
  {
    "model_benchmark_id": 402,
    "benchmark_id": "math",
    "model_id": "qwen2-7b-instruct",
    "score": 0.496,
    "normalized_score": 0.496,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2-7B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.859120+00:00",
    "updated_at": "2025-07-19T19:56:11.859120+00:00",
    "benchmark_name": "MATH"
  },
  {
    "model_benchmark_id": 1183,
    "benchmark_id": "mbpp",
    "model_id": "qwen2-7b-instruct",
    "score": 0.672,
    "normalized_score": 0.672,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2-7B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.493272+00:00",
    "updated_at": "2025-07-19T19:56:13.493272+00:00",
    "benchmark_name": "MBPP"
  },
  {
    "model_benchmark_id": 87,
    "benchmark_id": "mmlu",
    "model_id": "qwen2-7b-instruct",
    "score": 0.705,
    "normalized_score": 0.705,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2-7B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.265352+00:00",
    "updated_at": "2025-07-19T19:56:11.265352+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 192,
    "benchmark_id": "mmlu-pro",
    "model_id": "qwen2-7b-instruct",
    "score": 0.441,
    "normalized_score": 0.441,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2-7B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.467957+00:00",
    "updated_at": "2025-07-19T19:56:11.467957+00:00",
    "benchmark_name": "MMLU-Pro"
  },
  {
    "model_benchmark_id": 1605,
    "benchmark_id": "mt-bench",
    "model_id": "qwen2-7b-instruct",
    "score": 0.841,
    "normalized_score": 0.841,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2-7B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.519120+00:00",
    "updated_at": "2025-07-19T19:56:14.519120+00:00",
    "benchmark_name": "MT-Bench"
  },
  {
    "model_benchmark_id": 641,
    "benchmark_id": "multipl-e",
    "model_id": "qwen2-7b-instruct",
    "score": 0.591,
    "normalized_score": 0.591,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2-7B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.317803+00:00",
    "updated_at": "2025-07-19T19:56:12.317803+00:00",
    "benchmark_name": "MultiPL-E"
  },
  {
    "model_benchmark_id": 1595,
    "benchmark_id": "theoremqa",
    "model_id": "qwen2-7b-instruct",
    "score": 0.253,
    "normalized_score": 0.253,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2-7B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.487702+00:00",
    "updated_at": "2025-07-19T19:56:14.487702+00:00",
    "benchmark_name": "TheoremQA"
  }
]

================================================
FILE: data/organizations/qwen/models/qwen2-7b-instruct/model.json
================================================
{
  "model_id": "qwen2-7b-instruct",
  "name": "Qwen2 7B Instruct",
  "organization_id": "qwen",
  "fine_tuned_from_model_id": null,
  "description": "Qwen2-7B-Instruct is an instruction-tuned language model with 7 billion parameters, supporting a context length of up to 131,072 tokens.",
  "release_date": "2024-07-23",
  "announcement_date": "2024-07-23",
  "license_id": "apache_2_0",
  "multimodal": false,
  "knowledge_cutoff": null,
  "param_count": 7620000000,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://huggingface.co/Qwen/Qwen2-7B-Instruct",
  "source_playground": "https://huggingface.co/Qwen/Qwen2-7B-Instruct",
  "source_paper": "https://arxiv.org/abs/2309.00071",
  "source_scorecard_blog_link": null,
  "source_repo_link": "https://huggingface.co/Qwen/Qwen2-7B-Instruct",
  "source_weights_link": "https://huggingface.co/Qwen/Qwen2-7B-Instruct",
  "created_at": "2025-07-19T19:49:05.612662+00:00",
  "updated_at": "2025-07-19T19:49:05.612662+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/qwen/models/qwen2-vl-72b/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 864,
    "benchmark_id": "chartqa",
    "model_id": "qwen2-vl-72b",
    "score": 0.883,
    "normalized_score": 0.883,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2",
    "verified_by_llmstats": false,
    "analysis_method": "score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.806635+00:00",
    "updated_at": "2025-07-19T19:56:12.806635+00:00",
    "benchmark_name": "ChartQA"
  },
  {
    "model_benchmark_id": 1629,
    "benchmark_id": "docvqatest",
    "model_id": "qwen2-vl-72b",
    "score": 0.965,
    "normalized_score": 0.965,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2",
    "verified_by_llmstats": false,
    "analysis_method": "score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.582058+00:00",
    "updated_at": "2025-07-19T19:56:14.582058+00:00",
    "benchmark_name": "DocVQAtest"
  },
  {
    "model_benchmark_id": 923,
    "benchmark_id": "egoschema",
    "model_id": "qwen2-vl-72b",
    "score": 0.779,
    "normalized_score": 0.779,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2",
    "verified_by_llmstats": false,
    "analysis_method": "score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.928297+00:00",
    "updated_at": "2025-07-19T19:56:12.928297+00:00",
    "benchmark_name": "EgoSchema"
  },
  {
    "model_benchmark_id": 1630,
    "benchmark_id": "infovqatest",
    "model_id": "qwen2-vl-72b",
    "score": 0.845,
    "normalized_score": 0.845,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2",
    "verified_by_llmstats": false,
    "analysis_method": "score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.586477+00:00",
    "updated_at": "2025-07-19T19:56:14.586477+00:00",
    "benchmark_name": "InfoVQAtest"
  },
  {
    "model_benchmark_id": 1269,
    "benchmark_id": "mathvista-mini",
    "model_id": "qwen2-vl-72b",
    "score": 0.705,
    "normalized_score": 0.705,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2",
    "verified_by_llmstats": false,
    "analysis_method": "score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.662750+00:00",
    "updated_at": "2025-07-19T19:56:13.662750+00:00",
    "benchmark_name": "MathVista-Mini"
  },
  {
    "model_benchmark_id": 1639,
    "benchmark_id": "mmbench-test",
    "model_id": "qwen2-vl-72b",
    "score": 0.865,
    "normalized_score": 0.865,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2",
    "verified_by_llmstats": false,
    "analysis_method": "score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.610292+00:00",
    "updated_at": "2025-07-19T19:56:14.610292+00:00",
    "benchmark_name": "MMBench_test"
  },
  {
    "model_benchmark_id": 1532,
    "benchmark_id": "mmmu-pro",
    "model_id": "qwen2-vl-72b",
    "score": 0.462,
    "normalized_score": 0.462,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2",
    "verified_by_llmstats": false,
    "analysis_method": "score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.292395+00:00",
    "updated_at": "2025-07-19T19:56:14.292395+00:00",
    "benchmark_name": "MMMU-Pro"
  },
  {
    "model_benchmark_id": 1628,
    "benchmark_id": "mmmuval",
    "model_id": "qwen2-vl-72b",
    "score": 0.645,
    "normalized_score": 0.645,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2",
    "verified_by_llmstats": false,
    "analysis_method": "score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.578458+00:00",
    "updated_at": "2025-07-19T19:56:14.578458+00:00",
    "benchmark_name": "MMMUval"
  },
  {
    "model_benchmark_id": 1640,
    "benchmark_id": "mmvetgpt4turbo",
    "model_id": "qwen2-vl-72b",
    "score": 0.74,
    "normalized_score": 0.74,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2",
    "verified_by_llmstats": false,
    "analysis_method": "score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.613913+00:00",
    "updated_at": "2025-07-19T19:56:14.613913+00:00",
    "benchmark_name": "MMVetGPT4Turbo"
  },
  {
    "model_benchmark_id": 1631,
    "benchmark_id": "mtvqa",
    "model_id": "qwen2-vl-72b",
    "score": 0.309,
    "normalized_score": 0.309,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2",
    "verified_by_llmstats": false,
    "analysis_method": "score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.590936+00:00",
    "updated_at": "2025-07-19T19:56:14.590936+00:00",
    "benchmark_name": "MTVQA"
  },
  {
    "model_benchmark_id": 1641,
    "benchmark_id": "mvbench",
    "model_id": "qwen2-vl-72b",
    "score": 0.736,
    "normalized_score": 0.736,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2",
    "verified_by_llmstats": false,
    "analysis_method": "score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.618622+00:00",
    "updated_at": "2025-07-19T19:56:14.618622+00:00",
    "benchmark_name": "MVBench"
  },
  {
    "model_benchmark_id": 1539,
    "benchmark_id": "ocrbench",
    "model_id": "qwen2-vl-72b",
    "score": 0.877,
    "normalized_score": 0.877,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2",
    "verified_by_llmstats": false,
    "analysis_method": "score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.311748+00:00",
    "updated_at": "2025-07-19T19:56:14.311748+00:00",
    "benchmark_name": "OCRBench"
  },
  {
    "model_benchmark_id": 1633,
    "benchmark_id": "realworldqa",
    "model_id": "qwen2-vl-72b",
    "score": 0.778,
    "normalized_score": 0.778,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2",
    "verified_by_llmstats": false,
    "analysis_method": "score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.597450+00:00",
    "updated_at": "2025-07-19T19:56:14.597450+00:00",
    "benchmark_name": "RealWorldQA"
  },
  {
    "model_benchmark_id": 909,
    "benchmark_id": "textvqa",
    "model_id": "qwen2-vl-72b",
    "score": 0.855,
    "normalized_score": 0.855,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2",
    "verified_by_llmstats": false,
    "analysis_method": "score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.894922+00:00",
    "updated_at": "2025-07-19T19:56:12.894922+00:00",
    "benchmark_name": "TextVQA"
  },
  {
    "model_benchmark_id": 1632,
    "benchmark_id": "vcr-en-easy",
    "model_id": "qwen2-vl-72b",
    "score": 0.9193,
    "normalized_score": 0.9193,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2",
    "verified_by_llmstats": false,
    "analysis_method": "score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.594379+00:00",
    "updated_at": "2025-07-19T19:56:14.594379+00:00",
    "benchmark_name": "VCR_en_easy"
  }
]

================================================
FILE: data/organizations/qwen/models/qwen2-vl-72b/model.json
================================================
{
  "model_id": "qwen2-vl-72b",
  "name": "Qwen2-VL-72B-Instruct",
  "organization_id": "qwen",
  "fine_tuned_from_model_id": null,
  "description": "An instruction-tuned, large multimodal model that excels at visual understanding and step-by-step reasoning. It supports image and video input, with dynamic resolution handling and improved positional embeddings (M-ROPE), enabling advanced capabilities such as complex problem solving, multilingual text recognition in images, and agent-like interactions in video contexts.",
  "release_date": "2024-08-29",
  "announcement_date": "2024-08-29",
  "license_id": "tongyi_qianwen",
  "multimodal": true,
  "knowledge_cutoff": "2023-06-30",
  "param_count": 73400000000,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct",
  "source_playground": null,
  "source_paper": "https://arxiv.org/abs/2409.12191",
  "source_scorecard_blog_link": "https://qwenlm.github.io/blog/qwen2-vl/",
  "source_repo_link": "https://github.com/QwenLM/Qwen2-VL",
  "source_weights_link": "https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct",
  "created_at": "2025-07-19T19:49:05.619575+00:00",
  "updated_at": "2025-07-19T19:49:05.619575+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/qwen/models/qwen2.5-omni-7b/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 1254,
    "benchmark_id": "ai2d",
    "model_id": "qwen2.5-omni-7b",
    "score": 0.832,
    "normalized_score": 0.832,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.633399+00:00",
    "updated_at": "2025-07-19T19:56:13.633399+00:00",
    "benchmark_name": "AI2D"
  },
  {
    "model_benchmark_id": 866,
    "benchmark_id": "chartqa",
    "model_id": "qwen2.5-omni-7b",
    "score": 0.853,
    "normalized_score": 0.853,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.809953+00:00",
    "updated_at": "2025-07-19T19:56:12.809953+00:00",
    "benchmark_name": "ChartQA"
  },
  {
    "model_benchmark_id": 1718,
    "benchmark_id": "common-voice-15",
    "model_id": "qwen2.5-omni-7b",
    "score": 0.076,
    "normalized_score": 0.076,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni",
    "verified_by_llmstats": false,
    "analysis_method": "WER",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.833534+00:00",
    "updated_at": "2025-07-19T19:56:14.833534+00:00",
    "benchmark_name": "Common Voice 15"
  },
  {
    "model_benchmark_id": 1717,
    "benchmark_id": "covost2-en-zh",
    "model_id": "qwen2.5-omni-7b",
    "score": 0.414,
    "normalized_score": 0.414,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni",
    "verified_by_llmstats": false,
    "analysis_method": "BLEU",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.828460+00:00",
    "updated_at": "2025-07-19T19:56:14.828460+00:00",
    "benchmark_name": "CoVoST2 en-zh"
  },
  {
    "model_benchmark_id": 1719,
    "benchmark_id": "crperelation",
    "model_id": "qwen2.5-omni-7b",
    "score": 0.765,
    "normalized_score": 0.765,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.837425+00:00",
    "updated_at": "2025-07-19T19:56:14.837425+00:00",
    "benchmark_name": "CRPErelation"
  },
  {
    "model_benchmark_id": 887,
    "benchmark_id": "docvqa",
    "model_id": "qwen2.5-omni-7b",
    "score": 0.952,
    "normalized_score": 0.952,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.846061+00:00",
    "updated_at": "2025-07-19T19:56:12.846061+00:00",
    "benchmark_name": "DocVQA"
  },
  {
    "model_benchmark_id": 924,
    "benchmark_id": "egoschema",
    "model_id": "qwen2.5-omni-7b",
    "score": 0.686,
    "normalized_score": 0.686,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.931056+00:00",
    "updated_at": "2025-07-19T19:56:12.931056+00:00",
    "benchmark_name": "EgoSchema"
  },
  {
    "model_benchmark_id": 1401,
    "benchmark_id": "fleurs",
    "model_id": "qwen2.5-omni-7b",
    "score": 0.041,
    "normalized_score": 0.041,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni",
    "verified_by_llmstats": false,
    "analysis_method": "WER",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.953081+00:00",
    "updated_at": "2025-07-19T19:56:13.953081+00:00",
    "benchmark_name": "FLEURS"
  },
  {
    "model_benchmark_id": 1720,
    "benchmark_id": "giantsteps-tempo",
    "model_id": "qwen2.5-omni-7b",
    "score": 0.88,
    "normalized_score": 0.88,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.841583+00:00",
    "updated_at": "2025-07-19T19:56:14.841583+00:00",
    "benchmark_name": "GiantSteps Tempo"
  },
  {
    "model_benchmark_id": 305,
    "benchmark_id": "gpqa",
    "model_id": "qwen2.5-omni-7b",
    "score": 0.308,
    "normalized_score": 0.308,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.684328+00:00",
    "updated_at": "2025-07-19T19:56:11.684328+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 997,
    "benchmark_id": "gsm8k",
    "model_id": "qwen2.5-omni-7b",
    "score": 0.887,
    "normalized_score": 0.887,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.086524+00:00",
    "updated_at": "2025-07-19T19:56:13.086524+00:00",
    "benchmark_name": "GSM8k"
  },
  {
    "model_benchmark_id": 788,
    "benchmark_id": "humaneval",
    "model_id": "qwen2.5-omni-7b",
    "score": 0.787,
    "normalized_score": 0.787,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.650243+00:00",
    "updated_at": "2025-07-19T19:56:12.650243+00:00",
    "benchmark_name": "HumanEval"
  },
  {
    "model_benchmark_id": 752,
    "benchmark_id": "livebench",
    "model_id": "qwen2.5-omni-7b",
    "score": 0.296,
    "normalized_score": 0.296,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.581448+00:00",
    "updated_at": "2025-07-19T19:56:12.581448+00:00",
    "benchmark_name": "LiveBench"
  },
  {
    "model_benchmark_id": 407,
    "benchmark_id": "math",
    "model_id": "qwen2.5-omni-7b",
    "score": 0.715,
    "normalized_score": 0.715,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.867189+00:00",
    "updated_at": "2025-07-19T19:56:11.867189+00:00",
    "benchmark_name": "MATH"
  },
  {
    "model_benchmark_id": 1676,
    "benchmark_id": "mathvision",
    "model_id": "qwen2.5-omni-7b",
    "score": 0.25,
    "normalized_score": 0.25,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.702750+00:00",
    "updated_at": "2025-07-19T19:56:14.702750+00:00",
    "benchmark_name": "MathVision"
  },
  {
    "model_benchmark_id": 527,
    "benchmark_id": "mathvista",
    "model_id": "qwen2.5-omni-7b",
    "score": 0.679,
    "normalized_score": 0.679,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.094090+00:00",
    "updated_at": "2025-07-19T19:56:12.094090+00:00",
    "benchmark_name": "MathVista"
  },
  {
    "model_benchmark_id": 1188,
    "benchmark_id": "mbpp",
    "model_id": "qwen2.5-omni-7b",
    "score": 0.732,
    "normalized_score": 0.732,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.504920+00:00",
    "updated_at": "2025-07-19T19:56:13.504920+00:00",
    "benchmark_name": "MBPP"
  },
  {
    "model_benchmark_id": 1721,
    "benchmark_id": "meld",
    "model_id": "qwen2.5-omni-7b",
    "score": 0.57,
    "normalized_score": 0.57,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.845437+00:00",
    "updated_at": "2025-07-19T19:56:14.845437+00:00",
    "benchmark_name": "Meld"
  },
  {
    "model_benchmark_id": 1722,
    "benchmark_id": "mmau",
    "model_id": "qwen2.5-omni-7b",
    "score": 0.656,
    "normalized_score": 0.656,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.849392+00:00",
    "updated_at": "2025-07-19T19:56:14.849392+00:00",
    "benchmark_name": "MMAU"
  },
  {
    "model_benchmark_id": 1723,
    "benchmark_id": "mmau-music",
    "model_id": "qwen2.5-omni-7b",
    "score": 0.6916,
    "normalized_score": 0.6916,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.854098+00:00",
    "updated_at": "2025-07-19T19:56:14.854098+00:00",
    "benchmark_name": "MMAU Music"
  },
  {
    "model_benchmark_id": 1724,
    "benchmark_id": "mmau-sound",
    "model_id": "qwen2.5-omni-7b",
    "score": 0.6787,
    "normalized_score": 0.6787,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.862523+00:00",
    "updated_at": "2025-07-19T19:56:14.862523+00:00",
    "benchmark_name": "MMAU Sound"
  },
  {
    "model_benchmark_id": 1725,
    "benchmark_id": "mmau-speech",
    "model_id": "qwen2.5-omni-7b",
    "score": 0.5976,
    "normalized_score": 0.5976,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.867393+00:00",
    "updated_at": "2025-07-19T19:56:14.867393+00:00",
    "benchmark_name": "MMAU Speech"
  },
  {
    "model_benchmark_id": 1726,
    "benchmark_id": "mmbench-v1.1",
    "model_id": "qwen2.5-omni-7b",
    "score": 0.818,
    "normalized_score": 0.818,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.871500+00:00",
    "updated_at": "2025-07-19T19:56:14.871500+00:00",
    "benchmark_name": "MMBench-V1.1"
  },
  {
    "model_benchmark_id": 1730,
    "benchmark_id": "mme-realworld",
    "model_id": "qwen2.5-omni-7b",
    "score": 0.616,
    "normalized_score": 0.616,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.879804+00:00",
    "updated_at": "2025-07-19T19:56:14.879804+00:00",
    "benchmark_name": "MME-RealWorld"
  },
  {
    "model_benchmark_id": 197,
    "benchmark_id": "mmlu-pro",
    "model_id": "qwen2.5-omni-7b",
    "score": 0.47,
    "normalized_score": 0.47,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.477278+00:00",
    "updated_at": "2025-07-19T19:56:11.477278+00:00",
    "benchmark_name": "MMLU-Pro"
  },
  {
    "model_benchmark_id": 734,
    "benchmark_id": "mmlu-redux",
    "model_id": "qwen2.5-omni-7b",
    "score": 0.71,
    "normalized_score": 0.71,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.544013+00:00",
    "updated_at": "2025-07-19T19:56:12.544013+00:00",
    "benchmark_name": "MMLU-Redux"
  },
  {
    "model_benchmark_id": 1731,
    "benchmark_id": "mm-mt-bench",
    "model_id": "qwen2.5-omni-7b",
    "score": 0.06,
    "normalized_score": 0.06,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.883880+00:00",
    "updated_at": "2025-07-19T19:56:14.883880+00:00",
    "benchmark_name": "MM-MT-Bench"
  },
  {
    "model_benchmark_id": 571,
    "benchmark_id": "mmmu",
    "model_id": "qwen2.5-omni-7b",
    "score": 0.592,
    "normalized_score": 0.592,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.175251+00:00",
    "updated_at": "2025-07-19T19:56:12.175251+00:00",
    "benchmark_name": "MMMU"
  },
  {
    "model_benchmark_id": 1534,
    "benchmark_id": "mmmu-pro",
    "model_id": "qwen2.5-omni-7b",
    "score": 0.366,
    "normalized_score": 0.366,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.296124+00:00",
    "updated_at": "2025-07-19T19:56:14.296124+00:00",
    "benchmark_name": "MMMU-Pro"
  },
  {
    "model_benchmark_id": 1660,
    "benchmark_id": "mmstar",
    "model_id": "qwen2.5-omni-7b",
    "score": 0.64,
    "normalized_score": 0.64,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.664551+00:00",
    "updated_at": "2025-07-19T19:56:14.664551+00:00",
    "benchmark_name": "MMStar"
  },
  {
    "model_benchmark_id": 1734,
    "benchmark_id": "muirbench",
    "model_id": "qwen2.5-omni-7b",
    "score": 0.592,
    "normalized_score": 0.592,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.891075+00:00",
    "updated_at": "2025-07-19T19:56:14.891075+00:00",
    "benchmark_name": "MuirBench"
  },
  {
    "model_benchmark_id": 645,
    "benchmark_id": "multipl-e",
    "model_id": "qwen2.5-omni-7b",
    "score": 0.658,
    "normalized_score": 0.658,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.324318+00:00",
    "updated_at": "2025-07-19T19:56:12.324318+00:00",
    "benchmark_name": "MultiPL-E"
  },
  {
    "model_benchmark_id": 1735,
    "benchmark_id": "musiccaps",
    "model_id": "qwen2.5-omni-7b",
    "score": 0.328,
    "normalized_score": 0.328,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.894342+00:00",
    "updated_at": "2025-07-19T19:56:14.894342+00:00",
    "benchmark_name": "MusicCaps"
  },
  {
    "model_benchmark_id": 1643,
    "benchmark_id": "mvbench",
    "model_id": "qwen2.5-omni-7b",
    "score": 0.703,
    "normalized_score": 0.703,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.621841+00:00",
    "updated_at": "2025-07-19T19:56:14.621841+00:00",
    "benchmark_name": "MVBench"
  },
  {
    "model_benchmark_id": 1736,
    "benchmark_id": "nmos",
    "model_id": "qwen2.5-omni-7b",
    "score": 0.0451,
    "normalized_score": 0.0451,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen2.5-omni/",
    "verified_by_llmstats": false,
    "analysis_method": "NMOS",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.897653+00:00",
    "updated_at": "2025-07-19T19:56:14.897653+00:00",
    "benchmark_name": "NMOS"
  },
  {
    "model_benchmark_id": 1737,
    "benchmark_id": "ocrbench-v2",
    "model_id": "qwen2.5-omni-7b",
    "score": 0.578,
    "normalized_score": 0.578,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.901546+00:00",
    "updated_at": "2025-07-19T19:56:14.901546+00:00",
    "benchmark_name": "OCRBench_V2"
  },
  {
    "model_benchmark_id": 1738,
    "benchmark_id": "odinw",
    "model_id": "qwen2.5-omni-7b",
    "score": 0.424,
    "normalized_score": 0.424,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.905294+00:00",
    "updated_at": "2025-07-19T19:56:14.905294+00:00",
    "benchmark_name": "ODinW"
  },
  {
    "model_benchmark_id": 1739,
    "benchmark_id": "omnibench",
    "model_id": "qwen2.5-omni-7b",
    "score": 0.5613,
    "normalized_score": 0.5613,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.909979+00:00",
    "updated_at": "2025-07-19T19:56:14.909979+00:00",
    "benchmark_name": "OmniBench"
  },
  {
    "model_benchmark_id": 1740,
    "benchmark_id": "omnibench-music",
    "model_id": "qwen2.5-omni-7b",
    "score": 0.5283,
    "normalized_score": 0.5283,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.913742+00:00",
    "updated_at": "2025-07-19T19:56:14.913742+00:00",
    "benchmark_name": "OmniBench Music"
  },
  {
    "model_benchmark_id": 1741,
    "benchmark_id": "pointgrounding",
    "model_id": "qwen2.5-omni-7b",
    "score": 0.665,
    "normalized_score": 0.665,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.918183+00:00",
    "updated_at": "2025-07-19T19:56:14.918183+00:00",
    "benchmark_name": "PointGrounding"
  },
  {
    "model_benchmark_id": 1634,
    "benchmark_id": "realworldqa",
    "model_id": "qwen2.5-omni-7b",
    "score": 0.703,
    "normalized_score": 0.703,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.599392+00:00",
    "updated_at": "2025-07-19T19:56:14.599392+00:00",
    "benchmark_name": "RealWorldQA"
  },
  {
    "model_benchmark_id": 911,
    "benchmark_id": "textvqa",
    "model_id": "qwen2.5-omni-7b",
    "score": 0.844,
    "normalized_score": 0.844,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.899579+00:00",
    "updated_at": "2025-07-19T19:56:12.899579+00:00",
    "benchmark_name": "TextVQA"
  },
  {
    "model_benchmark_id": 1685,
    "benchmark_id": "videomme-w-sub.",
    "model_id": "qwen2.5-omni-7b",
    "score": 0.724,
    "normalized_score": 0.724,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.727965+00:00",
    "updated_at": "2025-07-19T19:56:14.727965+00:00",
    "benchmark_name": "VideoMME w sub."
  },
  {
    "model_benchmark_id": 1742,
    "benchmark_id": "vocalsound",
    "model_id": "qwen2.5-omni-7b",
    "score": 0.939,
    "normalized_score": 0.939,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.921505+00:00",
    "updated_at": "2025-07-19T19:56:14.921505+00:00",
    "benchmark_name": "VocalSound"
  },
  {
    "model_benchmark_id": 1743,
    "benchmark_id": "voicebench-avg",
    "model_id": "qwen2.5-omni-7b",
    "score": 0.7412,
    "normalized_score": 0.7412,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-Omni",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.925208+00:00",
    "updated_at": "2025-07-19T19:56:14.925208+00:00",
    "benchmark_name": "VoiceBench Avg"
  }
]

================================================
FILE: data/organizations/qwen/models/qwen2.5-omni-7b/model.json
================================================
{
  "model_id": "qwen2.5-omni-7b",
  "name": "Qwen2.5-Omni-7B",
  "organization_id": "qwen",
  "fine_tuned_from_model_id": null,
  "description": "Qwen2.5-Omni is the flagship end-to-end multimodal model in the Qwen series. It processes diverse inputs including text, images, audio, and video, delivering real-time streaming responses through text generation and natural speech synthesis using a novel Thinker-Talker architecture.",
  "release_date": "2025-03-27",
  "announcement_date": "2025-03-27",
  "license_id": "apache_2_0",
  "multimodal": true,
  "knowledge_cutoff": null,
  "param_count": 7000000000,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": null,
  "source_playground": "https://chat.qwen.ai/",
  "source_paper": "https://arxiv.org/pdf/2503.20215",
  "source_scorecard_blog_link": "https://qwenlm.github.io/blog/qwen2.5-omni/",
  "source_repo_link": "https://github.com/QwenLM/Qwen2.5-Omni",
  "source_weights_link": "https://huggingface.co/Qwen/Qwen2.5-Omni-7B",
  "created_at": "2025-07-19T19:49:05.639433+00:00",
  "updated_at": "2025-07-19T19:49:05.639433+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/qwen/models/qwen2.5-vl-32b/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 1704,
    "benchmark_id": "aitz-em",
    "model_id": "qwen2.5-vl-32b",
    "score": 0.831,
    "normalized_score": 0.831,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "EM",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.791493+00:00",
    "updated_at": "2025-07-19T19:56:14.791493+00:00",
    "benchmark_name": "AITZ_EM"
  },
  {
    "model_benchmark_id": 1707,
    "benchmark_id": "android-control-high-em",
    "model_id": "qwen2.5-vl-32b",
    "score": 0.696,
    "normalized_score": 0.696,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "EM",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.798431+00:00",
    "updated_at": "2025-07-19T19:56:14.798431+00:00",
    "benchmark_name": "Android Control High_EM"
  },
  {
    "model_benchmark_id": 1710,
    "benchmark_id": "android-control-low-em",
    "model_id": "qwen2.5-vl-32b",
    "score": 0.933,
    "normalized_score": 0.933,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "EM",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.807428+00:00",
    "updated_at": "2025-07-19T19:56:14.807428+00:00",
    "benchmark_name": "Android Control Low_EM"
  },
  {
    "model_benchmark_id": 1713,
    "benchmark_id": "androidworld-sr",
    "model_id": "qwen2.5-vl-32b",
    "score": 0.22,
    "normalized_score": 0.22,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "SR",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.815734+00:00",
    "updated_at": "2025-07-19T19:56:14.815734+00:00",
    "benchmark_name": "AndroidWorld_SR"
  },
  {
    "model_benchmark_id": 1658,
    "benchmark_id": "cc-ocr",
    "model_id": "qwen2.5-vl-32b",
    "score": 0.771,
    "normalized_score": 0.771,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.659496+00:00",
    "updated_at": "2025-07-19T19:56:14.659496+00:00",
    "benchmark_name": "CC-OCR"
  },
  {
    "model_benchmark_id": 1695,
    "benchmark_id": "charadessta",
    "model_id": "qwen2.5-vl-32b",
    "score": 0.542,
    "normalized_score": 0.542,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.765807+00:00",
    "updated_at": "2025-07-19T19:56:14.765807+00:00",
    "benchmark_name": "CharadesSTA"
  },
  {
    "model_benchmark_id": 889,
    "benchmark_id": "docvqa",
    "model_id": "qwen2.5-vl-32b",
    "score": 0.948,
    "normalized_score": 0.948,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.850117+00:00",
    "updated_at": "2025-07-19T19:56:12.850117+00:00",
    "benchmark_name": "DocVQA"
  },
  {
    "model_benchmark_id": 1751,
    "benchmark_id": "gpqa",
    "model_id": "qwen2.5-vl-32b",
    "score": 0.46,
    "normalized_score": 0.46,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.953480+00:00",
    "updated_at": "2025-07-19T19:56:14.953480+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 791,
    "benchmark_id": "humaneval",
    "model_id": "qwen2.5-vl-32b",
    "score": 0.915,
    "normalized_score": 0.915,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.655022+00:00",
    "updated_at": "2025-07-19T19:56:12.655022+00:00",
    "benchmark_name": "HumanEval"
  },
  {
    "model_benchmark_id": 1243,
    "benchmark_id": "infovqa",
    "model_id": "qwen2.5-vl-32b",
    "score": 0.834,
    "normalized_score": 0.834,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.612560+00:00",
    "updated_at": "2025-07-19T19:56:13.612560+00:00",
    "benchmark_name": "InfoVQA"
  },
  {
    "model_benchmark_id": 830,
    "benchmark_id": "lvbench",
    "model_id": "qwen2.5-vl-32b",
    "score": 0.49,
    "normalized_score": 0.49,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.733525+00:00",
    "updated_at": "2025-07-19T19:56:12.733525+00:00",
    "benchmark_name": "LVBench"
  },
  {
    "model_benchmark_id": 410,
    "benchmark_id": "math",
    "model_id": "qwen2.5-vl-32b",
    "score": 0.822,
    "normalized_score": 0.822,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.873375+00:00",
    "updated_at": "2025-07-19T19:56:11.873375+00:00",
    "benchmark_name": "MATH"
  },
  {
    "model_benchmark_id": 1678,
    "benchmark_id": "mathvision",
    "model_id": "qwen2.5-vl-32b",
    "score": 0.384,
    "normalized_score": 0.384,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-VL",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.707439+00:00",
    "updated_at": "2025-07-19T19:56:14.707439+00:00",
    "benchmark_name": "MathVision"
  },
  {
    "model_benchmark_id": 1272,
    "benchmark_id": "mathvista-mini",
    "model_id": "qwen2.5-vl-32b",
    "score": 0.747,
    "normalized_score": 0.747,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.668155+00:00",
    "updated_at": "2025-07-19T19:56:13.668155+00:00",
    "benchmark_name": "MathVista-Mini"
  },
  {
    "model_benchmark_id": 1191,
    "benchmark_id": "mbpp",
    "model_id": "qwen2.5-vl-32b",
    "score": 0.84,
    "normalized_score": 0.84,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.509907+00:00",
    "updated_at": "2025-07-19T19:56:13.509907+00:00",
    "benchmark_name": "MBPP"
  },
  {
    "model_benchmark_id": 1690,
    "benchmark_id": "mmbench-video",
    "model_id": "qwen2.5-vl-32b",
    "score": 0.0193,
    "normalized_score": 0.0193,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-VL",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.747059+00:00",
    "updated_at": "2025-07-19T19:56:14.747059+00:00",
    "benchmark_name": "MMBench-Video"
  },
  {
    "model_benchmark_id": 92,
    "benchmark_id": "mmlu",
    "model_id": "qwen2.5-vl-32b",
    "score": 0.784,
    "normalized_score": 0.784,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.274441+00:00",
    "updated_at": "2025-07-19T19:56:11.274441+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 200,
    "benchmark_id": "mmlu-pro",
    "model_id": "qwen2.5-vl-32b",
    "score": 0.688,
    "normalized_score": 0.688,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.482355+00:00",
    "updated_at": "2025-07-19T19:56:11.482355+00:00",
    "benchmark_name": "MMLU-Pro"
  },
  {
    "model_benchmark_id": 573,
    "benchmark_id": "mmmu",
    "model_id": "qwen2.5-vl-32b",
    "score": 0.7,
    "normalized_score": 0.7,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.179390+00:00",
    "updated_at": "2025-07-19T19:56:12.179390+00:00",
    "benchmark_name": "MMMU"
  },
  {
    "model_benchmark_id": 1536,
    "benchmark_id": "mmmu-pro",
    "model_id": "qwen2.5-vl-32b",
    "score": 0.495,
    "normalized_score": 0.495,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.299391+00:00",
    "updated_at": "2025-07-19T19:56:14.299391+00:00",
    "benchmark_name": "MMMU-Pro"
  },
  {
    "model_benchmark_id": 1662,
    "benchmark_id": "mmstar",
    "model_id": "qwen2.5-vl-32b",
    "score": 0.695,
    "normalized_score": 0.695,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.668445+00:00",
    "updated_at": "2025-07-19T19:56:14.668445+00:00",
    "benchmark_name": "MMStar"
  },
  {
    "model_benchmark_id": 1745,
    "benchmark_id": "ocrbench-v2-(en)",
    "model_id": "qwen2.5-vl-32b",
    "score": 0.572,
    "normalized_score": 0.572,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.930331+00:00",
    "updated_at": "2025-07-19T19:56:14.930331+00:00",
    "benchmark_name": "OCRBench-V2 (en)"
  },
  {
    "model_benchmark_id": 1750,
    "benchmark_id": "ocrbench-v2-(zh)",
    "model_id": "qwen2.5-vl-32b",
    "score": 0.591,
    "normalized_score": 0.591,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.947420+00:00",
    "updated_at": "2025-07-19T19:56:14.947420+00:00",
    "benchmark_name": "OCRBench-V2 (zh)"
  },
  {
    "model_benchmark_id": 1748,
    "benchmark_id": "osworld",
    "model_id": "qwen2.5-vl-32b",
    "score": 0.0592,
    "normalized_score": 0.0592,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.939263+00:00",
    "updated_at": "2025-07-19T19:56:14.939263+00:00",
    "benchmark_name": "OSWorld"
  },
  {
    "model_benchmark_id": 1698,
    "benchmark_id": "screenspot",
    "model_id": "qwen2.5-vl-32b",
    "score": 0.885,
    "normalized_score": 0.885,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.775538+00:00",
    "updated_at": "2025-07-19T19:56:14.775538+00:00",
    "benchmark_name": "ScreenSpot"
  },
  {
    "model_benchmark_id": 1701,
    "benchmark_id": "screenspot-pro",
    "model_id": "qwen2.5-vl-32b",
    "score": 0.394,
    "normalized_score": 0.394,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.783897+00:00",
    "updated_at": "2025-07-19T19:56:14.783897+00:00",
    "benchmark_name": "ScreenSpot Pro"
  },
  {
    "model_benchmark_id": 1683,
    "benchmark_id": "videomme-w-o-sub.",
    "model_id": "qwen2.5-vl-32b",
    "score": 0.705,
    "normalized_score": 0.705,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.722056+00:00",
    "updated_at": "2025-07-19T19:56:14.722056+00:00",
    "benchmark_name": "VideoMME w/o sub."
  },
  {
    "model_benchmark_id": 1686,
    "benchmark_id": "videomme-w-sub.",
    "model_id": "qwen2.5-vl-32b",
    "score": 0.779,
    "normalized_score": 0.779,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.729388+00:00",
    "updated_at": "2025-07-19T19:56:14.729388+00:00",
    "benchmark_name": "VideoMME w sub."
  }
]

================================================
FILE: data/organizations/qwen/models/qwen2.5-vl-32b/model.json
================================================
{
  "model_id": "qwen2.5-vl-32b",
  "name": "Qwen2.5 VL 32B Instruct",
  "organization_id": "qwen",
  "fine_tuned_from_model_id": null,
  "description": "Qwen2.5-VL is a vision-language model from the Qwen family. Key enhancements include visual understanding (objects, text, charts, layouts), visual agent capabilities (tool use, computer/phone control), long video comprehension with event pinpointing, visual localization (bounding boxes/points), and structured output generation.",
  "release_date": "2025-02-28",
  "announcement_date": "2025-02-28",
  "license_id": "apache_2_0",
  "multimodal": true,
  "knowledge_cutoff": null,
  "param_count": 33500000000,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": null,
  "source_playground": "https://chat.qwen.ai/",
  "source_paper": "https://arxiv.org/pdf/2502.13923",
  "source_scorecard_blog_link": "https://qwenlm.github.io/blog/qwen2.5-vl/",
  "source_repo_link": "https://github.com/QwenLM/Qwen2.5-VL",
  "source_weights_link": "https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct",
  "created_at": "2025-07-19T19:49:05.653921+00:00",
  "updated_at": "2025-07-19T19:49:05.653921+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/qwen/models/qwen2.5-vl-72b/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 1255,
    "benchmark_id": "ai2d",
    "model_id": "qwen2.5-vl-72b",
    "score": 0.884,
    "normalized_score": 0.884,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.635049+00:00",
    "updated_at": "2025-07-19T19:56:13.635049+00:00",
    "benchmark_name": "AI2D"
  },
  {
    "model_benchmark_id": 1703,
    "benchmark_id": "aitz-em",
    "model_id": "qwen2.5-vl-72b",
    "score": 0.832,
    "normalized_score": 0.832,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "EM",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.789425+00:00",
    "updated_at": "2025-07-19T19:56:14.789425+00:00",
    "benchmark_name": "AITZ_EM"
  },
  {
    "model_benchmark_id": 1706,
    "benchmark_id": "android-control-high-em",
    "model_id": "qwen2.5-vl-72b",
    "score": 0.6736,
    "normalized_score": 0.6736,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "EM",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.796411+00:00",
    "updated_at": "2025-07-19T19:56:14.796411+00:00",
    "benchmark_name": "Android Control High_EM"
  },
  {
    "model_benchmark_id": 1709,
    "benchmark_id": "android-control-low-em",
    "model_id": "qwen2.5-vl-72b",
    "score": 0.937,
    "normalized_score": 0.937,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "EM",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.805303+00:00",
    "updated_at": "2025-07-19T19:56:14.805303+00:00",
    "benchmark_name": "Android Control Low_EM"
  },
  {
    "model_benchmark_id": 1712,
    "benchmark_id": "androidworld-sr",
    "model_id": "qwen2.5-vl-72b",
    "score": 0.35,
    "normalized_score": 0.35,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "SR",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.813492+00:00",
    "updated_at": "2025-07-19T19:56:14.813492+00:00",
    "benchmark_name": "AndroidWorld_SR"
  },
  {
    "model_benchmark_id": 1657,
    "benchmark_id": "cc-ocr",
    "model_id": "qwen2.5-vl-72b",
    "score": 0.798,
    "normalized_score": 0.798,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.657333+00:00",
    "updated_at": "2025-07-19T19:56:14.657333+00:00",
    "benchmark_name": "CC-OCR"
  },
  {
    "model_benchmark_id": 867,
    "benchmark_id": "chartqa",
    "model_id": "qwen2.5-vl-72b",
    "score": 0.895,
    "normalized_score": 0.895,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.811401+00:00",
    "updated_at": "2025-07-19T19:56:12.811401+00:00",
    "benchmark_name": "ChartQA"
  },
  {
    "model_benchmark_id": 888,
    "benchmark_id": "docvqa",
    "model_id": "qwen2.5-vl-72b",
    "score": 0.964,
    "normalized_score": 0.964,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.848273+00:00",
    "updated_at": "2025-07-19T19:56:12.848273+00:00",
    "benchmark_name": "DocVQA"
  },
  {
    "model_benchmark_id": 925,
    "benchmark_id": "egoschema",
    "model_id": "qwen2.5-vl-72b",
    "score": 0.762,
    "normalized_score": 0.762,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.933582+00:00",
    "updated_at": "2025-07-19T19:56:12.933582+00:00",
    "benchmark_name": "EgoSchema"
  },
  {
    "model_benchmark_id": 1673,
    "benchmark_id": "hallusion-bench",
    "model_id": "qwen2.5-vl-72b",
    "score": 0.5516,
    "normalized_score": 0.5516,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.694733+00:00",
    "updated_at": "2025-07-19T19:56:14.694733+00:00",
    "benchmark_name": "Hallusion Bench"
  },
  {
    "model_benchmark_id": 829,
    "benchmark_id": "lvbench",
    "model_id": "qwen2.5-vl-72b",
    "score": 0.473,
    "normalized_score": 0.473,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.731476+00:00",
    "updated_at": "2025-07-19T19:56:12.731476+00:00",
    "benchmark_name": "LVBench"
  },
  {
    "model_benchmark_id": 1677,
    "benchmark_id": "mathvision",
    "model_id": "qwen2.5-vl-72b",
    "score": 0.381,
    "normalized_score": 0.381,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.705119+00:00",
    "updated_at": "2025-07-19T19:56:14.705119+00:00",
    "benchmark_name": "MathVision"
  },
  {
    "model_benchmark_id": 1271,
    "benchmark_id": "mathvista-mini",
    "model_id": "qwen2.5-vl-72b",
    "score": 0.748,
    "normalized_score": 0.748,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.666379+00:00",
    "updated_at": "2025-07-19T19:56:13.666379+00:00",
    "benchmark_name": "MathVista-Mini"
  },
  {
    "model_benchmark_id": 1746,
    "benchmark_id": "mlvu-m",
    "model_id": "qwen2.5-vl-72b",
    "score": 0.746,
    "normalized_score": 0.746,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.934328+00:00",
    "updated_at": "2025-07-19T19:56:14.934328+00:00",
    "benchmark_name": "MLVU-M"
  },
  {
    "model_benchmark_id": 1512,
    "benchmark_id": "mmbench",
    "model_id": "qwen2.5-vl-72b",
    "score": 0.88,
    "normalized_score": 0.88,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.243543+00:00",
    "updated_at": "2025-07-19T19:56:14.243543+00:00",
    "benchmark_name": "MMBench"
  },
  {
    "model_benchmark_id": 1689,
    "benchmark_id": "mmbench-video",
    "model_id": "qwen2.5-vl-72b",
    "score": 0.0202,
    "normalized_score": 0.0202,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-VL",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.744558+00:00",
    "updated_at": "2025-07-19T19:56:14.744558+00:00",
    "benchmark_name": "MMBench-Video"
  },
  {
    "model_benchmark_id": 572,
    "benchmark_id": "mmmu",
    "model_id": "qwen2.5-vl-72b",
    "score": 0.702,
    "normalized_score": 0.702,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.177290+00:00",
    "updated_at": "2025-07-19T19:56:12.177290+00:00",
    "benchmark_name": "MMMU"
  },
  {
    "model_benchmark_id": 1535,
    "benchmark_id": "mmmu-pro",
    "model_id": "qwen2.5-vl-72b",
    "score": 0.511,
    "normalized_score": 0.511,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.297757+00:00",
    "updated_at": "2025-07-19T19:56:14.297757+00:00",
    "benchmark_name": "MMMU-Pro"
  },
  {
    "model_benchmark_id": 1661,
    "benchmark_id": "mmstar",
    "model_id": "qwen2.5-vl-72b",
    "score": 0.708,
    "normalized_score": 0.708,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.666719+00:00",
    "updated_at": "2025-07-19T19:56:14.666719+00:00",
    "benchmark_name": "MMStar"
  },
  {
    "model_benchmark_id": 1671,
    "benchmark_id": "mmvet",
    "model_id": "qwen2.5-vl-72b",
    "score": 0.7619,
    "normalized_score": 0.7619,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.688513+00:00",
    "updated_at": "2025-07-19T19:56:14.688513+00:00",
    "benchmark_name": "MMVet"
  },
  {
    "model_benchmark_id": 1715,
    "benchmark_id": "mobileminiwob++-sr",
    "model_id": "qwen2.5-vl-72b",
    "score": 0.68,
    "normalized_score": 0.68,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "SR",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.820961+00:00",
    "updated_at": "2025-07-19T19:56:14.820961+00:00",
    "benchmark_name": "MobileMiniWob++_SR"
  },
  {
    "model_benchmark_id": 1644,
    "benchmark_id": "mvbench",
    "model_id": "qwen2.5-vl-72b",
    "score": 0.704,
    "normalized_score": 0.704,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.623550+00:00",
    "updated_at": "2025-07-19T19:56:14.623550+00:00",
    "benchmark_name": "MVBench"
  },
  {
    "model_benchmark_id": 1541,
    "benchmark_id": "ocrbench",
    "model_id": "qwen2.5-vl-72b",
    "score": 0.885,
    "normalized_score": 0.885,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.318110+00:00",
    "updated_at": "2025-07-19T19:56:14.318110+00:00",
    "benchmark_name": "OCRBench"
  },
  {
    "model_benchmark_id": 1744,
    "benchmark_id": "ocrbench-v2-(en)",
    "model_id": "qwen2.5-vl-72b",
    "score": 0.615,
    "normalized_score": 0.615,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.928710+00:00",
    "updated_at": "2025-07-19T19:56:14.928710+00:00",
    "benchmark_name": "OCRBench-V2 (en)"
  },
  {
    "model_benchmark_id": 1747,
    "benchmark_id": "osworld",
    "model_id": "qwen2.5-vl-72b",
    "score": 0.0883,
    "normalized_score": 0.0883,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.937610+00:00",
    "updated_at": "2025-07-19T19:56:14.937610+00:00",
    "benchmark_name": "OSWorld"
  },
  {
    "model_benchmark_id": 1680,
    "benchmark_id": "perceptiontest",
    "model_id": "qwen2.5-vl-72b",
    "score": 0.732,
    "normalized_score": 0.732,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.713944+00:00",
    "updated_at": "2025-07-19T19:56:14.713944+00:00",
    "benchmark_name": "PerceptionTest"
  },
  {
    "model_benchmark_id": 1697,
    "benchmark_id": "screenspot",
    "model_id": "qwen2.5-vl-72b",
    "score": 0.871,
    "normalized_score": 0.871,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.773284+00:00",
    "updated_at": "2025-07-19T19:56:14.773284+00:00",
    "benchmark_name": "ScreenSpot"
  },
  {
    "model_benchmark_id": 1700,
    "benchmark_id": "screenspot-pro",
    "model_id": "qwen2.5-vl-72b",
    "score": 0.436,
    "normalized_score": 0.436,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.780898+00:00",
    "updated_at": "2025-07-19T19:56:14.780898+00:00",
    "benchmark_name": "ScreenSpot Pro"
  },
  {
    "model_benchmark_id": 1692,
    "benchmark_id": "tempcompass",
    "model_id": "qwen2.5-vl-72b",
    "score": 0.748,
    "normalized_score": 0.748,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.754032+00:00",
    "updated_at": "2025-07-19T19:56:14.754032+00:00",
    "benchmark_name": "TempCompass"
  },
  {
    "model_benchmark_id": 1682,
    "benchmark_id": "videomme-w-o-sub.",
    "model_id": "qwen2.5-vl-72b",
    "score": 0.733,
    "normalized_score": 0.733,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.720259+00:00",
    "updated_at": "2025-07-19T19:56:14.720259+00:00",
    "benchmark_name": "VideoMME w/o sub."
  }
]

================================================
FILE: data/organizations/qwen/models/qwen2.5-vl-72b/model.json
================================================
{
  "model_id": "qwen2.5-vl-72b",
  "name": "Qwen2.5 VL 72B Instruct",
  "organization_id": "qwen",
  "fine_tuned_from_model_id": null,
  "description": "Qwen2.5-VL is the new flagship vision-language model of Qwen, significantly improved from Qwen2-VL. It excels at recognizing objects, analyzing text/charts/layouts in images, acting as a visual agent, understanding long videos (over 1 hour) with event pinpointing, performing visual localization (bounding boxes/points), and generating structured outputs from documents.",
  "release_date": "2025-01-26",
  "announcement_date": "2025-01-26",
  "license_id": "tongyi_qianwen",
  "multimodal": true,
  "knowledge_cutoff": null,
  "param_count": 72000000000,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": null,
  "source_playground": "https://chat.qwen.ai/",
  "source_paper": "https://arxiv.org/pdf/2502.13923",
  "source_scorecard_blog_link": "https://qwenlm.github.io/blog/qwen2.5-vl/",
  "source_repo_link": "https://github.com/QwenLM/Qwen2.5-VL",
  "source_weights_link": "https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct",
  "created_at": "2025-07-19T19:49:05.647509+00:00",
  "updated_at": "2025-07-19T19:49:05.647509+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/qwen/models/qwen2.5-vl-7b/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 1702,
    "benchmark_id": "aitz-em",
    "model_id": "qwen2.5-vl-7b",
    "score": 0.819,
    "normalized_score": 0.819,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "EM",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.787781+00:00",
    "updated_at": "2025-07-19T19:56:14.787781+00:00",
    "benchmark_name": "AITZ_EM"
  },
  {
    "model_benchmark_id": 1705,
    "benchmark_id": "android-control-high-em",
    "model_id": "qwen2.5-vl-7b",
    "score": 0.601,
    "normalized_score": 0.601,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "EM",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.794879+00:00",
    "updated_at": "2025-07-19T19:56:14.794879+00:00",
    "benchmark_name": "Android Control High_EM"
  },
  {
    "model_benchmark_id": 1708,
    "benchmark_id": "android-control-low-em",
    "model_id": "qwen2.5-vl-7b",
    "score": 0.914,
    "normalized_score": 0.914,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-VL",
    "verified_by_llmstats": false,
    "analysis_method": "EM",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.803305+00:00",
    "updated_at": "2025-07-19T19:56:14.803305+00:00",
    "benchmark_name": "Android Control Low_EM"
  },
  {
    "model_benchmark_id": 1711,
    "benchmark_id": "androidworld-sr",
    "model_id": "qwen2.5-vl-7b",
    "score": 0.255,
    "normalized_score": 0.255,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "SR",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.811782+00:00",
    "updated_at": "2025-07-19T19:56:14.811782+00:00",
    "benchmark_name": "AndroidWorld_SR"
  },
  {
    "model_benchmark_id": 1656,
    "benchmark_id": "cc-ocr",
    "model_id": "qwen2.5-vl-7b",
    "score": 0.778,
    "normalized_score": 0.778,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.655251+00:00",
    "updated_at": "2025-07-19T19:56:14.655251+00:00",
    "benchmark_name": "CC-OCR"
  },
  {
    "model_benchmark_id": 1694,
    "benchmark_id": "charadessta",
    "model_id": "qwen2.5-vl-7b",
    "score": 0.436,
    "normalized_score": 0.436,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "mIoU",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.763802+00:00",
    "updated_at": "2025-07-19T19:56:14.763802+00:00",
    "benchmark_name": "CharadesSTA"
  },
  {
    "model_benchmark_id": 865,
    "benchmark_id": "chartqa",
    "model_id": "qwen2.5-vl-7b",
    "score": 0.873,
    "normalized_score": 0.873,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.808329+00:00",
    "updated_at": "2025-07-19T19:56:12.808329+00:00",
    "benchmark_name": "ChartQA"
  },
  {
    "model_benchmark_id": 886,
    "benchmark_id": "docvqa",
    "model_id": "qwen2.5-vl-7b",
    "score": 0.957,
    "normalized_score": 0.957,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.844347+00:00",
    "updated_at": "2025-07-19T19:56:12.844347+00:00",
    "benchmark_name": "DocVQA"
  },
  {
    "model_benchmark_id": 1672,
    "benchmark_id": "hallusion-bench",
    "model_id": "qwen2.5-vl-7b",
    "score": 0.529,
    "normalized_score": 0.529,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.693096+00:00",
    "updated_at": "2025-07-19T19:56:14.693096+00:00",
    "benchmark_name": "Hallusion Bench"
  },
  {
    "model_benchmark_id": 1242,
    "benchmark_id": "infovqa",
    "model_id": "qwen2.5-vl-7b",
    "score": 0.826,
    "normalized_score": 0.826,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.610945+00:00",
    "updated_at": "2025-07-19T19:56:13.610945+00:00",
    "benchmark_name": "InfoVQA"
  },
  {
    "model_benchmark_id": 1687,
    "benchmark_id": "longvideobench",
    "model_id": "qwen2.5-vl-7b",
    "score": 0.547,
    "normalized_score": 0.547,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.737450+00:00",
    "updated_at": "2025-07-19T19:56:14.737450+00:00",
    "benchmark_name": "LongVideoBench"
  },
  {
    "model_benchmark_id": 828,
    "benchmark_id": "lvbench",
    "model_id": "qwen2.5-vl-7b",
    "score": 0.453,
    "normalized_score": 0.453,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.729778+00:00",
    "updated_at": "2025-07-19T19:56:12.729778+00:00",
    "benchmark_name": "LVBench"
  },
  {
    "model_benchmark_id": 1674,
    "benchmark_id": "mathvision",
    "model_id": "qwen2.5-vl-7b",
    "score": 0.2507,
    "normalized_score": 0.2507,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.698748+00:00",
    "updated_at": "2025-07-19T19:56:14.698748+00:00",
    "benchmark_name": "MathVision"
  },
  {
    "model_benchmark_id": 1270,
    "benchmark_id": "mathvista-mini",
    "model_id": "qwen2.5-vl-7b",
    "score": 0.682,
    "normalized_score": 0.682,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.664381+00:00",
    "updated_at": "2025-07-19T19:56:13.664381+00:00",
    "benchmark_name": "MathVista-Mini"
  },
  {
    "model_benchmark_id": 1693,
    "benchmark_id": "mlvu",
    "model_id": "qwen2.5-vl-7b",
    "score": 0.702,
    "normalized_score": 0.702,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.758833+00:00",
    "updated_at": "2025-07-19T19:56:14.758833+00:00",
    "benchmark_name": "MLVU"
  },
  {
    "model_benchmark_id": 1511,
    "benchmark_id": "mmbench",
    "model_id": "qwen2.5-vl-7b",
    "score": 0.843,
    "normalized_score": 0.843,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-VL",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.241869+00:00",
    "updated_at": "2025-07-19T19:56:14.241869+00:00",
    "benchmark_name": "MMBench"
  },
  {
    "model_benchmark_id": 1688,
    "benchmark_id": "mmbench-video",
    "model_id": "qwen2.5-vl-7b",
    "score": 0.0179,
    "normalized_score": 0.0179,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-VL",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.742467+00:00",
    "updated_at": "2025-07-19T19:56:14.742467+00:00",
    "benchmark_name": "MMBench-Video"
  },
  {
    "model_benchmark_id": 569,
    "benchmark_id": "mmmu",
    "model_id": "qwen2.5-vl-7b",
    "score": 0.586,
    "normalized_score": 0.586,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.170987+00:00",
    "updated_at": "2025-07-19T19:56:12.170987+00:00",
    "benchmark_name": "MMMU"
  },
  {
    "model_benchmark_id": 1533,
    "benchmark_id": "mmmu-pro",
    "model_id": "qwen2.5-vl-7b",
    "score": 0.383,
    "normalized_score": 0.383,
    "is_self_reported": true,
    "self_reported_source_link": "https://github.com/QwenLM/Qwen2.5-VL",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.294582+00:00",
    "updated_at": "2025-07-19T19:56:14.294582+00:00",
    "benchmark_name": "MMMU-Pro"
  },
  {
    "model_benchmark_id": 1659,
    "benchmark_id": "mmstar",
    "model_id": "qwen2.5-vl-7b",
    "score": 0.639,
    "normalized_score": 0.639,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.662888+00:00",
    "updated_at": "2025-07-19T19:56:14.662888+00:00",
    "benchmark_name": "MMStar"
  },
  {
    "model_benchmark_id": 1666,
    "benchmark_id": "mmt-bench",
    "model_id": "qwen2.5-vl-7b",
    "score": 0.636,
    "normalized_score": 0.636,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.676869+00:00",
    "updated_at": "2025-07-19T19:56:14.676869+00:00",
    "benchmark_name": "MMT-Bench"
  },
  {
    "model_benchmark_id": 1670,
    "benchmark_id": "mmvet",
    "model_id": "qwen2.5-vl-7b",
    "score": 0.671,
    "normalized_score": 0.671,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.687023+00:00",
    "updated_at": "2025-07-19T19:56:14.687023+00:00",
    "benchmark_name": "MMVet"
  },
  {
    "model_benchmark_id": 1714,
    "benchmark_id": "mobileminiwob++-sr",
    "model_id": "qwen2.5-vl-7b",
    "score": 0.914,
    "normalized_score": 0.914,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "SR",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.819401+00:00",
    "updated_at": "2025-07-19T19:56:14.819401+00:00",
    "benchmark_name": "MobileMiniWob++_SR"
  },
  {
    "model_benchmark_id": 1642,
    "benchmark_id": "mvbench",
    "model_id": "qwen2.5-vl-7b",
    "score": 0.696,
    "normalized_score": 0.696,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.620310+00:00",
    "updated_at": "2025-07-19T19:56:14.620310+00:00",
    "benchmark_name": "MVBench"
  },
  {
    "model_benchmark_id": 1540,
    "benchmark_id": "ocrbench",
    "model_id": "qwen2.5-vl-7b",
    "score": 0.864,
    "normalized_score": 0.864,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.315649+00:00",
    "updated_at": "2025-07-19T19:56:14.315649+00:00",
    "benchmark_name": "OCRBench"
  },
  {
    "model_benchmark_id": 1679,
    "benchmark_id": "perceptiontest",
    "model_id": "qwen2.5-vl-7b",
    "score": 0.705,
    "normalized_score": 0.705,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.712010+00:00",
    "updated_at": "2025-07-19T19:56:14.712010+00:00",
    "benchmark_name": "PerceptionTest"
  },
  {
    "model_benchmark_id": 1696,
    "benchmark_id": "screenspot",
    "model_id": "qwen2.5-vl-7b",
    "score": 0.847,
    "normalized_score": 0.847,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.771516+00:00",
    "updated_at": "2025-07-19T19:56:14.771516+00:00",
    "benchmark_name": "ScreenSpot"
  },
  {
    "model_benchmark_id": 1699,
    "benchmark_id": "screenspot-pro",
    "model_id": "qwen2.5-vl-7b",
    "score": 0.29,
    "normalized_score": 0.29,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.779312+00:00",
    "updated_at": "2025-07-19T19:56:14.779312+00:00",
    "benchmark_name": "ScreenSpot Pro"
  },
  {
    "model_benchmark_id": 1691,
    "benchmark_id": "tempcompass",
    "model_id": "qwen2.5-vl-7b",
    "score": 0.717,
    "normalized_score": 0.717,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.752008+00:00",
    "updated_at": "2025-07-19T19:56:14.752008+00:00",
    "benchmark_name": "TempCompass"
  },
  {
    "model_benchmark_id": 910,
    "benchmark_id": "textvqa",
    "model_id": "qwen2.5-vl-7b",
    "score": 0.849,
    "normalized_score": 0.849,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.896871+00:00",
    "updated_at": "2025-07-19T19:56:12.896871+00:00",
    "benchmark_name": "TextVQA"
  },
  {
    "model_benchmark_id": 1681,
    "benchmark_id": "videomme-w-o-sub.",
    "model_id": "qwen2.5-vl-7b",
    "score": 0.651,
    "normalized_score": 0.651,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.718319+00:00",
    "updated_at": "2025-07-19T19:56:14.718319+00:00",
    "benchmark_name": "VideoMME w/o sub."
  },
  {
    "model_benchmark_id": 1684,
    "benchmark_id": "videomme-w-sub.",
    "model_id": "qwen2.5-vl-7b",
    "score": 0.716,
    "normalized_score": 0.716,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.726358+00:00",
    "updated_at": "2025-07-19T19:56:14.726358+00:00",
    "benchmark_name": "VideoMME w sub."
  }
]

================================================
FILE: data/organizations/qwen/models/qwen2.5-vl-7b/model.json
================================================
{
  "model_id": "qwen2.5-vl-7b",
  "name": "Qwen2.5 VL 7B Instruct",
  "organization_id": "qwen",
  "fine_tuned_from_model_id": null,
  "description": "Qwen2.5-VL is a vision-language model from the Qwen family. Key enhancements include visual understanding (objects, text, charts, layouts), visual agent capabilities (tool use, computer/phone control), long video comprehension with event pinpointing, visual localization (bounding boxes/points), and structured output generation.",
  "release_date": "2025-01-26",
  "announcement_date": "2025-01-26",
  "license_id": "apache_2_0",
  "multimodal": true,
  "knowledge_cutoff": null,
  "param_count": 8290000000,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": null,
  "source_playground": "https://chat.qwen.ai/",
  "source_paper": "https://arxiv.org/pdf/2502.13923",
  "source_scorecard_blog_link": "https://qwenlm.github.io/blog/qwen2.5-vl/",
  "source_repo_link": "https://github.com/QwenLM/Qwen2.5-VL",
  "source_weights_link": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct",
  "created_at": "2025-07-19T19:49:05.635630+00:00",
  "updated_at": "2025-07-19T19:49:05.635630+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/qwen/models/qwen3-235b-a22b/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 1626,
    "benchmark_id": "aider",
    "model_id": "qwen3-235b-a22b",
    "score": 0.618,
    "normalized_score": 0.618,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@2",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.572970+00:00",
    "updated_at": "2025-07-19T19:56:14.572970+00:00",
    "benchmark_name": "Aider"
  },
  {
    "model_benchmark_id": 454,
    "benchmark_id": "aime-2024",
    "model_id": "qwen3-235b-a22b",
    "score": 0.857,
    "normalized_score": 0.857,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@64",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.963641+00:00",
    "updated_at": "2025-07-19T19:56:11.963641+00:00",
    "benchmark_name": "AIME 2024"
  },
  {
    "model_benchmark_id": 690,
    "benchmark_id": "aime-2025",
    "model_id": "qwen3-235b-a22b",
    "score": 0.815,
    "normalized_score": 0.815,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@64",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.447678+00:00",
    "updated_at": "2025-07-19T19:56:12.447678+00:00",
    "benchmark_name": "AIME 2025"
  },
  {
    "model_benchmark_id": 1452,
    "benchmark_id": "arena-hard",
    "model_id": "qwen3-235b-a22b",
    "score": 0.956,
    "normalized_score": 0.956,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.095282+00:00",
    "updated_at": "2025-07-19T19:56:14.095282+00:00",
    "benchmark_name": "Arena Hard"
  },
  {
    "model_benchmark_id": 972,
    "benchmark_id": "bbh",
    "model_id": "qwen3-235b-a22b",
    "score": 0.8887,
    "normalized_score": 0.8887,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.043683+00:00",
    "updated_at": "2025-07-19T19:56:13.043683+00:00",
    "benchmark_name": "BBH"
  },
  {
    "model_benchmark_id": 851,
    "benchmark_id": "bfcl",
    "model_id": "qwen3-235b-a22b",
    "score": 0.708,
    "normalized_score": 0.708,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/",
    "verified_by_llmstats": false,
    "analysis_method": "v3",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.780457+00:00",
    "updated_at": "2025-07-19T19:56:12.780457+00:00",
    "benchmark_name": "BFCL"
  },
  {
    "model_benchmark_id": 1648,
    "benchmark_id": "crux-o",
    "model_id": "qwen3-235b-a22b",
    "score": 0.79,
    "normalized_score": 0.79,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.637715+00:00",
    "updated_at": "2025-07-19T19:56:14.637715+00:00",
    "benchmark_name": "CRUX-O"
  },
  {
    "model_benchmark_id": 371,
    "benchmark_id": "evalplus",
    "model_id": "qwen3-235b-a22b",
    "score": 0.776,
    "normalized_score": 0.776,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.801301+00:00",
    "updated_at": "2025-07-19T19:56:11.801301+00:00",
    "benchmark_name": "EvalPlus"
  },
  {
    "model_benchmark_id": 302,
    "benchmark_id": "gpqa",
    "model_id": "qwen3-235b-a22b",
    "score": 0.4747,
    "normalized_score": 0.4747,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.679464+00:00",
    "updated_at": "2025-07-19T19:56:11.679464+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 995,
    "benchmark_id": "gsm8k",
    "model_id": "qwen3-235b-a22b",
    "score": 0.9439,
    "normalized_score": 0.9439,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.083824+00:00",
    "updated_at": "2025-07-19T19:56:13.083824+00:00",
    "benchmark_name": "GSM8k"
  },
  {
    "model_benchmark_id": 1308,
    "benchmark_id": "include",
    "model_id": "qwen3-235b-a22b",
    "score": 0.7346,
    "normalized_score": 0.7346,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.737543+00:00",
    "updated_at": "2025-07-19T19:56:13.737543+00:00",
    "benchmark_name": "Include"
  },
  {
    "model_benchmark_id": 749,
    "benchmark_id": "livebench",
    "model_id": "qwen3-235b-a22b",
    "score": 0.771,
    "normalized_score": 0.771,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.575629+00:00",
    "updated_at": "2025-07-19T19:56:12.575629+00:00",
    "benchmark_name": "LiveBench"
  },
  {
    "model_benchmark_id": 1123,
    "benchmark_id": "livecodebench",
    "model_id": "qwen3-235b-a22b",
    "score": 0.707,
    "normalized_score": 0.707,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/",
    "verified_by_llmstats": false,
    "analysis_method": "v5",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.344206+00:00",
    "updated_at": "2025-07-19T19:56:13.344206+00:00",
    "benchmark_name": "LiveCodeBench"
  },
  {
    "model_benchmark_id": 405,
    "benchmark_id": "math",
    "model_id": "qwen3-235b-a22b",
    "score": 0.7184,
    "normalized_score": 0.7184,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.863985+00:00",
    "updated_at": "2025-07-19T19:56:11.863985+00:00",
    "benchmark_name": "MATH"
  },
  {
    "model_benchmark_id": 1186,
    "benchmark_id": "mbpp",
    "model_id": "qwen3-235b-a22b",
    "score": 0.814,
    "normalized_score": 0.814,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.500617+00:00",
    "updated_at": "2025-07-19T19:56:13.500617+00:00",
    "benchmark_name": "MBPP"
  },
  {
    "model_benchmark_id": 1289,
    "benchmark_id": "mgsm",
    "model_id": "qwen3-235b-a22b",
    "score": 0.8353,
    "normalized_score": 0.8353,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.700097+00:00",
    "updated_at": "2025-07-19T19:56:13.700097+00:00",
    "benchmark_name": "MGSM"
  },
  {
    "model_benchmark_id": 90,
    "benchmark_id": "mmlu",
    "model_id": "qwen3-235b-a22b",
    "score": 0.8781,
    "normalized_score": 0.8781,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.270963+00:00",
    "updated_at": "2025-07-19T19:56:11.270963+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 195,
    "benchmark_id": "mmlu-pro",
    "model_id": "qwen3-235b-a22b",
    "score": 0.6818,
    "normalized_score": 0.6818,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.472627+00:00",
    "updated_at": "2025-07-19T19:56:11.472627+00:00",
    "benchmark_name": "MMLU-Pro"
  },
  {
    "model_benchmark_id": 732,
    "benchmark_id": "mmlu-redux",
    "model_id": "qwen3-235b-a22b",
    "score": 0.874,
    "normalized_score": 0.874,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.540685+00:00",
    "updated_at": "2025-07-19T19:56:12.540685+00:00",
    "benchmark_name": "MMLU-Redux"
  },
  {
    "model_benchmark_id": 1477,
    "benchmark_id": "mmmlu",
    "model_id": "qwen3-235b-a22b",
    "score": 0.867,
    "normalized_score": 0.867,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.150792+00:00",
    "updated_at": "2025-07-19T19:56:14.150792+00:00",
    "benchmark_name": "MMMLU"
  },
  {
    "model_benchmark_id": 1647,
    "benchmark_id": "multilf",
    "model_id": "qwen3-235b-a22b",
    "score": 0.719,
    "normalized_score": 0.719,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.633963+00:00",
    "updated_at": "2025-07-19T19:56:14.633963+00:00",
    "benchmark_name": "MultiLF"
  },
  {
    "model_benchmark_id": 643,
    "benchmark_id": "multipl-e",
    "model_id": "qwen3-235b-a22b",
    "score": 0.6594,
    "normalized_score": 0.6594,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.320821+00:00",
    "updated_at": "2025-07-19T19:56:12.320821+00:00",
    "benchmark_name": "MultiPL-E"
  },
  {
    "model_benchmark_id": 366,
    "benchmark_id": "supergpqa",
    "model_id": "qwen3-235b-a22b",
    "score": 0.4406,
    "normalized_score": 0.4406,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.784624+00:00",
    "updated_at": "2025-07-19T19:56:11.784624+00:00",
    "benchmark_name": "SuperGPQA"
  }
]

================================================
FILE: data/organizations/qwen/models/qwen3-235b-a22b/model.json
================================================
{
  "model_id": "qwen3-235b-a22b",
  "name": "Qwen3 235B A22B",
  "organization_id": "qwen",
  "fine_tuned_from_model_id": null,
  "description": "Qwen3 235B A22B is a large language model developed by Alibaba, featuring a Mixture-of-Experts (MoE) architecture with 235 billion total parameters and 22 billion activated parameters. It achieves competitive results in benchmark evaluations of coding, math, general capabilities, and more, compared to other top-tier models.",
  "release_date": "2025-04-29",
  "announcement_date": "2025-04-29",
  "license_id": "apache_2_0",
  "multimodal": false,
  "knowledge_cutoff": null,
  "param_count": 235000000000,
  "training_tokens": 36000000000000,
  "available_in_zeroeval": true,
  "source_api_ref": "https://qwenlm.github.io/blog/qwen3/",
  "source_playground": "https://chat.qwen.ai/",
  "source_paper": null,
  "source_scorecard_blog_link": null,
  "source_repo_link": "https://github.com/QwenLM/Qwen3",
  "source_weights_link": "https://huggingface.co/Qwen/Qwen3-235B-A22B",
  "created_at": "2025-07-19T19:49:05.624683+00:00",
  "updated_at": "2025-07-19T19:49:05.624683+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/qwen/models/qwen3-235b-a22b-instruct-2507/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 15972,
    "benchmark_id": "aider-polyglot",
    "model_id": "qwen3-235b-a22b-instruct-2507",
    "score": 0.573,
    "normalized_score": 0.573,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-03T22:06:13.609026+00:00",
    "updated_at": "2025-08-03T22:06:13.609026+00:00",
    "benchmark_name": "Aider-Polyglot"
  },
  {
    "model_benchmark_id": 15973,
    "benchmark_id": "aime-2025",
    "model_id": "qwen3-235b-a22b-instruct-2507",
    "score": 0.703,
    "normalized_score": 0.703,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-03T22:06:13.611021+00:00",
    "updated_at": "2025-08-03T22:06:13.611021+00:00",
    "benchmark_name": "AIME 2025"
  },
  {
    "model_benchmark_id": 15974,
    "benchmark_id": "arc-agi",
    "model_id": "qwen3-235b-a22b-instruct-2507",
    "score": 0.418,
    "normalized_score": 0.418,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-03T22:06:13.618116+00:00",
    "updated_at": "2025-08-03T22:06:13.618116+00:00",
    "benchmark_name": "ARC-AGI"
  },
  {
    "model_benchmark_id": 15975,
    "benchmark_id": "arena-hard-v2",
    "model_id": "qwen3-235b-a22b-instruct-2507",
    "score": 0.792,
    "normalized_score": 0.792,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507",
    "verified_by_llmstats": false,
    "analysis_method": "Win Rate",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-03T22:06:13.620187+00:00",
    "updated_at": "2025-08-03T22:06:13.620187+00:00",
    "benchmark_name": "Arena-Hard v2"
  },
  {
    "model_benchmark_id": 15976,
    "benchmark_id": "bfcl-v3",
    "model_id": "qwen3-235b-a22b-instruct-2507",
    "score": 0.709,
    "normalized_score": 0.709,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-03T22:06:13.622144+00:00",
    "updated_at": "2025-08-03T22:06:13.622144+00:00",
    "benchmark_name": "BFCL-v3"
  },
  {
    "model_benchmark_id": 15977,
    "benchmark_id": "creative-writing-v3",
    "model_id": "qwen3-235b-a22b-instruct-2507",
    "score": 0.875,
    "normalized_score": 0.875,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-03T22:06:13.626065+00:00",
    "updated_at": "2025-08-03T22:06:13.626065+00:00",
    "benchmark_name": "Creative Writing v3"
  },
  {
    "model_benchmark_id": 15978,
    "benchmark_id": "csimpleqa",
    "model_id": "qwen3-235b-a22b-instruct-2507",
    "score": 0.843,
    "normalized_score": 0.843,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-03T22:06:13.629696+00:00",
    "updated_at": "2025-08-03T22:06:13.629696+00:00",
    "benchmark_name": "CSimpleQA"
  },
  {
    "model_benchmark_id": 15979,
    "benchmark_id": "gpqa",
    "model_id": "qwen3-235b-a22b-instruct-2507",
    "score": 0.775,
    "normalized_score": 0.775,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-03T22:06:13.631769+00:00",
    "updated_at": "2025-08-03T22:06:13.631769+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 15980,
    "benchmark_id": "hmmt25",
    "model_id": "qwen3-235b-a22b-instruct-2507",
    "score": 0.554,
    "normalized_score": 0.554,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-03T22:06:13.633387+00:00",
    "updated_at": "2025-08-03T22:06:13.633387+00:00",
    "benchmark_name": "HMMT25"
  },
  {
    "model_benchmark_id": 15981,
    "benchmark_id": "ifeval",
    "model_id": "qwen3-235b-a22b-instruct-2507",
    "score": 0.887,
    "normalized_score": 0.887,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-03T22:06:13.635001+00:00",
    "updated_at": "2025-08-03T22:06:13.635001+00:00",
    "benchmark_name": "IFEval"
  },
  {
    "model_benchmark_id": 15982,
    "benchmark_id": "include",
    "model_id": "qwen3-235b-a22b-instruct-2507",
    "score": 0.795,
    "normalized_score": 0.795,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-03T22:06:13.636605+00:00",
    "updated_at": "2025-08-03T22:06:13.636605+00:00",
    "benchmark_name": "INCLUDE"
  },
  {
    "model_benchmark_id": 15983,
    "benchmark_id": "livebench-20241125",
    "model_id": "qwen3-235b-a22b-instruct-2507",
    "score": 0.754,
    "normalized_score": 0.754,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-03T22:06:13.638166+00:00",
    "updated_at": "2025-08-03T22:06:13.638166+00:00",
    "benchmark_name": "LiveBench 20241125"
  },
  {
    "model_benchmark_id": 15984,
    "benchmark_id": "livecodebench-v6",
    "model_id": "qwen3-235b-a22b-instruct-2507",
    "score": 0.518,
    "normalized_score": 0.518,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-03T22:06:13.639661+00:00",
    "updated_at": "2025-08-03T22:06:13.639661+00:00",
    "benchmark_name": "LiveCodeBench v6"
  },
  {
    "model_benchmark_id": 15985,
    "benchmark_id": "mmlu-pro",
    "model_id": "qwen3-235b-a22b-instruct-2507",
    "score": 0.83,
    "normalized_score": 0.83,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-03T22:06:13.641236+00:00",
    "updated_at": "2025-08-03T22:06:13.641236+00:00",
    "benchmark_name": "MMLU-Pro"
  },
  {
    "model_benchmark_id": 15986,
    "benchmark_id": "mmlu-prox",
    "model_id": "qwen3-235b-a22b-instruct-2507",
    "score": 0.794,
    "normalized_score": 0.794,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-03T22:06:13.642908+00:00",
    "updated_at": "2025-08-03T22:06:13.642908+00:00",
    "benchmark_name": "MMLU-ProX"
  },
  {
    "model_benchmark_id": 15987,
    "benchmark_id": "mmlu-redux",
    "model_id": "qwen3-235b-a22b-instruct-2507",
    "score": 0.931,
    "normalized_score": 0.931,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-03T22:06:13.644630+00:00",
    "updated_at": "2025-08-03T22:06:13.644630+00:00",
    "benchmark_name": "MMLU-Redux"
  },
  {
    "model_benchmark_id": 15988,
    "benchmark_id": "multi-if",
    "model_id": "qwen3-235b-a22b-instruct-2507",
    "score": 0.775,
    "normalized_score": 0.775,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-03T22:06:13.646355+00:00",
    "updated_at": "2025-08-03T22:06:13.646355+00:00",
    "benchmark_name": "Multi-IF"
  },
  {
    "model_benchmark_id": 15989,
    "benchmark_id": "multipl-e",
    "model_id": "qwen3-235b-a22b-instruct-2507",
    "score": 0.879,
    "normalized_score": 0.879,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507",
    "verified_by_llmstats": false,
    "analysis_method": "Score",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-03T22:06:13.648211+00:00",
    "updated_at": "2025-08-03T22:06:13.648211+00:00",
    "benchmark_name": "MultiPL-E"
  },
  {
    "model_benchmark_id": 15990,
    "benchmark_id": "polymath",
    "model_id": "qwen3-235b-a22b-instruct-2507",
    "score": 0.502,
    "normalized_score": 0.502,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-03T22:06:13.649756+00:00",
    "updated_at": "2025-08-03T22:06:13.649756+00:00",
    "benchmark_name": "PolyMATH"
  },
  {
    "model_benchmark_id": 15991,
    "benchmark_id": "simpleqa",
    "model_id": "qwen3-235b-a22b-instruct-2507",
    "score": 0.543,
    "normalized_score": 0.543,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-03T22:06:13.651445+00:00",
    "updated_at": "2025-08-03T22:06:13.651445+00:00",
    "benchmark_name": "SimpleQA"
  },
  {
    "model_benchmark_id": 15992,
    "benchmark_id": "supergpqa",
    "model_id": "qwen3-235b-a22b-instruct-2507",
    "score": 0.626,
    "normalized_score": 0.626,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-03T22:06:13.652980+00:00",
    "updated_at": "2025-08-03T22:06:13.652980+00:00",
    "benchmark_name": "SuperGPQA"
  },
  {
    "model_benchmark_id": 15993,
    "benchmark_id": "tau2-airline",
    "model_id": "qwen3-235b-a22b-instruct-2507",
    "score": 0.44,
    "normalized_score": 0.44,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-03T22:06:13.654737+00:00",
    "updated_at": "2025-08-03T22:06:13.654737+00:00",
    "benchmark_name": "Tau2 airline"
  },
  {
    "model_benchmark_id": 15994,
    "benchmark_id": "tau2-retail",
    "model_id": "qwen3-235b-a22b-instruct-2507",
    "score": 0.713,
    "normalized_score": 0.713,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-03T22:06:13.656359+00:00",
    "updated_at": "2025-08-03T22:06:13.656359+00:00",
    "benchmark_name": "Tau2 retail"
  },
  {
    "model_benchmark_id": 15995,
    "benchmark_id": "writingbench",
    "model_id": "qwen3-235b-a22b-instruct-2507",
    "score": 0.852,
    "normalized_score": 0.852,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-03T22:06:13.657968+00:00",
    "updated_at": "2025-08-03T22:06:13.657968+00:00",
    "benchmark_name": "WritingBench"
  },
  {
    "model_benchmark_id": 15996,
    "benchmark_id": "zebralogic",
    "model_id": "qwen3-235b-a22b-instruct-2507",
    "score": 0.95,
    "normalized_score": 0.95,
    "is_self_reported": true,
    "self_reported_source_link": "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-08-03T22:06:13.659618+00:00",
    "updated_at": "2025-08-03T22:06:13.659618+00:00",
    "benchmark_name": "ZebraLogic"
  }
]


================================================
FILE: data/organizations/qwen/models/qwen3-235b-a22b-instruct-2507/model.json
================================================
{
  "model_id": "qwen3-235b-a22b-instruct-2507",
  "name": "Qwen3-235B-A22B-Instruct-2507",
  "organization_id": "qwen",
  "fine_tuned_from_model_id": null,
  "description": "Qwen3-235B-A22B-Instruct-2507 is the updated instruct version of Qwen3-235B-A22B featuring significant improvements in general capabilities including instruction following, logical reasoning, text comprehension, mathematics, science, coding and tool usage. It provides substantial gains in long-tail knowledge coverage across multiple languages and markedly better alignment with user preferences in subjective and open-ended tasks.",
  "release_date": "2025-07-22",
  "announcement_date": "2025-07-22",
  "license_id": "apache_2_0",
  "multimodal": false,
  "knowledge_cutoff": null,
  "param_count": 235000000000,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://qwenlm.github.io/blog/qwen3/",
  "source_playground": "https://chat.qwen.ai/",
  "source_paper": "https://arxiv.org/abs/2505.09388",
  "source_scorecard_blog_link": "https://qwenlm.github.io/blog/qwen3/",
  "source_repo_link": "https://github.com/QwenLM/Qwen3",
  "source_weights_link": "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507",
  "created_at": "2025-08-03T22:06:11.701778+00:00",
  "updated_at": "2025-08-03T22:06:11.701778+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/qwen/models/qwen3-235b-a22b-thinking-2507/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 9101,
    "benchmark_id": "mmlu-pro",
    "model_id": "qwen3-235b-a22b-thinking-2507",
    "score": 0.844,
    "normalized_score": 0.844,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-thinking/",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-25T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "MMLU-Pro"
  },
  {
    "model_benchmark_id": 9102,
    "benchmark_id": "mmlu-redux",
    "model_id": "qwen3-235b-a22b-thinking-2507",
    "score": 0.938,
    "normalized_score": 0.938,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-thinking/",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-25T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "MMLU-Redux"
  },
  {
    "model_benchmark_id": 9103,
    "benchmark_id": "gpqa",
    "model_id": "qwen3-235b-a22b-thinking-2507",
    "score": 0.811,
    "normalized_score": 0.811,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-thinking/",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-25T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 9104,
    "benchmark_id": "supergpqa",
    "model_id": "qwen3-235b-a22b-thinking-2507",
    "score": 0.649,
    "normalized_score": 0.649,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-thinking/",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-25T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "SuperGPQA"
  },
  {
    "model_benchmark_id": 9105,
    "benchmark_id": "aime-2025",
    "model_id": "qwen3-235b-a22b-thinking-2507",
    "score": 0.923,
    "normalized_score": 0.923,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-thinking/",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-25T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "AIME 2025"
  },
  {
    "model_benchmark_id": 9106,
    "benchmark_id": "hmmt25",
    "model_id": "qwen3-235b-a22b-thinking-2507",
    "score": 0.839,
    "normalized_score": 0.839,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-thinking/",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-25T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "HMMT25"
  },
  {
    "model_benchmark_id": 9107,
    "benchmark_id": "livebench-20241125",
    "model_id": "qwen3-235b-a22b-thinking-2507",
    "score": 0.784,
    "normalized_score": 0.784,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-thinking/",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-25T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "LiveBench 20241125"
  },
  {
    "model_benchmark_id": 9108,
    "benchmark_id": "humanity's-last-exam",
    "model_id": "qwen3-235b-a22b-thinking-2507",
    "score": 0.182,
    "normalized_score": 0.182,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-thinking/",
    "verified_by_llmstats": false,
    "analysis_method": "text-only subset",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": "Score refers to text-only subset as model is not multi-modal",
    "created_at": "2025-07-25T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "HLE"
  },
  {
    "model_benchmark_id": 9109,
    "benchmark_id": "livecodebench-v6",
    "model_id": "qwen3-235b-a22b-thinking-2507",
    "score": 0.741,
    "normalized_score": 0.741,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-thinking/",
    "verified_by_llmstats": false,
    "analysis_method": "25.02-25.05",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-25T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "LiveCodeBench v6"
  },
  {
    "model_benchmark_id": 9110,
    "benchmark_id": "cfeval",
    "model_id": "qwen3-235b-a22b-thinking-2507",
    "score": 2134,
    "normalized_score": 0.2134,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-thinking/",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": "Raw score: 2134",
    "created_at": "2025-07-25T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "CFEval"
  },
  {
    "model_benchmark_id": 9111,
    "benchmark_id": "ojbench",
    "model_id": "qwen3-235b-a22b-thinking-2507",
    "score": 0.325,
    "normalized_score": 0.325,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-thinking/",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-25T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "OJBench"
  },
  {
    "model_benchmark_id": 9112,
    "benchmark_id": "ifeval",
    "model_id": "qwen3-235b-a22b-thinking-2507",
    "score": 0.878,
    "normalized_score": 0.878,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-thinking/",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-25T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "IFEval"
  },
  {
    "model_benchmark_id": 9113,
    "benchmark_id": "arena-hard-v2",
    "model_id": "qwen3-235b-a22b-thinking-2507",
    "score": 0.797,
    "normalized_score": 0.797,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-thinking/",
    "verified_by_llmstats": false,
    "analysis_method": "GPT-4 evaluated win rates",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-25T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "Arena-Hard v2"
  },
  {
    "model_benchmark_id": 9114,
    "benchmark_id": "creative-writing-v3",
    "model_id": "qwen3-235b-a22b-thinking-2507",
    "score": 0.861,
    "normalized_score": 0.861,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-thinking/",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-25T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "Creative Writing v3"
  },
  {
    "model_benchmark_id": 9115,
    "benchmark_id": "writingbench",
    "model_id": "qwen3-235b-a22b-thinking-2507",
    "score": 0.883,
    "normalized_score": 0.883,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-thinking/",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-25T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "WritingBench"
  },
  {
    "model_benchmark_id": 9116,
    "benchmark_id": "bfcl-v3",
    "model_id": "qwen3-235b-a22b-thinking-2507",
    "score": 0.719,
    "normalized_score": 0.719,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-thinking/",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-25T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "BFCL-v3"
  },
  {
    "model_benchmark_id": 9117,
    "benchmark_id": "tau-bench-retail",
    "model_id": "qwen3-235b-a22b-thinking-2507",
    "score": 0.678,
    "normalized_score": 0.678,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-thinking/",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": "TAU1-Retail",
    "created_at": "2025-07-25T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "TAU1-Retail"
  },
  {
    "model_benchmark_id": 9118,
    "benchmark_id": "tau-bench-airline",
    "model_id": "qwen3-235b-a22b-thinking-2507",
    "score": 0.46,
    "normalized_score": 0.46,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-thinking/",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": "TAU1-Airline",
    "created_at": "2025-07-25T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "TAU1-Airline"
  },
  {
    "model_benchmark_id": 9119,
    "benchmark_id": "tau2-retail",
    "model_id": "qwen3-235b-a22b-thinking-2507",
    "score": 0.719,
    "normalized_score": 0.719,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-thinking/",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-25T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "TAU2-Retail"
  },
  {
    "model_benchmark_id": 9120,
    "benchmark_id": "tau2-airline",
    "model_id": "qwen3-235b-a22b-thinking-2507",
    "score": 0.58,
    "normalized_score": 0.58,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-thinking/",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-25T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "TAU2-Airline"
  },
  {
    "model_benchmark_id": 9121,
    "benchmark_id": "tau2-telecom",
    "model_id": "qwen3-235b-a22b-thinking-2507",
    "score": 0.456,
    "normalized_score": 0.456,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-thinking/",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-25T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "TAU2-Telecom"
  },
  {
    "model_benchmark_id": 9122,
    "benchmark_id": "multi-if",
    "model_id": "qwen3-235b-a22b-thinking-2507",
    "score": 0.806,
    "normalized_score": 0.806,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-thinking/",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": "MultiIF",
    "created_at": "2025-07-25T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "MultiIF"
  },
  {
    "model_benchmark_id": 9123,
    "benchmark_id": "mmlu-prox",
    "model_id": "qwen3-235b-a22b-thinking-2507",
    "score": 0.81,
    "normalized_score": 0.81,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-thinking/",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": "MMLU-ProX",
    "created_at": "2025-07-25T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "MMLU-ProX"
  },
  {
    "model_benchmark_id": 9124,
    "benchmark_id": "include",
    "model_id": "qwen3-235b-a22b-thinking-2507",
    "score": 0.81,
    "normalized_score": 0.81,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-thinking/",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-25T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "INCLUDE"
  },
  {
    "model_benchmark_id": 9125,
    "benchmark_id": "polymath",
    "model_id": "qwen3-235b-a22b-thinking-2507",
    "score": 0.601,
    "normalized_score": 0.601,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-thinking/",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-25T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "PolyMATH"
  }
]


================================================
FILE: data/organizations/qwen/models/qwen3-235b-a22b-thinking-2507/model.json
================================================
{
  "model_id": "qwen3-235b-a22b-thinking-2507",
  "name": "Qwen3-235B-A22B-Thinking-2507",
  "organization_id": "qwen",
  "model_family_id": null,
  "fine_tuned_from_model_id": "qwen3-235b-a22b",
  "description": "Qwen3-235B-A22B-Thinking-2507 is a state-of-the-art thinking-enabled Mixture-of-Experts (MoE) model with 235B total parameters (22B activated). It features 94 layers, 128 experts (8 activated), and supports 262K native context length. This version delivers significantly improved reasoning performance, achieving state-of-the-art results among open-source thinking models on logical reasoning, mathematics, science, coding, and academic benchmarks. Key enhancements include markedly better general capabilities (instruction following, tool usage, text generation), enhanced 256K long-context understanding, and increased thinking depth. The model supports only thinking mode with automatic <think> tag inclusion.",
  "release_date": "2025-07-25",
  "announcement_date": "2025-07-25",
  "license_id": "apache_2_0",
  "multimodal": false,
  "knowledge_cutoff": null,
  "param_count": 235000000000,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507",
  "source_playground": "https://chat.qwen.ai/",
  "source_paper": null,
  "source_scorecard_blog_link": "https://qwenlm.github.io/blog/qwen3-thinking/",
  "source_repo_link": "https://github.com/QwenLM/Qwen3",
  "source_weights_link": "https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507",
  "created_at": "2025-07-25T00:00:00.000000+00:00",
  "updated_at": "2025-09-15T00:00:00.000000+00:00"
}


================================================
FILE: data/organizations/qwen/models/qwen3-30b-a3b/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 455,
    "benchmark_id": "aime-2024",
    "model_id": "qwen3-30b-a3b",
    "score": 0.804,
    "normalized_score": 0.804,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.965575+00:00",
    "updated_at": "2025-07-19T19:56:11.965575+00:00",
    "benchmark_name": "AIME 2024"
  },
  {
    "model_benchmark_id": 691,
    "benchmark_id": "aime-2025",
    "model_id": "qwen3-30b-a3b",
    "score": 0.709,
    "normalized_score": 0.709,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.449947+00:00",
    "updated_at": "2025-07-19T19:56:12.449947+00:00",
    "benchmark_name": "AIME 2025"
  },
  {
    "model_benchmark_id": 1454,
    "benchmark_id": "arena-hard",
    "model_id": "qwen3-30b-a3b",
    "score": 0.91,
    "normalized_score": 0.91,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.098594+00:00",
    "updated_at": "2025-07-19T19:56:14.098594+00:00",
    "benchmark_name": "Arena Hard"
  },
  {
    "model_benchmark_id": 852,
    "benchmark_id": "bfcl",
    "model_id": "qwen3-30b-a3b",
    "score": 0.691,
    "normalized_score": 0.691,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/",
    "verified_by_llmstats": false,
    "analysis_method": "v3",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.782049+00:00",
    "updated_at": "2025-07-19T19:56:12.782049+00:00",
    "benchmark_name": "BFCL"
  },
  {
    "model_benchmark_id": 304,
    "benchmark_id": "gpqa",
    "model_id": "qwen3-30b-a3b",
    "score": 0.658,
    "normalized_score": 0.658,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.682771+00:00",
    "updated_at": "2025-07-19T19:56:11.682771+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 751,
    "benchmark_id": "livebench",
    "model_id": "qwen3-30b-a3b",
    "score": 0.743,
    "normalized_score": 0.743,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.579527+00:00",
    "updated_at": "2025-07-19T19:56:12.579527+00:00",
    "benchmark_name": "LiveBench"
  },
  {
    "model_benchmark_id": 1125,
    "benchmark_id": "livecodebench",
    "model_id": "qwen3-30b-a3b",
    "score": 0.626,
    "normalized_score": 0.626,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/",
    "verified_by_llmstats": false,
    "analysis_method": "v5",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.349221+00:00",
    "updated_at": "2025-07-19T19:56:13.349221+00:00",
    "benchmark_name": "LiveCodeBench"
  },
  {
    "model_benchmark_id": 1649,
    "benchmark_id": "multi-if",
    "model_id": "qwen3-30b-a3b",
    "score": 0.722,
    "normalized_score": 0.722,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.641584+00:00",
    "updated_at": "2025-07-19T19:56:14.641584+00:00",
    "benchmark_name": "Multi-IF"
  }
]

================================================
FILE: data/organizations/qwen/models/qwen3-30b-a3b/model.json
================================================
{
  "model_id": "qwen3-30b-a3b",
  "name": "Qwen3 30B A3B",
  "organization_id": "qwen",
  "fine_tuned_from_model_id": null,
  "description": "Qwen3-30B-A3B is a smaller Mixture-of-Experts (MoE) model from the Qwen3 series by Alibaba, with 30.5 billion total parameters and 3.3 billion activated parameters. Features hybrid thinking/non-thinking modes, support for 119 languages, and enhanced agent capabilities. It aims to outperform previous models like QwQ-32B while using significantly fewer activated parameters.",
  "release_date": "2025-04-29",
  "announcement_date": "2025-04-29",
  "license_id": "apache_2_0",
  "multimodal": false,
  "knowledge_cutoff": null,
  "param_count": 30500000000,
  "training_tokens": 36000000000000,
  "available_in_zeroeval": true,
  "source_api_ref": "https://qwenlm.github.io/blog/qwen3/",
  "source_playground": "https://chat.qwen.ai/",
  "source_paper": null,
  "source_scorecard_blog_link": "https://qwenlm.github.io/blog/qwen3/",
  "source_repo_link": "https://github.com/QwenLM/Qwen3",
  "source_weights_link": "https://huggingface.co/Qwen/Qwen3-30B-A3B",
  "created_at": "2025-07-19T19:49:05.631206+00:00",
  "updated_at": "2025-07-19T19:49:05.631206+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/qwen/models/qwen3-32b/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 1625,
    "benchmark_id": "aider",
    "model_id": "qwen3-32b",
    "score": 0.502,
    "normalized_score": 0.502,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@2",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.571165+00:00",
    "updated_at": "2025-07-19T19:56:14.571165+00:00",
    "benchmark_name": "Aider"
  },
  {
    "model_benchmark_id": 453,
    "benchmark_id": "aime-2024",
    "model_id": "qwen3-32b",
    "score": 0.814,
    "normalized_score": 0.814,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@64",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.961658+00:00",
    "updated_at": "2025-07-19T19:56:11.961658+00:00",
    "benchmark_name": "AIME 2024"
  },
  {
    "model_benchmark_id": 689,
    "benchmark_id": "aime-2025",
    "model_id": "qwen3-32b",
    "score": 0.729,
    "normalized_score": 0.729,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@64",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.446075+00:00",
    "updated_at": "2025-07-19T19:56:12.446075+00:00",
    "benchmark_name": "AIME 2025"
  },
  {
    "model_benchmark_id": 1451,
    "benchmark_id": "arena-hard",
    "model_id": "qwen3-32b",
    "score": 0.938,
    "normalized_score": 0.938,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.093495+00:00",
    "updated_at": "2025-07-19T19:56:14.093495+00:00",
    "benchmark_name": "Arena Hard"
  },
  {
    "model_benchmark_id": 850,
    "benchmark_id": "bfcl",
    "model_id": "qwen3-32b",
    "score": 0.703,
    "normalized_score": 0.703,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/",
    "verified_by_llmstats": false,
    "analysis_method": "v3",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.778924+00:00",
    "updated_at": "2025-07-19T19:56:12.778924+00:00",
    "benchmark_name": "BFCL"
  },
  {
    "model_benchmark_id": 1645,
    "benchmark_id": "codeforces",
    "model_id": "qwen3-32b",
    "score": 0.659,
    "normalized_score": 0.659,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/",
    "verified_by_llmstats": false,
    "analysis_method": "Elo Rating",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.627279+00:00",
    "updated_at": "2025-07-19T19:56:14.627279+00:00",
    "benchmark_name": "CodeForces"
  },
  {
    "model_benchmark_id": 748,
    "benchmark_id": "livebench",
    "model_id": "qwen3-32b",
    "score": 0.749,
    "normalized_score": 0.749,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.573432+00:00",
    "updated_at": "2025-07-19T19:56:12.573432+00:00",
    "benchmark_name": "LiveBench"
  },
  {
    "model_benchmark_id": 1122,
    "benchmark_id": "livecodebench",
    "model_id": "qwen3-32b",
    "score": 0.657,
    "normalized_score": 0.657,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/",
    "verified_by_llmstats": false,
    "analysis_method": "v5",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.342304+00:00",
    "updated_at": "2025-07-19T19:56:13.342304+00:00",
    "benchmark_name": "LiveCodeBench"
  },
  {
    "model_benchmark_id": 1646,
    "benchmark_id": "multilf",
    "model_id": "qwen3-32b",
    "score": 0.73,
    "normalized_score": 0.73,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3/",
    "verified_by_llmstats": false,
    "analysis_method": "Accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.630716+00:00",
    "updated_at": "2025-07-19T19:56:14.630716+00:00",
    "benchmark_name": "MultiLF"
  }
]

================================================
FILE: data/organizations/qwen/models/qwen3-32b/model.json
================================================
{
  "model_id": "qwen3-32b",
  "name": "Qwen3 32B",
  "organization_id": "qwen",
  "fine_tuned_from_model_id": null,
  "description": "Qwen3-32B is a large language model from Alibaba's Qwen3 series. It features 32.8 billion parameters, a 128k token context window, support for 119 languages, and hybrid thinking modes allowing switching between deep reasoning and fast responses. It demonstrates strong performance in reasoning, instruction-following, and agent capabilities.",
  "release_date": "2025-04-29",
  "announcement_date": "2025-04-29",
  "license_id": "apache_2_0",
  "multimodal": false,
  "knowledge_cutoff": null,
  "param_count": 32800000000,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": null,
  "source_playground": "https://chat.qwen.ai/",
  "source_paper": null,
  "source_scorecard_blog_link": "https://qwenlm.github.io/blog/qwen3/",
  "source_repo_link": "https://github.com/QwenLM/Qwen3",
  "source_weights_link": "https://huggingface.co/Qwen/Qwen3-32B",
  "created_at": "2025-07-19T19:49:05.621845+00:00",
  "updated_at": "2025-07-19T19:49:05.621845+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/qwen/models/qwen3-next-80b-a3b-base/benchmarks.json
================================================
[]


================================================
FILE: data/organizations/qwen/models/qwen3-next-80b-a3b-base/model.json
================================================
{
  "model_id": "qwen3-next-80b-a3b-base",
  "name": "Qwen3-Next-80B-A3B-Base",
  "organization_id": "qwen",
  "model_family_id": null,
  "fine_tuned_from_model_id": null,
  "description": "Qwen3-Next-80B-A3B-Base is the foundation model in the Qwen3-Next series, featuring revolutionary architectural innovations for ultimate training and inference efficiency. It introduces Hybrid Attention combining Gated DeltaNet (75% layers) and Gated Attention (25% layers) for efficient ultra-long context modeling, Ultra-Sparse MoE with 512 total experts but only 10 routed + 1 shared expert activated (3.7% activation ratio), and native Multi-Token Prediction for faster inference. With 80B total parameters and only ~3B activated per inference step, it achieves performance comparable to Qwen3-32B while using less than 10% training cost and delivering 10x+ throughput for 32K+ contexts. Trained on 15T tokens with training-stability-friendly designs including Zero-Centered RMSNorm and normalized MoE router parameters. Supports 256K context length, extensible to 1M tokens with YaRN scaling.",
  "release_date": "2025-09-10",
  "announcement_date": "2025-09-10",
  "license_id": "apache_2_0",
  "multimodal": false,
  "knowledge_cutoff": null,
  "param_count": 80000000000,
  "training_tokens": 15000000000000,
  "available_in_zeroeval": true,
  "source_api_ref": "https://huggingface.co/Qwen/Qwen3-Next-80B-A3B-Base",
  "source_playground": null,
  "source_paper": null,
  "source_scorecard_blog_link": "https://qwenlm.github.io/blog/qwen3-next/",
  "source_repo_link": "https://github.com/QwenLM/Qwen3",
  "source_weights_link": "https://huggingface.co/Qwen/Qwen3-Next-80B-A3B-Base",
  "created_at": "2025-09-10T00:00:00.000000+00:00",
  "updated_at": "2025-09-15T00:00:00.000000+00:00"
}


================================================
FILE: data/organizations/qwen/models/qwen3-next-80b-a3b-instruct/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 9301,
    "benchmark_id": "mmlu-pro",
    "model_id": "qwen3-next-80b-a3b-instruct",
    "score": 0.806,
    "normalized_score": 0.806,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-01-10T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "MMLU-Pro"
  },
  {
    "model_benchmark_id": 9302,
    "benchmark_id": "mmlu-redux",
    "model_id": "qwen3-next-80b-a3b-instruct",
    "score": 0.909,
    "normalized_score": 0.909,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-01-10T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "MMLU-Redux"
  },
  {
    "model_benchmark_id": 9303,
    "benchmark_id": "gpqa",
    "model_id": "qwen3-next-80b-a3b-instruct",
    "score": 0.729,
    "normalized_score": 0.729,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-01-10T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 9304,
    "benchmark_id": "supergpqa",
    "model_id": "qwen3-next-80b-a3b-instruct",
    "score": 0.588,
    "normalized_score": 0.588,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-01-10T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "SuperGPQA"
  },
  {
    "model_benchmark_id": 9305,
    "benchmark_id": "aime-2025",
    "model_id": "qwen3-next-80b-a3b-instruct",
    "score": 0.695,
    "normalized_score": 0.695,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-01-10T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "AIME 2025"
  },
  {
    "model_benchmark_id": 9306,
    "benchmark_id": "hmmt25",
    "model_id": "qwen3-next-80b-a3b-instruct",
    "score": 0.541,
    "normalized_score": 0.541,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-01-10T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "HMMT25"
  },
  {
    "model_benchmark_id": 9307,
    "benchmark_id": "livebench-20241125",
    "model_id": "qwen3-next-80b-a3b-instruct",
    "score": 0.758,
    "normalized_score": 0.758,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-01-10T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "LiveBench 20241125"
  },
  {
    "model_benchmark_id": 9308,
    "benchmark_id": "livecodebench-v6",
    "model_id": "qwen3-next-80b-a3b-instruct",
    "score": 0.566,
    "normalized_score": 0.566,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/",
    "verified_by_llmstats": false,
    "analysis_method": "25.02-25.05",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-01-10T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "LiveCodeBench v6"
  },
  {
    "model_benchmark_id": 9309,
    "benchmark_id": "multipl-e",
    "model_id": "qwen3-next-80b-a3b-instruct",
    "score": 0.878,
    "normalized_score": 0.878,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-01-10T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "MultiPL-E"
  },
  {
    "model_benchmark_id": 9310,
    "benchmark_id": "aider-polyglot",
    "model_id": "qwen3-next-80b-a3b-instruct",
    "score": 0.498,
    "normalized_score": 0.498,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-01-10T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "Aider-Polyglot"
  },
  {
    "model_benchmark_id": 9311,
    "benchmark_id": "ifeval",
    "model_id": "qwen3-next-80b-a3b-instruct",
    "score": 0.876,
    "normalized_score": 0.876,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-01-10T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "IFEval"
  },
  {
    "model_benchmark_id": 9312,
    "benchmark_id": "arena-hard-v2",
    "model_id": "qwen3-next-80b-a3b-instruct",
    "score": 0.827,
    "normalized_score": 0.827,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/",
    "verified_by_llmstats": false,
    "analysis_method": "GPT-4.1 evaluated win rates",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-01-10T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "Arena-Hard v2"
  },
  {
    "model_benchmark_id": 9313,
    "benchmark_id": "creative-writing-v3",
    "model_id": "qwen3-next-80b-a3b-instruct",
    "score": 0.853,
    "normalized_score": 0.853,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-01-10T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "Creative Writing v3"
  },
  {
    "model_benchmark_id": 9314,
    "benchmark_id": "writingbench",
    "model_id": "qwen3-next-80b-a3b-instruct",
    "score": 0.873,
    "normalized_score": 0.873,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-01-10T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "WritingBench"
  },
  {
    "model_benchmark_id": 9315,
    "benchmark_id": "bfcl-v3",
    "model_id": "qwen3-next-80b-a3b-instruct",
    "score": 0.703,
    "normalized_score": 0.703,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-01-10T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "BFCL-v3"
  },
  {
    "model_benchmark_id": 9316,
    "benchmark_id": "tau-bench-retail",
    "model_id": "qwen3-next-80b-a3b-instruct",
    "score": 0.609,
    "normalized_score": 0.609,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": "TAU1-Retail",
    "created_at": "2025-01-10T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "TAU1-Retail"
  },
  {
    "model_benchmark_id": 9317,
    "benchmark_id": "tau-bench-airline",
    "model_id": "qwen3-next-80b-a3b-instruct",
    "score": 0.44,
    "normalized_score": 0.44,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": "TAU1-Airline",
    "created_at": "2025-01-10T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "TAU1-Airline"
  },
  {
    "model_benchmark_id": 9318,
    "benchmark_id": "tau2-retail",
    "model_id": "qwen3-next-80b-a3b-instruct",
    "score": 0.573,
    "normalized_score": 0.573,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-01-10T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "TAU2-Retail"
  },
  {
    "model_benchmark_id": 9319,
    "benchmark_id": "tau2-airline",
    "model_id": "qwen3-next-80b-a3b-instruct",
    "score": 0.455,
    "normalized_score": 0.455,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-01-10T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "TAU2-Airline"
  },
  {
    "model_benchmark_id": 9320,
    "benchmark_id": "tau2-telecom",
    "model_id": "qwen3-next-80b-a3b-instruct",
    "score": 0.132,
    "normalized_score": 0.132,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-01-10T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "TAU2-Telecom"
  },
  {
    "model_benchmark_id": 9321,
    "benchmark_id": "multi-if",
    "model_id": "qwen3-next-80b-a3b-instruct",
    "score": 0.758,
    "normalized_score": 0.758,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": "MultiIF",
    "created_at": "2025-01-10T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "MultiIF"
  },
  {
    "model_benchmark_id": 9322,
    "benchmark_id": "mmlu-prox",
    "model_id": "qwen3-next-80b-a3b-instruct",
    "score": 0.767,
    "normalized_score": 0.767,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": "MMLU-ProX",
    "created_at": "2025-01-10T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "MMLU-ProX"
  },
  {
    "model_benchmark_id": 9323,
    "benchmark_id": "include",
    "model_id": "qwen3-next-80b-a3b-instruct",
    "score": 0.789,
    "normalized_score": 0.789,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-01-10T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "INCLUDE"
  },
  {
    "model_benchmark_id": 9324,
    "benchmark_id": "polymath",
    "model_id": "qwen3-next-80b-a3b-instruct",
    "score": 0.459,
    "normalized_score": 0.459,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-01-10T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "PolyMATH"
  }
]


================================================
FILE: data/organizations/qwen/models/qwen3-next-80b-a3b-instruct/model.json
================================================
{
  "model_id": "qwen3-next-80b-a3b-instruct",
  "name": "Qwen3-Next-80B-A3B-Instruct",
  "organization_id": "qwen",
  "model_family_id": null,
  "fine_tuned_from_model_id": null,
  "description": "Qwen3-Next-80B-A3B-Instruct is the first in the Qwen3-Next series, featuring groundbreaking architectural innovations. It uses Hybrid Attention combining Gated DeltaNet and Gated Attention for efficient ultra-long context modeling, High-Sparsity MoE with 512 experts (10 activated + 1 shared) achieving extreme low activation ratio, and Multi-Token Prediction for improved performance and faster inference. With 80B total parameters and only 3B activated, it outperforms Qwen3-32B-Base with 10% training cost and 10x throughput for 32K+ contexts. The model performs on par with Qwen3-235B-A22B-Instruct-2507 while excelling at ultra-long-context tasks up to 256K tokens (extensible to 1M with YaRN). Architecture: 48 layers, 15T training tokens, hybrid layout of 12*(3*(Gated DeltaNet->MoE)->(Gated Attention->MoE)).",
  "release_date": "2025-09-10",
  "announcement_date": "2025-09-10",
  "license_id": "apache_2_0",
  "multimodal": false,
  "knowledge_cutoff": null,
  "param_count": 80000000000,
  "training_tokens": 15000000000000,
  "available_in_zeroeval": true,
  "source_api_ref": "https://huggingface.co/Qwen/Qwen3-Next-80B-A3B-Instruct",
  "source_playground": "https://chat.qwen.ai/",
  "source_paper": null,
  "source_scorecard_blog_link": "https://qwenlm.github.io/blog/qwen3-next/",
  "source_repo_link": "https://github.com/QwenLM/Qwen3",
  "source_weights_link": "https://huggingface.co/Qwen/Qwen3-Next-80B-A3B-Instruct",
  "created_at": "2025-09-10T00:00:00.000000+00:00",
  "updated_at": "2025-09-15T00:00:00.000000+00:00"
}


================================================
FILE: data/organizations/qwen/models/qwen3-next-80b-a3b-thinking/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 9201,
    "benchmark_id": "mmlu-pro",
    "model_id": "qwen3-next-80b-a3b-thinking",
    "score": 0.827,
    "normalized_score": 0.827,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-01-10T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "MMLU-Pro"
  },
  {
    "model_benchmark_id": 9202,
    "benchmark_id": "mmlu-redux",
    "model_id": "qwen3-next-80b-a3b-thinking",
    "score": 0.925,
    "normalized_score": 0.925,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-01-10T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "MMLU-Redux"
  },
  {
    "model_benchmark_id": 9203,
    "benchmark_id": "gpqa",
    "model_id": "qwen3-next-80b-a3b-thinking",
    "score": 0.772,
    "normalized_score": 0.772,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-01-10T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 9204,
    "benchmark_id": "supergpqa",
    "model_id": "qwen3-next-80b-a3b-thinking",
    "score": 0.608,
    "normalized_score": 0.608,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-01-10T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "SuperGPQA"
  },
  {
    "model_benchmark_id": 9205,
    "benchmark_id": "aime-2025",
    "model_id": "qwen3-next-80b-a3b-thinking",
    "score": 0.878,
    "normalized_score": 0.878,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-01-10T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "AIME 2025"
  },
  {
    "model_benchmark_id": 9206,
    "benchmark_id": "hmmt25",
    "model_id": "qwen3-next-80b-a3b-thinking",
    "score": 0.739,
    "normalized_score": 0.739,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-01-10T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "HMMT25"
  },
  {
    "model_benchmark_id": 9207,
    "benchmark_id": "livebench-20241125",
    "model_id": "qwen3-next-80b-a3b-thinking",
    "score": 0.766,
    "normalized_score": 0.766,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-01-10T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "LiveBench 241125"
  },
  {
    "model_benchmark_id": 9208,
    "benchmark_id": "livecodebench-v6",
    "model_id": "qwen3-next-80b-a3b-thinking",
    "score": 0.687,
    "normalized_score": 0.687,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/",
    "verified_by_llmstats": false,
    "analysis_method": "25.02-25.05",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-01-10T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "LiveCodeBench v6"
  },
  {
    "model_benchmark_id": 9209,
    "benchmark_id": "cfeval",
    "model_id": "qwen3-next-80b-a3b-thinking",
    "score": 2071,
    "normalized_score": 0.2071,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": "Raw score: 2071",
    "created_at": "2025-01-10T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "CFEval"
  },
  {
    "model_benchmark_id": 9210,
    "benchmark_id": "ojbench",
    "model_id": "qwen3-next-80b-a3b-thinking",
    "score": 0.297,
    "normalized_score": 0.297,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-01-10T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "OJBench"
  },
  {
    "model_benchmark_id": 9211,
    "benchmark_id": "ifeval",
    "model_id": "qwen3-next-80b-a3b-thinking",
    "score": 0.889,
    "normalized_score": 0.889,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-01-10T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "IFEval"
  },
  {
    "model_benchmark_id": 9212,
    "benchmark_id": "arena-hard-v2",
    "model_id": "qwen3-next-80b-a3b-thinking",
    "score": 0.623,
    "normalized_score": 0.623,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/",
    "verified_by_llmstats": false,
    "analysis_method": "GPT-4.1 evaluated win rates",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-01-10T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "Arena-Hard v2"
  },
  {
    "model_benchmark_id": 9213,
    "benchmark_id": "writingbench",
    "model_id": "qwen3-next-80b-a3b-thinking",
    "score": 0.846,
    "normalized_score": 0.846,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-01-10T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "WritingBench"
  },
  {
    "model_benchmark_id": 9214,
    "benchmark_id": "bfcl-v3",
    "model_id": "qwen3-next-80b-a3b-thinking",
    "score": 0.72,
    "normalized_score": 0.72,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-01-10T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "BFCL-v3"
  },
  {
    "model_benchmark_id": 9215,
    "benchmark_id": "tau-bench-retail",
    "model_id": "qwen3-next-80b-a3b-thinking",
    "score": 0.696,
    "normalized_score": 0.696,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": "TAU1-Retail",
    "created_at": "2025-01-10T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "TAU1-Retail"
  },
  {
    "model_benchmark_id": 9216,
    "benchmark_id": "tau-bench-airline",
    "model_id": "qwen3-next-80b-a3b-thinking",
    "score": 0.49,
    "normalized_score": 0.49,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": "TAU1-Airline",
    "created_at": "2025-01-10T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "TAU1-Airline"
  },
  {
    "model_benchmark_id": 9217,
    "benchmark_id": "tau2-retail",
    "model_id": "qwen3-next-80b-a3b-thinking",
    "score": 0.678,
    "normalized_score": 0.678,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-01-10T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "TAU2-Retail"
  },
  {
    "model_benchmark_id": 9218,
    "benchmark_id": "tau2-airline",
    "model_id": "qwen3-next-80b-a3b-thinking",
    "score": 0.605,
    "normalized_score": 0.605,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-01-10T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "TAU2-Airline"
  },
  {
    "model_benchmark_id": 9219,
    "benchmark_id": "tau2-telecom",
    "model_id": "qwen3-next-80b-a3b-thinking",
    "score": 0.439,
    "normalized_score": 0.439,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-01-10T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "TAU2-Telecom"
  },
  {
    "model_benchmark_id": 9220,
    "benchmark_id": "multi-if",
    "model_id": "qwen3-next-80b-a3b-thinking",
    "score": 0.778,
    "normalized_score": 0.778,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": "MultiIF",
    "created_at": "2025-01-10T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "MultiIF"
  },
  {
    "model_benchmark_id": 9221,
    "benchmark_id": "mmlu-prox",
    "model_id": "qwen3-next-80b-a3b-thinking",
    "score": 0.787,
    "normalized_score": 0.787,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": "MMLU-ProX",
    "created_at": "2025-01-10T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "MMLU-ProX"
  },
  {
    "model_benchmark_id": 9222,
    "benchmark_id": "include",
    "model_id": "qwen3-next-80b-a3b-thinking",
    "score": 0.789,
    "normalized_score": 0.789,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-01-10T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "INCLUDE"
  },
  {
    "model_benchmark_id": 9223,
    "benchmark_id": "polymath",
    "model_id": "qwen3-next-80b-a3b-thinking",
    "score": 0.563,
    "normalized_score": 0.563,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwen3-next/",
    "verified_by_llmstats": false,
    "analysis_method": null,
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-01-10T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "benchmark_name": "PolyMATH"
  }
]


================================================
FILE: data/organizations/qwen/models/qwen3-next-80b-a3b-thinking/model.json
================================================
{
  "model_id": "qwen3-next-80b-a3b-thinking",
  "name": "Qwen3-Next-80B-A3B-Thinking",
  "organization_id": "qwen",
  "model_family_id": null,
  "fine_tuned_from_model_id": null,
  "description": "Qwen3-Next-80B-A3B-Thinking is the thinking variant of the Qwen3-Next series, featuring the same groundbreaking architecture as the instruct model. Leveraging GSPO, it addresses stability and efficiency challenges of hybrid attention + high-sparsity MoE in RL training. It uses Hybrid Attention combining Gated DeltaNet and Gated Attention for efficient ultra-long context modeling, High-Sparsity MoE with 512 experts (10 activated + 1 shared), and Multi-Token Prediction. With 80B total parameters and only 3B activated, it demonstrates outstanding performance on complex reasoning tasks — outperforming Qwen3-30B-A3B-Thinking-2507, Qwen3-32B-Thinking, and even the proprietary Gemini-2.5-Flash-Thinking across multiple benchmarks. Architecture: 48 layers, 15T training tokens, hybrid layout of 12*(3*(Gated DeltaNet->MoE)->(Gated Attention->MoE)). Supports only thinking mode with automatic <think> tag inclusion, may generate longer thinking content.",
  "release_date": "2025-09-10",
  "announcement_date": "2025-09-10",
  "license_id": "apache_2_0",
  "multimodal": false,
  "knowledge_cutoff": null,
  "param_count": 80000000000,
  "training_tokens": 15000000000000,
  "available_in_zeroeval": true,
  "source_api_ref": "https://huggingface.co/Qwen/Qwen3-Next-80B-A3B-Thinking",
  "source_playground": "https://chat.qwen.ai/",
  "source_paper": null,
  "source_scorecard_blog_link": "https://qwenlm.github.io/blog/qwen3-next/",
  "source_repo_link": "https://github.com/QwenLM/Qwen3",
  "source_weights_link": "https://huggingface.co/Qwen/Qwen3-Next-80B-A3B-Thinking",
  "created_at": "2025-09-10T00:00:00.000000+00:00",
  "updated_at": "2025-09-15T00:00:00.000000+00:00"
}


================================================
FILE: data/organizations/qwen/models/qwq-32b/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 451,
    "benchmark_id": "aime-2024",
    "model_id": "qwq-32b",
    "score": 0.795,
    "normalized_score": 0.795,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwq-32b/",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.957773+00:00",
    "updated_at": "2025-07-19T19:56:11.957773+00:00",
    "benchmark_name": "AIME 2024"
  },
  {
    "model_benchmark_id": 849,
    "benchmark_id": "bfcl",
    "model_id": "qwq-32b",
    "score": 0.664,
    "normalized_score": 0.664,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwq-32b/",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.777209+00:00",
    "updated_at": "2025-07-19T19:56:12.777209+00:00",
    "benchmark_name": "BFCL"
  },
  {
    "model_benchmark_id": 298,
    "benchmark_id": "gpqa",
    "model_id": "qwq-32b",
    "score": 0.652,
    "normalized_score": 0.652,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwen-ai.com/qwq-32b/",
    "verified_by_llmstats": false,
    "analysis_method": "Pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.672880+00:00",
    "updated_at": "2025-07-19T19:56:11.672880+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 619,
    "benchmark_id": "ifeval",
    "model_id": "qwq-32b",
    "score": 0.839,
    "normalized_score": 0.839,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwq-32b/",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.275723+00:00",
    "updated_at": "2025-07-19T19:56:12.275723+00:00",
    "benchmark_name": "IFEval"
  },
  {
    "model_benchmark_id": 747,
    "benchmark_id": "livebench",
    "model_id": "qwq-32b",
    "score": 0.731,
    "normalized_score": 0.731,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwq-32b/",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.570952+00:00",
    "updated_at": "2025-07-19T19:56:12.570952+00:00",
    "benchmark_name": "LiveBench"
  },
  {
    "model_benchmark_id": 1118,
    "benchmark_id": "livecodebench",
    "model_id": "qwq-32b",
    "score": 0.634,
    "normalized_score": 0.634,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwq-32b/",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.332752+00:00",
    "updated_at": "2025-07-19T19:56:13.332752+00:00",
    "benchmark_name": "LiveCodeBench"
  },
  {
    "model_benchmark_id": 495,
    "benchmark_id": "math-500",
    "model_id": "qwq-32b",
    "score": 0.906,
    "normalized_score": 0.906,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwen-ai.com/qwq-32b/",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.034467+00:00",
    "updated_at": "2025-07-19T19:56:12.034467+00:00",
    "benchmark_name": "MATH-500"
  }
]

================================================
FILE: data/organizations/qwen/models/qwq-32b/model.json
================================================
{
  "model_id": "qwq-32b",
  "name": "QwQ-32B",
  "organization_id": "qwen",
  "fine_tuned_from_model_id": null,
  "description": "A model focused on advancing AI reasoning capabilities, particularly excelling in mathematics and programming. Features deep introspection and self-questioning abilities while having some limitations in language mixing and recursive/endless reasoning patterns.",
  "release_date": "2025-03-05",
  "announcement_date": "2025-03-05",
  "license_id": "apache_2_0",
  "multimodal": false,
  "knowledge_cutoff": "2024-11-28",
  "param_count": 32500000000,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://huggingface.co/Qwen/QwQ-32B",
  "source_playground": "https://huggingface.co/playground?modelId=Qwen/QwQ-32B",
  "source_paper": "https://arxiv.org/abs/2412.15115",
  "source_scorecard_blog_link": "https://qwenlm.github.io/blog/qwq-32b/",
  "source_repo_link": "https://github.com/QwenLM/QwQ",
  "source_weights_link": "https://huggingface.co/Qwen/QwQ-32B",
  "created_at": "2025-07-19T19:49:05.609393+00:00",
  "updated_at": "2025-07-19T19:49:05.609393+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/qwen/models/qwq-32b-preview/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 452,
    "benchmark_id": "aime-2024",
    "model_id": "qwq-32b-preview",
    "score": 0.5,
    "normalized_score": 0.5,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwq-32b-preview/",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.959852+00:00",
    "updated_at": "2025-07-19T19:56:11.959852+00:00",
    "benchmark_name": "AIME 2024"
  },
  {
    "model_benchmark_id": 300,
    "benchmark_id": "gpqa",
    "model_id": "qwq-32b-preview",
    "score": 0.652,
    "normalized_score": 0.652,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwq-32b-preview/",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.675997+00:00",
    "updated_at": "2025-07-19T19:56:11.675997+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 1120,
    "benchmark_id": "livecodebench",
    "model_id": "qwq-32b-preview",
    "score": 0.5,
    "normalized_score": 0.5,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwq-32b-preview/",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.337401+00:00",
    "updated_at": "2025-07-19T19:56:13.337401+00:00",
    "benchmark_name": "LiveCodeBench"
  },
  {
    "model_benchmark_id": 496,
    "benchmark_id": "math-500",
    "model_id": "qwq-32b-preview",
    "score": 0.906,
    "normalized_score": 0.906,
    "is_self_reported": true,
    "self_reported_source_link": "https://qwenlm.github.io/blog/qwq-32b-preview/",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.036449+00:00",
    "updated_at": "2025-07-19T19:56:12.036449+00:00",
    "benchmark_name": "MATH-500"
  }
]

================================================
FILE: data/organizations/qwen/models/qwq-32b-preview/model.json
================================================
{
  "model_id": "qwq-32b-preview",
  "name": "QwQ-32B-Preview",
  "organization_id": "qwen",
  "fine_tuned_from_model_id": "qwen-2.5-32b-instruct",
  "description": "An experimental research model focused on advancing AI reasoning capabilities, particularly excelling in mathematics and programming. Features deep introspection and self-questioning abilities while having some limitations in language mixing and recursive reasoning patterns.",
  "release_date": "2024-11-28",
  "announcement_date": "2024-11-28",
  "license_id": "apache_2_0",
  "multimodal": false,
  "knowledge_cutoff": "2024-11-28",
  "param_count": 32500000000,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://huggingface.co/Qwen/QwQ-32B-Preview",
  "source_playground": "https://huggingface.co/spaces/Qwen/QwQ-32B-Preview",
  "source_paper": "https://arxiv.org/abs/2407.10671",
  "source_scorecard_blog_link": "https://qwenlm.github.io/blog/qwq-32b-preview/",
  "source_repo_link": "https://github.com/QwenLM/Qwen2",
  "source_weights_link": "https://huggingface.co/Qwen/QwQ-32B-Preview",
  "created_at": "2025-07-19T19:49:05.887027+00:00",
  "updated_at": "2025-07-19T19:49:05.887027+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/qwen/organization.json
================================================
{
  "organization_id": "qwen",
  "name": "Alibaba Cloud / Qwen Team",
  "website": "https://qwenlm.github.io",
  "description": "The Qwen Team from Alibaba Cloud, developing the Qwen series of large language models including state-of-the-art mixture-of-experts and thinking-enabled models",
  "country": "CN",
  "created_at": "2025-07-19T19:49:05.604449+00:00",
  "updated_at": "2025-09-15T00:00:00.000000+00:00"
}


================================================
FILE: data/organizations/unknown/organization.json
================================================
{
  "organization_id": "unknown",
  "name": "Unknown",
  "website": "",
  "description": "Default organization for missing data",
  "country": null,
  "created_at": "2025-08-03T22:06:10.791768+00:00",
  "updated_at": "2025-08-03T22:06:10.791768+00:00"
}

================================================
FILE: data/organizations/xai/models/grok-1.5/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 894,
    "benchmark_id": "docvqa",
    "model_id": "grok-1.5",
    "score": 0.856,
    "normalized_score": 0.856,
    "is_self_reported": true,
    "self_reported_source_link": "https://x.ai/blog/grok-2",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.861804+00:00",
    "updated_at": "2025-07-19T19:56:12.861804+00:00",
    "benchmark_name": "DocVQA"
  },
  {
    "model_benchmark_id": 322,
    "benchmark_id": "gpqa",
    "model_id": "grok-1.5",
    "score": 0.359,
    "normalized_score": 0.359,
    "is_self_reported": true,
    "self_reported_source_link": "https://x.ai/blog/grok-2",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.711788+00:00",
    "updated_at": "2025-07-19T19:56:11.711788+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 1001,
    "benchmark_id": "gsm8k",
    "model_id": "grok-1.5",
    "score": 0.9,
    "normalized_score": 0.9,
    "is_self_reported": true,
    "self_reported_source_link": "https://x.ai/blog/grok-1.5",
    "verified_by_llmstats": false,
    "analysis_method": "8-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.092882+00:00",
    "updated_at": "2025-07-19T19:56:13.092882+00:00",
    "benchmark_name": "GSM8k"
  },
  {
    "model_benchmark_id": 794,
    "benchmark_id": "humaneval",
    "model_id": "grok-1.5",
    "score": 0.741,
    "normalized_score": 0.741,
    "is_self_reported": true,
    "self_reported_source_link": "https://x.ai/blog/grok-1.5",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.660557+00:00",
    "updated_at": "2025-07-19T19:56:12.660557+00:00",
    "benchmark_name": "HumanEval"
  },
  {
    "model_benchmark_id": 413,
    "benchmark_id": "math",
    "model_id": "grok-1.5",
    "score": 0.506,
    "normalized_score": 0.506,
    "is_self_reported": true,
    "self_reported_source_link": "https://x.ai/blog/grok-1.5",
    "verified_by_llmstats": false,
    "analysis_method": "4-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.878054+00:00",
    "updated_at": "2025-07-19T19:56:11.878054+00:00",
    "benchmark_name": "MATH"
  },
  {
    "model_benchmark_id": 532,
    "benchmark_id": "mathvista",
    "model_id": "grok-1.5",
    "score": 0.528,
    "normalized_score": 0.528,
    "is_self_reported": true,
    "self_reported_source_link": "https://x.ai/blog/grok-2",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.103226+00:00",
    "updated_at": "2025-07-19T19:56:12.103226+00:00",
    "benchmark_name": "MathVista"
  },
  {
    "model_benchmark_id": 97,
    "benchmark_id": "mmlu",
    "model_id": "grok-1.5",
    "score": 0.813,
    "normalized_score": 0.813,
    "is_self_reported": true,
    "self_reported_source_link": "https://x.ai/blog/grok-1.5",
    "verified_by_llmstats": false,
    "analysis_method": "5-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.283997+00:00",
    "updated_at": "2025-07-19T19:56:11.283997+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 206,
    "benchmark_id": "mmlu-pro",
    "model_id": "grok-1.5",
    "score": 0.51,
    "normalized_score": 0.51,
    "is_self_reported": true,
    "self_reported_source_link": "https://x.ai/blog/grok-2",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.492470+00:00",
    "updated_at": "2025-07-19T19:56:11.492470+00:00",
    "benchmark_name": "MMLU-Pro"
  },
  {
    "model_benchmark_id": 578,
    "benchmark_id": "mmmu",
    "model_id": "grok-1.5",
    "score": 0.536,
    "normalized_score": 0.536,
    "is_self_reported": true,
    "self_reported_source_link": "https://x.ai/blog/grok-2",
    "verified_by_llmstats": false,
    "analysis_method": "0-shot",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.189264+00:00",
    "updated_at": "2025-07-19T19:56:12.189264+00:00",
    "benchmark_name": "MMMU"
  }
]

================================================
FILE: data/organizations/xai/models/grok-1.5/model.json
================================================
{
  "model_id": "grok-1.5",
  "name": "Grok-1.5",
  "organization_id": "xai",
  "fine_tuned_from_model_id": null,
  "description": "An advanced language model with improved reasoning capabilities, particularly excelling in coding and mathematical tasks. Features a 128K token context window and enhanced problem-solving abilities compared to its predecessor.",
  "release_date": "2024-03-28",
  "announcement_date": "2024-03-28",
  "license_id": "proprietary",
  "multimodal": false,
  "knowledge_cutoff": null,
  "param_count": null,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://x.ai/api",
  "source_playground": null,
  "source_paper": null,
  "source_scorecard_blog_link": "https://x.ai/blog/grok-1.5",
  "source_repo_link": "https://github.com/xai-org/grok-1",
  "source_weights_link": null,
  "created_at": "2025-07-19T19:49:05.705047+00:00",
  "updated_at": "2025-07-19T19:49:05.705047+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/xai/models/grok-1.5v/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 1259,
    "benchmark_id": "ai2d",
    "model_id": "grok-1.5v",
    "score": 0.883,
    "normalized_score": 0.883,
    "is_self_reported": true,
    "self_reported_source_link": "https://x.ai/blog/grok-1.5v",
    "verified_by_llmstats": false,
    "analysis_method": "zero-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.641849+00:00",
    "updated_at": "2025-07-19T19:56:13.641849+00:00",
    "benchmark_name": "AI2D"
  },
  {
    "model_benchmark_id": 871,
    "benchmark_id": "chartqa",
    "model_id": "grok-1.5v",
    "score": 0.761,
    "normalized_score": 0.761,
    "is_self_reported": true,
    "self_reported_source_link": "https://x.ai/blog/grok-1.5v",
    "verified_by_llmstats": false,
    "analysis_method": "zero-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.817786+00:00",
    "updated_at": "2025-07-19T19:56:12.817786+00:00",
    "benchmark_name": "ChartQA"
  },
  {
    "model_benchmark_id": 896,
    "benchmark_id": "docvqa",
    "model_id": "grok-1.5v",
    "score": 0.856,
    "normalized_score": 0.856,
    "is_self_reported": true,
    "self_reported_source_link": "https://x.ai/blog/grok-1.5v",
    "verified_by_llmstats": false,
    "analysis_method": "zero-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.865566+00:00",
    "updated_at": "2025-07-19T19:56:12.865566+00:00",
    "benchmark_name": "DocVQA"
  },
  {
    "model_benchmark_id": 534,
    "benchmark_id": "mathvista",
    "model_id": "grok-1.5v",
    "score": 0.528,
    "normalized_score": 0.528,
    "is_self_reported": true,
    "self_reported_source_link": "https://x.ai/blog/grok-1.5v",
    "verified_by_llmstats": false,
    "analysis_method": "zero-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.106344+00:00",
    "updated_at": "2025-07-19T19:56:12.106344+00:00",
    "benchmark_name": "MathVista"
  },
  {
    "model_benchmark_id": 581,
    "benchmark_id": "mmmu",
    "model_id": "grok-1.5v",
    "score": 0.536,
    "normalized_score": 0.536,
    "is_self_reported": true,
    "self_reported_source_link": "https://x.ai/blog/grok-1.5v",
    "verified_by_llmstats": false,
    "analysis_method": "zero-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.195047+00:00",
    "updated_at": "2025-07-19T19:56:12.195047+00:00",
    "benchmark_name": "MMMU"
  },
  {
    "model_benchmark_id": 1638,
    "benchmark_id": "realworldqa",
    "model_id": "grok-1.5v",
    "score": 0.687,
    "normalized_score": 0.687,
    "is_self_reported": true,
    "self_reported_source_link": "https://x.ai/blog/grok-1.5v",
    "verified_by_llmstats": false,
    "analysis_method": "zero-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:14.606610+00:00",
    "updated_at": "2025-07-19T19:56:14.606610+00:00",
    "benchmark_name": "RealWorldQA"
  },
  {
    "model_benchmark_id": 915,
    "benchmark_id": "textvqa",
    "model_id": "grok-1.5v",
    "score": 0.781,
    "normalized_score": 0.781,
    "is_self_reported": true,
    "self_reported_source_link": "https://x.ai/blog/grok-1.5v",
    "verified_by_llmstats": false,
    "analysis_method": "zero-shot evaluation",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.908800+00:00",
    "updated_at": "2025-07-19T19:56:12.908800+00:00",
    "benchmark_name": "TextVQA"
  }
]

================================================
FILE: data/organizations/xai/models/grok-1.5v/model.json
================================================
{
  "model_id": "grok-1.5v",
  "name": "Grok-1.5V",
  "organization_id": "xai",
  "fine_tuned_from_model_id": null,
  "description": "A multimodal model capable of processing text and visual information, including documents, diagrams, charts, screenshots, and photographs. Notable for strong real-world spatial understanding capabilities.",
  "release_date": "2024-04-12",
  "announcement_date": "2024-04-12",
  "license_id": "proprietary",
  "multimodal": true,
  "knowledge_cutoff": null,
  "param_count": null,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://x.ai/api",
  "source_playground": null,
  "source_paper": null,
  "source_scorecard_blog_link": "https://x.ai/blog/grok-1.5v",
  "source_repo_link": null,
  "source_weights_link": null,
  "created_at": "2025-07-19T19:49:05.717803+00:00",
  "updated_at": "2025-07-19T19:49:05.717803+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/xai/models/grok-2/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 895,
    "benchmark_id": "docvqa",
    "model_id": "grok-2",
    "score": 0.936,
    "normalized_score": 0.936,
    "is_self_reported": true,
    "self_reported_source_link": "https://x.ai/blog/grok-2",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.863462+00:00",
    "updated_at": "2025-07-19T19:56:12.863462+00:00",
    "benchmark_name": "DocVQA"
  },
  {
    "model_benchmark_id": 325,
    "benchmark_id": "gpqa",
    "model_id": "grok-2",
    "score": 0.56,
    "normalized_score": 0.56,
    "is_self_reported": true,
    "self_reported_source_link": "https://x.ai/blog/grok-2",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.716230+00:00",
    "updated_at": "2025-07-19T19:56:11.716230+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 795,
    "benchmark_id": "humaneval",
    "model_id": "grok-2",
    "score": 0.884,
    "normalized_score": 0.884,
    "is_self_reported": true,
    "self_reported_source_link": "https://x.ai/blog/grok-2",
    "verified_by_llmstats": false,
    "analysis_method": "pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.662404+00:00",
    "updated_at": "2025-07-19T19:56:12.662404+00:00",
    "benchmark_name": "HumanEval"
  },
  {
    "model_benchmark_id": 414,
    "benchmark_id": "math",
    "model_id": "grok-2",
    "score": 0.761,
    "normalized_score": 0.761,
    "is_self_reported": true,
    "self_reported_source_link": "https://x.ai/blog/grok-2",
    "verified_by_llmstats": false,
    "analysis_method": "maj@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.880368+00:00",
    "updated_at": "2025-07-19T19:56:11.880368+00:00",
    "benchmark_name": "MATH"
  },
  {
    "model_benchmark_id": 533,
    "benchmark_id": "mathvista",
    "model_id": "grok-2",
    "score": 0.69,
    "normalized_score": 0.69,
    "is_self_reported": true,
    "self_reported_source_link": "https://x.ai/blog/grok-2",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.104885+00:00",
    "updated_at": "2025-07-19T19:56:12.104885+00:00",
    "benchmark_name": "MathVista"
  },
  {
    "model_benchmark_id": 98,
    "benchmark_id": "mmlu",
    "model_id": "grok-2",
    "score": 0.875,
    "normalized_score": 0.875,
    "is_self_reported": true,
    "self_reported_source_link": "https://x.ai/blog/grok-2",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.285517+00:00",
    "updated_at": "2025-07-19T19:56:11.285517+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 207,
    "benchmark_id": "mmlu-pro",
    "model_id": "grok-2",
    "score": 0.755,
    "normalized_score": 0.755,
    "is_self_reported": true,
    "self_reported_source_link": "https://x.ai/blog/grok-2",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.494333+00:00",
    "updated_at": "2025-07-19T19:56:11.494333+00:00",
    "benchmark_name": "MMLU-Pro"
  },
  {
    "model_benchmark_id": 580,
    "benchmark_id": "mmmu",
    "model_id": "grok-2",
    "score": 0.661,
    "normalized_score": 0.661,
    "is_self_reported": true,
    "self_reported_source_link": "https://x.ai/blog/grok-2",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.193698+00:00",
    "updated_at": "2025-07-19T19:56:12.193698+00:00",
    "benchmark_name": "MMMU"
  }
]

================================================
FILE: data/organizations/xai/models/grok-2/model.json
================================================
{
  "model_id": "grok-2",
  "name": "Grok-2",
  "organization_id": "xai",
  "fine_tuned_from_model_id": null,
  "description": "Grok-2 is a frontier language model with state-of-the-art reasoning capabilities, featuring advanced abilities in chat, coding, and reasoning. It demonstrates superior performance in visual math reasoning, document-based question answering, and excels across various academic benchmarks including reasoning, reading comprehension, math, and science.",
  "release_date": "2024-08-13",
  "announcement_date": "2024-08-13",
  "license_id": "proprietary",
  "multimodal": true,
  "knowledge_cutoff": null,
  "param_count": null,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://x.ai/api",
  "source_playground": null,
  "source_paper": null,
  "source_scorecard_blog_link": "https://x.ai/blog/grok-2",
  "source_repo_link": null,
  "source_weights_link": null,
  "created_at": "2025-07-19T19:49:05.715016+00:00",
  "updated_at": "2025-07-19T19:49:05.715016+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/xai/models/grok-2-mini/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 893,
    "benchmark_id": "docvqa",
    "model_id": "grok-2-mini",
    "score": 0.932,
    "normalized_score": 0.932,
    "is_self_reported": true,
    "self_reported_source_link": "https://x.ai/blog/grok-2",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.860093+00:00",
    "updated_at": "2025-07-19T19:56:12.860093+00:00",
    "benchmark_name": "DocVQA"
  },
  {
    "model_benchmark_id": 321,
    "benchmark_id": "gpqa",
    "model_id": "grok-2-mini",
    "score": 0.51,
    "normalized_score": 0.51,
    "is_self_reported": true,
    "self_reported_source_link": "https://x.ai/blog/grok-2",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.710285+00:00",
    "updated_at": "2025-07-19T19:56:11.710285+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 793,
    "benchmark_id": "humaneval",
    "model_id": "grok-2-mini",
    "score": 0.857,
    "normalized_score": 0.857,
    "is_self_reported": true,
    "self_reported_source_link": "https://x.ai/blog/grok-2",
    "verified_by_llmstats": false,
    "analysis_method": "pass@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.658802+00:00",
    "updated_at": "2025-07-19T19:56:12.658802+00:00",
    "benchmark_name": "HumanEval"
  },
  {
    "model_benchmark_id": 412,
    "benchmark_id": "math",
    "model_id": "grok-2-mini",
    "score": 0.73,
    "normalized_score": 0.73,
    "is_self_reported": true,
    "self_reported_source_link": "https://x.ai/blog/grok-2",
    "verified_by_llmstats": false,
    "analysis_method": "maj@1",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.876593+00:00",
    "updated_at": "2025-07-19T19:56:11.876593+00:00",
    "benchmark_name": "MATH"
  },
  {
    "model_benchmark_id": 531,
    "benchmark_id": "mathvista",
    "model_id": "grok-2-mini",
    "score": 0.681,
    "normalized_score": 0.681,
    "is_self_reported": true,
    "self_reported_source_link": "https://x.ai/blog/grok-2",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.101817+00:00",
    "updated_at": "2025-07-19T19:56:12.101817+00:00",
    "benchmark_name": "MathVista"
  },
  {
    "model_benchmark_id": 96,
    "benchmark_id": "mmlu",
    "model_id": "grok-2-mini",
    "score": 0.862,
    "normalized_score": 0.862,
    "is_self_reported": true,
    "self_reported_source_link": "https://x.ai/blog/grok-2",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.281643+00:00",
    "updated_at": "2025-07-19T19:56:11.281643+00:00",
    "benchmark_name": "MMLU"
  },
  {
    "model_benchmark_id": 205,
    "benchmark_id": "mmlu-pro",
    "model_id": "grok-2-mini",
    "score": 0.72,
    "normalized_score": 0.72,
    "is_self_reported": true,
    "self_reported_source_link": "https://x.ai/blog/grok-2",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.490630+00:00",
    "updated_at": "2025-07-19T19:56:11.490630+00:00",
    "benchmark_name": "MMLU-Pro"
  },
  {
    "model_benchmark_id": 577,
    "benchmark_id": "mmmu",
    "model_id": "grok-2-mini",
    "score": 0.632,
    "normalized_score": 0.632,
    "is_self_reported": true,
    "self_reported_source_link": "https://x.ai/blog/grok-2",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.186961+00:00",
    "updated_at": "2025-07-19T19:56:12.186961+00:00",
    "benchmark_name": "MMMU"
  }
]

================================================
FILE: data/organizations/xai/models/grok-2-mini/model.json
================================================
{
  "model_id": "grok-2-mini",
  "name": "Grok-2 mini",
  "organization_id": "xai",
  "fine_tuned_from_model_id": null,
  "description": "Grok-2 mini is a smaller, faster variant of Grok-2 that offers a balance between speed and answer quality. While more compact than its larger sibling, it maintains strong capabilities across various tasks including reasoning, coding, and chat interactions.",
  "release_date": "2024-08-13",
  "announcement_date": "2024-08-13",
  "license_id": "proprietary",
  "multimodal": true,
  "knowledge_cutoff": null,
  "param_count": null,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://x.ai/api",
  "source_playground": null,
  "source_paper": null,
  "source_scorecard_blog_link": "https://x.ai/blog/grok-2",
  "source_repo_link": null,
  "source_weights_link": null,
  "created_at": "2025-07-19T19:49:05.702680+00:00",
  "updated_at": "2025-07-19T19:49:05.702680+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/xai/models/grok-3/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 475,
    "benchmark_id": "aime-2024",
    "model_id": "grok-3",
    "score": 0.933,
    "normalized_score": 0.933,
    "is_self_reported": true,
    "self_reported_source_link": "https://x.ai/blog/grok-3",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.003392+00:00",
    "updated_at": "2025-07-19T19:56:12.003392+00:00",
    "benchmark_name": "AIME 2024"
  },
  {
    "model_benchmark_id": 696,
    "benchmark_id": "aime-2025",
    "model_id": "grok-3",
    "score": 0.933,
    "normalized_score": 0.933,
    "is_self_reported": true,
    "self_reported_source_link": "https://x.ai/blog/grok-3",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.457788+00:00",
    "updated_at": "2025-07-19T19:56:12.457788+00:00",
    "benchmark_name": "AIME 2025"
  },
  {
    "model_benchmark_id": 324,
    "benchmark_id": "gpqa",
    "model_id": "grok-3",
    "score": 0.846,
    "normalized_score": 0.846,
    "is_self_reported": true,
    "self_reported_source_link": "https://x.ai/blog/grok-3",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.714708+00:00",
    "updated_at": "2025-07-19T19:56:11.714708+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 1142,
    "benchmark_id": "livecodebench",
    "model_id": "grok-3",
    "score": 0.794,
    "normalized_score": 0.794,
    "is_self_reported": true,
    "self_reported_source_link": "https://x.ai/blog/grok-3",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.402422+00:00",
    "updated_at": "2025-07-19T19:56:13.402422+00:00",
    "benchmark_name": "LiveCodeBench"
  },
  {
    "model_benchmark_id": 579,
    "benchmark_id": "mmmu",
    "model_id": "grok-3",
    "score": 0.78,
    "normalized_score": 0.78,
    "is_self_reported": true,
    "self_reported_source_link": "https://x.ai/blog/grok-3",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.191844+00:00",
    "updated_at": "2025-07-19T19:56:12.191844+00:00",
    "benchmark_name": "MMMU"
  }
]

================================================
FILE: data/organizations/xai/models/grok-3/model.json
================================================
{
  "model_id": "grok-3",
  "name": "Grok-3",
  "organization_id": "xai",
  "fine_tuned_from_model_id": null,
  "description": "Grok 3, launched by xAI on February 17, 2025, is an advanced AI model with significantly enhanced capabilities compared to Grok 2, boasting an order of magnitude increase in performance. Trained on a vast dataset that includes legal documents among others, and utilizing a massive compute infrastructure with around 200,000 GPUs in a Memphis data center, Grok 3's training used ten times more compute than its predecessor. It features specialized models like Grok 3 Reasoning and Grok 3 Mini Reasoning for complex problem-solving, and it excels in benchmarks like AIME for mathematics and GPQA for PhD-level science.",
  "release_date": "2025-02-17",
  "announcement_date": "2025-02-17",
  "license_id": "proprietary",
  "multimodal": true,
  "knowledge_cutoff": "2024-11-17",
  "param_count": null,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://x.ai/api",
  "source_playground": null,
  "source_paper": null,
  "source_scorecard_blog_link": null,
  "source_repo_link": null,
  "source_weights_link": null,
  "created_at": "2025-07-19T19:49:05.711845+00:00",
  "updated_at": "2025-07-19T19:49:05.711845+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/xai/models/grok-3-mini/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 474,
    "benchmark_id": "aime-2024",
    "model_id": "grok-3-mini",
    "score": 0.958,
    "normalized_score": 0.958,
    "is_self_reported": true,
    "self_reported_source_link": "https://x.ai/blog/grok-3",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.001587+00:00",
    "updated_at": "2025-07-19T19:56:12.001587+00:00",
    "benchmark_name": "AIME 2024"
  },
  {
    "model_benchmark_id": 693,
    "benchmark_id": "aime-2025",
    "model_id": "grok-3-mini",
    "score": 0.908,
    "normalized_score": 0.908,
    "is_self_reported": true,
    "self_reported_source_link": "https://x.ai/blog/grok-3",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.452930+00:00",
    "updated_at": "2025-07-19T19:56:12.452930+00:00",
    "benchmark_name": "AIME 2025"
  },
  {
    "model_benchmark_id": 319,
    "benchmark_id": "gpqa",
    "model_id": "grok-3-mini",
    "score": 0.84,
    "normalized_score": 0.84,
    "is_self_reported": true,
    "self_reported_source_link": "https://x.ai/blog/grok-3",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.707259+00:00",
    "updated_at": "2025-07-19T19:56:11.707259+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 1139,
    "benchmark_id": "livecodebench",
    "model_id": "grok-3-mini",
    "score": 0.804,
    "normalized_score": 0.804,
    "is_self_reported": true,
    "self_reported_source_link": "https://x.ai/blog/grok-3",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.394024+00:00",
    "updated_at": "2025-07-19T19:56:13.394024+00:00",
    "benchmark_name": "LiveCodeBench"
  }
]

================================================
FILE: data/organizations/xai/models/grok-3-mini/model.json
================================================
{
  "model_id": "grok-3-mini",
  "name": "Grok-3 Mini",
  "organization_id": "xai",
  "fine_tuned_from_model_id": null,
  "description": "Grok 3 Mini is a streamlined version of xAI's Grok 3 AI model, designed for quicker response times while maintaining utility. It's tailored for users who require speed over the comprehensive capabilities of the full Grok 3 model, making it suitable for tasks where rapid information retrieval is key. Grok 3 Mini still leverages the advanced training and data that Grok 3 was built on but offers a lighter, more efficient version for everyday use.",
  "release_date": "2025-02-17",
  "announcement_date": "2025-02-17",
  "license_id": "proprietary",
  "multimodal": true,
  "knowledge_cutoff": "2024-11-17",
  "param_count": null,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://x.ai/api",
  "source_playground": null,
  "source_paper": null,
  "source_scorecard_blog_link": null,
  "source_repo_link": null,
  "source_weights_link": null,
  "created_at": "2025-07-19T19:49:05.697297+00:00",
  "updated_at": "2025-07-19T19:49:05.697297+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/xai/models/grok-4/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 695,
    "benchmark_id": "aime-2025",
    "model_id": "grok-4",
    "score": 0.917,
    "normalized_score": 0.917,
    "is_self_reported": true,
    "self_reported_source_link": "https://x.com/xai/status/1943158495588815072",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.456102+00:00",
    "updated_at": "2025-07-19T19:56:12.456102+00:00",
    "benchmark_name": "AIME 2025"
  },
  {
    "model_benchmark_id": 1387,
    "benchmark_id": "arc-agi-v2",
    "model_id": "grok-4",
    "score": 0.159,
    "normalized_score": 0.159,
    "is_self_reported": true,
    "self_reported_source_link": "https://x.com/xai/status/1943158495588815072",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.922021+00:00",
    "updated_at": "2025-07-19T19:56:13.922021+00:00",
    "benchmark_name": "ARC-AGI v2"
  },
  {
    "model_benchmark_id": 323,
    "benchmark_id": "gpqa",
    "model_id": "grok-4",
    "score": 0.875,
    "normalized_score": 0.875,
    "is_self_reported": true,
    "self_reported_source_link": "https://x.com/xai/status/1943158495588815072",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.713248+00:00",
    "updated_at": "2025-07-19T19:56:11.713248+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 1799,
    "benchmark_id": "hmmt25",
    "model_id": "grok-4",
    "score": 0.9,
    "normalized_score": 0.9,
    "is_self_reported": true,
    "self_reported_source_link": "https://x.com/xai/status/1943158495588815072",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.065811+00:00",
    "updated_at": "2025-07-19T19:56:15.065811+00:00",
    "benchmark_name": "HMMT25"
  },
  {
    "model_benchmark_id": 723,
    "benchmark_id": "humanity's-last-exam",
    "model_id": "grok-4",
    "score": 0.4,
    "normalized_score": 0.4,
    "is_self_reported": true,
    "self_reported_source_link": "https://x.com/xai/status/1943158495588815072",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.523105+00:00",
    "updated_at": "2025-07-19T19:56:12.523105+00:00",
    "benchmark_name": "Humanity's Last Exam"
  },
  {
    "model_benchmark_id": 1141,
    "benchmark_id": "livecodebench",
    "model_id": "grok-4",
    "score": 0.79,
    "normalized_score": 0.79,
    "is_self_reported": true,
    "self_reported_source_link": "https://x.com/xai/status/1943158495588815072",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.399716+00:00",
    "updated_at": "2025-07-19T19:56:13.399716+00:00",
    "benchmark_name": "LiveCodeBench"
  },
  {
    "model_benchmark_id": 1801,
    "benchmark_id": "usamo25",
    "model_id": "grok-4",
    "score": 0.375,
    "normalized_score": 0.375,
    "is_self_reported": true,
    "self_reported_source_link": "https://x.com/xai/status/1943158495588815072",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.071894+00:00",
    "updated_at": "2025-07-19T19:56:15.071894+00:00",
    "benchmark_name": "USAMO25"
  }
]


================================================
FILE: data/organizations/xai/models/grok-4/model.json
================================================
{
  "model_id": "grok-4",
  "name": "Grok-4",
  "organization_id": "xai",
  "fine_tuned_from_model_id": null,
  "description": "Grok 4, announced by xAI in summer 2025, represents a major leap in AI capabilities, described as 'the smartest AI in the world.' Built on version 6 of xAI's foundation model, it uses 100x more training compute than Grok 2 and 10x more reinforcement learning compute than Grok 3. The model achieves PhD-level performance across all academic disciplines simultaneously, scoring perfect on standardized tests like the SAT and near-perfect on graduate exams like the GRE. Unlike Grok 3, tool usage is built into the training process rather than relying on generalization. Trained using 200,000 GPUs, Grok 4 excels at complex reasoning, mathematical problem-solving, and coding tasks, though it has acknowledged weaknesses in multimodal capabilities that are being addressed in the next version.",
  "release_date": "2025-07-09",
  "announcement_date": "2025-07-09",
  "license_id": "proprietary",
  "multimodal": true,
  "knowledge_cutoff": "2024-12-31",
  "param_count": null,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://x.ai/api",
  "source_playground": null,
  "source_paper": null,
  "source_scorecard_blog_link": null,
  "source_repo_link": null,
  "source_weights_link": null,
  "created_at": "2025-07-19T19:49:05.707962+00:00",
  "updated_at": "2025-07-19T19:49:05.707962+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/xai/models/grok-4-fast/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 22228,
    "benchmark_id": "gpqa",
    "model_id": "grok-4-fast",
    "score": 0.857,
    "normalized_score": 0.857,
    "is_self_reported": true,
    "self_reported_source_link": "https://x.ai/news/grok-4-fast",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-10-11T00:00:00.000000+00:00",
    "updated_at": "2025-10-11T00:00:00.000000+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 22229,
    "benchmark_id": "aime-2025",
    "model_id": "grok-4-fast",
    "score": 0.920,
    "normalized_score": 0.920,
    "is_self_reported": true,
    "self_reported_source_link": "https://x.ai/news/grok-4-fast",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-10-11T00:00:00.000000+00:00",
    "updated_at": "2025-10-11T00:00:00.000000+00:00",
    "benchmark_name": "AIME 2025"
  },
  {
    "model_benchmark_id": 22230,
    "benchmark_id": "hmmt-2025",
    "model_id": "grok-4-fast",
    "score": 0.933,
    "normalized_score": 0.933,
    "is_self_reported": true,
    "self_reported_source_link": "https://x.ai/news/grok-4-fast",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-10-11T00:00:00.000000+00:00",
    "updated_at": "2025-10-11T00:00:00.000000+00:00",
    "benchmark_name": "HMMT 2025"
  },
  {
    "model_benchmark_id": 22231,
    "benchmark_id": "hle",
    "model_id": "grok-4-fast",
    "score": 0.200,
    "normalized_score": 0.200,
    "is_self_reported": true,
    "self_reported_source_link": "https://x.ai/news/grok-4-fast",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-10-11T00:00:00.000000+00:00",
    "updated_at": "2025-10-11T00:00:00.000000+00:00",
    "benchmark_name": "HLE"
  },
  {
    "model_benchmark_id": 22232,
    "benchmark_id": "livecodebench",
    "model_id": "grok-4-fast",
    "score": 0.800,
    "normalized_score": 0.800,
    "is_self_reported": true,
    "self_reported_source_link": "https://x.ai/news/grok-4-fast",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-10-11T00:00:00.000000+00:00",
    "updated_at": "2025-10-11T00:00:00.000000+00:00",
    "benchmark_name": "LiveCodeBench"
  },
  {
    "model_benchmark_id": 22233,
    "benchmark_id": "browsecomp",
    "model_id": "grok-4-fast",
    "score": 0.449,
    "normalized_score": 0.449,
    "is_self_reported": true,
    "self_reported_source_link": "https://x.ai/news/grok-4-fast",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-10-11T00:00:00.000000+00:00",
    "updated_at": "2025-10-11T00:00:00.000000+00:00",
    "benchmark_name": "BrowseComp"
  },
  {
    "model_benchmark_id": 22234,
    "benchmark_id": "simpleqa",
    "model_id": "grok-4-fast",
    "score": 0.950,
    "normalized_score": 0.950,
    "is_self_reported": true,
    "self_reported_source_link": "https://x.ai/news/grok-4-fast",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-10-11T00:00:00.000000+00:00",
    "updated_at": "2025-10-11T00:00:00.000000+00:00",
    "benchmark_name": "SimpleQA"
  }
]


================================================
FILE: data/organizations/xai/models/grok-4-fast/model.json
================================================
{
  "model_id": "grok-4-fast",
  "name": "Grok 4 Fast",
  "organization_id": "xai",
  "fine_tuned_from_model_id": null,
  "description": "Grok 4 Fast is a high-speed variant of Grok-4, optimized for faster inference while maintaining strong reasoning capabilities. It offers improved throughput and lower latency compared to the standard Grok-4 model.",
  "release_date": "2025-08-28",
  "announcement_date": "2025-08-28",
  "license_id": "proprietary",
  "multimodal": true,
  "knowledge_cutoff": null,
  "param_count": null,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://x.ai/api",
  "source_playground": null,
  "source_paper": "https://data.x.ai/2025-08-26-grok-code-fast-1-model-card.pdf",
  "source_scorecard_blog_link": null,
  "source_repo_link": null,
  "source_weights_link": null,
  "created_at": "2025-10-11T00:00:00.000000+00:00",
  "updated_at": "2025-10-11T00:00:00.000000+00:00",
  "model_family_id": null
}


================================================
FILE: data/organizations/xai/models/grok-4-heavy/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 694,
    "benchmark_id": "aime-2025",
    "model_id": "grok-4-heavy",
    "score": 1.0,
    "normalized_score": 1.0,
    "is_self_reported": true,
    "self_reported_source_link": "https://x.com/xai/status/1943158495588815072",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.454500+00:00",
    "updated_at": "2025-07-19T19:56:12.454500+00:00",
    "benchmark_name": "AIME 2025"
  },
  {
    "model_benchmark_id": 320,
    "benchmark_id": "gpqa",
    "model_id": "grok-4-heavy",
    "score": 0.884,
    "normalized_score": 0.884,
    "is_self_reported": true,
    "self_reported_source_link": "https://x.com/xai/status/1943158495588815072",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:11.708827+00:00",
    "updated_at": "2025-07-19T19:56:11.708827+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 1798,
    "benchmark_id": "hmmt25",
    "model_id": "grok-4-heavy",
    "score": 0.967,
    "normalized_score": 0.967,
    "is_self_reported": true,
    "self_reported_source_link": "https://x.com/xai/status/1943158495588815072",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.063588+00:00",
    "updated_at": "2025-07-19T19:56:15.063588+00:00",
    "benchmark_name": "HMMT25"
  },
  {
    "model_benchmark_id": 722,
    "benchmark_id": "humanity's-last-exam",
    "model_id": "grok-4-heavy",
    "score": 0.507,
    "normalized_score": 0.507,
    "is_self_reported": true,
    "self_reported_source_link": "https://x.com/xai/status/1943158495588815072",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:12.521361+00:00",
    "updated_at": "2025-07-19T19:56:12.521361+00:00",
    "benchmark_name": "Humanity's Last Exam"
  },
  {
    "model_benchmark_id": 1140,
    "benchmark_id": "livecodebench",
    "model_id": "grok-4-heavy",
    "score": 0.794,
    "normalized_score": 0.794,
    "is_self_reported": true,
    "self_reported_source_link": "https://x.com/xai/status/1943158495588815072",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:13.396669+00:00",
    "updated_at": "2025-07-19T19:56:13.396669+00:00",
    "benchmark_name": "LiveCodeBench"
  },
  {
    "model_benchmark_id": 1800,
    "benchmark_id": "usamo25",
    "model_id": "grok-4-heavy",
    "score": 0.619,
    "normalized_score": 0.619,
    "is_self_reported": true,
    "self_reported_source_link": "https://x.com/xai/status/1943158495588815072",
    "verified_by_llmstats": false,
    "analysis_method": "accuracy",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-19T19:56:15.070427+00:00",
    "updated_at": "2025-07-19T19:56:15.070427+00:00",
    "benchmark_name": "USAMO25"
  }
]


================================================
FILE: data/organizations/xai/models/grok-4-heavy/model.json
================================================
{
  "model_id": "grok-4-heavy",
  "name": "Grok-4 Heavy",
  "organization_id": "xai",
  "fine_tuned_from_model_id": null,
  "description": "Grok 4 Heavy is the multi-agent version of Grok 4, released alongside the standard model in summer 2025. This system spawns multiple Grok 4 agents in parallel that work independently on problems and then collaborate by comparing their solutions, similar to a study group. The agents share insights and tricks they discover, with the system intelligently combining their work rather than simply using majority voting. Grok 4 Heavy uses approximately 10x more test-time compute than regular Grok 4, enabling it to solve significantly more complex problems. On the Humanities Last Exam, it achieves over 50% accuracy on text-only problems, and it scored a perfect result on the AIME 2025 mathematics competition. The system represents a major advancement in multi-agent AI collaboration and reasoning capabilities.",
  "release_date": "2025-07-09",
  "announcement_date": "2025-07-09",
  "license_id": "proprietary",
  "multimodal": true,
  "knowledge_cutoff": "2024-12-31",
  "param_count": null,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://x.ai/api",
  "source_playground": null,
  "source_paper": null,
  "source_scorecard_blog_link": null,
  "source_repo_link": null,
  "source_weights_link": null,
  "created_at": "2025-07-19T19:49:05.700416+00:00",
  "updated_at": "2025-07-19T19:49:05.700416+00:00",
  "model_family_id": null
}

================================================
FILE: data/organizations/xai/models/grok-code-fast-1/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 22227,
    "benchmark_id": "swe-bench-verified",
    "model_id": "grok-code-fast-1",
    "score": 0.708,
    "normalized_score": 0.708,
    "is_self_reported": true,
    "self_reported_source_link": "https://x.ai/blog/grok-code-fast-1",
    "verified_by_llmstats": false,
    "analysis_method": "full subset, internal harness",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-10-03T00:00:00.000000+00:00",
    "updated_at": "2025-10-03T00:00:00.000000+00:00",
    "benchmark_name": "SWE-Bench Verified"
  }
]


================================================
FILE: data/organizations/xai/models/grok-code-fast-1/model.json
================================================
{
  "model_id": "grok-code-fast-1",
  "name": "Grok Code Fast 1",
  "organization_id": "xai",
  "fine_tuned_from_model_id": null,
  "description": "Grok Code Fast 1 is a speedy and economical reasoning model that excels at agentic coding. Built from scratch with a brand-new model architecture, it features a pre-training corpus rich with programming-related content and post-training datasets that reflect real-world pull requests and coding tasks. The model has mastered the use of common tools like grep, terminal, and file editing, making it ideal for integration with IDEs. It is exceptionally versatile across the full software development stack and is particularly adept at TypeScript, Python, Java, Rust, C++, and Go.",
  "release_date": "2025-08-28",
  "announcement_date": "2025-08-28",
  "license_id": "proprietary",
  "multimodal": false,
  "knowledge_cutoff": null,
  "param_count": null,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://x.ai/api",
  "source_playground": null,
  "source_paper": "https://data.x.ai/2025-08-26-grok-code-fast-1-model-card.pdf",
  "source_scorecard_blog_link": "https://x.ai/blog/grok-code-fast-1",
  "source_repo_link": null,
  "source_weights_link": null,
  "created_at": "2025-10-03T00:00:00.000000+00:00",
  "updated_at": "2025-10-03T00:00:00.000000+00:00",
  "model_family_id": null
}


================================================
FILE: data/organizations/xai/organization.json
================================================
{
  "organization_id": "xai",
  "name": "xAI",
  "website": "https://x.ai",
  "description": "Elon Musk AI company",
  "country": "US",
  "created_at": "2025-07-19T19:49:05.695344+00:00",
  "updated_at": "2025-07-19T19:49:05.695344+00:00"
}


================================================
FILE: data/organizations/zai-org/models/glm-4.5/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 7001,
    "benchmark_id": "mmlu-pro",
    "model_id": "glm-4.5",
    "score": 0.846,
    "normalized_score": 0.846,
    "is_self_reported": true,
    "self_reported_source_link": "https://z.ai/blog/glm-4.5",
    "verified_by_llmstats": false,
    "analysis_method": "standard",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-28T00:00:00.000000+00:00",
    "updated_at": "2025-07-28T00:00:00.000000+00:00",
    "benchmark_name": "MMLU-Pro"
  },
  {
    "model_benchmark_id": 7002,
    "benchmark_id": "aime-2024",
    "model_id": "glm-4.5",
    "score": 0.91,
    "normalized_score": 0.91,
    "is_self_reported": true,
    "self_reported_source_link": "https://z.ai/blog/glm-4.5",
    "verified_by_llmstats": false,
    "analysis_method": "Avg@32",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-28T00:00:00.000000+00:00",
    "updated_at": "2025-07-28T00:00:00.000000+00:00",
    "benchmark_name": "AIME24"
  },
  {
    "model_benchmark_id": 7003,
    "benchmark_id": "math-500",
    "model_id": "glm-4.5",
    "score": 0.982,
    "normalized_score": 0.982,
    "is_self_reported": true,
    "self_reported_source_link": "https://z.ai/blog/glm-4.5",
    "verified_by_llmstats": false,
    "analysis_method": "standard",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-28T00:00:00.000000+00:00",
    "updated_at": "2025-07-28T00:00:00.000000+00:00",
    "benchmark_name": "MATH-500"
  },
  {
    "model_benchmark_id": 7004,
    "benchmark_id": "scicode",
    "model_id": "glm-4.5",
    "score": 0.417,
    "normalized_score": 0.417,
    "is_self_reported": true,
    "self_reported_source_link": "https://z.ai/blog/glm-4.5",
    "verified_by_llmstats": false,
    "analysis_method": "standard",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-28T00:00:00.000000+00:00",
    "updated_at": "2025-07-28T00:00:00.000000+00:00",
    "benchmark_name": "SciCode"
  },
  {
    "model_benchmark_id": 7005,
    "benchmark_id": "gpqa",
    "model_id": "glm-4.5",
    "score": 0.791,
    "normalized_score": 0.791,
    "is_self_reported": true,
    "self_reported_source_link": "https://z.ai/blog/glm-4.5",
    "verified_by_llmstats": false,
    "analysis_method": "Avg@8",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-28T00:00:00.000000+00:00",
    "updated_at": "2025-07-28T00:00:00.000000+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 7006,
    "benchmark_id": "livecodebench",
    "model_id": "glm-4.5",
    "score": 0.729,
    "normalized_score": 0.729,
    "is_self_reported": true,
    "self_reported_source_link": "https://z.ai/blog/glm-4.5",
    "verified_by_llmstats": false,
    "analysis_method": "2407-2501",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-28T00:00:00.000000+00:00",
    "updated_at": "2025-07-28T00:00:00.000000+00:00",
    "benchmark_name": "LiveCodeBench"
  },
  {
    "model_benchmark_id": 7007,
    "benchmark_id": "swe-bench-verified",
    "model_id": "glm-4.5",
    "score": 0.642,
    "normalized_score": 0.642,
    "is_self_reported": true,
    "self_reported_source_link": "https://z.ai/blog/glm-4.5",
    "verified_by_llmstats": false,
    "analysis_method": "OpenHands v0.34.0",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-28T00:00:00.000000+00:00",
    "updated_at": "2025-07-28T00:00:00.000000+00:00",
    "benchmark_name": "SWE-bench-Verified"
  },
  {
    "model_benchmark_id": 7008,
    "benchmark_id": "tau-bench-retail",
    "model_id": "glm-4.5",
    "score": 0.797,
    "normalized_score": 0.797,
    "is_self_reported": true,
    "self_reported_source_link": "https://z.ai/blog/glm-4.5",
    "verified_by_llmstats": false,
    "analysis_method": "optimized user simulator",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-28T00:00:00.000000+00:00",
    "updated_at": "2025-07-28T00:00:00.000000+00:00",
    "benchmark_name": "TAU-bench-Retail"
  },
  {
    "model_benchmark_id": 7009,
    "benchmark_id": "bfcl-v3",
    "model_id": "glm-4.5",
    "score": 0.778,
    "normalized_score": 0.778,
    "is_self_reported": true,
    "self_reported_source_link": "https://z.ai/blog/glm-4.5",
    "verified_by_llmstats": false,
    "analysis_method": "Full",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-28T00:00:00.000000+00:00",
    "updated_at": "2025-07-28T00:00:00.000000+00:00",
    "benchmark_name": "BFCL-v3"
  },
  {
    "model_benchmark_id": 7010,
    "benchmark_id": "tau-bench-airline",
    "model_id": "glm-4.5",
    "score": 0.604,
    "normalized_score": 0.604,
    "is_self_reported": true,
    "self_reported_source_link": "https://z.ai/blog/glm-4.5",
    "verified_by_llmstats": false,
    "analysis_method": "optimized user simulator",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-28T00:00:00.000000+00:00",
    "updated_at": "2025-07-28T00:00:00.000000+00:00",
    "benchmark_name": "TAU-bench-Airline"
  },
  {
    "model_benchmark_id": 7011,
    "benchmark_id": "browsecomp",
    "model_id": "glm-4.5",
    "score": 0.264,
    "normalized_score": 0.264,
    "is_self_reported": true,
    "self_reported_source_link": "https://z.ai/blog/glm-4.5",
    "verified_by_llmstats": false,
    "analysis_method": "standard",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-28T00:00:00.000000+00:00",
    "updated_at": "2025-07-28T00:00:00.000000+00:00",
    "benchmark_name": "BrowseComp"
  },
  {
    "model_benchmark_id": 7012,
    "benchmark_id": "hle",
    "model_id": "glm-4.5",
    "score": 0.144,
    "normalized_score": 0.144,
    "is_self_reported": true,
    "self_reported_source_link": "https://z.ai/blog/glm-4.5",
    "verified_by_llmstats": false,
    "analysis_method": "text-based questions only",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-28T00:00:00.000000+00:00",
    "updated_at": "2025-07-28T00:00:00.000000+00:00",
    "benchmark_name": "HLE"
  },
  {
    "model_benchmark_id": 7013,
    "benchmark_id": "aa-index",
    "model_id": "glm-4.5",
    "score": 0.677,
    "normalized_score": 0.677,
    "is_self_reported": true,
    "self_reported_source_link": "https://z.ai/blog/glm-4.5",
    "verified_by_llmstats": false,
    "analysis_method": "Estimated",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-28T00:00:00.000000+00:00",
    "updated_at": "2025-07-28T00:00:00.000000+00:00",
    "benchmark_name": "AA-Index"
  },
  {
    "model_benchmark_id": 7014,
    "benchmark_id": "terminal-bench",
    "model_id": "glm-4.5",
    "score": 0.375,
    "normalized_score": 0.375,
    "is_self_reported": true,
    "self_reported_source_link": "https://z.ai/blog/glm-4.5",
    "verified_by_llmstats": false,
    "analysis_method": "Terminus framework",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-28T00:00:00.000000+00:00",
    "updated_at": "2025-07-28T00:00:00.000000+00:00",
    "benchmark_name": "Terminal-Bench"
  }
]


================================================
FILE: data/organizations/zai-org/models/glm-4.5/model.json
================================================
{
  "model_id": "glm-4.5",
  "name": "GLM-4.5",
  "organization_id": "zai-org",
  "fine_tuned_from_model_id": null,
  "description": "GLM-4.5 is an Agentic, Reasoning, and Coding (ARC) foundation model designed for intelligent agents, featuring 355 billion total parameters with 32 billion active parameters using MoE architecture. Trained on 23T tokens through multi-stage training, it is a hybrid reasoning model that provides two modes: thinking mode for complex reasoning and tool usage, and non-thinking mode for immediate responses. The model unifies agentic, reasoning, and coding capabilities with 128K context length support. It achieves exceptional performance with a score of 63.2 across 12 industry-standard benchmarks, placing 3rd among all proprietary and open-source models. Released under MIT open-source license allowing commercial use and secondary development.",
  "release_date": "2025-07-28",
  "announcement_date": "2025-07-28",
  "license_id": "mit",
  "multimodal": false,
  "knowledge_cutoff": null,
  "param_count": 355000000000,
  "training_tokens": 23000000000000,
  "available_in_zeroeval": true,
  "source_api_ref": "https://docs.z.ai/guides/llm/glm-4.5",
  "source_playground": "https://chat.z.ai",
  "source_paper": "https://arxiv.org/pdf/2508.06471",
  "source_scorecard_blog_link": "https://z.ai/blog/glm-4.5",
  "source_repo_link": "https://github.com/zai-org/GLM-4.5",
  "source_weights_link": "https://huggingface.co/zai-org/GLM-4.5",
  "created_at": "2025-09-15T00:00:00.000000+00:00",
  "updated_at": "2025-09-15T00:00:00.000000+00:00",
  "model_family_id": null
}


================================================
FILE: data/organizations/zai-org/models/glm-4.5-air/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 7101,
    "benchmark_id": "mmlu-pro",
    "model_id": "glm-4.5-air",
    "score": 0.814,
    "normalized_score": 0.814,
    "is_self_reported": true,
    "self_reported_source_link": "https://z.ai/blog/glm-4.5",
    "verified_by_llmstats": false,
    "analysis_method": "standard",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-28T00:00:00.000000+00:00",
    "updated_at": "2025-07-28T00:00:00.000000+00:00",
    "benchmark_name": "MMLU-Pro"
  },
  {
    "model_benchmark_id": 7102,
    "benchmark_id": "aime-2024",
    "model_id": "glm-4.5-air",
    "score": 0.894,
    "normalized_score": 0.894,
    "is_self_reported": true,
    "self_reported_source_link": "https://z.ai/blog/glm-4.5",
    "verified_by_llmstats": false,
    "analysis_method": "Avg@32",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-28T00:00:00.000000+00:00",
    "updated_at": "2025-07-28T00:00:00.000000+00:00",
    "benchmark_name": "AIME24"
  },
  {
    "model_benchmark_id": 7103,
    "benchmark_id": "math-500",
    "model_id": "glm-4.5-air",
    "score": 0.981,
    "normalized_score": 0.981,
    "is_self_reported": true,
    "self_reported_source_link": "https://z.ai/blog/glm-4.5",
    "verified_by_llmstats": false,
    "analysis_method": "standard",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-28T00:00:00.000000+00:00",
    "updated_at": "2025-07-28T00:00:00.000000+00:00",
    "benchmark_name": "MATH-500"
  },
  {
    "model_benchmark_id": 7104,
    "benchmark_id": "scicode",
    "model_id": "glm-4.5-air",
    "score": 0.373,
    "normalized_score": 0.373,
    "is_self_reported": true,
    "self_reported_source_link": "https://z.ai/blog/glm-4.5",
    "verified_by_llmstats": false,
    "analysis_method": "standard",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-28T00:00:00.000000+00:00",
    "updated_at": "2025-07-28T00:00:00.000000+00:00",
    "benchmark_name": "SciCode"
  },
  {
    "model_benchmark_id": 7105,
    "benchmark_id": "gpqa",
    "model_id": "glm-4.5-air",
    "score": 0.75,
    "normalized_score": 0.75,
    "is_self_reported": true,
    "self_reported_source_link": "https://z.ai/blog/glm-4.5",
    "verified_by_llmstats": false,
    "analysis_method": "Avg@8",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-28T00:00:00.000000+00:00",
    "updated_at": "2025-07-28T00:00:00.000000+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 7106,
    "benchmark_id": "livecodebench",
    "model_id": "glm-4.5-air",
    "score": 0.707,
    "normalized_score": 0.707,
    "is_self_reported": true,
    "self_reported_source_link": "https://z.ai/blog/glm-4.5",
    "verified_by_llmstats": false,
    "analysis_method": "2407-2501",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-28T00:00:00.000000+00:00",
    "updated_at": "2025-07-28T00:00:00.000000+00:00",
    "benchmark_name": "LiveCodeBench"
  },
  {
    "model_benchmark_id": 7107,
    "benchmark_id": "swe-bench-verified",
    "model_id": "glm-4.5-air",
    "score": 0.576,
    "normalized_score": 0.576,
    "is_self_reported": true,
    "self_reported_source_link": "https://z.ai/blog/glm-4.5",
    "verified_by_llmstats": false,
    "analysis_method": "OpenHands v0.34.0",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-28T00:00:00.000000+00:00",
    "updated_at": "2025-07-28T00:00:00.000000+00:00",
    "benchmark_name": "SWE-bench-Verified"
  },
  {
    "model_benchmark_id": 7108,
    "benchmark_id": "tau-bench-retail",
    "model_id": "glm-4.5-air",
    "score": 0.779,
    "normalized_score": 0.779,
    "is_self_reported": true,
    "self_reported_source_link": "https://z.ai/blog/glm-4.5",
    "verified_by_llmstats": false,
    "analysis_method": "optimized user simulator",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-28T00:00:00.000000+00:00",
    "updated_at": "2025-07-28T00:00:00.000000+00:00",
    "benchmark_name": "TAU-bench-Retail"
  },
  {
    "model_benchmark_id": 7109,
    "benchmark_id": "bfcl-v3",
    "model_id": "glm-4.5-air",
    "score": 0.764,
    "normalized_score": 0.764,
    "is_self_reported": true,
    "self_reported_source_link": "https://z.ai/blog/glm-4.5",
    "verified_by_llmstats": false,
    "analysis_method": "Full",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-28T00:00:00.000000+00:00",
    "updated_at": "2025-07-28T00:00:00.000000+00:00",
    "benchmark_name": "BFCL-v3"
  },
  {
    "model_benchmark_id": 7110,
    "benchmark_id": "tau-bench-airline",
    "model_id": "glm-4.5-air",
    "score": 0.608,
    "normalized_score": 0.608,
    "is_self_reported": true,
    "self_reported_source_link": "https://z.ai/blog/glm-4.5",
    "verified_by_llmstats": false,
    "analysis_method": "optimized user simulator",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-28T00:00:00.000000+00:00",
    "updated_at": "2025-07-28T00:00:00.000000+00:00",
    "benchmark_name": "TAU-bench-Airline"
  },
  {
    "model_benchmark_id": 7111,
    "benchmark_id": "browsecomp",
    "model_id": "glm-4.5-air",
    "score": 0.213,
    "normalized_score": 0.213,
    "is_self_reported": true,
    "self_reported_source_link": "https://z.ai/blog/glm-4.5",
    "verified_by_llmstats": false,
    "analysis_method": "standard",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-28T00:00:00.000000+00:00",
    "updated_at": "2025-07-28T00:00:00.000000+00:00",
    "benchmark_name": "BrowseComp"
  },
  {
    "model_benchmark_id": 7112,
    "benchmark_id": "hle",
    "model_id": "glm-4.5-air",
    "score": 0.106,
    "normalized_score": 0.106,
    "is_self_reported": true,
    "self_reported_source_link": "https://z.ai/blog/glm-4.5",
    "verified_by_llmstats": false,
    "analysis_method": "text-based questions only",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-28T00:00:00.000000+00:00",
    "updated_at": "2025-07-28T00:00:00.000000+00:00",
    "benchmark_name": "HLE"
  },
  {
    "model_benchmark_id": 7113,
    "benchmark_id": "aa-index",
    "model_id": "glm-4.5-air",
    "score": 0.648,
    "normalized_score": 0.648,
    "is_self_reported": true,
    "self_reported_source_link": "https://z.ai/blog/glm-4.5",
    "verified_by_llmstats": false,
    "analysis_method": "Estimated",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-28T00:00:00.000000+00:00",
    "updated_at": "2025-07-28T00:00:00.000000+00:00",
    "benchmark_name": "AA-Index"
  },
  {
    "model_benchmark_id": 7114,
    "benchmark_id": "terminal-bench",
    "model_id": "glm-4.5-air",
    "score": 0.3,
    "normalized_score": 0.3,
    "is_self_reported": true,
    "self_reported_source_link": "https://z.ai/blog/glm-4.5",
    "verified_by_llmstats": false,
    "analysis_method": "Terminus framework",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-28T00:00:00.000000+00:00",
    "updated_at": "2025-07-28T00:00:00.000000+00:00",
    "benchmark_name": "Terminal-Bench"
  }
]


================================================
FILE: data/organizations/zai-org/models/glm-4.5-air/model.json
================================================
{
  "model_id": "glm-4.5-air",
  "name": "GLM-4.5-Air",
  "organization_id": "zai-org",
  "fine_tuned_from_model_id": null,
  "description": "GLM-4.5-Air is a more compact variant of GLM-4.5 designed for efficient Agentic, Reasoning, and Coding (ARC) applications. It features 106 billion total parameters with 12 billion active parameters using MoE architecture. Like GLM-4.5, it is a hybrid reasoning model providing thinking mode for complex reasoning and tool usage, and non-thinking mode for immediate responses. Despite its compact design, GLM-4.5-Air delivers competitive performance with a score of 59.8 across 12 industry-standard benchmarks, ranking 6th overall while maintaining superior efficiency. It supports 128K context length and is released under MIT open-source license allowing commercial use.",
  "release_date": "2025-07-28",
  "announcement_date": "2025-07-28",
  "license_id": "mit",
  "multimodal": false,
  "knowledge_cutoff": null,
  "param_count": 106000000000,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://docs.z.ai/guides/llm/glm-4.5",
  "source_playground": "https://chat.z.ai",
  "source_paper": "https://arxiv.org/pdf/2508.06471",
  "source_scorecard_blog_link": "https://z.ai/blog/glm-4.5",
  "source_repo_link": "https://github.com/zai-org/GLM-4.5",
  "source_weights_link": "https://huggingface.co/zai-org/GLM-4.5-Air",
  "created_at": "2025-09-15T00:00:00.000000+00:00",
  "updated_at": "2025-09-15T00:00:00.000000+00:00",
  "model_family_id": null
}


================================================
FILE: data/organizations/zai-org/models/glm-4.5v/benchmarks.json
================================================
[]


================================================
FILE: data/organizations/zai-org/models/glm-4.5v/model.json
================================================
{
  "model_id": "glm-4.5v",
  "name": "GLM-4.5V",
  "organization_id": "zai-org",
  "model_family_id": null,
  "fine_tuned_from_model_id": "glm-4.5-air",
  "description": "GLM-4.5V is a multimodal (vision-language) model based on GLM-4.5-Air (106B total, 12B active) that extends hybrid reasoning to images and video. It achieves state-of-the-art results across 40+ VLM benchmarks (image reasoning, video understanding, GUI tasks, chart/document parsing, grounding) while supporting a Thinking Mode switch for deep reasoning. Released under MIT with FP8/BF16 variants and tooling in Transformers, vLLM, and SGLang.",
  "release_date": "2025-08-11",
  "announcement_date": "2025-08-11",
  "license_id": "mit",
  "multimodal": true,
  "knowledge_cutoff": null,
  "param_count": 108000000000,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": null,
  "source_playground": "https://chat.z.ai",
  "source_paper": "https://arxiv.org/abs/2507.01006",
  "source_scorecard_blog_link": null,
  "source_repo_link": "https://github.com/zai-org/GLM-V/",
  "source_weights_link": "https://huggingface.co/zai-org/GLM-4.5V",
  "created_at": "2025-09-29T00:00:00.000000+00:00",
  "updated_at": "2025-09-29T00:00:00.000000+00:00"
}


================================================
FILE: data/organizations/zai-org/models/glm-4.6/benchmarks.json
================================================
[
  {
    "model_benchmark_id": 7002,
    "benchmark_id": "aime-2025",
    "model_id": "glm-4.6",
    "score": 0.939,
    "normalized_score": 0.939,
    "is_self_reported": true,
    "self_reported_source_link": "https://z.ai/blog/glm-4.6",
    "verified_by_llmstats": false,
    "analysis_method": "standard",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-30T00:00:00.000000+00:00",
    "updated_at": "2025-07-30T00:00:00.000000+00:00",
    "benchmark_name": "AIME 2025"
  },
  {
    "model_benchmark_id": 7005,
    "benchmark_id": "gpqa",
    "model_id": "glm-4.6",
    "score": 0.81,
    "normalized_score": 0.81,
    "is_self_reported": true,
    "self_reported_source_link": "https://z.ai/blog/glm-4.6",
    "verified_by_llmstats": false,
    "analysis_method": "standard",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-30T00:00:00.000000+00:00",
    "updated_at": "2025-07-30T00:00:00.000000+00:00",
    "benchmark_name": "GPQA"
  },
  {
    "model_benchmark_id": 7006,
    "benchmark_id": "livecodebench-v6",
    "model_id": "glm-4.6",
    "score": 0.828,
    "normalized_score": 0.828,
    "is_self_reported": true,
    "self_reported_source_link": "https://z.ai/blog/glm-4.6",
    "verified_by_llmstats": false,
    "analysis_method": "standard",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-30T00:00:00.000000+00:00",
    "updated_at": "2025-07-30T00:00:00.000000+00:00",
    "benchmark_name": "LiveCodeBench v6"
  },
  {
    "model_benchmark_id": 7007,
    "benchmark_id": "swe-bench-verified",
    "model_id": "glm-4.6",
    "score": 0.68,
    "normalized_score": 0.68,
    "is_self_reported": true,
    "self_reported_source_link": "https://z.ai/blog/glm-4.6",
    "verified_by_llmstats": false,
    "analysis_method": "OpenHands v0.34.0",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-30T00:00:00.000000+00:00",
    "updated_at": "2025-07-30T00:00:00.000000+00:00",
    "benchmark_name": "SWE-bench-Verified"
  },
  {
    "model_benchmark_id": 7011,
    "benchmark_id": "browsecomp",
    "model_id": "glm-4.6",
    "score": 0.451,
    "normalized_score": 0.451,
    "is_self_reported": true,
    "self_reported_source_link": "https://z.ai/blog/glm-4.6",
    "verified_by_llmstats": false,
    "analysis_method": "standard",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-30T00:00:00.000000+00:00",
    "updated_at": "2025-07-30T00:00:00.000000+00:00",
    "benchmark_name": "BrowseComp"
  },
  {
    "model_benchmark_id": 7012,
    "benchmark_id": "hle",
    "model_id": "glm-4.6",
    "score": 0.172,
    "normalized_score": 0.172,
    "is_self_reported": true,
    "self_reported_source_link": "https://z.ai/blog/glm-4.6",
    "verified_by_llmstats": false,
    "analysis_method": "standard",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-30T00:00:00.000000+00:00",
    "updated_at": "2025-07-30T00:00:00.000000+00:00",
    "benchmark_name": "HLE"
  },
  {
    "model_benchmark_id": 7014,
    "benchmark_id": "terminal-bench",
    "model_id": "glm-4.6",
    "score": 0.405,
    "normalized_score": 0.405,
    "is_self_reported": true,
    "self_reported_source_link": "https://z.ai/blog/glm-4.6",
    "verified_by_llmstats": false,
    "analysis_method": "standard",
    "verification_provider_id": null,
    "verification_hardware": null,
    "verification_date": null,
    "verification_notes": null,
    "created_at": "2025-07-30T00:00:00.000000+00:00",
    "updated_at": "2025-07-30T00:00:00.000000+00:00",
    "benchmark_name": "Terminal-Bench"
  }
]


================================================
FILE: data/organizations/zai-org/models/glm-4.6/model.json
================================================
{
  "model_id": "glm-4.6",
  "name": "GLM-4.6",
  "organization_id": "zai-org",
  "model_family_id": null,
  "fine_tuned_from_model_id": null,
  "description": "GLM-4.6 is the latest version of Z.ai's flagship model, bringing significant improvements over GLM-4.5. Key features include: 200K token context window (expanded from 128K), superior coding performance with better real-world application in Claude Code/Cline/Roo Code/Kilo Code, advanced reasoning with tool use during inference, stronger agent capabilities, and refined writing aligned with human preferences. GLM-4.6 achieves competitive performance with DeepSeek-V3.2-Exp and Claude Sonnet 4, reaching near parity with Claude Sonnet 4 (48.6% win rate) on CC-Bench real-world coding tasks.",
  "release_date": "2025-09-30",
  "announcement_date": "2025-09-30",
  "license_id": "mit",
  "multimodal": true,
  "knowledge_cutoff": null,
  "param_count": 357000000000,
  "training_tokens": null,
  "available_in_zeroeval": true,
  "source_api_ref": "https://docs.z.ai/guides/llm/glm-4.6",
  "source_playground": "https://chat.z.ai",
  "source_paper": "https://arxiv.org/pdf/2508.06471",
  "source_scorecard_blog_link": "https://huggingface.co/zai-org/GLM-4.6",
  "source_repo_link": null,
  "source_weights_link": null,
  "created_at": "2025-09-29T00:00:00.000000+00:00",
  "updated_at": "2025-09-30T00:00:00.000000+00:00"
}


================================================
FILE: data/organizations/zai-org/organization.json
================================================
{
  "organization_id": "zai-org",
  "name": "Zhipu AI",
  "website": "https://z.ai",
  "description": "Zhipu AI is a Chinese AI company that provides a suite of AI tools and services.",
  "country": "CN",
  "created_at": "2025-09-15T00:00:00.000000+00:00",
  "updated_at": "2025-09-15T00:00:00.000000+00:00"
}


================================================
FILE: data/providers/anthropic/models.json
================================================
[
  {
    "model_provider_id": 398,
    "model_id": "claude-3-5-haiku-20241022",
    "provider_id": "anthropic",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 100,
    "output_cents_per_million_tokens": 500,
    "quantization": null,
    "max_input_tokens": 200000,
    "max_output_tokens": 200000,
    "throughput": 100.0,
    "latency": 0.3,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.073101+00:00",
    "updated_at": "2025-07-19T19:49:17.073101+00:00",
    "provider_model_id_used": "claude-3-5-haiku-20241022",
    "model_name": "Claude 3.5 Haiku",
    "organization_id": "anthropic"
  },
  {
    "model_provider_id": 397,
    "model_id": "claude-3-5-sonnet-20241022",
    "provider_id": "anthropic",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 300,
    "output_cents_per_million_tokens": 1500,
    "quantization": null,
    "max_input_tokens": 200000,
    "max_output_tokens": 200000,
    "throughput": 100.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.071608+00:00",
    "updated_at": "2025-07-19T19:49:17.071608+00:00",
    "provider_model_id_used": "claude-3-5-sonnet-20241022",
    "model_name": "Claude 3.5 Sonnet",
    "organization_id": "anthropic"
  },
  {
    "model_provider_id": 402,
    "model_id": "claude-3-7-sonnet-20250219",
    "provider_id": "anthropic",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 300,
    "output_cents_per_million_tokens": 1500,
    "quantization": null,
    "max_input_tokens": 200000,
    "max_output_tokens": 128000,
    "throughput": 42.0,
    "latency": 0.4,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.082450+00:00",
    "updated_at": "2025-07-19T19:49:17.082450+00:00",
    "provider_model_id_used": "claude-3-7-sonnet-20250219",
    "model_name": "Claude 3.7 Sonnet",
    "organization_id": "anthropic"
  },
  {
    "model_provider_id": 401,
    "model_id": "claude-3-haiku-20240307",
    "provider_id": "anthropic",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 25,
    "output_cents_per_million_tokens": 125,
    "quantization": null,
    "max_input_tokens": 200000,
    "max_output_tokens": 200000,
    "throughput": 100.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.080579+00:00",
    "updated_at": "2025-07-19T19:49:17.080579+00:00",
    "provider_model_id_used": "claude-3-haiku-20240307",
    "model_name": "Claude 3 Haiku",
    "organization_id": "anthropic"
  },
  {
    "model_provider_id": 399,
    "model_id": "claude-3-opus-20240229",
    "provider_id": "anthropic",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 1500,
    "output_cents_per_million_tokens": 7500,
    "quantization": null,
    "max_input_tokens": 200000,
    "max_output_tokens": 200000,
    "throughput": 100.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.075485+00:00",
    "updated_at": "2025-07-19T19:49:17.075485+00:00",
    "provider_model_id_used": "claude-3-opus-20240229",
    "model_name": "Claude 3 Opus",
    "organization_id": "anthropic"
  },
  {
    "model_provider_id": 400,
    "model_id": "claude-3-sonnet-20240229",
    "provider_id": "anthropic",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 300,
    "output_cents_per_million_tokens": 1500,
    "quantization": null,
    "max_input_tokens": 200000,
    "max_output_tokens": 200000,
    "throughput": 100.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.078602+00:00",
    "updated_at": "2025-07-19T19:49:17.078602+00:00",
    "provider_model_id_used": "claude-3-sonnet-20240229",
    "model_name": "Claude 3 Sonnet",
    "organization_id": "anthropic"
  },
  {
    "model_provider_id": 404,
    "model_id": "claude-opus-4-20250514",
    "provider_id": "anthropic",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 1500,
    "output_cents_per_million_tokens": 7500,
    "quantization": null,
    "max_input_tokens": 200000,
    "max_output_tokens": 128000,
    "throughput": 100.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.086661+00:00",
    "updated_at": "2025-07-19T19:49:17.086661+00:00",
    "provider_model_id_used": "claude-opus-4-20250514",
    "model_name": "Claude Opus 4",
    "organization_id": "anthropic"
  },
  {
    "model_provider_id": 405,
    "model_id": "claude-opus-4-1-20250805",
    "provider_id": "anthropic",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 1500,
    "output_cents_per_million_tokens": 7500,
    "quantization": null,
    "max_input_tokens": 200000,
    "max_output_tokens": 32000,
    "throughput": 100.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-08-05T00:00:00.000000+00:00",
    "updated_at": "2025-08-05T00:00:00.000000+00:00",
    "provider_model_id_used": "claude-opus-4-1-20250805",
    "model_name": "Claude Opus 4.1",
    "organization_id": "anthropic"
  },
  {
    "model_provider_id": 403,
    "model_id": "claude-sonnet-4-20250514",
    "provider_id": "anthropic",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 300,
    "output_cents_per_million_tokens": 1500,
    "quantization": null,
    "max_input_tokens": 200000,
    "max_output_tokens": 128000,
    "throughput": 42.0,
    "latency": 0.4,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.084616+00:00",
    "updated_at": "2025-07-19T19:49:17.084616+00:00",
    "provider_model_id_used": "claude-sonnet-4-20250514",
    "model_name": "Claude Sonnet 4",
    "organization_id": "anthropic"
  },
  {
    "model_provider_id": 406,
    "model_id": "claude-sonnet-4-5-20250929",
    "provider_id": "anthropic",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 300,
    "output_cents_per_million_tokens": 1500,
    "quantization": null,
    "max_input_tokens": 200000,
    "max_output_tokens": 64000,
    "throughput": 42.0,
    "latency": 0.4,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": true,
    "input_modality_video": true,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.084616+00:00",
    "updated_at": "2025-07-19T19:49:17.084616+00:00",
    "provider_model_id_used": "claude-sonnet-4-5-20250929",
    "model_name": "Claude Sonnet 4.5",
    "organization_id": "anthropic"
  },
  {
    "model_provider_id": 407,
    "model_id": "claude-haiku-4-5-20251015",
    "provider_id": "anthropic",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 100,
    "output_cents_per_million_tokens": 500,
    "quantization": null,
    "max_input_tokens": 200000,
    "max_output_tokens": 200000,
    "throughput": 100.0,
    "latency": 0.3,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-10-15T00:00:00.000000+00:00",
    "updated_at": "2025-10-15T00:00:00.000000+00:00",
    "provider_model_id_used": "claude-haiku-4-5-20251015",
    "model_name": "Claude Haiku 4.5",
    "organization_id": "anthropic"
  }
]


================================================
FILE: data/providers/anthropic/provider.json
================================================
{
  "provider_id": "anthropic",
  "name": "Anthropic",
  "website": "https://anthropic.com",
  "created_at": "2025-07-19T19:49:17.069874+00:00",
  "updated_at": "2025-07-19T19:49:17.069874+00:00"
}


================================================
FILE: data/providers/azure/models.json
================================================
[
  {
    "model_provider_id": 261,
    "model_id": "gpt-3.5-turbo-0125",
    "provider_id": "azure",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 50,
    "output_cents_per_million_tokens": 150,
    "quantization": null,
    "max_input_tokens": 16385,
    "max_output_tokens": 4096,
    "throughput": 90.0,
    "latency": 0.8,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.759540+00:00",
    "updated_at": "2025-07-19T19:49:16.759540+00:00",
    "provider_model_id_used": "gpt-3.5-turbo-0125",
    "model_name": "GPT-3.5 Turbo",
    "organization_id": "openai"
  },
  {
    "model_provider_id": 259,
    "model_id": "gpt-4-0613",
    "provider_id": "azure",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 3000,
    "output_cents_per_million_tokens": 6000,
    "quantization": null,
    "max_input_tokens": 32768,
    "max_output_tokens": 32768,
    "throughput": 104.0,
    "latency": 0.3,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.751649+00:00",
    "updated_at": "2025-07-19T19:49:16.751649+00:00",
    "provider_model_id_used": "gpt-4-0613",
    "model_name": "GPT-4",
    "organization_id": "openai"
  },
  {
    "model_provider_id": 264,
    "model_id": "gpt-4o-2024-05-13",
    "provider_id": "azure",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 250,
    "output_cents_per_million_tokens": 1000,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 4096,
    "throughput": 92.0,
    "latency": 0.54,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.767540+00:00",
    "updated_at": "2025-07-19T19:49:16.767540+00:00",
    "provider_model_id_used": "gpt-4o-2024-05-13",
    "model_name": "GPT-4o",
    "organization_id": "openai"
  },
  {
    "model_provider_id": 263,
    "model_id": "gpt-4o-2024-08-06",
    "provider_id": "azure",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 250,
    "output_cents_per_million_tokens": 1000,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 16384,
    "throughput": 99.0,
    "latency": 0.53,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.765163+00:00",
    "updated_at": "2025-07-19T19:49:16.765163+00:00",
    "provider_model_id_used": "gpt-4o-2024-08-06",
    "model_name": "GPT-4o",
    "organization_id": "openai"
  },
  {
    "model_provider_id": 262,
    "model_id": "gpt-4o-mini-2024-07-18",
    "provider_id": "azure",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 15,
    "output_cents_per_million_tokens": 60,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 16384,
    "throughput": 92.0,
    "latency": 0.52,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.762692+00:00",
    "updated_at": "2025-07-19T19:49:16.762692+00:00",
    "provider_model_id_used": "gpt-4o-mini-2024-07-18",
    "model_name": "GPT-4o mini",
    "organization_id": "openai"
  },
  {
    "model_provider_id": 260,
    "model_id": "gpt-4-turbo-2024-04-09",
    "provider_id": "azure",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 1000,
    "output_cents_per_million_tokens": 3000,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 4096,
    "throughput": 97.0,
    "latency": 0.6,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.755438+00:00",
    "updated_at": "2025-07-19T19:49:16.755438+00:00",
    "provider_model_id_used": "gpt-4-turbo-2024-04-09",
    "model_name": "GPT-4 Turbo",
    "organization_id": "openai"
  },
  {
    "model_provider_id": 266,
    "model_id": "o1-2024-12-17",
    "provider_id": "azure",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 1500,
    "output_cents_per_million_tokens": 6000,
    "quantization": null,
    "max_input_tokens": 200000,
    "max_output_tokens": 100000,
    "throughput": 16.0,
    "latency": 0.54,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.772502+00:00",
    "updated_at": "2025-07-19T19:49:16.772502+00:00",
    "provider_model_id_used": "o1-2024-12-17",
    "model_name": "o1",
    "organization_id": "openai"
  },
  {
    "model_provider_id": 267,
    "model_id": "o1-mini",
    "provider_id": "azure",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 330,
    "output_cents_per_million_tokens": 1320,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 65536,
    "throughput": 100.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.774395+00:00",
    "updated_at": "2025-07-19T19:49:16.774395+00:00",
    "provider_model_id_used": "o1-mini",
    "model_name": "o1-mini",
    "organization_id": "openai"
  },
  {
    "model_provider_id": 265,
    "model_id": "o1-preview",
    "provider_id": "azure",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 1650,
    "output_cents_per_million_tokens": 6600,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 32768,
    "throughput": 16.0,
    "latency": 0.54,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.770395+00:00",
    "updated_at": "2025-07-19T19:49:16.770395+00:00",
    "provider_model_id_used": "o1-preview",
    "model_name": "o1-preview",
    "organization_id": "openai"
  },
  {
    "model_provider_id": 268,
    "model_id": "o3-mini",
    "provider_id": "azure",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 110,
    "output_cents_per_million_tokens": 440,
    "quantization": null,
    "max_input_tokens": 200000,
    "max_output_tokens": 100000,
    "throughput": 115.0,
    "latency": 5.2,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.776480+00:00",
    "updated_at": "2025-07-19T19:49:16.776480+00:00",
    "provider_model_id_used": "o3-mini",
    "model_name": "o3-mini",
    "organization_id": "openai"
  },
  {
    "model_provider_id": 269,
    "model_id": "phi-3.5-mini-instruct",
    "provider_id": "azure",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 10,
    "output_cents_per_million_tokens": 10,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 23.0,
    "latency": 0.52,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.778852+00:00",
    "updated_at": "2025-07-19T19:49:16.778852+00:00",
    "provider_model_id_used": "phi-3.5-mini-instruct",
    "model_name": "Phi-3.5-mini-instruct",
    "organization_id": "microsoft"
  }
]

================================================
FILE: data/providers/azure/provider.json
================================================
{
  "provider_id": "azure",
  "name": "Azure",
  "website": "https://azure.microsoft.com",
  "created_at": "2025-07-19T19:49:16.749000+00:00",
  "updated_at": "2025-07-19T19:49:16.749000+00:00"
}

================================================
FILE: data/providers/bedrock/models.json
================================================
[
  {
    "model_provider_id": 369,
    "model_id": "claude-3-5-haiku-20241022",
    "provider_id": "bedrock",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 80,
    "output_cents_per_million_tokens": 400,
    "quantization": null,
    "max_input_tokens": 200000,
    "max_output_tokens": 200000,
    "throughput": 104.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.009862+00:00",
    "updated_at": "2025-07-19T19:49:17.009862+00:00",
    "provider_model_id_used": "claude-3-5-haiku-20241022",
    "model_name": "Claude 3.5 Haiku",
    "organization_id": "anthropic"
  },
  {
    "model_provider_id": 368,
    "model_id": "claude-3-5-sonnet-20240620",
    "provider_id": "bedrock",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 300,
    "output_cents_per_million_tokens": 1500,
    "quantization": null,
    "max_input_tokens": 200000,
    "max_output_tokens": 200000,
    "throughput": 101.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.007722+00:00",
    "updated_at": "2025-07-19T19:49:17.007722+00:00",
    "provider_model_id_used": "claude-3-5-sonnet-20240620",
    "model_name": "Claude 3.5 Sonnet",
    "organization_id": "anthropic"
  },
  {
    "model_provider_id": 367,
    "model_id": "claude-3-5-sonnet-20241022",
    "provider_id": "bedrock",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 300,
    "output_cents_per_million_tokens": 1500,
    "quantization": null,
    "max_input_tokens": 200000,
    "max_output_tokens": 200000,
    "throughput": 101.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.005765+00:00",
    "updated_at": "2025-07-19T19:49:17.005765+00:00",
    "provider_model_id_used": "claude-3-5-sonnet-20241022",
    "model_name": "Claude 3.5 Sonnet",
    "organization_id": "anthropic"
  },
  {
    "model_provider_id": 385,
    "model_id": "claude-3-7-sonnet-20250219",
    "provider_id": "bedrock",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 300,
    "output_cents_per_million_tokens": 1500,
    "quantization": null,
    "max_input_tokens": 200000,
    "max_output_tokens": 128000,
    "throughput": 101.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.041625+00:00",
    "updated_at": "2025-07-19T19:49:17.041625+00:00",
    "provider_model_id_used": "claude-3-7-sonnet-20250219",
    "model_name": "Claude 3.7 Sonnet",
    "organization_id": "anthropic"
  },
  {
    "model_provider_id": 372,
    "model_id": "claude-3-haiku-20240307",
    "provider_id": "bedrock",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 25,
    "output_cents_per_million_tokens": 125,
    "quantization": null,
    "max_input_tokens": 200000,
    "max_output_tokens": 200000,
    "throughput": 104.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.016542+00:00",
    "updated_at": "2025-07-19T19:49:17.016542+00:00",
    "provider_model_id_used": "claude-3-haiku-20240307",
    "model_name": "Claude 3 Haiku",
    "organization_id": "anthropic"
  },
  {
    "model_provider_id": 370,
    "model_id": "claude-3-opus-20240229",
    "provider_id": "bedrock",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 1500,
    "output_cents_per_million_tokens": 7500,
    "quantization": null,
    "max_input_tokens": 200000,
    "max_output_tokens": 200000,
    "throughput": 120.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.011523+00:00",
    "updated_at": "2025-07-19T19:49:17.011523+00:00",
    "provider_model_id_used": "claude-3-opus-20240229",
    "model_name": "Claude 3 Opus",
    "organization_id": "anthropic"
  },
  {
    "model_provider_id": 371,
    "model_id": "claude-3-sonnet-20240229",
    "provider_id": "bedrock",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 300,
    "output_cents_per_million_tokens": 1500,
    "quantization": null,
    "max_input_tokens": 200000,
    "max_output_tokens": 200000,
    "throughput": 120.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.014573+00:00",
    "updated_at": "2025-07-19T19:49:17.014573+00:00",
    "provider_model_id_used": "claude-3-sonnet-20240229",
    "model_name": "Claude 3 Sonnet",
    "organization_id": "anthropic"
  },
  {
    "model_provider_id": 387,
    "model_id": "claude-opus-4-20250514",
    "provider_id": "bedrock",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 1500,
    "output_cents_per_million_tokens": 7500,
    "quantization": null,
    "max_input_tokens": 200000,
    "max_output_tokens": 128000,
    "throughput": 120.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.046935+00:00",
    "updated_at": "2025-07-19T19:49:17.046935+00:00",
    "provider_model_id_used": "claude-opus-4-20250514",
    "model_name": "Claude Opus 4",
    "organization_id": "anthropic"
  },
  {
    "model_provider_id": 388,
    "model_id": "claude-opus-4-1-20250805",
    "provider_id": "bedrock",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 1500,
    "output_cents_per_million_tokens": 7500,
    "quantization": null,
    "max_input_tokens": 200000,
    "max_output_tokens": 32000,
    "throughput": 120.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-08-05T00:00:00.000000+00:00",
    "updated_at": "2025-08-05T00:00:00.000000+00:00",
    "provider_model_id_used": "claude-opus-4-1-20250805",
    "model_name": "Claude Opus 4.1",
    "organization_id": "anthropic"
  },
  {
    "model_provider_id": 386,
    "model_id": "claude-sonnet-4-20250514",
    "provider_id": "bedrock",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 300,
    "output_cents_per_million_tokens": 1500,
    "quantization": null,
    "max_input_tokens": 200000,
    "max_output_tokens": 128000,
    "throughput": 101.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.044184+00:00",
    "updated_at": "2025-07-19T19:49:17.044184+00:00",
    "provider_model_id_used": "claude-sonnet-4-20250514",
    "model_name": "Claude Sonnet 4",
    "organization_id": "anthropic"
  },
  {
    "model_provider_id": 381,
    "model_id": "command-r-plus-04-2024",
    "provider_id": "bedrock",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 300,
    "output_cents_per_million_tokens": 1500,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 100.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.034365+00:00",
    "updated_at": "2025-07-19T19:49:17.034365+00:00",
    "provider_model_id_used": "command-r-plus-04-2024",
    "model_name": "Command R+",
    "organization_id": "cohere"
  },
  {
    "model_provider_id": 374,
    "model_id": "jamba-1.5-large",
    "provider_id": "bedrock",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 200,
    "output_cents_per_million_tokens": 800,
    "quantization": null,
    "max_input_tokens": 256000,
    "max_output_tokens": 256000,
    "throughput": 100.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.020432+00:00",
    "updated_at": "2025-07-19T19:49:17.020432+00:00",
    "provider_model_id_used": "jamba-1.5-large",
    "model_name": "Jamba 1.5 Large",
    "organization_id": "ai21"
  },
  {
    "model_provider_id": 373,
    "model_id": "jamba-1.5-mini",
    "provider_id": "bedrock",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 20,
    "output_cents_per_million_tokens": 40,
    "quantization": null,
    "max_input_tokens": 256144,
    "max_output_tokens": 256144,
    "throughput": 100.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.018357+00:00",
    "updated_at": "2025-07-19T19:49:17.018357+00:00",
    "provider_model_id_used": "jamba-1.5-mini",
    "model_name": "Jamba 1.5 Mini",
    "organization_id": "ai21"
  },
  {
    "model_provider_id": 376,
    "model_id": "llama-3.1-405b-instruct",
    "provider_id": "bedrock",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 300,
    "output_cents_per_million_tokens": 300,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 100.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.024314+00:00",
    "updated_at": "2025-07-19T19:49:17.024314+00:00",
    "provider_model_id_used": "llama-3.1-405b-instruct",
    "model_name": "Llama 3.1 405B Instruct",
    "organization_id": "meta"
  },
  {
    "model_provider_id": 375,
    "model_id": "llama-3.1-70b-instruct",
    "provider_id": "bedrock",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 89,
    "output_cents_per_million_tokens": 89,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 100.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.022256+00:00",
    "updated_at": "2025-07-19T19:49:17.022256+00:00",
    "provider_model_id_used": "llama-3.1-70b-instruct",
    "model_name": "Llama 3.1 70B Instruct",
    "organization_id": "meta"
  },
  {
    "model_provider_id": 377,
    "model_id": "llama-3.1-8b-instruct",
    "provider_id": "bedrock",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 22,
    "output_cents_per_million_tokens": 22,
    "quantization": null,
    "max_input_tokens": 131072,
    "max_output_tokens": 131072,
    "throughput": 100.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.026582+00:00",
    "updated_at": "2025-07-19T19:49:17.026582+00:00",
    "provider_model_id_used": "llama-3.1-8b-instruct",
    "model_name": "Llama 3.1 8B Instruct",
    "organization_id": "meta"
  },
  {
    "model_provider_id": 378,
    "model_id": "llama-3.2-11b-instruct",
    "provider_id": "bedrock",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 16,
    "output_cents_per_million_tokens": 16,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 100.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.028853+00:00",
    "updated_at": "2025-07-19T19:49:17.028853+00:00",
    "provider_model_id_used": "llama-3.2-11b-instruct",
    "model_name": "Llama 3.2 11B Instruct",
    "organization_id": "meta"
  },
  {
    "model_provider_id": 379,
    "model_id": "llama-3.2-90b-instruct",
    "provider_id": "bedrock",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 72,
    "output_cents_per_million_tokens": 72,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 100.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.030727+00:00",
    "updated_at": "2025-07-19T19:49:17.030727+00:00",
    "provider_model_id_used": "llama-3.2-90b-instruct",
    "model_name": "Llama 3.2 90B Instruct",
    "organization_id": "meta"
  },
  {
    "model_provider_id": 380,
    "model_id": "llama-3.3-70b-instruct",
    "provider_id": "bedrock",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 72,
    "output_cents_per_million_tokens": 72,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 100.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.032478+00:00",
    "updated_at": "2025-07-19T19:49:17.032478+00:00",
    "provider_model_id_used": "llama-3.3-70b-instruct",
    "model_name": "Llama 3.3 70B Instruct",
    "organization_id": "meta"
  },
  {
    "model_provider_id": 383,
    "model_id": "nova-lite",
    "provider_id": "bedrock",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 6,
    "output_cents_per_million_tokens": 24,
    "quantization": null,
    "max_input_tokens": 300000,
    "max_output_tokens": 2048,
    "throughput": 100.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.037841+00:00",
    "updated_at": "2025-07-19T19:49:17.037841+00:00",
    "provider_model_id_used": "nova-lite",
    "model_name": "Nova Lite",
    "organization_id": "amazon"
  },
  {
    "model_provider_id": 382,
    "model_id": "nova-micro",
    "provider_id": "bedrock",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 3,
    "output_cents_per_million_tokens": 14,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 100.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.036065+00:00",
    "updated_at": "2025-07-19T19:49:17.036065+00:00",
    "provider_model_id_used": "nova-micro",
    "model_name": "Nova Micro",
    "organization_id": "amazon"
  },
  {
    "model_provider_id": 384,
    "model_id": "nova-pro",
    "provider_id": "bedrock",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 80,
    "output_cents_per_million_tokens": 320,
    "quantization": null,
    "max_input_tokens": 300000,
    "max_output_tokens": 300000,
    "throughput": 100.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.039606+00:00",
    "updated_at": "2025-07-19T19:49:17.039606+00:00",
    "provider_model_id_used": "nova-pro",
    "model_name": "Nova Pro",
    "organization_id": "amazon"
  }
]


================================================
FILE: data/providers/bedrock/provider.json
================================================
{
  "provider_id": "bedrock",
  "name": "Bedrock",
  "website": "https://aws.amazon.com/bedrock/",
  "created_at": "2025-07-19T19:49:17.004009+00:00",
  "updated_at": "2025-07-19T19:49:17.004009+00:00"
}

================================================
FILE: data/providers/cerebras/models.json
================================================
[
  {
    "model_provider_id": 405,
    "model_id": "llama-3.1-70b-instruct",
    "provider_id": "cerebras",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 60,
    "output_cents_per_million_tokens": 60,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 1204.0,
    "latency": 0.2,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.090362+00:00",
    "updated_at": "2025-07-19T19:49:17.090362+00:00",
    "provider_model_id_used": "llama-3.1-70b-instruct",
    "model_name": "Llama 3.1 70B Instruct",
    "organization_id": "meta"
  },
  {
    "model_provider_id": 406,
    "model_id": "llama-3.1-8b-instruct",
    "provider_id": "cerebras",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 10,
    "output_cents_per_million_tokens": 10,
    "quantization": null,
    "max_input_tokens": 131072,
    "max_output_tokens": 131072,
    "throughput": 2047.0,
    "latency": 0.2,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.092709+00:00",
    "updated_at": "2025-07-19T19:49:17.092709+00:00",
    "provider_model_id_used": "llama-3.1-8b-instruct",
    "model_name": "Llama 3.1 8B Instruct",
    "organization_id": "meta"
  },
  {
    "model_provider_id": 407,
    "model_id": "llama-3.3-70b-instruct",
    "provider_id": "cerebras",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 70,
    "output_cents_per_million_tokens": 80,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 2220.0,
    "latency": 0.65,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.095252+00:00",
    "updated_at": "2025-07-19T19:49:17.095252+00:00",
    "provider_model_id_used": "llama-3.3-70b-instruct",
    "model_name": "Llama 3.3 70B Instruct",
    "organization_id": "meta"
  }
]

================================================
FILE: data/providers/cerebras/provider.json
================================================
{
  "provider_id": "cerebras",
  "name": "Cerebras",
  "website": "https://cerebras.ai",
  "created_at": "2025-07-19T19:49:17.088130+00:00",
  "updated_at": "2025-07-19T19:49:17.088130+00:00"
}

================================================
FILE: data/providers/cohere/models.json
================================================
[
  {
    "model_provider_id": 238,
    "model_id": "command-r-plus-04-2024",
    "provider_id": "cohere",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 25,
    "output_cents_per_million_tokens": 100,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 59.0,
    "latency": 0.65,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.693641+00:00",
    "updated_at": "2025-07-19T19:49:16.693641+00:00",
    "provider_model_id_used": "command-r-plus-04-2024",
    "model_name": "Command R+",
    "organization_id": "cohere"
  }
]

================================================
FILE: data/providers/cohere/provider.json
================================================
{
  "provider_id": "cohere",
  "name": "Cohere",
  "website": "https://cohere.ai",
  "created_at": "2025-07-19T19:49:16.663117+00:00",
  "updated_at": "2025-07-19T19:49:16.663117+00:00"
}


================================================
FILE: data/providers/deepinfra/models.json
================================================
[
  {
    "model_provider_id": 290,
    "model_id": "deepseek-r1",
    "provider_id": "deepinfra",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 85,
    "output_cents_per_million_tokens": 250,
    "quantization": null,
    "max_input_tokens": 131072,
    "max_output_tokens": 131072,
    "throughput": 0.9,
    "latency": 0.3,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.830887+00:00",
    "updated_at": "2025-07-19T19:49:16.830887+00:00",
    "provider_model_id_used": "deepseek-r1",
    "model_name": "DeepSeek-R1",
    "organization_id": "deepseek"
  },
  {
    "model_provider_id": 304,
    "model_id": "deepseek-r1-0528",
    "provider_id": "deepinfra",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 50,
    "output_cents_per_million_tokens": 215,
    "quantization": null,
    "max_input_tokens": 131072,
    "max_output_tokens": 131072,
    "throughput": 45.04,
    "latency": 0.61,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.862375+00:00",
    "updated_at": "2025-07-19T19:49:16.862375+00:00",
    "provider_model_id_used": "deepseek-r1-0528",
    "model_name": "DeepSeek-R1-0528",
    "organization_id": "deepseek"
  },
  {
    "model_provider_id": 298,
    "model_id": "deepseek-r1-distill-llama-70b",
    "provider_id": "deepinfra",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 10,
    "output_cents_per_million_tokens": 40,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 37.0,
    "latency": 0.65,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.847437+00:00",
    "updated_at": "2025-07-19T19:49:16.847437+00:00",
    "provider_model_id_used": "deepseek-r1-distill-llama-70b",
    "model_name": "DeepSeek R1 Distill Llama 70B",
    "organization_id": "deepseek"
  },
  {
    "model_provider_id": 299,
    "model_id": "deepseek-r1-distill-qwen-32b",
    "provider_id": "deepinfra",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 12,
    "output_cents_per_million_tokens": 18,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 37.0,
    "latency": 0.65,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.849673+00:00",
    "updated_at": "2025-07-19T19:49:16.849673+00:00",
    "provider_model_id_used": "deepseek-r1-distill-qwen-32b",
    "model_name": "DeepSeek R1 Distill Qwen 32B",
    "organization_id": "deepseek"
  },
  {
    "model_provider_id": 284,
    "model_id": "deepseek-v2.5",
    "provider_id": "deepinfra",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 70,
    "output_cents_per_million_tokens": 140,
    "quantization": null,
    "max_input_tokens": 8192,
    "max_output_tokens": 8192,
    "throughput": 63.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.819006+00:00",
    "updated_at": "2025-07-19T19:49:16.819006+00:00",
    "provider_model_id_used": "deepseek-v2.5",
    "model_name": "DeepSeek-V2.5",
    "organization_id": "deepseek"
  },
  {
    "model_provider_id": 305,
    "model_id": "deepseek-v3.1",
    "provider_id": "deepinfra",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 27,
    "output_cents_per_million_tokens": 100,
    "quantization": "int4",
    "max_input_tokens": 163840,
    "max_output_tokens": 163840,
    "throughput": null,
    "latency": null,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-09-15T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "provider_model_id_used": "deepseek-ai/DeepSeek-V3.1",
    "model_name": "DeepSeek V3.1",
    "organization_id": "deepseek"
  },
  {
    "model_provider_id": 306,
    "model_id": "glm-4.5",
    "provider_id": "deepinfra",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 40,
    "output_cents_per_million_tokens": 160,
    "quantization": "fp8",
    "max_input_tokens": 131072,
    "max_output_tokens": 131072,
    "throughput": null,
    "latency": null,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": false,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-09-15T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "provider_model_id_used": "zai-org/GLM-4.5",
    "model_name": "GLM-4.5",
    "organization_id": "zai-org"
  },
  {
    "model_provider_id": 307,
    "model_id": "gpt-oss-120b",
    "provider_id": "deepinfra",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 9,
    "output_cents_per_million_tokens": 45,
    "quantization": "int4",
    "max_input_tokens": 131072,
    "max_output_tokens": 131072,
    "throughput": null,
    "latency": null,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-09-15T00:00:00.000000+00:00",
    "updated_at": "2025-09-15T00:00:00.000000+00:00",
    "provider_model_id_used": "openai/gpt-oss-120b",
    "model_name": "GPT-OSS-120B",
    "organization_id": "openai"
  },
  {
    "model_provider_id": 294,
    "model_id": "gemma-3-12b-it",
    "provider_id": "deepinfra",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 5,
    "output_cents_per_million_tokens": 10,
    "quantization": null,
    "max_input_tokens": 131072,
    "max_output_tokens": 131072,
    "throughput": 33.0,
    "latency": 0.2,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.839147+00:00",
    "updated_at": "2025-07-19T19:49:16.839147+00:00",
    "provider_model_id_used": "gemma-3-12b-it",
    "model_name": "Gemma 3 12B",
    "organization_id": "google"
  },
  {
    "model_provider_id": 295,
    "model_id": "gemma-3-27b-it",
    "provider_id": "deepinfra",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 10,
    "output_cents_per_million_tokens": 20,
    "quantization": null,
    "max_input_tokens": 131072,
    "max_output_tokens": 131072,
    "throughput": 33.0,
    "latency": 0.2,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.841300+00:00",
    "updated_at": "2025-07-19T19:49:16.841300+00:00",
    "provider_model_id_used": "gemma-3-27b-it",
    "model_name": "Gemma 3 27B",
    "organization_id": "google"
  },
  {
    "model_provider_id": 293,
    "model_id": "gemma-3-4b-it",
    "provider_id": "deepinfra",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 2,
    "output_cents_per_million_tokens": 4,
    "quantization": null,
    "max_input_tokens": 131072,
    "max_output_tokens": 131072,
    "throughput": 33.0,
    "latency": 0.2,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.837297+00:00",
    "updated_at": "2025-07-19T19:49:16.837297+00:00",
    "provider_model_id_used": "gemma-3-4b-it",
    "model_name": "Gemma 3 4B",
    "organization_id": "google"
  },
  {
    "model_provider_id": 281,
    "model_id": "llama-3.1-405b-instruct",
    "provider_id": "deepinfra",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 179,
    "output_cents_per_million_tokens": 179,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 27.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.812645+00:00",
    "updated_at": "2025-07-19T19:49:16.812645+00:00",
    "provider_model_id_used": "llama-3.1-405b-instruct",
    "model_name": "Llama 3.1 405B Instruct",
    "organization_id": "meta"
  },
  {
    "model_provider_id": 279,
    "model_id": "llama-3.1-70b-instruct",
    "provider_id": "deepinfra",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 35,
    "output_cents_per_million_tokens": 40,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 25.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.808506+00:00",
    "updated_at": "2025-07-19T19:49:16.808506+00:00",
    "provider_model_id_used": "llama-3.1-70b-instruct",
    "model_name": "Llama 3.1 70B Instruct",
    "organization_id": "meta"
  },
  {
    "model_provider_id": 280,
    "model_id": "llama-3.1-8b-instruct",
    "provider_id": "deepinfra",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 5,
    "output_cents_per_million_tokens": 5,
    "quantization": null,
    "max_input_tokens": 131072,
    "max_output_tokens": 131072,
    "throughput": 118.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.810724+00:00",
    "updated_at": "2025-07-19T19:49:16.810724+00:00",
    "provider_model_id_used": "llama-3.1-8b-instruct",
    "model_name": "Llama 3.1 8B Instruct",
    "organization_id": "meta"
  },
  {
    "model_provider_id": 283,
    "model_id": "llama-3.2-11b-instruct",
    "provider_id": "deepinfra",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 5,
    "output_cents_per_million_tokens": 5,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 108.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.817103+00:00",
    "updated_at": "2025-07-19T19:49:16.817103+00:00",
    "provider_model_id_used": "llama-3.2-11b-instruct",
    "model_name": "Llama 3.2 11B Instruct",
    "organization_id": "meta"
  },
  {
    "model_provider_id": 289,
    "model_id": "llama-3.2-3b-instruct",
    "provider_id": "deepinfra",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 1,
    "output_cents_per_million_tokens": 2,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 171.5,
    "latency": 0.24,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.828875+00:00",
    "updated_at": "2025-07-19T19:49:16.828875+00:00",
    "provider_model_id_used": "llama-3.2-3b-instruct",
    "model_name": "Llama 3.2 3B Instruct",
    "organization_id": "meta"
  },
  {
    "model_provider_id": 282,
    "model_id": "llama-3.2-90b-instruct",
    "provider_id": "deepinfra",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 35,
    "output_cents_per_million_tokens": 40,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 24.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.814472+00:00",
    "updated_at": "2025-07-19T19:49:16.814472+00:00",
    "provider_model_id_used": "llama-3.2-90b-instruct",
    "model_name": "Llama 3.2 90B Instruct",
    "organization_id": "meta"
  },
  {
    "model_provider_id": 288,
    "model_id": "llama-3.3-70b-instruct",
    "provider_id": "deepinfra",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 23,
    "output_cents_per_million_tokens": 40,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 37.0,
    "latency": 0.65,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.827019+00:00",
    "updated_at": "2025-07-19T19:49:16.827019+00:00",
    "provider_model_id_used": "llama-3.3-70b-instruct",
    "model_name": "Llama 3.3 70B Instruct",
    "organization_id": "meta"
  },
  {
    "model_provider_id": 296,
    "model_id": "llama-4-maverick",
    "provider_id": "deepinfra",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 17,
    "output_cents_per_million_tokens": 60,
    "quantization": null,
    "max_input_tokens": 1000000,
    "max_output_tokens": 1000000,
    "throughput": 83.59,
    "latency": 0.38,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.843444+00:00",
    "updated_at": "2025-07-19T19:49:16.843444+00:00",
    "provider_model_id_used": "llama-4-maverick",
    "model_name": "Llama 4 Maverick",
    "organization_id": "meta"
  },
  {
    "model_provider_id": 297,
    "model_id": "llama-4-scout",
    "provider_id": "deepinfra",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 8,
    "output_cents_per_million_tokens": 30,
    "quantization": null,
    "max_input_tokens": 10000000,
    "max_output_tokens": 10000000,
    "throughput": 76.1,
    "latency": 0.31,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.845085+00:00",
    "updated_at": "2025-07-19T19:49:16.845085+00:00",
    "provider_model_id_used": "llama-4-scout",
    "model_name": "Llama 4 Scout",
    "organization_id": "meta"
  },
  {
    "model_provider_id": 291,
    "model_id": "mistral-small-24b-instruct-2501",
    "provider_id": "deepinfra",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 7,
    "output_cents_per_million_tokens": 14,
    "quantization": null,
    "max_input_tokens": 32000,
    "max_output_tokens": 32000,
    "throughput": 49.0,
    "latency": 0.2,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.832954+00:00",
    "updated_at": "2025-07-19T19:49:16.832954+00:00",
    "provider_model_id_used": "mistral-small-24b-instruct-2501",
    "model_name": "Mistral Small 3 24B Instruct",
    "organization_id": "mistral"
  },
  {
    "model_provider_id": 292,
    "model_id": "phi-4",
    "provider_id": "deepinfra",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 7,
    "output_cents_per_million_tokens": 14,
    "quantization": null,
    "max_input_tokens": 16000,
    "max_output_tokens": 16000,
    "throughput": 33.0,
    "latency": 0.2,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.835314+00:00",
    "updated_at": "2025-07-19T19:49:16.835314+00:00",
    "provider_model_id_used": "phi-4",
    "model_name": "Phi 4",
    "organization_id": "microsoft"
  },
  {
    "model_provider_id": 300,
    "model_id": "phi-4-multimodal-instruct",
    "provider_id": "deepinfra",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 5,
    "output_cents_per_million_tokens": 10,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 25.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.852868+00:00",
    "updated_at": "2025-07-19T19:49:16.852868+00:00",
    "provider_model_id_used": "phi-4-multimodal-instruct",
    "model_name": "Phi-4-multimodal-instruct",
    "organization_id": "microsoft"
  },
  {
    "model_provider_id": 286,
    "model_id": "qwen-2.5-72b-instruct",
    "provider_id": "deepinfra",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 35,
    "output_cents_per_million_tokens": 40,
    "quantization": null,
    "max_input_tokens": 131072,
    "max_output_tokens": 8192,
    "throughput": 10.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.822329+00:00",
    "updated_at": "2025-07-19T19:49:16.822329+00:00",
    "provider_model_id_used": "qwen-2.5-72b-instruct",
    "model_name": "Qwen2.5 72B Instruct",
    "organization_id": "qwen"
  },
  {
    "model_provider_id": 285,
    "model_id": "qwen-2.5-coder-32b-instruct",
    "provider_id": "deepinfra",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 18,
    "output_cents_per_million_tokens": 18,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 44.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.820492+00:00",
    "updated_at": "2025-07-19T19:49:16.820492+00:00",
    "provider_model_id_used": "qwen-2.5-coder-32b-instruct",
    "model_name": "Qwen2.5-Coder 32B Instruct",
    "organization_id": "qwen"
  },
  {
    "model_provider_id": 301,
    "model_id": "qwen3-235b-a22b",
    "provider_id": "deepinfra",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 20,
    "output_cents_per_million_tokens": 60,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 21.74,
    "latency": 1.23,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.855452+00:00",
    "updated_at": "2025-07-19T19:49:16.855452+00:00",
    "provider_model_id_used": "qwen3-235b-a22b",
    "model_name": "Qwen3 235B A22B",
    "organization_id": "qwen"
  },
  {
    "model_provider_id": 303,
    "model_id": "qwen3-30b-a3b",
    "provider_id": "deepinfra",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 10,
    "output_cents_per_million_tokens": 30,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 82.57,
    "latency": 0.84,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.859780+00:00",
    "updated_at": "2025-07-19T19:49:16.859780+00:00",
    "provider_model_id_used": "qwen3-30b-a3b",
    "model_name": "Qwen3 30B A3B",
    "organization_id": "qwen"
  },
  {
    "model_provider_id": 302,
    "model_id": "qwen3-32b",
    "provider_id": "deepinfra",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 10,
    "output_cents_per_million_tokens": 30,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 26.95,
    "latency": 1.19,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.857468+00:00",
    "updated_at": "2025-07-19T19:49:16.857468+00:00",
    "provider_model_id_used": "qwen3-32b",
    "model_name": "Qwen3 32B",
    "organization_id": "qwen"
  },
  {
    "model_provider_id": 287,
    "model_id": "qwq-32b-preview",
    "provider_id": "deepinfra",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 15,
    "output_cents_per_million_tokens": 60,
    "quantization": null,
    "max_input_tokens": 32768,
    "max_output_tokens": 32768,
    "throughput": 76.04,
    "latency": 0.44,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.825039+00:00",
    "updated_at": "2025-07-19T19:49:16.825039+00:00",
    "provider_model_id_used": "qwq-32b-preview",
    "model_name": "QwQ-32B-Preview",
    "organization_id": "qwen"
  },
  {
    "model_provider_id": 288,
    "model_id": "glm-4.6",
    "provider_id": "deepinfra",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 60,
    "output_cents_per_million_tokens": 200,
    "quantization": "fp8",
    "max_input_tokens": 131072,
    "max_output_tokens": 65536,
    "throughput": 85.0,
    "latency": 0.7,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": true,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-09-30T00:00:00.000000+00:00",
    "updated_at": "2025-09-30T00:00:00.000000+00:00",
    "provider_model_id_used": "zai-org/GLM-4.6",
    "model_name": "GLM-4.6",
    "organization_id": "zai-org"
  }
]


================================================
FILE: data/providers/deepinfra/provider.json
================================================
{
  "provider_id": "deepinfra",
  "name": "DeepInfra",
  "website": "https://deepinfra.com/",
  "created_at": "2025-07-19T19:49:16.806529+00:00",
  "updated_at": "2025-07-19T19:49:16.806529+00:00"
}

================================================
FILE: data/providers/deepseek/models.json
================================================
[
  {
    "model_provider_id": 361,
    "model_id": "deepseek-r1",
    "provider_id": "deepseek",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 55,
    "output_cents_per_million_tokens": 219,
    "quantization": null,
    "max_input_tokens": 131072,
    "max_output_tokens": 131072,
    "throughput": 9.0,
    "latency": 0.3,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.991378+00:00",
    "updated_at": "2025-07-19T19:49:16.991378+00:00",
    "provider_model_id_used": "deepseek-r1",
    "model_name": "DeepSeek-R1",
    "organization_id": "deepseek"
  },
  {
    "model_provider_id": 362,
    "model_id": "deepseek-r1-0528",
    "provider_id": "deepseek",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 55,
    "output_cents_per_million_tokens": 219,
    "quantization": null,
    "max_input_tokens": 131072,
    "max_output_tokens": 131072,
    "throughput": 9.0,
    "latency": 0.3,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.993656+00:00",
    "updated_at": "2025-07-19T19:49:16.993656+00:00",
    "provider_model_id_used": "deepseek-r1-0528",
    "model_name": "DeepSeek-R1-0528",
    "organization_id": "deepseek"
  },
  {
    "model_provider_id": 359,
    "model_id": "deepseek-v2.5",
    "provider_id": "deepseek",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 14,
    "output_cents_per_million_tokens": 28,
    "quantization": null,
    "max_input_tokens": 8192,
    "max_output_tokens": 8192,
    "throughput": 100.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.987664+00:00",
    "updated_at": "2025-07-19T19:49:16.987664+00:00",
    "provider_model_id_used": "deepseek-v2.5",
    "model_name": "DeepSeek-V2.5",
    "organization_id": "deepseek"
  },
  {
    "model_provider_id": 360,
    "model_id": "deepseek-v3",
    "provider_id": "deepseek",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 27,
    "output_cents_per_million_tokens": 110,
    "quantization": null,
    "max_input_tokens": 131072,
    "max_output_tokens": 131072,
    "throughput": 100.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.989355+00:00",
    "updated_at": "2025-07-19T19:49:16.989355+00:00",
    "provider_model_id_used": "deepseek-v3",
    "model_name": "DeepSeek-V3",
    "organization_id": "deepseek"
  }
]

================================================
FILE: data/providers/deepseek/provider.json
================================================
{
  "provider_id": "deepseek",
  "name": "DeepSeek",
  "website": "https://deepseek.com/",
  "created_at": "2025-07-19T19:49:16.986078+00:00",
  "updated_at": "2025-07-19T19:49:16.986078+00:00"
}


================================================
FILE: data/providers/fireworks/models.json
================================================
[
  {
    "model_provider_id": 340,
    "model_id": "deepseek-r1",
    "provider_id": "fireworks",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 800,
    "output_cents_per_million_tokens": 800,
    "quantization": null,
    "max_input_tokens": 131072,
    "max_output_tokens": 131072,
    "throughput": 2.1,
    "latency": 0.3,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.942224+00:00",
    "updated_at": "2025-07-19T19:49:16.942224+00:00",
    "provider_model_id_used": "deepseek-r1",
    "model_name": "DeepSeek-R1",
    "organization_id": "deepseek"
  },
  {
    "model_provider_id": 331,
    "model_id": "llama-3.1-405b-instruct",
    "provider_id": "fireworks",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 300,
    "output_cents_per_million_tokens": 300,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 78.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.923810+00:00",
    "updated_at": "2025-07-19T19:49:16.923810+00:00",
    "provider_model_id_used": "llama-3.1-405b-instruct",
    "model_name": "Llama 3.1 405B Instruct",
    "organization_id": "meta"
  },
  {
    "model_provider_id": 332,
    "model_id": "llama-3.1-70b-instruct",
    "provider_id": "fireworks",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 89,
    "output_cents_per_million_tokens": 89,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 32.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.926263+00:00",
    "updated_at": "2025-07-19T19:49:16.926263+00:00",
    "provider_model_id_used": "llama-3.1-70b-instruct",
    "model_name": "Llama 3.1 70B Instruct",
    "organization_id": "meta"
  },
  {
    "model_provider_id": 333,
    "model_id": "llama-3.1-8b-instruct",
    "provider_id": "fireworks",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 20,
    "output_cents_per_million_tokens": 20,
    "quantization": null,
    "max_input_tokens": 131072,
    "max_output_tokens": 131072,
    "throughput": 292.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.928500+00:00",
    "updated_at": "2025-07-19T19:49:16.928500+00:00",
    "provider_model_id_used": "llama-3.1-8b-instruct",
    "model_name": "Llama 3.1 8B Instruct",
    "organization_id": "meta"
  },
  {
    "model_provider_id": 335,
    "model_id": "llama-3.2-11b-instruct",
    "provider_id": "fireworks",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 20,
    "output_cents_per_million_tokens": 20,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 125.0,
    "latency": 0.2,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.932316+00:00",
    "updated_at": "2025-07-19T19:49:16.932316+00:00",
    "provider_model_id_used": "llama-3.2-11b-instruct",
    "model_name": "Llama 3.2 11B Instruct",
    "organization_id": "meta"
  },
  {
    "model_provider_id": 334,
    "model_id": "llama-3.2-90b-instruct",
    "provider_id": "fireworks",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 89,
    "output_cents_per_million_tokens": 89,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 50.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.930486+00:00",
    "updated_at": "2025-07-19T19:49:16.930486+00:00",
    "provider_model_id_used": "llama-3.2-90b-instruct",
    "model_name": "Llama 3.2 90B Instruct",
    "organization_id": "meta"
  },
  {
    "model_provider_id": 339,
    "model_id": "llama-3.3-70b-instruct",
    "provider_id": "fireworks",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 89,
    "output_cents_per_million_tokens": 89,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 197.0,
    "latency": 0.65,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.939993+00:00",
    "updated_at": "2025-07-19T19:49:16.939993+00:00",
    "provider_model_id_used": "llama-3.3-70b-instruct",
    "model_name": "Llama 3.3 70B Instruct",
    "organization_id": "meta"
  },
  {
    "model_provider_id": 341,
    "model_id": "llama-4-maverick",
    "provider_id": "fireworks",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 22,
    "output_cents_per_million_tokens": 88,
    "quantization": null,
    "max_input_tokens": 1000000,
    "max_output_tokens": 1000000,
    "throughput": 63.03,
    "latency": 0.62,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.944370+00:00",
    "updated_at": "2025-07-19T19:49:16.944370+00:00",
    "provider_model_id_used": "llama-4-maverick",
    "model_name": "Llama 4 Maverick",
    "organization_id": "meta"
  },
  {
    "model_provider_id": 342,
    "model_id": "llama-4-scout",
    "provider_id": "fireworks",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 15,
    "output_cents_per_million_tokens": 60,
    "quantization": null,
    "max_input_tokens": 10000000,
    "max_output_tokens": 10000000,
    "throughput": 116.1,
    "latency": 0.53,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.946725+00:00",
    "updated_at": "2025-07-19T19:49:16.946725+00:00",
    "provider_model_id_used": "llama-4-scout",
    "model_name": "Llama 4 Scout",
    "organization_id": "meta"
  },
  {
    "model_provider_id": 337,
    "model_id": "qwen-2.5-72b-instruct",
    "provider_id": "fireworks",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 89,
    "output_cents_per_million_tokens": 89,
    "quantization": null,
    "max_input_tokens": 131072,
    "max_output_tokens": 8192,
    "throughput": 59.0,
    "latency": 0.37,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.936092+00:00",
    "updated_at": "2025-07-19T19:49:16.936092+00:00",
    "provider_model_id_used": "qwen-2.5-72b-instruct",
    "model_name": "Qwen2.5 72B Instruct",
    "organization_id": "qwen"
  },
  {
    "model_provider_id": 336,
    "model_id": "qwen-2.5-coder-32b-instruct",
    "provider_id": "fireworks",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 89,
    "output_cents_per_million_tokens": 89,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 110.0,
    "latency": 0.26,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.934183+00:00",
    "updated_at": "2025-07-19T19:49:16.934183+00:00",
    "provider_model_id_used": "qwen-2.5-coder-32b-instruct",
    "model_name": "Qwen2.5-Coder 32B Instruct",
    "organization_id": "qwen"
  },
  {
    "model_provider_id": 343,
    "model_id": "qwen3-235b-a22b",
    "provider_id": "fireworks",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 10,
    "output_cents_per_million_tokens": 10,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 68.17,
    "latency": 0.78,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.949833+00:00",
    "updated_at": "2025-07-19T19:49:16.949833+00:00",
    "provider_model_id_used": "qwen3-235b-a22b",
    "model_name": "Qwen3 235B A22B",
    "organization_id": "qwen"
  },
  {
    "model_provider_id": 344,
    "model_id": "qwen3-30b-a3b",
    "provider_id": "fireworks",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 89,
    "output_cents_per_million_tokens": 89,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 122.4,
    "latency": 0.66,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.951886+00:00",
    "updated_at": "2025-07-19T19:49:16.951886+00:00",
    "provider_model_id_used": "qwen3-30b-a3b",
    "model_name": "Qwen3 30B A3B",
    "organization_id": "qwen"
  },
  {
    "model_provider_id": 338,
    "model_id": "qwq-32b-preview",
    "provider_id": "fireworks",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 89,
    "output_cents_per_million_tokens": 89,
    "quantization": null,
    "max_input_tokens": 32768,
    "max_output_tokens": 32768,
    "throughput": 99.15,
    "latency": 0.53,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.937841+00:00",
    "updated_at": "2025-07-19T19:49:16.937841+00:00",
    "provider_model_id_used": "qwq-32b-preview",
    "model_name": "QwQ-32B-Preview",
    "organization_id": "qwen"
  }
]

================================================
FILE: data/providers/fireworks/provider.json
================================================
{
  "provider_id": "fireworks",
  "name": "Fireworks",
  "website": "https://fireworks.ai/",
  "created_at": "2025-07-19T19:49:16.921865+00:00",
  "updated_at": "2025-07-19T19:49:16.921865+00:00"
}

================================================
FILE: data/providers/google/models.json
================================================
[
  {
    "model_provider_id": 318,
    "model_id": "claude-3-5-haiku-20241022",
    "provider_id": "google",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 80,
    "output_cents_per_million_tokens": 400,
    "quantization": null,
    "max_input_tokens": 200000,
    "max_output_tokens": 200000,
    "throughput": 42.0,
    "latency": 0.4,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.896052+00:00",
    "updated_at": "2025-07-19T19:49:16.896052+00:00",
    "provider_model_id_used": "claude-3-5-haiku-20241022",
    "model_name": "Claude 3.5 Haiku",
    "organization_id": "anthropic"
  },
  {
    "model_provider_id": 320,
    "model_id": "claude-3-5-sonnet-20240620",
    "provider_id": "google",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 300,
    "output_cents_per_million_tokens": 1500,
    "quantization": null,
    "max_input_tokens": 200000,
    "max_output_tokens": 200000,
    "throughput": 42.0,
    "latency": 0.4,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.900161+00:00",
    "updated_at": "2025-07-19T19:49:16.900161+00:00",
    "provider_model_id_used": "claude-3-5-sonnet-20240620",
    "model_name": "Claude 3.5 Sonnet",
    "organization_id": "anthropic"
  },
  {
    "model_provider_id": 319,
    "model_id": "claude-3-5-sonnet-20241022",
    "provider_id": "google",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 300,
    "output_cents_per_million_tokens": 1500,
    "quantization": null,
    "max_input_tokens": 200000,
    "max_output_tokens": 200000,
    "throughput": 42.0,
    "latency": 0.4,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.898073+00:00",
    "updated_at": "2025-07-19T19:49:16.898073+00:00",
    "provider_model_id_used": "claude-3-5-sonnet-20241022",
    "model_name": "Claude 3.5 Sonnet",
    "organization_id": "anthropic"
  },
  {
    "model_provider_id": 327,
    "model_id": "claude-3-7-sonnet-20250219",
    "provider_id": "google",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 300,
    "output_cents_per_million_tokens": 1500,
    "quantization": null,
    "max_input_tokens": 200000,
    "max_output_tokens": 128000,
    "throughput": 42.0,
    "latency": 0.4,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.914565+00:00",
    "updated_at": "2025-07-19T19:49:16.914565+00:00",
    "provider_model_id_used": "claude-3-7-sonnet-20250219",
    "model_name": "Claude 3.7 Sonnet",
    "organization_id": "anthropic"
  },
  {
    "model_provider_id": 328,
    "model_id": "claude-3-haiku-20240307",
    "provider_id": "google",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 25,
    "output_cents_per_million_tokens": 125,
    "quantization": null,
    "max_input_tokens": 200000,
    "max_output_tokens": 200000,
    "throughput": 42.0,
    "latency": 0.4,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.916491+00:00",
    "updated_at": "2025-07-19T19:49:16.916491+00:00",
    "provider_model_id_used": "claude-3-haiku-20240307",
    "model_name": "Claude 3 Haiku",
    "organization_id": "anthropic"
  },
  {
    "model_provider_id": 322,
    "model_id": "claude-3-opus-20240229",
    "provider_id": "google",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 1500,
    "output_cents_per_million_tokens": 7500,
    "quantization": null,
    "max_input_tokens": 200000,
    "max_output_tokens": 200000,
    "throughput": 42.0,
    "latency": 0.4,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.903705+00:00",
    "updated_at": "2025-07-19T19:49:16.903705+00:00",
    "provider_model_id_used": "claude-3-opus-20240229",
    "model_name": "Claude 3 Opus",
    "organization_id": "anthropic"
  },
  {
    "model_provider_id": 321,
    "model_id": "claude-3-sonnet-20240229",
    "provider_id": "google",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 300,
    "output_cents_per_million_tokens": 1500,
    "quantization": null,
    "max_input_tokens": 200000,
    "max_output_tokens": 200000,
    "throughput": 42.0,
    "latency": 0.4,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.902100+00:00",
    "updated_at": "2025-07-19T19:49:16.902100+00:00",
    "provider_model_id_used": "claude-3-sonnet-20240229",
    "model_name": "Claude 3 Sonnet",
    "organization_id": "anthropic"
  },
  {
    "model_provider_id": 330,
    "model_id": "claude-opus-4-20250514",
    "provider_id": "google",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 1500,
    "output_cents_per_million_tokens": 7500,
    "quantization": null,
    "max_input_tokens": 200000,
    "max_output_tokens": 128000,
    "throughput": 42.0,
    "latency": 0.4,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.920504+00:00",
    "updated_at": "2025-07-19T19:49:16.920504+00:00",
    "provider_model_id_used": "claude-opus-4-20250514",
    "model_name": "Claude Opus 4",
    "organization_id": "anthropic"
  },
  {
    "model_provider_id": 331,
    "model_id": "claude-opus-4-1-20250805",
    "provider_id": "google",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 1500,
    "output_cents_per_million_tokens": 7500,
    "quantization": null,
    "max_input_tokens": 200000,
    "max_output_tokens": 32000,
    "throughput": 42.0,
    "latency": 0.4,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-08-05T00:00:00.000000+00:00",
    "updated_at": "2025-08-05T00:00:00.000000+00:00",
    "provider_model_id_used": "claude-opus-4-1-20250805",
    "model_name": "Claude Opus 4.1",
    "organization_id": "anthropic"
  },
  {
    "model_provider_id": 329,
    "model_id": "claude-sonnet-4-20250514",
    "provider_id": "google",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 300,
    "output_cents_per_million_tokens": 1500,
    "quantization": null,
    "max_input_tokens": 200000,
    "max_output_tokens": 128000,
    "throughput": 42.0,
    "latency": 0.4,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.918456+00:00",
    "updated_at": "2025-07-19T19:49:16.918456+00:00",
    "provider_model_id_used": "claude-sonnet-4-20250514",
    "model_name": "Claude Sonnet 4",
    "organization_id": "anthropic"
  },
  {
    "model_provider_id": 312,
    "model_id": "gemini-1.0-pro",
    "provider_id": "google",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 50,
    "output_cents_per_million_tokens": 150,
    "quantization": null,
    "max_input_tokens": 32760,
    "max_output_tokens": 8192,
    "throughput": 120.0,
    "latency": 0.4,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.882424+00:00",
    "updated_at": "2025-07-19T19:49:16.882424+00:00",
    "provider_model_id_used": "gemini-1.0-pro",
    "model_name": "Gemini 1.0 Pro",
    "organization_id": "google"
  },
  {
    "model_provider_id": 313,
    "model_id": "gemini-1.5-flash",
    "provider_id": "google",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 15,
    "output_cents_per_million_tokens": 60,
    "quantization": null,
    "max_input_tokens": 1048576,
    "max_output_tokens": 8192,
    "throughput": 150.0,
    "latency": 0.3,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.885387+00:00",
    "updated_at": "2025-07-19T19:49:16.885387+00:00",
    "provider_model_id_used": "gemini-1.5-flash",
    "model_name": "Gemini 1.5 Flash",
    "organization_id": "google"
  },
  {
    "model_provider_id": 314,
    "model_id": "gemini-1.5-flash-8b",
    "provider_id": "google",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 7,
    "output_cents_per_million_tokens": 30,
    "quantization": null,
    "max_input_tokens": 1048576,
    "max_output_tokens": 8192,
    "throughput": 150.0,
    "latency": 0.3,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.887626+00:00",
    "updated_at": "2025-07-19T19:49:16.887626+00:00",
    "provider_model_id_used": "gemini-1.5-flash-8b",
    "model_name": "Gemini 1.5 Flash 8B",
    "organization_id": "google"
  },
  {
    "model_provider_id": 311,
    "model_id": "gemini-1.5-pro",
    "provider_id": "google",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 250,
    "output_cents_per_million_tokens": 1000,
    "quantization": null,
    "max_input_tokens": 2097152,
    "max_output_tokens": 8192,
    "throughput": 85.0,
    "latency": 0.7,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.880526+00:00",
    "updated_at": "2025-07-19T19:49:16.880526+00:00",
    "provider_model_id_used": "gemini-1.5-pro",
    "model_name": "Gemini 1.5 Pro",
    "organization_id": "google"
  },
  {
    "model_provider_id": 310,
    "model_id": "gemini-2.0-flash",
    "provider_id": "google",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 10,
    "output_cents_per_million_tokens": 40,
    "quantization": null,
    "max_input_tokens": 1048576,
    "max_output_tokens": 8192,
    "throughput": 183.0,
    "latency": 0.4,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.878419+00:00",
    "updated_at": "2025-07-19T19:49:16.878419+00:00",
    "provider_model_id_used": "gemini-2.0-flash",
    "model_name": "Gemini 2.0 Flash",
    "organization_id": "google"
  },
  {
    "model_provider_id": 309,
    "model_id": "gemini-2.0-flash-lite",
    "provider_id": "google",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 7,
    "output_cents_per_million_tokens": 30,
    "quantization": null,
    "max_input_tokens": 1048576,
    "max_output_tokens": 8192,
    "throughput": 85.0,
    "latency": 0.7,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.876262+00:00",
    "updated_at": "2025-07-19T19:49:16.876262+00:00",
    "provider_model_id_used": "gemini-2.0-flash-lite",
    "model_name": "Gemini 2.0 Flash-Lite",
    "organization_id": "google"
  },
  {
    "model_provider_id": 306,
    "model_id": "gemini-2.5-flash",
    "provider_id": "google",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 30,
    "output_cents_per_million_tokens": 250,
    "quantization": null,
    "max_input_tokens": 1048576,
    "max_output_tokens": 65536,
    "throughput": 85.0,
    "latency": 0.7,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.868859+00:00",
    "updated_at": "2025-07-19T19:49:16.868859+00:00",
    "provider_model_id_used": "gemini-2.5-flash",
    "model_name": "Gemini 2.5 Flash",
    "organization_id": "google"
  },
  {
    "model_provider_id": 305,
    "model_id": "gemini-2.5-flash-lite",
    "provider_id": "google",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 10,
    "output_cents_per_million_tokens": 40,
    "quantization": null,
    "max_input_tokens": 1048576,
    "max_output_tokens": 65536,
    "throughput": 5.69,
    "latency": 0.44,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.866570+00:00",
    "updated_at": "2025-07-19T19:49:16.866570+00:00",
    "provider_model_id_used": "gemini-2.5-flash-lite",
    "model_name": "Gemini 2.5 Flash-Lite",
    "organization_id": "google"
  },
  {
    "model_provider_id": 307,
    "model_id": "gemini-2.5-pro",
    "provider_id": "google",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 125,
    "output_cents_per_million_tokens": 1000,
    "quantization": null,
    "max_input_tokens": 1048576,
    "max_output_tokens": 65536,
    "throughput": 85.0,
    "latency": 0.7,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.871063+00:00",
    "updated_at": "2025-07-19T19:49:16.871063+00:00",
    "provider_model_id_used": "gemini-2.5-pro",
    "model_name": "Gemini 2.5 Pro",
    "organization_id": "google"
  },
  {
    "model_provider_id": 308,
    "model_id": "gemini-2.5-pro-preview-06-05",
    "provider_id": "google",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 125,
    "output_cents_per_million_tokens": 1000,
    "quantization": null,
    "max_input_tokens": 1048576,
    "max_output_tokens": 65535,
    "throughput": 85.0,
    "latency": 0.7,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.873667+00:00",
    "updated_at": "2025-07-19T19:49:16.873667+00:00",
    "provider_model_id_used": "gemini-2.5-pro-preview-06-05",
    "model_name": "Gemini 2.5 Pro Preview 06-05",
    "organization_id": "google"
  },
  {
    "model_provider_id": 316,
    "model_id": "jamba-1.5-large",
    "provider_id": "google",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 200,
    "output_cents_per_million_tokens": 800,
    "quantization": null,
    "max_input_tokens": 256000,
    "max_output_tokens": 256000,
    "throughput": 42.0,
    "latency": 0.3,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.891518+00:00",
    "updated_at": "2025-07-19T19:49:16.891518+00:00",
    "provider_model_id_used": "jamba-1.5-large",
    "model_name": "Jamba 1.5 Large",
    "organization_id": "ai21"
  },
  {
    "model_provider_id": 317,
    "model_id": "jamba-1.5-mini",
    "provider_id": "google",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 20,
    "output_cents_per_million_tokens": 40,
    "quantization": null,
    "max_input_tokens": 256144,
    "max_output_tokens": 256144,
    "throughput": 100.0,
    "latency": 0.3,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.893779+00:00",
    "updated_at": "2025-07-19T19:49:16.893779+00:00",
    "provider_model_id_used": "jamba-1.5-mini",
    "model_name": "Jamba 1.5 Mini",
    "organization_id": "ai21"
  },
  {
    "model_provider_id": 323,
    "model_id": "llama-3.1-405b-instruct",
    "provider_id": "google",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 500,
    "output_cents_per_million_tokens": 1600,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 42.0,
    "latency": 0.4,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.905332+00:00",
    "updated_at": "2025-07-19T19:49:16.905332+00:00",
    "provider_model_id_used": "llama-3.1-405b-instruct",
    "model_name": "Llama 3.1 405B Instruct",
    "organization_id": "meta"
  },
  {
    "model_provider_id": 324,
    "model_id": "mistral-large-2-2407",
    "provider_id": "google",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 200,
    "output_cents_per_million_tokens": 600,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 42.0,
    "latency": 0.4,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.907260+00:00",
    "updated_at": "2025-07-19T19:49:16.907260+00:00",
    "provider_model_id_used": "mistral-large-2-2407",
    "model_name": "Mistral Large 2",
    "organization_id": "mistral"
  },
  {
    "model_provider_id": 325,
    "model_id": "mistral-nemo-instruct-2407",
    "provider_id": "google",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 15,
    "output_cents_per_million_tokens": 15,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 42.0,
    "latency": 0.4,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.909863+00:00",
    "updated_at": "2025-07-19T19:49:16.909863+00:00",
    "provider_model_id_used": "mistral-nemo-instruct-2407",
    "model_name": "Mistral NeMo Instruct",
    "organization_id": "mistral"
  }
]


================================================
FILE: data/providers/google/provider.json
================================================
{
  "provider_id": "google",
  "name": "Google",
  "website": "https://ai.google.dev",
  "created_at": "2025-07-19T19:49:16.864633+00:00",
  "updated_at": "2025-07-19T19:49:16.864633+00:00"
}


================================================
FILE: data/providers/groq/models.json
================================================
[
  {
    "model_provider_id": 345,
    "model_id": "llama-3.1-70b-instruct",
    "provider_id": "groq",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 59,
    "output_cents_per_million_tokens": 78,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 250.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.955618+00:00",
    "updated_at": "2025-07-19T19:49:16.955618+00:00",
    "provider_model_id_used": "llama-3.1-70b-instruct",
    "model_name": "Llama 3.1 70B Instruct",
    "organization_id": "meta"
  },
  {
    "model_provider_id": 346,
    "model_id": "llama-3.1-8b-instruct",
    "provider_id": "groq",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 5,
    "output_cents_per_million_tokens": 8,
    "quantization": null,
    "max_input_tokens": 131072,
    "max_output_tokens": 131072,
    "throughput": 750.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.957463+00:00",
    "updated_at": "2025-07-19T19:49:16.957463+00:00",
    "provider_model_id_used": "llama-3.1-8b-instruct",
    "model_name": "Llama 3.1 8B Instruct",
    "organization_id": "meta"
  },
  {
    "model_provider_id": 347,
    "model_id": "llama-3.2-11b-instruct",
    "provider_id": "groq",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 18,
    "output_cents_per_million_tokens": 18,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 100.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.959974+00:00",
    "updated_at": "2025-07-19T19:49:16.959974+00:00",
    "provider_model_id_used": "llama-3.2-11b-instruct",
    "model_name": "Llama 3.2 11B Instruct",
    "organization_id": "meta"
  },
  {
    "model_provider_id": 348,
    "model_id": "llama-3.3-70b-instruct",
    "provider_id": "groq",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 59,
    "output_cents_per_million_tokens": 790,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 268.0,
    "latency": 0.65,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.962122+00:00",
    "updated_at": "2025-07-19T19:49:16.962122+00:00",
    "provider_model_id_used": "llama-3.3-70b-instruct",
    "model_name": "Llama 3.3 70B Instruct",
    "organization_id": "meta"
  },
  {
    "model_provider_id": 349,
    "model_id": "llama-4-maverick",
    "provider_id": "groq",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 20,
    "output_cents_per_million_tokens": 60,
    "quantization": null,
    "max_input_tokens": 1000000,
    "max_output_tokens": 1000000,
    "throughput": 307.3,
    "latency": 0.27,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.963701+00:00",
    "updated_at": "2025-07-19T19:49:16.963701+00:00",
    "provider_model_id_used": "llama-4-maverick",
    "model_name": "Llama 4 Maverick",
    "organization_id": "meta"
  },
  {
    "model_provider_id": 350,
    "model_id": "llama-4-scout",
    "provider_id": "groq",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 11,
    "output_cents_per_million_tokens": 34,
    "quantization": null,
    "max_input_tokens": 10000000,
    "max_output_tokens": 10000000,
    "throughput": 776.1,
    "latency": 1.08,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.965756+00:00",
    "updated_at": "2025-07-19T19:49:16.965756+00:00",
    "provider_model_id_used": "llama-4-scout",
    "model_name": "Llama 4 Scout",
    "organization_id": "meta"
  },
  {
    "model_provider_id": 1231,
    "model_id": "gpt-oss-120b",
    "provider_id": "groq",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 15,
    "output_cents_per_million_tokens": 60,
    "quantization": null,
    "max_input_tokens": 131000,
    "max_output_tokens": 30000,
    "throughput": 500,
    "latency": 0.5,
    "feature_web_search": true,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": true,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-08-05T19:49:16.965756+00:00",
    "updated_at": "2025-08-05T19:49:16.965756+00:00",
    "provider_model_id_used": "gpt-oss-120b",
    "model_name": "OpenAI OSS 120B",
    "organization_id": "openai"
  },
  {
    "model_provider_id": 1232,
    "model_id": "gpt-oss-20b",
    "provider_id": "groq",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 10,
    "output_cents_per_million_tokens": 50,
    "quantization": null,
    "max_input_tokens": 131000,
    "max_output_tokens": 30000,
    "throughput": 1000,
    "latency": 0.38,
    "feature_web_search": true,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": true,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-08-05T19:49:16.965756+00:00",
    "updated_at": "2025-08-05T19:49:16.965756+00:00",
    "provider_model_id_used": "gpt-oss-20b",
    "model_name": "OpenAI OSS 20B",
    "organization_id": "openai"
  }
]


================================================
FILE: data/providers/groq/provider.json
================================================
{
  "provider_id": "groq",
  "name": "Groq",
  "website": "https://groq.com/",
  "created_at": "2025-07-19T19:49:16.953587+00:00",
  "updated_at": "2025-07-19T19:49:16.953587+00:00"
}

================================================
FILE: data/providers/hyperbolic/models.json
================================================
[
  {
    "model_provider_id": 276,
    "model_id": "deepseek-v2.5",
    "provider_id": "hyperbolic",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 200,
    "output_cents_per_million_tokens": 200,
    "quantization": null,
    "max_input_tokens": 8192,
    "max_output_tokens": 8192,
    "throughput": 100.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.801424+00:00",
    "updated_at": "2025-07-19T19:49:16.801424+00:00",
    "provider_model_id_used": "deepseek-v2.5",
    "model_name": "DeepSeek-V2.5",
    "organization_id": "deepseek"
  },
  {
    "model_provider_id": 272,
    "model_id": "llama-3.1-405b-instruct",
    "provider_id": "hyperbolic",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 400,
    "output_cents_per_million_tokens": 400,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 40.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.788610+00:00",
    "updated_at": "2025-07-19T19:49:16.788610+00:00",
    "provider_model_id_used": "llama-3.1-405b-instruct",
    "model_name": "Llama 3.1 405B Instruct",
    "organization_id": "meta"
  },
  {
    "model_provider_id": 271,
    "model_id": "llama-3.1-70b-instruct",
    "provider_id": "hyperbolic",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 40,
    "output_cents_per_million_tokens": 40,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 100.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.785874+00:00",
    "updated_at": "2025-07-19T19:49:16.785874+00:00",
    "provider_model_id_used": "llama-3.1-70b-instruct",
    "model_name": "Llama 3.1 70B Instruct",
    "organization_id": "meta"
  },
  {
    "model_provider_id": 270,
    "model_id": "llama-3.1-8b-instruct",
    "provider_id": "hyperbolic",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 10,
    "output_cents_per_million_tokens": 10,
    "quantization": null,
    "max_input_tokens": 131072,
    "max_output_tokens": 131072,
    "throughput": 200.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.783230+00:00",
    "updated_at": "2025-07-19T19:49:16.783230+00:00",
    "provider_model_id_used": "llama-3.1-8b-instruct",
    "model_name": "Llama 3.1 8B Instruct",
    "organization_id": "meta"
  },
  {
    "model_provider_id": 273,
    "model_id": "llama-3.2-90b-instruct",
    "provider_id": "hyperbolic",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 200,
    "output_cents_per_million_tokens": 200,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 42.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.791634+00:00",
    "updated_at": "2025-07-19T19:49:16.791634+00:00",
    "provider_model_id_used": "llama-3.2-90b-instruct",
    "model_name": "Llama 3.2 90B Instruct",
    "organization_id": "meta"
  },
  {
    "model_provider_id": 278,
    "model_id": "llama-3.3-70b-instruct",
    "provider_id": "hyperbolic",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 40,
    "output_cents_per_million_tokens": 40,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 42.0,
    "latency": 0.65,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.805164+00:00",
    "updated_at": "2025-07-19T19:49:16.805164+00:00",
    "provider_model_id_used": "llama-3.3-70b-instruct",
    "model_name": "Llama 3.3 70B Instruct",
    "organization_id": "meta"
  },
  {
    "model_provider_id": 274,
    "model_id": "qwen-2.5-72b-instruct",
    "provider_id": "hyperbolic",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 40,
    "output_cents_per_million_tokens": 40,
    "quantization": null,
    "max_input_tokens": 131072,
    "max_output_tokens": 8192,
    "throughput": 100.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.795011+00:00",
    "updated_at": "2025-07-19T19:49:16.795011+00:00",
    "provider_model_id_used": "qwen-2.5-72b-instruct",
    "model_name": "Qwen2.5 72B Instruct",
    "organization_id": "qwen"
  },
  {
    "model_provider_id": 275,
    "model_id": "qwen-2.5-coder-32b-instruct",
    "provider_id": "hyperbolic",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 20,
    "output_cents_per_million_tokens": 20,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 100.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.798904+00:00",
    "updated_at": "2025-07-19T19:49:16.798904+00:00",
    "provider_model_id_used": "qwen-2.5-coder-32b-instruct",
    "model_name": "Qwen2.5-Coder 32B Instruct",
    "organization_id": "qwen"
  },
  {
    "model_provider_id": 277,
    "model_id": "qwq-32b-preview",
    "provider_id": "hyperbolic",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 20,
    "output_cents_per_million_tokens": 20,
    "quantization": null,
    "max_input_tokens": 32768,
    "max_output_tokens": 32768,
    "throughput": 31.9,
    "latency": 1.05,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.803353+00:00",
    "updated_at": "2025-07-19T19:49:16.803353+00:00",
    "provider_model_id_used": "qwq-32b-preview",
    "model_name": "QwQ-32B-Preview",
    "organization_id": "qwen"
  }
]

================================================
FILE: data/providers/hyperbolic/provider.json
================================================
{
  "provider_id": "hyperbolic",
  "name": "Hyperbolic",
  "website": "https://hyperbolic.xyz",
  "created_at": "2025-07-19T19:49:16.780946+00:00",
  "updated_at": "2025-07-19T19:49:16.780946+00:00"
}

================================================
FILE: data/providers/lambda/models.json
================================================
[
  {
    "model_provider_id": 390,
    "model_id": "llama-3.1-405b-instruct",
    "provider_id": "lambda",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 89,
    "output_cents_per_million_tokens": 89,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 42.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.054217+00:00",
    "updated_at": "2025-07-19T19:49:17.054217+00:00",
    "provider_model_id_used": "llama-3.1-405b-instruct",
    "model_name": "Llama 3.1 405B Instruct",
    "organization_id": "meta"
  },
  {
    "model_provider_id": 389,
    "model_id": "llama-3.1-70b-instruct",
    "provider_id": "lambda",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 20,
    "output_cents_per_million_tokens": 20,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 42.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.051981+00:00",
    "updated_at": "2025-07-19T19:49:17.051981+00:00",
    "provider_model_id_used": "llama-3.1-70b-instruct",
    "model_name": "Llama 3.1 70B Instruct",
    "organization_id": "meta"
  },
  {
    "model_provider_id": 388,
    "model_id": "llama-3.1-8b-instruct",
    "provider_id": "lambda",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 3,
    "output_cents_per_million_tokens": 3,
    "quantization": null,
    "max_input_tokens": 131072,
    "max_output_tokens": 131072,
    "throughput": 42.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.050200+00:00",
    "updated_at": "2025-07-19T19:49:17.050200+00:00",
    "provider_model_id_used": "llama-3.1-8b-instruct",
    "model_name": "Llama 3.1 8B Instruct",
    "organization_id": "meta"
  },
  {
    "model_provider_id": 391,
    "model_id": "llama-3.3-70b-instruct",
    "provider_id": "lambda",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 20,
    "output_cents_per_million_tokens": 20,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 42.0,
    "latency": 0.65,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.056567+00:00",
    "updated_at": "2025-07-19T19:49:17.056567+00:00",
    "provider_model_id_used": "llama-3.3-70b-instruct",
    "model_name": "Llama 3.3 70B Instruct",
    "organization_id": "meta"
  },
  {
    "model_provider_id": 393,
    "model_id": "llama-4-maverick",
    "provider_id": "lambda",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 18,
    "output_cents_per_million_tokens": 60,
    "quantization": null,
    "max_input_tokens": 1000000,
    "max_output_tokens": 1000000,
    "throughput": 93.69,
    "latency": 0.65,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.060734+00:00",
    "updated_at": "2025-07-19T19:49:17.060734+00:00",
    "provider_model_id_used": "llama-4-maverick",
    "model_name": "Llama 4 Maverick",
    "organization_id": "meta"
  },
  {
    "model_provider_id": 394,
    "model_id": "llama-4-scout",
    "provider_id": "lambda",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 8,
    "output_cents_per_million_tokens": 30,
    "quantization": null,
    "max_input_tokens": 10000000,
    "max_output_tokens": 10000000,
    "throughput": 139.7,
    "latency": 0.43,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.062783+00:00",
    "updated_at": "2025-07-19T19:49:17.062783+00:00",
    "provider_model_id_used": "llama-4-scout",
    "model_name": "Llama 4 Scout",
    "organization_id": "meta"
  },
  {
    "model_provider_id": 392,
    "model_id": "qwen-2.5-coder-32b-instruct",
    "provider_id": "lambda",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 9,
    "output_cents_per_million_tokens": 9,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 42.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.058608+00:00",
    "updated_at": "2025-07-19T19:49:17.058608+00:00",
    "provider_model_id_used": "qwen-2.5-coder-32b-instruct",
    "model_name": "Qwen2.5-Coder 32B Instruct",
    "organization_id": "qwen"
  }
]

================================================
FILE: data/providers/lambda/provider.json
================================================
{
  "provider_id": "lambda",
  "name": "Lambda",
  "website": "https://lambdalabs.com",
  "created_at": "2025-07-19T19:49:17.048564+00:00",
  "updated_at": "2025-07-19T19:49:17.048564+00:00"
}

================================================
FILE: data/providers/mistral/models.json
================================================
[
  {
    "model_provider_id": 408,
    "model_id": "devstral-medium-2507",
    "provider_id": "mistral",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 40,
    "output_cents_per_million_tokens": 200,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 137.1,
    "latency": 0.23,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.098942+00:00",
    "updated_at": "2025-07-19T19:49:17.098942+00:00",
    "provider_model_id_used": "devstral-medium-2507",
    "model_name": "Devstral Medium",
    "organization_id": "mistral"
  },
  {
    "model_provider_id": 409,
    "model_id": "devstral-small-2507",
    "provider_id": "mistral",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 10,
    "output_cents_per_million_tokens": 30,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 137.1,
    "latency": 0.23,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.100512+00:00",
    "updated_at": "2025-07-19T19:49:17.100512+00:00",
    "provider_model_id_used": "devstral-small-2507",
    "model_name": "Devstral Small 1.1",
    "organization_id": "mistral"
  },
  {
    "model_provider_id": 415,
    "model_id": "ministral-8b-instruct-2410",
    "provider_id": "mistral",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 10,
    "output_cents_per_million_tokens": 10,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 0.1,
    "latency": 0.18,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.113059+00:00",
    "updated_at": "2025-07-19T19:49:17.113059+00:00",
    "provider_model_id_used": "ministral-8b-instruct-2410",
    "model_name": "Ministral 8B Instruct",
    "organization_id": "mistral"
  },
  {
    "model_provider_id": 412,
    "model_id": "mistral-large-2-2407",
    "provider_id": "mistral",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 200,
    "output_cents_per_million_tokens": 600,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 0.1,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.106626+00:00",
    "updated_at": "2025-07-19T19:49:17.106626+00:00",
    "provider_model_id_used": "mistral-large-2-2407",
    "model_name": "Mistral Large 2",
    "organization_id": "mistral"
  },
  {
    "model_provider_id": 417,
    "model_id": "mistral-nemo-instruct-2407",
    "provider_id": "mistral",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 15,
    "output_cents_per_million_tokens": 15,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 0.1,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.116560+00:00",
    "updated_at": "2025-07-19T19:49:17.116560+00:00",
    "provider_model_id_used": "mistral-nemo-instruct-2407",
    "model_name": "Mistral NeMo Instruct",
    "organization_id": "mistral"
  },
  {
    "model_provider_id": 414,
    "model_id": "mistral-small-2409",
    "provider_id": "mistral",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 20,
    "output_cents_per_million_tokens": 60,
    "quantization": null,
    "max_input_tokens": 32768,
    "max_output_tokens": 32768,
    "throughput": 0.1,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.111268+00:00",
    "updated_at": "2025-07-19T19:49:17.111268+00:00",
    "provider_model_id_used": "mistral-small-2409",
    "model_name": "Mistral Small",
    "organization_id": "mistral"
  },
  {
    "model_provider_id": 419,
    "model_id": "mistral-small-24b-instruct-2501",
    "provider_id": "mistral",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 10,
    "output_cents_per_million_tokens": 30,
    "quantization": null,
    "max_input_tokens": 32000,
    "max_output_tokens": 32000,
    "throughput": 134.0,
    "latency": 0.2,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.120575+00:00",
    "updated_at": "2025-07-19T19:49:17.120575+00:00",
    "provider_model_id_used": "mistral-small-24b-instruct-2501",
    "model_name": "Mistral Small 3 24B Instruct",
    "organization_id": "mistral"
  },
  {
    "model_provider_id": 410,
    "model_id": "mistral-small-3.1-24b-base-2503",
    "provider_id": "mistral",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 10,
    "output_cents_per_million_tokens": 30,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 137.1,
    "latency": 0.23,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.102773+00:00",
    "updated_at": "2025-07-19T19:49:17.102773+00:00",
    "provider_model_id_used": "mistral-small-3.1-24b-base-2503",
    "model_name": "Mistral Small 3.1 24B Base",
    "organization_id": "mistral"
  },
  {
    "model_provider_id": 416,
    "model_id": "pixtral-12b-2409",
    "provider_id": "mistral",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 15,
    "output_cents_per_million_tokens": 15,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 8192,
    "throughput": 0.1,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.114646+00:00",
    "updated_at": "2025-07-19T19:49:17.114646+00:00",
    "provider_model_id_used": "pixtral-12b-2409",
    "model_name": "Pixtral-12B",
    "organization_id": "mistral"
  },
  {
    "model_provider_id": 413,
    "model_id": "pixtral-large",
    "provider_id": "mistral",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 200,
    "output_cents_per_million_tokens": 600,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 0.1,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.108807+00:00",
    "updated_at": "2025-07-19T19:49:17.108807+00:00",
    "provider_model_id_used": "pixtral-large",
    "model_name": "Pixtral Large",
    "organization_id": "mistral"
  }
]


================================================
FILE: data/providers/mistral/provider.json
================================================
{
  "provider_id": "mistral",
  "name": "Mistral AI",
  "website": "https://mistral.ai",
  "created_at": "2025-07-19T19:49:17.096952+00:00",
  "updated_at": "2025-07-19T19:49:17.096952+00:00"
}


================================================
FILE: data/providers/novita/models.json
================================================
[
  {
    "model_provider_id": 359,
    "model_id": "qwen3-235b-a22b-instruct-2507",
    "provider_id": "novita",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 15,
    "output_cents_per_million_tokens": 80,
    "quantization": "fp8",
    "max_input_tokens": 131072,
    "max_output_tokens": 16384,
    "throughput": null,
    "latency": null,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-09-14T00:00:00.000000+00:00",
    "updated_at": "2025-09-14T00:00:00.000000+00:00",
    "provider_model_id_used": "qwen/qwen3-235b-a22b-instruct-2507",
    "model_name": "Qwen3-235B-A22B-Instruct-2507",
    "organization_id": "qwen"
  },
  {
    "model_provider_id": 360,
    "model_id": "gpt-oss-20b",
    "provider_id": "novita",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 5,
    "output_cents_per_million_tokens": 20,
    "quantization": "bf16",
    "max_input_tokens": 131072,
    "max_output_tokens": 32768,
    "throughput": null,
    "latency": null,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-09-14T00:00:00.000000+00:00",
    "updated_at": "2025-09-14T00:00:00.000000+00:00",
    "provider_model_id_used": "openai/gpt-oss-20b",
    "model_name": "GPT-OSS-20B",
    "organization_id": "openai"
  },
  {
    "model_provider_id": 364,
    "model_id": "gpt-oss-120b",
    "provider_id": "novita",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 10,
    "output_cents_per_million_tokens": 50,
    "quantization": "bf16",
    "max_input_tokens": 131072,
    "max_output_tokens": 131072,
    "throughput": null,
    "latency": null,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-09-14T00:00:00.000000+00:00",
    "updated_at": "2025-09-14T00:00:00.000000+00:00",
    "provider_model_id_used": "openai/gpt-oss-120b",
    "model_name": "GPT-OSS-120B",
    "organization_id": "openai"
  },
  {
    "model_provider_id": 361,
    "model_id": "qwen3-235b-a22b-thinking-2507",
    "provider_id": "novita",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 30,
    "output_cents_per_million_tokens": 300,
    "quantization": "fp8",
    "max_input_tokens": 256000,
    "max_output_tokens": 131072,
    "throughput": null,
    "latency": null,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-09-14T00:00:00.000000+00:00",
    "updated_at": "2025-09-14T00:00:00.000000+00:00",
    "provider_model_id_used": "qwen/qwen3-235b-a22b-thinking-2507",
    "model_name": "Qwen3-235B-A22B-Thinking-2507",
    "organization_id": "qwen"
  },
  {
    "model_provider_id": 362,
    "model_id": "deepseek-v3-0324",
    "provider_id": "novita",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 28,
    "output_cents_per_million_tokens": 114,
    "quantization": "fp8",
    "max_input_tokens": 163840,
    "max_output_tokens": 163840,
    "throughput": null,
    "latency": null,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-09-14T00:00:00.000000+00:00",
    "updated_at": "2025-09-14T00:00:00.000000+00:00",
    "provider_model_id_used": "deepseek/deepseek-v3-0324",
    "model_name": "DeepSeek-V3-0324",
    "organization_id": "deepseek"
  },
  {
    "model_provider_id": 363,
    "model_id": "deepseek-v3.1",
    "provider_id": "novita",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 27,
    "output_cents_per_million_tokens": 100,
    "quantization": "fp8",
    "max_input_tokens": 163840,
    "max_output_tokens": 163840,
    "throughput": null,
    "latency": null,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-09-14T00:00:00.000000+00:00",
    "updated_at": "2025-09-14T00:00:00.000000+00:00",
    "provider_model_id_used": "deepseek/deepseek-v3.1",
    "model_name": "DeepSeek V3.1",
    "organization_id": "deepseek"
  },
  {
    "model_provider_id": 357,
    "model_id": "deepseek-r1-0528",
    "provider_id": "novita",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 70,
    "output_cents_per_million_tokens": 250,
    "quantization": null,
    "max_input_tokens": 131072,
    "max_output_tokens": 131072,
    "throughput": 37.96,
    "latency": 1.18,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.982118+00:00",
    "updated_at": "2025-07-19T19:49:16.982118+00:00",
    "provider_model_id_used": "deepseek-r1-0528",
    "model_name": "DeepSeek-R1-0528",
    "organization_id": "deepseek"
  },
  {
    "model_provider_id": 351,
    "model_id": "gemma-3-27b-it",
    "provider_id": "novita",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 11,
    "output_cents_per_million_tokens": 20,
    "quantization": null,
    "max_input_tokens": 131072,
    "max_output_tokens": 131072,
    "throughput": 33.0,
    "latency": 0.2,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.969199+00:00",
    "updated_at": "2025-07-19T19:49:16.969199+00:00",
    "provider_model_id_used": "gemma-3-27b-it",
    "model_name": "Gemma 3 27B",
    "organization_id": "google"
  },
  {
    "model_provider_id": 358,
    "model_id": "kimi-k2-instruct",
    "provider_id": "novita",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 57,
    "output_cents_per_million_tokens": 230,
    "quantization": "fp8",
    "max_input_tokens": 131072,
    "max_output_tokens": 131072,
    "throughput": 45.0,
    "latency": 0.95,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.984536+00:00",
    "updated_at": "2025-09-14T00:00:00.000000+00:00",
    "provider_model_id_used": "moonshotai/kimi-k2-instruct",
    "model_name": "Kimi K2 Instruct",
    "organization_id": "moonshotai"
  },
  {
    "model_provider_id": 365,
    "model_id": "kimi-k2-0905",
    "provider_id": "novita",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 60,
    "output_cents_per_million_tokens": 250,
    "quantization": "fp8",
    "max_input_tokens": 262144,
    "max_output_tokens": 262144,
    "throughput": null,
    "latency": null,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-09-14T00:00:00.000000+00:00",
    "updated_at": "2025-09-14T00:00:00.000000+00:00",
    "provider_model_id_used": "moonshotai/kimi-k2-0905",
    "model_name": "Kimi K2 0905",
    "organization_id": "moonshotai"
  },
  {
    "model_provider_id": 366,
    "model_id": "qwen3-next-80b-a3b-thinking",
    "provider_id": "novita",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 15,
    "output_cents_per_million_tokens": 150,
    "quantization": "bf16",
    "max_input_tokens": 65536,
    "max_output_tokens": 65536,
    "throughput": null,
    "latency": null,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-09-14T00:00:00.000000+00:00",
    "updated_at": "2025-09-14T00:00:00.000000+00:00",
    "provider_model_id_used": "qwen/qwen3-next-80b-a3b-thinking",
    "model_name": "Qwen3 Next 80B A3B Thinking",
    "organization_id": "qwen"
  },
  {
    "model_provider_id": 367,
    "model_id": "qwen3-next-80b-a3b-instruct",
    "provider_id": "novita",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 15,
    "output_cents_per_million_tokens": 150,
    "quantization": "bf16",
    "max_input_tokens": 65536,
    "max_output_tokens": 65536,
    "throughput": null,
    "latency": null,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-09-14T00:00:00.000000+00:00",
    "updated_at": "2025-09-14T00:00:00.000000+00:00",
    "provider_model_id_used": "qwen/qwen3-next-80b-a3b-instruct",
    "model_name": "Qwen3 Next 80B A3B Instruct",
    "organization_id": "qwen"
  },
  {
    "model_provider_id": 352,
    "model_id": "llama-4-maverick",
    "provider_id": "novita",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 17,
    "output_cents_per_million_tokens": 85,
    "quantization": null,
    "max_input_tokens": 1000000,
    "max_output_tokens": 1000000,
    "throughput": 69.42,
    "latency": 0.62,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.970871+00:00",
    "updated_at": "2025-07-19T19:49:16.970871+00:00",
    "provider_model_id_used": "llama-4-maverick",
    "model_name": "Llama 4 Maverick",
    "organization_id": "meta"
  },
  {
    "model_provider_id": 353,
    "model_id": "llama-4-scout",
    "provider_id": "novita",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 10,
    "output_cents_per_million_tokens": 50,
    "quantization": null,
    "max_input_tokens": 10000000,
    "max_output_tokens": 10000000,
    "throughput": 69.82,
    "latency": 0.85,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.972719+00:00",
    "updated_at": "2025-07-19T19:49:16.972719+00:00",
    "provider_model_id_used": "llama-4-scout",
    "model_name": "Llama 4 Scout",
    "organization_id": "meta"
  },
  {
    "model_provider_id": 354,
    "model_id": "qwen3-235b-a22b",
    "provider_id": "novita",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 20,
    "output_cents_per_million_tokens": 80,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 38.51,
    "latency": 1.02,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.975233+00:00",
    "updated_at": "2025-07-19T19:49:16.975233+00:00",
    "provider_model_id_used": "qwen3-235b-a22b",
    "model_name": "Qwen3 235B A22B",
    "organization_id": "qwen"
  },
  {
    "model_provider_id": 356,
    "model_id": "qwen3-30b-a3b",
    "provider_id": "novita",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 10,
    "output_cents_per_million_tokens": 44,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 88.84,
    "latency": 0.73,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.980126+00:00",
    "updated_at": "2025-07-19T19:49:16.980126+00:00",
    "provider_model_id_used": "qwen3-30b-a3b",
    "model_name": "Qwen3 30B A3B",
    "organization_id": "qwen"
  },
  {
    "model_provider_id": 355,
    "model_id": "qwen3-32b",
    "provider_id": "novita",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 10,
    "output_cents_per_million_tokens": 44,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 32.43,
    "latency": 0.93,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.977464+00:00",
    "updated_at": "2025-07-19T19:49:16.977464+00:00",
    "provider_model_id_used": "qwen3-32b",
    "model_name": "Qwen3 32B",
    "organization_id": "qwen"
  },
  {
    "model_provider_id": 368,
    "model_id": "deepseek-v3.2-exp",
    "provider_id": "novita",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 27,
    "output_cents_per_million_tokens": 41,
    "quantization": "fp8",
    "max_input_tokens": 163840,
    "max_output_tokens": 65536,
    "throughput": null,
    "latency": null,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-09-29T00:00:00.000000+00:00",
    "updated_at": "2025-09-29T00:00:00.000000+00:00",
    "provider_model_id_used": "deepseek/deepseek-v3.2-exp",
    "model_name": "DeepSeek V3.2 Exp",
    "organization_id": "deepseek"
  },
  {
    "model_provider_id": 369,
    "model_id": "glm-4.5",
    "provider_id": "novita",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 60,
    "output_cents_per_million_tokens": 220,
    "quantization": "fp8",
    "max_input_tokens": 131072,
    "max_output_tokens": 98304,
    "throughput": null,
    "latency": null,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-09-29T00:00:00.000000+00:00",
    "updated_at": "2025-09-29T00:00:00.000000+00:00",
    "provider_model_id_used": "zai-org/glm-4.5",
    "model_name": "GLM-4.5",
    "organization_id": "zai-org"
  },
  {
    "model_provider_id": 370,
    "model_id": "glm-4.5v",
    "provider_id": "novita",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 60,
    "output_cents_per_million_tokens": 220,
    "quantization": "fp8",
    "max_input_tokens": 131072,
    "max_output_tokens": 65536,
    "throughput": null,
    "latency": null,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": true,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-09-29T00:00:00.000000+00:00",
    "updated_at": "2025-09-29T00:00:00.000000+00:00",
    "provider_model_id_used": "zai-org/GLM-4.5V",
    "model_name": "GLM-4.5V",
    "organization_id": "zai-org"
  }
]


================================================
FILE: data/providers/novita/provider.json
================================================
{
  "provider_id": "novita",
  "name": "Novita",
  "website": "https://novita.ai/",
  "created_at": "2025-07-19T19:49:16.967182+00:00",
  "updated_at": "2025-07-19T19:49:16.967182+00:00"
}

================================================
FILE: data/providers/openai/models.json
================================================
[
  {
    "model_provider_id": 422,
    "model_id": "gpt-3.5-turbo-0125",
    "provider_id": "openai",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 50,
    "output_cents_per_million_tokens": 150,
    "quantization": null,
    "max_input_tokens": 16385,
    "max_output_tokens": 4096,
    "throughput": 100.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.128446+00:00",
    "updated_at": "2025-07-19T19:49:17.128446+00:00",
    "provider_model_id_used": "gpt-3.5-turbo-0125",
    "model_name": "GPT-3.5 Turbo",
    "organization_id": "openai"
  },
  {
    "model_provider_id": 420,
    "model_id": "gpt-4-0613",
    "provider_id": "openai",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 3000,
    "output_cents_per_million_tokens": 6000,
    "quantization": null,
    "max_input_tokens": 32768,
    "max_output_tokens": 32768,
    "throughput": 100.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.123888+00:00",
    "updated_at": "2025-07-19T19:49:17.123888+00:00",
    "provider_model_id_used": "gpt-4-0613",
    "model_name": "GPT-4",
    "organization_id": "openai"
  },
  {
    "model_provider_id": 430,
    "model_id": "gpt-4.1-2025-04-14",
    "provider_id": "openai",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 200,
    "output_cents_per_million_tokens": 800,
    "quantization": null,
    "max_input_tokens": 1047576,
    "max_output_tokens": 32768,
    "throughput": 100.0,
    "latency": 10.0,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.150851+00:00",
    "updated_at": "2025-07-19T19:49:17.150851+00:00",
    "provider_model_id_used": "gpt-4.1-2025-04-14",
    "model_name": "GPT-4.1",
    "organization_id": "openai"
  },
  {
    "model_provider_id": 431,
    "model_id": "gpt-4.1-mini-2025-04-14",
    "provider_id": "openai",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 40,
    "output_cents_per_million_tokens": 160,
    "quantization": null,
    "max_input_tokens": 1047576,
    "max_output_tokens": 32768,
    "throughput": 150.0,
    "latency": 5.0,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.152948+00:00",
    "updated_at": "2025-07-19T19:49:17.152948+00:00",
    "provider_model_id_used": "gpt-4.1-mini-2025-04-14",
    "model_name": "GPT-4.1 mini",
    "organization_id": "openai"
  },
  {
    "model_provider_id": 432,
    "model_id": "gpt-4.1-nano-2025-04-14",
    "provider_id": "openai",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 10,
    "output_cents_per_million_tokens": 40,
    "quantization": null,
    "max_input_tokens": 1047576,
    "max_output_tokens": 32768,
    "throughput": 200.0,
    "latency": 2.0,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.154798+00:00",
    "updated_at": "2025-07-19T19:49:17.154798+00:00",
    "provider_model_id_used": "gpt-4.1-nano-2025-04-14",
    "model_name": "GPT-4.1 nano",
    "organization_id": "openai"
  },
  {
    "model_provider_id": 429,
    "model_id": "gpt-4.5",
    "provider_id": "openai",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 7500,
    "output_cents_per_million_tokens": 15000,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 4096,
    "throughput": 50.0,
    "latency": 20.0,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.148982+00:00",
    "updated_at": "2025-07-19T19:49:17.148982+00:00",
    "provider_model_id_used": "gpt-4.5",
    "model_name": "GPT-4.5",
    "organization_id": "openai"
  },
  {
    "model_provider_id": 424,
    "model_id": "gpt-4o-2024-05-13",
    "provider_id": "openai",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 250,
    "output_cents_per_million_tokens": 1000,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 4096,
    "throughput": 100.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.132398+00:00",
    "updated_at": "2025-07-19T19:49:17.132398+00:00",
    "provider_model_id_used": "gpt-4o-2024-05-13",
    "model_name": "GPT-4o",
    "organization_id": "openai"
  },
  {
    "model_provider_id": 423,
    "model_id": "gpt-4o-2024-08-06",
    "provider_id": "openai",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 250,
    "output_cents_per_million_tokens": 1000,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 16384,
    "throughput": 132.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.130542+00:00",
    "updated_at": "2025-07-19T19:49:17.130542+00:00",
    "provider_model_id_used": "gpt-4o-2024-08-06",
    "model_name": "GPT-4o",
    "organization_id": "openai"
  },
  {
    "model_provider_id": 421,
    "model_id": "gpt-4-turbo-2024-04-09",
    "provider_id": "openai",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 1000,
    "output_cents_per_million_tokens": 3000,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 4096,
    "throughput": 100.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.126193+00:00",
    "updated_at": "2025-07-19T19:49:17.126193+00:00",
    "provider_model_id_used": "gpt-4-turbo-2024-04-09",
    "model_name": "GPT-4 Turbo",
    "organization_id": "openai"
  },
  {
    "model_provider_id": 426,
    "model_id": "o1-2024-12-17",
    "provider_id": "openai",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 1500,
    "output_cents_per_million_tokens": 6000,
    "quantization": null,
    "max_input_tokens": 200000,
    "max_output_tokens": 100000,
    "throughput": 66.0,
    "latency": 16.2,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.136375+00:00",
    "updated_at": "2025-07-19T19:49:17.136375+00:00",
    "provider_model_id_used": "o1-2024-12-17",
    "model_name": "o1",
    "organization_id": "openai"
  },
  {
    "model_provider_id": 427,
    "model_id": "o1-mini",
    "provider_id": "openai",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 300,
    "output_cents_per_million_tokens": 1200,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 65536,
    "throughput": 115.0,
    "latency": 5.2,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.137957+00:00",
    "updated_at": "2025-07-19T19:49:17.137957+00:00",
    "provider_model_id_used": "o1-mini",
    "model_name": "o1-mini",
    "organization_id": "openai"
  },
  {
    "model_provider_id": 425,
    "model_id": "o1-preview",
    "provider_id": "openai",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 1500,
    "output_cents_per_million_tokens": 6000,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 32768,
    "throughput": 66.0,
    "latency": 16.2,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.134477+00:00",
    "updated_at": "2025-07-19T19:49:17.134477+00:00",
    "provider_model_id_used": "o1-preview",
    "model_name": "o1-preview",
    "organization_id": "openai"
  },
  {
    "model_provider_id": 433,
    "model_id": "o3-2025-04-16",
    "provider_id": "openai",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 200,
    "output_cents_per_million_tokens": 800,
    "quantization": null,
    "max_input_tokens": 200000,
    "max_output_tokens": 100000,
    "throughput": 50.0,
    "latency": 20.0,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.156370+00:00",
    "updated_at": "2025-07-19T19:49:17.156370+00:00",
    "provider_model_id_used": "o3-2025-04-16",
    "model_name": "o3",
    "organization_id": "openai"
  },
  {
    "model_provider_id": 428,
    "model_id": "o3-mini",
    "provider_id": "openai",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 110,
    "output_cents_per_million_tokens": 440,
    "quantization": null,
    "max_input_tokens": 200000,
    "max_output_tokens": 100000,
    "throughput": 115.0,
    "latency": 5.2,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.147026+00:00",
    "updated_at": "2025-07-19T19:49:17.147026+00:00",
    "provider_model_id_used": "o3-mini",
    "model_name": "o3-mini",
    "organization_id": "openai"
  },
  {
    "model_provider_id": 435,
    "model_id": "o3-pro-2025-06-10",
    "provider_id": "openai",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 2000,
    "output_cents_per_million_tokens": 8000,
    "quantization": null,
    "max_input_tokens": 200000,
    "max_output_tokens": 100000,
    "throughput": 25.0,
    "latency": 30.0,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.161549+00:00",
    "updated_at": "2025-07-19T19:49:17.161549+00:00",
    "provider_model_id_used": "o3-pro-2025-06-10",
    "model_name": "o3-pro",
    "organization_id": "openai"
  },
  {
    "model_provider_id": 434,
    "model_id": "o4-mini",
    "provider_id": "openai",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 110,
    "output_cents_per_million_tokens": 440,
    "quantization": null,
    "max_input_tokens": 200000,
    "max_output_tokens": 100000,
    "throughput": 115.0,
    "latency": 5.2,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.159618+00:00",
    "updated_at": "2025-07-19T19:49:17.159618+00:00",
    "provider_model_id_used": "o4-mini",
    "model_name": "o4-mini",
    "organization_id": "openai"
  },
  {
    "model_provider_id": 434,
    "model_id": "gpt-oss-120b",
    "provider_id": "openai",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 10,
    "output_cents_per_million_tokens": 50,
    "quantization": null,
    "max_input_tokens": 131072,
    "max_output_tokens": 131072,
    "throughput": 115.0,
    "latency": 5.2,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.159618+00:00",
    "updated_at": "2025-07-19T19:49:17.159618+00:00",
    "provider_model_id_used": "gpt-oss-120b",
    "model_name": "GPT OSS 120B",
    "organization_id": "openai"
  },
  {
    "model_provider_id": 434,
    "model_id": "gpt-oss-20b",
    "provider_id": "openai",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 10,
    "output_cents_per_million_tokens": 50,
    "quantization": null,
    "max_input_tokens": 131072,
    "max_output_tokens": 131072,
    "throughput": 115.0,
    "latency": 5.2,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.159618+00:00",
    "updated_at": "2025-07-19T19:49:17.159618+00:00",
    "provider_model_id_used": "gpt-oss-20b",
    "model_name": "GPT OSS 20B",
    "organization_id": "openai"
  },
  {
    "model_provider_id": 436,
    "model_id": "gpt-5-2025-08-07",
    "provider_id": "openai",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 125,
    "output_cents_per_million_tokens": 1000,
    "quantization": null,
    "max_input_tokens": 400000,
    "max_output_tokens": 128000,
    "throughput": 100.0,
    "latency": 2.0,
    "feature_web_search": true,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": true,
    "feature_batch_inference": true,
    "feature_finetuning": true,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "provider_model_id_used": "gpt-5",
    "model_name": "GPT-5",
    "organization_id": "openai"
  },
  {
    "model_provider_id": 437,
    "model_id": "gpt-5-mini-2025-08-07",
    "provider_id": "openai",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 25,
    "output_cents_per_million_tokens": 200,
    "quantization": null,
    "max_input_tokens": 400000,
    "max_output_tokens": 128000,
    "throughput": 200.0,
    "latency": 1.0,
    "feature_web_search": true,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": true,
    "feature_batch_inference": true,
    "feature_finetuning": true,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "provider_model_id_used": "gpt-5-mini",
    "model_name": "GPT-5 mini",
    "organization_id": "openai"
  },
  {
    "model_provider_id": 438,
    "model_id": "gpt-5-nano-2025-08-07",
    "provider_id": "openai",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 5,
    "output_cents_per_million_tokens": 40,
    "quantization": null,
    "max_input_tokens": 400000,
    "max_output_tokens": 128000,
    "throughput": 500.0,
    "latency": 0.3,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": true,
    "feature_batch_inference": true,
    "feature_finetuning": true,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "provider_model_id_used": "gpt-5-nano",
    "model_name": "GPT-5 nano",
    "organization_id": "openai"
  }
]


================================================
FILE: data/providers/openai/provider.json
================================================
{
  "provider_id": "openai",
  "name": "OpenAI",
  "website": "https://openai.com",
  "created_at": "2025-07-19T19:49:17.121876+00:00",
  "updated_at": "2025-07-19T19:49:17.121876+00:00"
}


================================================
FILE: data/providers/replicate/models.json
================================================
[
  {
    "model_provider_id": 396,
    "model_id": "deepseek-vl2",
    "provider_id": "replicate",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 950,
    "output_cents_per_million_tokens": 480000,
    "quantization": null,
    "max_input_tokens": 129280,
    "max_output_tokens": 129280,
    "throughput": 22.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.068077+00:00",
    "updated_at": "2025-07-19T19:49:17.068077+00:00",
    "provider_model_id_used": "deepseek-vl2",
    "model_name": "DeepSeek VL2",
    "organization_id": "deepseek"
  },
  {
    "model_provider_id": 395,
    "model_id": "llama-3.1-405b-instruct",
    "provider_id": "replicate",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 950,
    "output_cents_per_million_tokens": 950,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 22.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.066199+00:00",
    "updated_at": "2025-07-19T19:49:17.066199+00:00",
    "provider_model_id_used": "llama-3.1-405b-instruct",
    "model_name": "Llama 3.1 405B Instruct",
    "organization_id": "meta"
  }
]

================================================
FILE: data/providers/replicate/provider.json
================================================
{
  "provider_id": "replicate",
  "name": "Replicate",
  "website": "https://replicate.com/",
  "created_at": "2025-07-19T19:49:17.064218+00:00",
  "updated_at": "2025-07-19T19:49:17.064218+00:00"
}

================================================
FILE: data/providers/sambanova/models.json
================================================
[
  {
    "model_provider_id": 240,
    "model_id": "llama-3.1-70b-instruct",
    "provider_id": "sambanova",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 500,
    "output_cents_per_million_tokens": 1000,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 74.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.702554+00:00",
    "updated_at": "2025-07-19T19:49:16.702554+00:00",
    "provider_model_id_used": "llama-3.1-70b-instruct",
    "model_name": "Llama 3.1 70B Instruct",
    "organization_id": "meta"
  },
  {
    "model_provider_id": 239,
    "model_id": "llama-3.1-8b-instruct",
    "provider_id": "sambanova",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 10,
    "output_cents_per_million_tokens": 20,
    "quantization": null,
    "max_input_tokens": 131072,
    "max_output_tokens": 131072,
    "throughput": 1050.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.699627+00:00",
    "updated_at": "2025-07-19T19:49:16.699627+00:00",
    "provider_model_id_used": "llama-3.1-8b-instruct",
    "model_name": "Llama 3.1 8B Instruct",
    "organization_id": "meta"
  },
  {
    "model_provider_id": 241,
    "model_id": "llama-3.2-11b-instruct",
    "provider_id": "sambanova",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 15,
    "output_cents_per_million_tokens": 30,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 100.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.705086+00:00",
    "updated_at": "2025-07-19T19:49:16.705086+00:00",
    "provider_model_id_used": "llama-3.2-11b-instruct",
    "model_name": "Llama 3.2 11B Instruct",
    "organization_id": "meta"
  },
  {
    "model_provider_id": 242,
    "model_id": "llama-3.3-70b-instruct",
    "provider_id": "sambanova",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 60,
    "output_cents_per_million_tokens": 120,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 1096.0,
    "latency": 0.65,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.707534+00:00",
    "updated_at": "2025-07-19T19:49:16.707534+00:00",
    "provider_model_id_used": "llama-3.3-70b-instruct",
    "model_name": "Llama 3.3 70B Instruct",
    "organization_id": "meta"
  },
  {
    "model_provider_id": 243,
    "model_id": "llama-4-maverick",
    "provider_id": "sambanova",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 63,
    "output_cents_per_million_tokens": 179,
    "quantization": null,
    "max_input_tokens": 1000000,
    "max_output_tokens": 1000000,
    "throughput": 638.7,
    "latency": 2.04,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.710100+00:00",
    "updated_at": "2025-07-19T19:49:16.710100+00:00",
    "provider_model_id_used": "llama-4-maverick",
    "model_name": "Llama 4 Maverick",
    "organization_id": "meta"
  },
  {
    "model_provider_id": 244,
    "model_id": "qwen3-32b",
    "provider_id": "sambanova",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 40,
    "output_cents_per_million_tokens": 80,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 327.7,
    "latency": 1.08,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.712669+00:00",
    "updated_at": "2025-07-19T19:49:16.712669+00:00",
    "provider_model_id_used": "qwen3-32b",
    "model_name": "Qwen3 32B",
    "organization_id": "qwen"
  }
]

================================================
FILE: data/providers/sambanova/provider.json
================================================
{
  "provider_id": "sambanova",
  "name": "Sambanova",
  "website": "https://sambanova.ai/",
  "created_at": "2025-07-19T19:49:16.697204+00:00",
  "updated_at": "2025-07-19T19:49:16.697204+00:00"
}

================================================
FILE: data/providers/together/models.json
================================================
[
  {
    "model_provider_id": 255,
    "model_id": "deepseek-r1",
    "provider_id": "together",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 700,
    "output_cents_per_million_tokens": 700,
    "quantization": null,
    "max_input_tokens": 131072,
    "max_output_tokens": 131072,
    "throughput": 4.0,
    "latency": 0.6,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.738387+00:00",
    "updated_at": "2025-07-19T19:49:16.738387+00:00",
    "provider_model_id_used": "deepseek-r1",
    "model_name": "DeepSeek-R1",
    "organization_id": "deepseek"
  },
  {
    "model_provider_id": 245,
    "model_id": "gemma-3n-e4b-it",
    "provider_id": "together",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 2000,
    "output_cents_per_million_tokens": 4000,
    "quantization": null,
    "max_input_tokens": 32000,
    "max_output_tokens": 32000,
    "throughput": 42.09,
    "latency": 0.43,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.716616+00:00",
    "updated_at": "2025-07-19T19:49:16.716616+00:00",
    "provider_model_id_used": "gemma-3n-e4b-it",
    "model_name": "Gemma 3n E4B Instructed",
    "organization_id": "google"
  },
  {
    "model_provider_id": 248,
    "model_id": "llama-3.1-405b-instruct",
    "provider_id": "together",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 350,
    "output_cents_per_million_tokens": 350,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 35.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.722263+00:00",
    "updated_at": "2025-07-19T19:49:16.722263+00:00",
    "provider_model_id_used": "llama-3.1-405b-instruct",
    "model_name": "Llama 3.1 405B Instruct",
    "organization_id": "meta"
  },
  {
    "model_provider_id": 247,
    "model_id": "llama-3.1-70b-instruct",
    "provider_id": "together",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 89,
    "output_cents_per_million_tokens": 89,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 94.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.720699+00:00",
    "updated_at": "2025-07-19T19:49:16.720699+00:00",
    "provider_model_id_used": "llama-3.1-70b-instruct",
    "model_name": "Llama 3.1 70B Instruct",
    "organization_id": "meta"
  },
  {
    "model_provider_id": 246,
    "model_id": "llama-3.1-8b-instruct",
    "provider_id": "together",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 20,
    "output_cents_per_million_tokens": 20,
    "quantization": null,
    "max_input_tokens": 131072,
    "max_output_tokens": 131072,
    "throughput": 194.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.718652+00:00",
    "updated_at": "2025-07-19T19:49:16.718652+00:00",
    "provider_model_id_used": "llama-3.1-8b-instruct",
    "model_name": "Llama 3.1 8B Instruct",
    "organization_id": "meta"
  },
  {
    "model_provider_id": 249,
    "model_id": "llama-3.2-11b-instruct",
    "provider_id": "together",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 18,
    "output_cents_per_million_tokens": 18,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 168.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.724215+00:00",
    "updated_at": "2025-07-19T19:49:16.724215+00:00",
    "provider_model_id_used": "llama-3.2-11b-instruct",
    "model_name": "Llama 3.2 11B Instruct",
    "organization_id": "meta"
  },
  {
    "model_provider_id": 250,
    "model_id": "llama-3.2-90b-instruct",
    "provider_id": "together",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 120,
    "output_cents_per_million_tokens": 120,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 57.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.726568+00:00",
    "updated_at": "2025-07-19T19:49:16.726568+00:00",
    "provider_model_id_used": "llama-3.2-90b-instruct",
    "model_name": "Llama 3.2 90B Instruct",
    "organization_id": "meta"
  },
  {
    "model_provider_id": 254,
    "model_id": "llama-3.3-70b-instruct",
    "provider_id": "together",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 88,
    "output_cents_per_million_tokens": 88,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 65.0,
    "latency": 0.65,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.735754+00:00",
    "updated_at": "2025-07-19T19:49:16.735754+00:00",
    "provider_model_id_used": "llama-3.3-70b-instruct",
    "model_name": "Llama 3.3 70B Instruct",
    "organization_id": "meta"
  },
  {
    "model_provider_id": 256,
    "model_id": "llama-4-maverick",
    "provider_id": "together",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 27,
    "output_cents_per_million_tokens": 85,
    "quantization": null,
    "max_input_tokens": 1000000,
    "max_output_tokens": 1000000,
    "throughput": 97.93,
    "latency": 0.2,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.740112+00:00",
    "updated_at": "2025-07-19T19:49:16.740112+00:00",
    "provider_model_id_used": "llama-4-maverick",
    "model_name": "Llama 4 Maverick",
    "organization_id": "meta"
  },
  {
    "model_provider_id": 257,
    "model_id": "llama-4-scout",
    "provider_id": "together",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 18,
    "output_cents_per_million_tokens": 59,
    "quantization": null,
    "max_input_tokens": 10000000,
    "max_output_tokens": 10000000,
    "throughput": 106.9,
    "latency": 0.54,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.742126+00:00",
    "updated_at": "2025-07-19T19:49:16.742126+00:00",
    "provider_model_id_used": "llama-4-scout",
    "model_name": "Llama 4 Scout",
    "organization_id": "meta"
  },
  {
    "model_provider_id": 252,
    "model_id": "qwen-2.5-72b-instruct",
    "provider_id": "together",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 120,
    "output_cents_per_million_tokens": 120,
    "quantization": null,
    "max_input_tokens": 131072,
    "max_output_tokens": 8192,
    "throughput": 47.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.731610+00:00",
    "updated_at": "2025-07-19T19:49:16.731610+00:00",
    "provider_model_id_used": "qwen-2.5-72b-instruct",
    "model_name": "Qwen2.5 72B Instruct",
    "organization_id": "qwen"
  },
  {
    "model_provider_id": 251,
    "model_id": "qwen-2.5-7b-instruct",
    "provider_id": "together",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 30,
    "output_cents_per_million_tokens": 30,
    "quantization": null,
    "max_input_tokens": 131072,
    "max_output_tokens": 8192,
    "throughput": 138.0,
    "latency": 0.5,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.728846+00:00",
    "updated_at": "2025-07-19T19:49:16.728846+00:00",
    "provider_model_id_used": "qwen-2.5-7b-instruct",
    "model_name": "Qwen2.5 7B Instruct",
    "organization_id": "qwen"
  },
  {
    "model_provider_id": 258,
    "model_id": "qwen3-235b-a22b",
    "provider_id": "together",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 20,
    "output_cents_per_million_tokens": 60,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 128000,
    "throughput": 23.74,
    "latency": 0.79,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.746014+00:00",
    "updated_at": "2025-07-19T19:49:16.746014+00:00",
    "provider_model_id_used": "qwen3-235b-a22b",
    "model_name": "Qwen3 235B A22B",
    "organization_id": "qwen"
  },
  {
    "model_provider_id": 253,
    "model_id": "qwq-32b-preview",
    "provider_id": "together",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 120,
    "output_cents_per_million_tokens": 120,
    "quantization": null,
    "max_input_tokens": 32768,
    "max_output_tokens": 32768,
    "throughput": 62.14,
    "latency": 0.74,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.733822+00:00",
    "updated_at": "2025-07-19T19:49:16.733822+00:00",
    "provider_model_id_used": "qwq-32b-preview",
    "model_name": "QwQ-32B-Preview",
    "organization_id": "qwen"
  }
]

================================================
FILE: data/providers/together/provider.json
================================================
{
  "provider_id": "together",
  "name": "Together",
  "website": "https://together.ai/",
  "created_at": "2025-07-19T19:49:16.714534+00:00",
  "updated_at": "2025-07-19T19:49:16.714534+00:00"
}

================================================
FILE: data/providers/xai/models.json
================================================
[
  {
    "model_provider_id": 363,
    "model_id": "grok-2",
    "provider_id": "xai",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 200,
    "output_cents_per_million_tokens": 1000,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 8000,
    "throughput": 85.0,
    "latency": 0.7,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.997220+00:00",
    "updated_at": "2025-07-19T19:49:16.997220+00:00",
    "provider_model_id_used": "grok-2",
    "model_name": "Grok-2",
    "organization_id": "xai"
  },
  {
    "model_provider_id": 364,
    "model_id": "grok-3",
    "provider_id": "xai",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 300,
    "output_cents_per_million_tokens": 1500,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 8000,
    "throughput": 100.0,
    "latency": 0.7,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:16.998872+00:00",
    "updated_at": "2025-07-19T19:49:16.998872+00:00",
    "provider_model_id_used": "grok-3",
    "model_name": "Grok-3",
    "organization_id": "xai"
  },
  {
    "model_provider_id": 365,
    "model_id": "grok-3-mini",
    "provider_id": "xai",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 30,
    "output_cents_per_million_tokens": 50,
    "quantization": null,
    "max_input_tokens": 128000,
    "max_output_tokens": 8000,
    "throughput": 100.0,
    "latency": 0.7,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.000676+00:00",
    "updated_at": "2025-07-19T19:49:17.000676+00:00",
    "provider_model_id_used": "grok-3-mini",
    "model_name": "Grok-3 Mini",
    "organization_id": "xai"
  },
  {
    "model_provider_id": 366,
    "model_id": "grok-4",
    "provider_id": "xai",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 300,
    "output_cents_per_million_tokens": 1500,
    "quantization": null,
    "max_input_tokens": 256000,
    "max_output_tokens": 8000,
    "throughput": 100.0,
    "latency": 0.7,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.002399+00:00",
    "updated_at": "2025-07-19T19:49:17.002399+00:00",
    "provider_model_id_used": "grok-4",
    "model_name": "Grok-4",
    "organization_id": "xai"
  },
  {
    "model_provider_id": 367,
    "model_id": "grok-code-fast-1",
    "provider_id": "xai",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 20,
    "output_cents_per_million_tokens": 150,
    "quantization": null,
    "max_input_tokens": 256000,
    "max_output_tokens": 10000,
    "throughput": 76.41,
    "latency": 1.38,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-10-03T00:00:00.000000+00:00",
    "updated_at": "2025-10-03T00:00:00.000000+00:00",
    "provider_model_id_used": "grok-code-fast-1",
    "model_name": "Grok Code Fast 1",
    "organization_id": "xai"
  },
  {
    "model_provider_id": 444,
    "model_id": "grok-4-fast",
    "provider_id": "xai",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 20,
    "output_cents_per_million_tokens": 50,
    "quantization": null,
    "max_input_tokens": 2000000,
    "max_output_tokens": 30000,
    "throughput": 90,
    "latency": null,
    "feature_web_search": true,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": false,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-10-11T00:00:00.000000+00:00",
    "updated_at": "2025-10-11T00:00:00.000000+00:00",
    "provider_model_id_used": "grok-4-fast",
    "model_name": "Grok 4 Fast",
    "organization_id": "xai"
  }
]

================================================
FILE: data/providers/xai/provider.json
================================================
{
  "provider_id": "xai",
  "name": "xAI",
  "website": "https://docs.x.ai",
  "created_at": "2025-07-19T19:49:16.995303+00:00",
  "updated_at": "2025-07-19T19:49:16.995303+00:00"
}


================================================
FILE: data/providers/zeroeval/models.json
================================================
[
  {
    "model_provider_id": 441,
    "model_id": "claude-3-7-sonnet-20250219",
    "provider_id": "zeroeval",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 300,
    "output_cents_per_million_tokens": 1500,
    "quantization": null,
    "max_input_tokens": 200000,
    "max_output_tokens": 128000,
    "throughput": 42.0,
    "latency": 0.4,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.176639+00:00",
    "updated_at": "2025-07-19T19:49:17.176639+00:00",
    "provider_model_id_used": "claude-3-7-sonnet-20250219",
    "model_name": "Claude 3.7 Sonnet",
    "organization_id": "anthropic"
  },
  {
    "model_provider_id": 436,
    "model_id": "claude-opus-4-20250514",
    "provider_id": "zeroeval",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 1500,
    "output_cents_per_million_tokens": 7500,
    "quantization": null,
    "max_input_tokens": 200000,
    "max_output_tokens": 128000,
    "throughput": 42.0,
    "latency": 0.4,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.165236+00:00",
    "updated_at": "2025-07-19T19:49:17.165236+00:00",
    "provider_model_id_used": "claude-opus-4-20250514",
    "model_name": "Claude Opus 4",
    "organization_id": "anthropic"
  },
  {
    "model_provider_id": 437,
    "model_id": "claude-opus-4-1-20250805",
    "provider_id": "zeroeval",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 1500,
    "output_cents_per_million_tokens": 7500,
    "quantization": null,
    "max_input_tokens": 200000,
    "max_output_tokens": 32000,
    "throughput": 42.0,
    "latency": 0.4,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-08-05T00:00:00.000000+00:00",
    "updated_at": "2025-08-05T00:00:00.000000+00:00",
    "provider_model_id_used": "claude-opus-4-1-20250805",
    "model_name": "Claude Opus 4.1",
    "organization_id": "anthropic"
  },
  {
    "model_provider_id": 438,
    "model_id": "claude-sonnet-4-20250514",
    "provider_id": "zeroeval",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 300,
    "output_cents_per_million_tokens": 1500,
    "quantization": null,
    "max_input_tokens": 200000,
    "max_output_tokens": 128000,
    "throughput": 42.0,
    "latency": 0.4,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.170880+00:00",
    "updated_at": "2025-07-19T19:49:17.170880+00:00",
    "provider_model_id_used": "claude-sonnet-4-20250514",
    "model_name": "Claude Sonnet 4",
    "organization_id": "anthropic"
  },
  {
    "model_provider_id": 442,
    "model_id": "gemini-2.5-flash",
    "provider_id": "zeroeval",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 30,
    "output_cents_per_million_tokens": 250,
    "quantization": null,
    "max_input_tokens": 1048576,
    "max_output_tokens": 65536,
    "throughput": 85.0,
    "latency": 0.7,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.179386+00:00",
    "updated_at": "2025-07-19T19:49:17.179386+00:00",
    "provider_model_id_used": "gemini-2.5-flash",
    "model_name": "Gemini 2.5 Flash",
    "organization_id": "google"
  },
  {
    "model_provider_id": 437,
    "model_id": "gemini-2.5-pro",
    "provider_id": "zeroeval",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 125,
    "output_cents_per_million_tokens": 1000,
    "quantization": null,
    "max_input_tokens": 1048576,
    "max_output_tokens": 65536,
    "throughput": 85.0,
    "latency": 0.7,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.168497+00:00",
    "updated_at": "2025-07-19T19:49:17.168497+00:00",
    "provider_model_id_used": "gemini-2.5-pro",
    "model_name": "Gemini 2.5 Pro",
    "organization_id": "google"
  },
  {
    "model_provider_id": 440,
    "model_id": "gpt-4.1-mini-2025-04-14",
    "provider_id": "zeroeval",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 40,
    "output_cents_per_million_tokens": 160,
    "quantization": null,
    "max_input_tokens": 1047576,
    "max_output_tokens": 32768,
    "throughput": 150.0,
    "latency": 5.0,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.174218+00:00",
    "updated_at": "2025-07-19T19:49:17.174218+00:00",
    "provider_model_id_used": "gpt-4.1-mini-2025-04-14",
    "model_name": "GPT-4.1 mini",
    "organization_id": "openai"
  },
  {
    "model_provider_id": 439,
    "model_id": "grok-4",
    "provider_id": "zeroeval",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 300,
    "output_cents_per_million_tokens": 1500,
    "quantization": null,
    "max_input_tokens": 256000,
    "max_output_tokens": 8000,
    "throughput": 100.0,
    "latency": 0.7,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.172505+00:00",
    "updated_at": "2025-07-19T19:49:17.172505+00:00",
    "provider_model_id_used": "grok-4",
    "model_name": "Grok-4",
    "organization_id": "xai"
  },
  {
    "model_provider_id": 1231,
    "model_id": "gpt-oss-120b",
    "provider_id": "zeroeval",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 15,
    "output_cents_per_million_tokens": 60,
    "quantization": null,
    "max_input_tokens": 131000,
    "max_output_tokens": 30000,
    "throughput": 500,
    "latency": 0.5,
    "feature_web_search": true,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": true,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-08-05T19:49:16.965756+00:00",
    "updated_at": "2025-08-05T19:49:16.965756+00:00",
    "provider_model_id_used": "gpt-oss-120b",
    "model_name": "OpenAI OSS 120B",
    "organization_id": "openai"
  },
  {
    "model_provider_id": 1232,
    "model_id": "gpt-oss-20b",
    "provider_id": "zeroeval",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 10,
    "output_cents_per_million_tokens": 50,
    "quantization": null,
    "max_input_tokens": 131000,
    "max_output_tokens": 30000,
    "throughput": 1000,
    "latency": 0.38,
    "feature_web_search": true,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": true,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-08-05T19:49:16.965756+00:00",
    "updated_at": "2025-08-05T19:49:16.965756+00:00",
    "provider_model_id_used": "gpt-oss-20b",
    "model_name": "OpenAI OSS 20B",
    "organization_id": "openai"
  },
  {
    "model_provider_id": 1233,
    "model_id": "gpt-5-2025-08-07",
    "provider_id": "zeroeval",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 125,
    "output_cents_per_million_tokens": 1000,
    "quantization": null,
    "max_input_tokens": 400000,
    "max_output_tokens": 128000,
    "throughput": 100.0,
    "latency": 2.0,
    "feature_web_search": true,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": true,
    "feature_batch_inference": true,
    "feature_finetuning": true,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "provider_model_id_used": "gpt-5",
    "model_name": "GPT-5",
    "organization_id": "openai"
  },
  {
    "model_provider_id": 1234,
    "model_id": "gpt-5-mini-2025-08-07",
    "provider_id": "zeroeval",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 25,
    "output_cents_per_million_tokens": 200,
    "quantization": null,
    "max_input_tokens": 400000,
    "max_output_tokens": 128000,
    "throughput": 200.0,
    "latency": 1.0,
    "feature_web_search": true,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": true,
    "feature_batch_inference": true,
    "feature_finetuning": true,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "provider_model_id_used": "gpt-5-mini",
    "model_name": "GPT-5 mini",
    "organization_id": "openai"
  },
  {
    "model_provider_id": 1235,
    "model_id": "gpt-5-nano-2025-08-07",
    "provider_id": "zeroeval",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 5,
    "output_cents_per_million_tokens": 40,
    "quantization": null,
    "max_input_tokens": 400000,
    "max_output_tokens": 128000,
    "throughput": 500.0,
    "latency": 0.3,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": true,
    "feature_batch_inference": true,
    "feature_finetuning": true,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-24T12:00:00.000000+00:00",
    "updated_at": "2025-07-24T12:00:00.000000+00:00",
    "provider_model_id_used": "gpt-5-nano",
    "model_name": "GPT-5 nano",
    "organization_id": "openai"
  },
  {
    "model_provider_id": 1236,
    "model_id": "deepseek-v3.2-exp",
    "provider_id": "zeroeval",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 27,
    "output_cents_per_million_tokens": 41,
    "quantization": "fp8",
    "max_input_tokens": 163840,
    "max_output_tokens": 65536,
    "throughput": 100.0,
    "latency": 0.7,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-09-29T00:00:00.000000+00:00",
    "updated_at": "2025-09-29T00:00:00.000000+00:00",
    "provider_model_id_used": "deepseek/deepseek-v3.2-exp",
    "model_name": "DeepSeek V3.2 Exp",
    "organization_id": "deepseek"
  },
  {
    "model_provider_id": 1237,
    "model_id": "glm-4.5",
    "provider_id": "zeroeval",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 60,
    "output_cents_per_million_tokens": 220,
    "quantization": "fp8",
    "max_input_tokens": 131072,
    "max_output_tokens": 98304,
    "throughput": 85.0,
    "latency": 0.7,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-09-29T00:00:00.000000+00:00",
    "updated_at": "2025-09-29T00:00:00.000000+00:00",
    "provider_model_id_used": "zai-org/glm-4.5",
    "model_name": "GLM-4.5",
    "organization_id": "zai-org"
  },
  {
    "model_provider_id": 1238,
    "model_id": "glm-4.5v",
    "provider_id": "zeroeval",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 60,
    "output_cents_per_million_tokens": 220,
    "quantization": "fp8",
    "max_input_tokens": 131072,
    "max_output_tokens": 65536,
    "throughput": 85.0,
    "latency": 0.7,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": true,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-09-29T00:00:00.000000+00:00",
    "updated_at": "2025-09-29T00:00:00.000000+00:00",
    "provider_model_id_used": "zai-org/GLM-4.5V",
    "model_name": "GLM-4.5V",
    "organization_id": "zai-org"
  },
  {
    "model_provider_id": 1239,
    "model_id": "kimi-k2-0905",
    "provider_id": "zeroeval",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 60,
    "output_cents_per_million_tokens": 250,
    "quantization": "fp8",
    "max_input_tokens": 262144,
    "max_output_tokens": 262144,
    "throughput": null,
    "latency": null,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-09-29T00:00:00.000000+00:00",
    "updated_at": "2025-09-29T00:00:00.000000+00:00",
    "provider_model_id_used": "moonshotai/kimi-k2-0905",
    "model_name": "Kimi K2 0905",
    "organization_id": "moonshotai"
  },
  {
    "model_provider_id": 1241,
    "model_id": "deepseek-r1",
    "provider_id": "zeroeval",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 300,
    "output_cents_per_million_tokens": 600,
    "quantization": null,
    "max_input_tokens": 65536,
    "max_output_tokens": 65536,
    "throughput": 189.0,
    "latency": 0.067,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": false,
    "input_modality_audio": false,
    "input_modality_video": false,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-09-29T00:00:00.000000+00:00",
    "updated_at": "2025-09-29T00:00:00.000000+00:00",
    "provider_model_id_used": "deepseek-r1",
    "model_name": "DeepSeek R1 671B",
    "organization_id": "deepseek"
  },
  {
    "model_provider_id": 1242,
    "model_id": "claude-sonnet-4-5-20250929",
    "provider_id": "zeroeval",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 300,
    "output_cents_per_million_tokens": 1500,
    "quantization": null,
    "max_input_tokens": 200000,
    "max_output_tokens": 64000,
    "throughput": 42.0,
    "latency": 0.4,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": true,
    "input_modality_video": true,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-07-19T19:49:17.084616+00:00",
    "updated_at": "2025-07-19T19:49:17.084616+00:00",
    "provider_model_id_used": "claude-sonnet-4-5-20250929",
    "model_name": "Claude Sonnet 4.5",
    "organization_id": "anthropic"
  },
  {
    "model_provider_id": 1243,
    "model_id": "glm-4.6",
    "provider_id": "zeroeval",
    "deprecated_at": null,
    "input_cents_per_million_tokens": 60,
    "output_cents_per_million_tokens": 200,
    "quantization": "fp8",
    "max_input_tokens": 131072,
    "max_output_tokens": 65536,
    "throughput": 85.0,
    "latency": 0.7,
    "feature_web_search": false,
    "feature_function_calling": true,
    "feature_structured_output": true,
    "feature_code_execution": false,
    "feature_batch_inference": true,
    "feature_finetuning": false,
    "input_modality_text": true,
    "input_modality_image": true,
    "input_modality_audio": false,
    "input_modality_video": true,
    "output_modality_text": true,
    "output_modality_image": false,
    "output_modality_audio": false,
    "output_modality_video": false,
    "created_at": "2025-09-30T00:00:00.000000+00:00",
    "updated_at": "2025-09-30T00:00:00.000000+00:00",
    "provider_model_id_used": "zai-org/GLM-4.6",
    "model_name": "GLM-4.6",
    "organization_id": "zai-org"
  }
]


================================================
FILE: data/providers/zeroeval/provider.json
================================================
{
  "provider_id": "zeroeval",
  "name": "ZeroEval",
  "website": "https://zeroeval.com",
  "created_at": "2025-07-15T06:36:02.543462+00:00",
  "updated_at": "2025-07-15T06:36:02.543462+00:00"
}

================================================
FILE: package.json
================================================
{
  "scripts": {
    "validate-schemas": "node scripts/validate-schemas.js"
  },
  "devDependencies": {
    "glob": "^10.4.5",
    "tv4": "^1.3.0"
  }
}


================================================
FILE: schemas/README.md
================================================
# JSON Schemas for LLM Stats Data

This directory contains JSON Schema definitions for all data types used in the LLM Stats project. These schemas define the structure, types, and validation rules for data stored in the hierarchical file system under `data/`.

## Schema Files

### Core Entity Schemas

- **`organization.schema.json`** - Schema for AI/ML organizations (e.g., OpenAI, Anthropic)
- **`model.schema.json`** - Schema for model metadata
- **`license.schema.json`** - Schema for software licenses governing model usage
- **`benchmark.schema.json`** - Schema for evaluation benchmark definitions
- **`provider.schema.json`** - Schema for model inference providers (e.g., AWS Bedrock, Google Vertex)

### Relationship Schemas

- **`benchmark-results.schema.json`** - Schema for model performance scores on benchmarks
- **`provider-models.schema.json`** - Schema for provider-specific model configurations and pricing

## Data Structure

The schemas correspond to data organized hierarchically:

```
data/
├── organizations/
│   └── [org_id]/
│       ├── organization.json    # Validates against organization.schema.json
│       └── models/
│           └── [model_id]/
│               ├── model.json       # Validates against model.schema.json
│               └── benchmarks.json  # Array validating against benchmark-results.schema.json
├── providers/
│   └── [provider_id]/
│       ├── provider.json        # Validates against provider.schema.json
│       └── models.json          # Array validating against provider-models.schema.json
├── licenses/
│   └── [license_id].json        # Validates against license.schema.json
└── benchmarks/
    └── [benchmark_id].json      # Validates against benchmark.schema.json
```

## Usage

These schemas can be used for:

1. **Data Validation** - Ensure all data files conform to expected structure
2. **Documentation** - Understand what fields are available and their meanings
3. **Code Generation** - Generate TypeScript interfaces or other language types
4. **API Contracts** - Define expected request/response formats

## Validation Example

To validate a data file against its schema using Python:

```python
import json
import jsonschema

# Load schema
with open('schemas/model.schema.json') as f:
    schema = json.load(f)

# Load data
with open('data/organizations/openai/models/gpt-4/model.json') as f:
    data = json.load(f)

# Validate
jsonschema.validate(instance=data, schema=schema)
```

## Schema Features

All schemas use JSON Schema Draft 7 and include:

- **Descriptions** - Every field has a human-readable description
- **Types** - Strict type definitions with null handling
- **Patterns** - Regular expressions for ID formats
- **Examples** - Real-world examples for clarity
- **Enums** - Restricted value sets where applicable
- **Format Validators** - For dates, URIs, etc.
- **Required Fields** - Clearly defined required vs optional

## Contributing

When adding new fields or modifying schemas:

1. Update the relevant schema file
2. Add clear descriptions and examples
3. Consider backward compatibility
4. Update this README if adding new schemas
5. Validate existing data against updated schemas

## Schema Versioning

Currently, all schemas target JSON Schema Draft 7. Future versions may adopt newer drafts as tooling support improves.


================================================
FILE: schemas/benchmark-results.schema.json
================================================
{
  "$schema": "http://json-schema.org/draft-07/schema#",
  "title": "ModelBenchmark",
  "description": "Schema for model performance scores on benchmarks",
  "type": "object",
  "properties": {
    "model_benchmark_id": {
      "type": "integer",
      "description": "Unique identifier for this model-benchmark result",
      "minimum": 1
    },
    "benchmark_id": {
      "type": "string",
      "description": "ID of the benchmark"
    },
    "model_id": {
      "type": "string",
      "description": "ID of the model"
    },
    "score": {
      "type": "number",
      "description": "Raw score achieved on the benchmark",
      "minimum": 0
    },
    "normalized_score": {
      "type": ["number", "null"],
      "description": "Score normalized to 0-1 range for cross-benchmark comparison",
      "minimum": 0,
      "maximum": 1
    },
    "is_self_reported": {
      "type": "boolean",
      "description": "Whether the score was self-reported by the model creator",
      "default": true
    },
    "self_reported_source_link": {
      "type": ["string", "null"],
      "format": "uri",
      "description": "URL to the source of self-reported scores"
    },
    "verified_by_llmstats": {
      "type": "boolean",
      "description": "Whether the score has been independently verified by llm-stats",
      "default": false
    },
    "analysis_method": {
      "type": ["string", "null"],
      "description": "Method used for evaluation (e.g., '0-shot', '5-shot', 'CoT')",
      "examples": [
        "0-shot",
        "5-shot",
        "few-shot",
        "chain-of-thought",
        "zero-shot CoT"
      ]
    },
    "verification_provider_id": {
      "type": ["string", "null"],
      "description": "Provider used for independent verification"
    },
    "verification_hardware": {
      "type": ["string", "null"],
      "description": "Hardware used for verification",
      "examples": ["H100 on Modal", "A100 on AWS", "4xA100 on GCP"]
    },
    "verification_date": {
      "type": ["string", "null"],
      "format": "date",
      "description": "Date when the score was independently verified"
    },
    "verification_notes": {
      "type": ["string", "null"],
      "description": "Additional notes about the verification process"
    },
    "created_at": {
      "type": "string",
      "format": "date-time",
      "description": "Timestamp when the record was created"
    },
    "updated_at": {
      "type": "string",
      "format": "date-time",
      "description": "Timestamp when the record was last updated"
    },
    "benchmark_name": {
      "type": "string",
      "description": "Display name of the benchmark (denormalized for convenience)"
    }
  },
  "required": [
    "model_benchmark_id",
    "benchmark_id",
    "model_id",
    "score",
    "is_self_reported",
    "verified_by_llmstats",
    "created_at",
    "updated_at",
    "benchmark_name"
  ],
  "additionalProperties": false
}


================================================
FILE: schemas/benchmark.schema.json
================================================
{
  "$schema": "http://json-schema.org/draft-07/schema#",
  "title": "Benchmark",
  "description": "Schema for AI/ML evaluation benchmark definitions",
  "type": "object",
  "properties": {
    "benchmark_id": {
      "type": "string",
      "description": "Unique identifier for the benchmark",
      "examples": [
        "mmlu",
        "humaneval",
        "arc-c",
        "gsm8k",
        "mbpp-pass@1",
        "humanity's-last-exam"
      ]
    },
    "name": {
      "type": "string",
      "description": "Display name of the benchmark",
      "examples": ["MMLU", "HumanEval", "ARC-Challenge", "GSM8K"]
    },
    "parent_benchmark_id": {
      "type": ["string", "null"],
      "description": "ID of parent benchmark if this is a subset or variant"
    },
    "categories": {
      "type": "array",
      "description": "Array of categories that this benchmark belongs to",
      "items": {
        "type": "string",
        "enum": [
          "general",
          "code",
          "math",
          "reasoning",
          "language",
          "multimodal",
          "safety",
          "long_context",
          "roleplay",
          "agents",
          "factuality",
          "vision",
          "audio",
          "video",
          "text-to-image",
          "image-to-text",
          "text-to-speech",
          "speech-to-text",
          "text-to-video",
          "video-to-text",
          "legal",
          "healthcare",
          "finance",
          "chemistry",
          "economics",
          "coding",
          "creativity",
          "psychology",
          "games",
          "communication",
          "physics",
          "spatial_reasoning",
          "summarization",
          "frontend_development",
          "writing",
          "search"
        ]
      },
      "minItems": 1,
      "uniqueItems": true,
      "examples": [
        ["general"],
        ["code", "reasoning"],
        ["math", "reasoning"],
        ["vision", "multimodal"]
      ]
    },
    "modality": {
      "type": "string",
      "description": "Primary modality of the benchmark",
      "enum": ["text", "image", "audio", "video", "multimodal"]
    },
    "multilingual": {
      "type": "boolean",
      "description": "Whether the benchmark tests multiple languages",
      "default": false
    },
    "max_score": {
      "type": "number",
      "description": "Maximum possible score on the benchmark",
      "minimum": 0,
      "default": 1.0,
      "examples": [1.0, 100.0]
    },
    "language": {
      "type": "string",
      "description": "Primary language of the benchmark (ISO 639-1 code)",
      "default": "en",
      "examples": ["en", "zh", "es", "fr"]
    },
    "description": {
      "type": ["string", "null"],
      "description": "Detailed description of what the benchmark measures"
    },
    "paper_link": {
      "type": ["string", "null"],
      "format": "uri",
      "description": "URL to the research paper introducing the benchmark"
    },
    "implementation_link": {
      "type": ["string", "null"],
      "format": "uri",
      "description": "URL to the official implementation or dataset"
    },
    "verified": {
      "type": "boolean",
      "description": "Whether the benchmark has been verified by llm-stats maintainers",
      "default": false
    },
    "created_at": {
      "type": "string",
      "format": "date-time",
      "description": "Timestamp when the record was created"
    },
    "updated_at": {
      "type": "string",
      "format": "date-time",
      "description": "Timestamp when the record was last updated"
    }
  },
  "required": [
    "benchmark_id",
    "name",
    "categories",
    "modality",
    "multilingual",
    "max_score",
    "language",
    "verified",
    "created_at",
    "updated_at"
  ],
  "additionalProperties": false
}


================================================
FILE: schemas/integrity-validator.js
================================================
const fs = require("fs");
const path = require("path");
const glob = require("glob");

class IntegrityValidator {
  constructor(dataDir) {
    this.dataDir = dataDir || path.join(__dirname, "..", "data");
    this.errors = [];
    this.warnings = [];

    // Collections to store all entities
    this.models = new Map();
    this.benchmarks = new Map();
    this.organizations = new Map();
    this.licenses = new Map();
    this.providers = new Map();

    // Maps to check for duplicates
    // Note: Model names can be duplicated (e.g., different versions), only IDs must be unique
    this.benchmarkNames = new Map();
  }

  loadJSON(filePath) {
    try {
      const content = fs.readFileSync(filePath, "utf8");
      return JSON.parse(content);
    } catch (error) {
      this.errors.push(`Failed to load ${filePath}: ${error.message}`);
      return null;
    }
  }

  // Load all data into memory
  async loadAllData() {
    console.log("\n📂 Loading all data files...\n");

    // Load organizations
    const orgFiles = glob.sync(
      path.join(this.dataDir, "organizations/*/organization.json")
    );
    for (const file of orgFiles) {
      const data = this.loadJSON(file);
      if (data) {
        this.organizations.set(data.organization_id, data);
      }
    }
    console.log(`✅ Loaded ${this.organizations.size} organizations`);

    // Load models
    const modelFiles = glob.sync(
      path.join(this.dataDir, "organizations/*/models/*/model.json")
    );
    for (const file of modelFiles) {
      const data = this.loadJSON(file);
      if (data) {
        // Check for duplicate model IDs
        if (this.models.has(data.model_id)) {
          const existing = this.models.get(data.model_id);
          this.errors.push(
            `❌ Duplicate model ID "${data.model_id}" found:\n` +
            `   - First occurrence: ${path.relative(this.dataDir, existing.file)}\n` +
            `   - Duplicate found: ${path.relative(this.dataDir, file)}`
          );
        }
        this.models.set(data.model_id, { ...data, file });
      }
    }
    console.log(`✅ Loaded ${this.models.size} models`);

    // Load benchmarks
    const benchmarkFiles = glob.sync(
      path.join(this.dataDir, "benchmarks/*.json")
    );
    for (const file of benchmarkFiles) {
      const data = this.loadJSON(file);
      if (data) {
        // Check for duplicate benchmark IDs
        if (this.benchmarks.has(data.benchmark_id)) {
          const existing = this.benchmarks.get(data.benchmark_id);
          this.errors.push(
            `❌ Duplicate benchmark ID "${data.benchmark_id}" found:\n` +
            `   - First occurrence: ${path.relative(this.dataDir, existing.file)}\n` +
            `   - Duplicate found: ${path.relative(this.dataDir, file)}`
          );
        }
        this.benchmarks.set(data.benchmark_id, { ...data, file });

        // Check for duplicate benchmark names
        if (this.benchmarkNames.has(data.name)) {
          this.benchmarkNames
            .get(data.name)
            .push({ id: data.benchmark_id, file });
        } else {
          this.benchmarkNames.set(data.name, [{ id: data.benchmark_id, file }]);
        }
      }
    }
    console.log(`✅ Loaded ${this.benchmarks.size} benchmarks`);

    // Load licenses
    const licenseFiles = glob.sync(path.join(this.dataDir, "licenses/*.json"));
    for (const file of licenseFiles) {
      const data = this.loadJSON(file);
      if (data) {
        this.licenses.set(data.license_id, data);
      }
    }
    console.log(`✅ Loaded ${this.licenses.size} licenses`);

    // Load providers
    const providerFiles = glob.sync(
      path.join(this.dataDir, "providers/*/provider.json")
    );
    for (const file of providerFiles) {
      const data = this.loadJSON(file);
      if (data) {
        this.providers.set(data.provider_id, data);
      }
    }
    console.log(`✅ Loaded ${this.providers.size} providers`);
  }

  // Check for duplicate names
  checkDuplicates() {
    console.log("\n🔍 Checking for duplicate names...\n");

    let duplicatesFound = false;

    // Check duplicate benchmark names (benchmark names should be unique)
    for (const [name, instances] of this.benchmarkNames.entries()) {
      if (instances.length > 1) {
        duplicatesFound = true;
        this.errors.push(
          `❌ Duplicate benchmark name "${name}" found in ${instances.length} benchmarks:\n` +
            instances
              .map(
                (i) => `   - ${i.id} in ${path.relative(this.dataDir, i.file)}`
              )
              .join("\n")
        );
      }
    }

    if (!duplicatesFound) {
      console.log("✅ No duplicate benchmark names found");
    }

    // Note: Model names can be duplicated (e.g., different versions of the same model)
    // IDs are checked during loading and must be unique
  }

  // Check all references
  checkReferences() {
    console.log("\n🔗 Checking references...\n");

    // Check model references
    for (const [modelId, model] of this.models.entries()) {
      const relPath = path.relative(this.dataDir, model.file);

      // Check organization reference
      if (
        model.organization_id &&
        !this.organizations.has(model.organization_id)
      ) {
        this.errors.push(
          `❌ Model "${modelId}" references non-existent organization "${model.organization_id}"\n` +
            `   in ${relPath}`
        );
      }

      // Check license reference
      if (model.license_id && !this.licenses.has(model.license_id)) {
        this.errors.push(
          `❌ Model "${modelId}" references non-existent license "${model.license_id}"\n` +
            `   in ${relPath}`
        );
      }

      // Check fine-tuned from reference
      if (
        model.fine_tuned_from_model_id &&
        !this.models.has(model.fine_tuned_from_model_id)
      ) {
        this.errors.push(
          `❌ Model "${modelId}" references non-existent base model "${model.fine_tuned_from_model_id}"\n` +
            `   in ${relPath}`
        );
      }

      // Check model family reference
      if (model.model_family_id && !this.models.has(model.model_family_id)) {
        this.warnings.push(
          `⚠️  Model "${modelId}" references model family "${model.model_family_id}" which doesn't exist as a model\n` +
            `   in ${relPath}`
        );
      }
    }

    // Check benchmark results references
    const benchmarkResultFiles = glob.sync(
      path.join(this.dataDir, "organizations/*/models/*/benchmarks.json")
    );

    for (const file of benchmarkResultFiles) {
      const results = this.loadJSON(file);
      if (results && Array.isArray(results)) {
        const relPath = path.relative(this.dataDir, file);

        for (let i = 0; i < results.length; i++) {
          const result = results[i];

          // Check model_id reference
          if (result.model_id && !this.models.has(result.model_id)) {
            this.errors.push(
              `❌ Benchmark result [${i}] references non-existent model "${result.model_id}"\n` +
                `   in ${relPath}`
            );
          }

          // Check benchmark_id reference
          if (
            result.benchmark_id &&
            !this.benchmarks.has(result.benchmark_id)
          ) {
            this.errors.push(
              `❌ Benchmark result [${i}] references non-existent benchmark "${result.benchmark_id}"\n` +
                `   in ${relPath}`
            );
          }

          // Check verification_provider_id reference
          if (
            result.verification_provider_id &&
            !this.providers.has(result.verification_provider_id)
          ) {
            this.warnings.push(
              `⚠️  Benchmark result [${i}] references non-existent verification provider "${result.verification_provider_id}"\n` +
                `   in ${relPath}`
            );
          }
        }
      }
    }

    // Check provider models references
    const providerModelFiles = glob.sync(
      path.join(this.dataDir, "providers/*/models.json")
    );

    for (const file of providerModelFiles) {
      const models = this.loadJSON(file);
      if (models && Array.isArray(models)) {
        const relPath = path.relative(this.dataDir, file);

        for (let i = 0; i < models.length; i++) {
          const providerModel = models[i];

          // Check model_id reference
          if (
            providerModel.model_id &&
            !this.models.has(providerModel.model_id)
          ) {
            this.errors.push(
              `❌ Provider model [${i}] references non-existent model "${providerModel.model_id}"\n` +
                `   in ${relPath}`
            );
          }

          // Check provider_id reference
          if (
            providerModel.provider_id &&
            !this.providers.has(providerModel.provider_id)
          ) {
            this.errors.push(
              `❌ Provider model [${i}] references non-existent provider "${providerModel.provider_id}"\n` +
                `   in ${relPath}`
            );
          }
        }
      }
    }

    // Check benchmark parent references
    for (const [benchmarkId, benchmark] of this.benchmarks.entries()) {
      if (
        benchmark.parent_benchmark_id &&
        !this.benchmarks.has(benchmark.parent_benchmark_id)
      ) {
        const relPath = path.relative(this.dataDir, benchmark.file);
        this.errors.push(
          `❌ Benchmark "${benchmarkId}" references non-existent parent benchmark "${benchmark.parent_benchmark_id}"\n` +
            `   in ${relPath}`
        );
      }
    }

    if (this.errors.length === 0 && this.warnings.length === 0) {
      console.log("✅ All references are valid");
    }
  }

  // Check for orphaned data
  checkOrphans() {
    console.log("\n👻 Checking for orphaned data...\n");

    // Check for models without benchmark results
    const modelsWithBenchmarks = new Set();
    const benchmarkResultFiles = glob.sync(
      path.join(this.dataDir, "organizations/*/models/*/benchmarks.json")
    );

    for (const file of benchmarkResultFiles) {
      const results = this.loadJSON(file);
      if (results && Array.isArray(results)) {
        results.forEach((r) => modelsWithBenchmarks.add(r.model_id));
      }
    }

    let modelsWithoutBenchmarks = 0;
    for (const modelId of this.models.keys()) {
      if (!modelsWithBenchmarks.has(modelId)) {
        modelsWithoutBenchmarks++;
      }
    }

    if (modelsWithoutBenchmarks > 0) {
      this.warnings.push(
        `⚠️  ${modelsWithoutBenchmarks} models have no benchmark results`
      );
    }

    // Check for unused benchmarks
    const usedBenchmarks = new Set();
    for (const file of benchmarkResultFiles) {
      const results = this.loadJSON(file);
      if (results && Array.isArray(results)) {
        results.forEach((r) => usedBenchmarks.add(r.benchmark_id));
      }
    }

    let unusedBenchmarks = 0;
    for (const benchmarkId of this.benchmarks.keys()) {
      if (!usedBenchmarks.has(benchmarkId)) {
        unusedBenchmarks++;
      }
    }

    if (unusedBenchmarks > 0) {
      this.warnings.push(
        `⚠️  ${unusedBenchmarks} benchmarks are not used by any model`
      );
    }

    // Check for unused licenses
    const usedLicenses = new Set();
    for (const model of this.models.values()) {
      if (model.license_id) {
        usedLicenses.add(model.license_id);
      }
    }

    let unusedLicenses = 0;
    for (const licenseId of this.licenses.keys()) {
      if (!usedLicenses.has(licenseId)) {
        unusedLicenses++;
      }
    }

    if (unusedLicenses > 0) {
      this.warnings.push(
        `⚠️  ${unusedLicenses} licenses are not used by any model`
      );
    }
  }

  // Main validation function
  async validate() {
    console.log("🔍 Running Data Integrity Validation...\n");
    console.log(`Data directory: ${this.dataDir}\n`);

    await this.loadAllData();
    this.checkDuplicates();
    this.checkReferences();
    this.checkOrphans();

    // Print summary
    console.log("\n" + "=".repeat(60));
    console.log("📊 Validation Summary");
    console.log("=".repeat(60));

    if (this.errors.length > 0) {
      console.log(`\n❌ Found ${this.errors.length} errors:\n`);
      this.errors.forEach((error) => console.log(error));
    }

    if (this.warnings.length > 0) {
      console.log(`\n⚠️  Found ${this.warnings.length} warnings:\n`);
      this.warnings.forEach((warning) => console.log(warning));
    }

    if (this.errors.length === 0 && this.warnings.length === 0) {
      console.log("\n✅ All integrity checks passed! 🎉");
      return true;
    }

    console.log("\n" + "=".repeat(60));

    return this.errors.length === 0;
  }
}

// Run validation if called directly
if (require.main === module) {
  const validator = new IntegrityValidator();
  validator.validate().then((success) => {
    process.exit(success ? 0 : 1);
  });
}

module.exports = IntegrityValidator;


================================================
FILE: schemas/license.schema.json
================================================
{
  "$schema": "http://json-schema.org/draft-07/schema#",
  "title": "License",
  "description": "Schema for model license definitions",
  "type": "object",
  "properties": {
    "license_id": {
      "type": "string",
      "description": "Unique identifier for the license",
      "examples": ["apache_2_0", "mit", "proprietary", "cc_by_nc"]
    },
    "name": {
      "type": "string",
      "description": "Display name of the license",
      "examples": ["Apache 2.0", "MIT License", "Proprietary", "CC BY-NC 4.0"]
    },
    "allow_commercial": {
      "type": "boolean",
      "description": "Whether the license allows commercial use of the model"
    },
    "description": {
      "type": "string",
      "description": "Brief description of the license terms and restrictions",
      "examples": [
        "Apache License 2.0 - allows commercial use",
        "Non-commercial research use only",
        "Proprietary license - contact vendor for terms"
      ]
    },
    "created_at": {
      "type": "string",
      "format": "date-time",
      "description": "Timestamp when the record was created"
    },
    "updated_at": {
      "type": "string",
      "format": "date-time",
      "description": "Timestamp when the record was last updated"
    }
  },
  "required": [
    "license_id",
    "name",
    "allow_commercial",
    "description",
    "created_at",
    "updated_at"
  ],
  "additionalProperties": false
}


================================================
FILE: schemas/model.schema.json
================================================
{
  "$schema": "http://json-schema.org/draft-07/schema#",
  "title": "Model",
  "description": "Schema for AI/ML model metadata",
  "type": "object",
  "properties": {
    "model_id": {
      "type": "string",
      "description": "Unique identifier for the model",
      "examples": ["gpt-4", "claude-3-opus", "llama-3.1-405b"]
    },
    "name": {
      "type": "string",
      "description": "Display name of the model",
      "examples": ["GPT-4", "Claude 3 Opus", "Llama 3.1 405B"]
    },
    "organization_id": {
      "type": "string",
      "description": "ID of the organization that created the model"
    },
    "model_family_id": {
      "type": ["string", "null"],
      "description": "ID of the model family this model belongs to",
      "examples": ["gpt-4", "claude-3", "llama-3-1"]
    },
    "fine_tuned_from_model_id": {
      "type": ["string", "null"],
      "description": "ID of the base model this was fine-tuned from"
    },
    "description": {
      "type": ["string", "null"],
      "description": "Detailed description of the model's capabilities and use cases"
    },
    "release_date": {
      "type": ["string", "null"],
      "format": "date",
      "description": "Date when the model was released (YYYY-MM-DD)",
      "examples": ["2024-11-20", "2023-03-14"]
    },
    "announcement_date": {
      "type": ["string", "null"],
      "format": "date",
      "description": "Date when the model was first announced (YYYY-MM-DD)"
    },
    "license_id": {
      "type": ["string", "null"],
      "description": "ID of the license governing the model's use"
    },
    "multimodal": {
      "type": "boolean",
      "description": "Whether the model supports multiple input/output modalities",
      "default": false
    },
    "knowledge_cutoff": {
      "type": ["string", "null"],
      "format": "date",
      "description": "Date up to which the model has training data (YYYY-MM-DD)"
    },
    "param_count": {
      "type": ["number", "null"],
      "description": "Number of parameters in the model (in billions)",
      "minimum": 0,
      "examples": [175, 405, 1.8]
    },
    "training_tokens": {
      "type": ["number", "null"],
      "description": "Number of tokens the model was trained on (in trillions)",
      "minimum": 0
    },
    "available_in_zeroeval": {
      "type": "boolean",
      "description": "Whether the model is available for evaluation in ZeroEval",
      "default": true
    },
    "source_api_ref": {
      "type": ["string", "null"],
      "format": "uri",
      "description": "URL to the official API documentation"
    },
    "source_playground": {
      "type": ["string", "null"],
      "format": "uri",
      "description": "URL to an interactive playground or demo"
    },
    "source_paper": {
      "type": ["string", "null"],
      "format": "uri",
      "description": "URL to the research paper or technical report"
    },
    "source_scorecard_blog_link": {
      "type": ["string", "null"],
      "format": "uri",
      "description": "URL to scorecard or evaluation blog post"
    },
    "source_repo_link": {
      "type": ["string", "null"],
      "format": "uri",
      "description": "URL to the model's code repository"
    },
    "source_weights_link": {
      "type": ["string", "null"],
      "format": "uri",
      "description": "URL to download model weights"
    },
    "created_at": {
      "type": "string",
      "format": "date-time",
      "description": "Timestamp when the record was created"
    },
    "updated_at": {
      "type": "string",
      "format": "date-time",
      "description": "Timestamp when the record was last updated"
    }
  },
  "required": [
    "model_id",
    "name",
    "organization_id",
    "multimodal",
    "available_in_zeroeval",
    "created_at",
    "updated_at"
  ],
  "additionalProperties": false
}


================================================
FILE: schemas/organization.schema.json
================================================
{
  "$schema": "http://json-schema.org/draft-07/schema#",
  "title": "Organization",
  "description": "Schema for AI/ML organization data",
  "type": "object",
  "properties": {
    "organization_id": {
      "type": "string",
      "description": "Unique identifier for the organization",
      "examples": ["openai", "anthropic", "google", "amazon"]
    },
    "name": {
      "type": "string",
      "description": "Display name of the organization",
      "examples": ["OpenAI", "Anthropic", "Google", "Amazon"]
    },
    "website": {
      "type": "string",
      "format": "uri",
      "description": "Official website URL of the organization",
      "examples": ["https://openai.com", "https://anthropic.com"]
    },
    "description": {
      "type": ["string", "null"],
      "description": "Brief description of the organization and its focus areas",
      "examples": ["Cloud and AI services", "AI safety and research company"]
    },
    "country": {
      "type": ["string", "null"],
      "description": "Country where the organization is headquartered (ISO 3166-1 alpha-2 code)",
      "examples": ["US", "UK", "CN"]
    },
    "created_at": {
      "type": "string",
      "format": "date-time",
      "description": "Timestamp when the record was created in the database"
    },
    "updated_at": {
      "type": "string",
      "format": "date-time",
      "description": "Timestamp when the record was last updated in the database"
    }
  },
  "required": [
    "organization_id",
    "name",
    "website",
    "created_at",
    "updated_at"
  ],
  "additionalProperties": false
}


================================================
FILE: schemas/provider-models.schema.json
================================================
{
  "$schema": "http://json-schema.org/draft-07/schema#",
  "title": "ProviderModel",
  "description": "Schema for provider-specific model configurations and pricing",
  "type": "object",
  "properties": {
    "model_provider_id": {
      "type": "integer",
      "description": "Unique identifier for this provider-model configuration",
      "minimum": 1
    },
    "model_id": {
      "type": "string",
      "description": "ID of the model"
    },
    "provider_id": {
      "type": "string",
      "description": "ID of the provider offering this model"
    },
    "provider_model_id_used": {
      "type": ["string", "null"],
      "description": "Model ID as used by the provider's API",
      "examples": ["gpt-4-turbo", "claude-3-opus-20240229"]
    },
    "deprecated_at": {
      "type": ["string", "null"],
      "format": "date-time",
      "description": "Timestamp when this model configuration was deprecated"
    },
    "input_cents_per_million_tokens": {
      "type": ["number", "null"],
      "description": "Cost in cents per million input tokens",
      "minimum": 0,
      "examples": [1000, 300, 80]
    },
    "output_cents_per_million_tokens": {
      "type": ["number", "null"],
      "description": "Cost in cents per million output tokens",
      "minimum": 0,
      "examples": [3000, 1500, 400]
    },
    "quantization": {
      "type": ["string", "null"],
      "description": "Quantization method applied to the model",
      "examples": ["int8", "int4", "fp16", "bf16"]
    },
    "max_input_tokens": {
      "type": ["integer", "null"],
      "description": "Maximum number of input tokens supported",
      "minimum": 1,
      "examples": [128000, 200000, 32000]
    },
    "max_output_tokens": {
      "type": ["integer", "null"],
      "description": "Maximum number of output tokens supported",
      "minimum": 1,
      "examples": [4096, 8192, 200000]
    },
    "throughput": {
      "type": ["number", "null"],
      "description": "Tokens per second throughput",
      "minimum": 0,
      "examples": [42.0, 150.5, 200.0]
    },
    "latency": {
      "type": ["number", "null"],
      "description": "Time to first token in seconds",
      "minimum": 0,
      "examples": [0.4, 0.2, 1.5]
    },
    "feature_web_search": {
      "type": ["boolean", "null"],
      "description": "Whether web search is available",
      "default": false
    },
    "feature_function_calling": {
      "type": ["boolean", "null"],
      "description": "Whether function/tool calling is supported",
      "default": false
    },
    "feature_structured_output": {
      "type": ["boolean", "null"],
      "description": "Whether structured output (JSON mode) is supported",
      "default": false
    },
    "feature_code_execution": {
      "type": ["boolean", "null"],
      "description": "Whether code execution is supported",
      "default": false
    },
    "feature_batch_inference": {
      "type": ["boolean", "null"],
      "description": "Whether batch inference is available",
      "default": false
    },
    "feature_finetuning": {
      "type": ["boolean", "null"],
      "description": "Whether fine-tuning is available",
      "default": false
    },
    "input_modality_text": {
      "type": ["boolean", "null"],
      "description": "Whether text input is supported",
      "default": true
    },
    "input_modality_image": {
      "type": ["boolean", "null"],
      "description": "Whether image input is supported",
      "default": false
    },
    "input_modality_audio": {
      "type": ["boolean", "null"],
      "description": "Whether audio input is supported",
      "default": false
    },
    "input_modality_video": {
      "type": ["boolean", "null"],
      "description": "Whether video input is supported",
      "default": false
    },
    "output_modality_text": {
      "type": ["boolean", "null"],
      "description": "Whether text output is supported",
      "default": true
    },
    "output_modality_image": {
      "type": ["boolean", "null"],
      "description": "Whether image output is supported",
      "default": false
    },
    "output_modality_audio": {
      "type": ["boolean", "null"],
      "description": "Whether audio output is supported",
      "default": false
    },
    "output_modality_video": {
      "type": ["boolean", "null"],
      "description": "Whether video output is supported",
      "default": false
    },
    "created_at": {
      "type": "string",
      "format": "date-time",
      "description": "Timestamp when the record was created"
    },
    "updated_at": {
      "type": "string",
      "format": "date-time",
      "description": "Timestamp when the record was last updated"
    },
    "model_name": {
      "type": "string",
      "description": "Display name of the model (denormalized for convenience)"
    },
    "organization_id": {
      "type": "string",
      "description": "ID of the organization that created the model (denormalized)"
    }
  },
  "required": [
    "model_provider_id",
    "model_id",
    "provider_id",
    "created_at",
    "updated_at",
    "model_name",
    "organization_id"
  ],
  "additionalProperties": false
}


================================================
FILE: schemas/provider.schema.json
================================================
{
  "$schema": "http://json-schema.org/draft-07/schema#",
  "title": "Provider",
  "description": "Schema for AI model inference providers",
  "type": "object",
  "properties": {
    "provider_id": {
      "type": "string",
      "description": "Unique identifier for the provider",
      "examples": ["openai", "anthropic", "google", "aws-bedrock", "azure"]
    },
    "name": {
      "type": "string",
      "description": "Display name of the provider",
      "examples": [
        "OpenAI",
        "Anthropic",
        "Google",
        "AWS Bedrock",
        "Azure OpenAI"
      ]
    },
    "website": {
      "type": "string",
      "format": "uri",
      "description": "Official website or API documentation URL",
      "examples": ["https://openai.com/api", "https://docs.anthropic.com"]
    },
    "created_at": {
      "type": "string",
      "format": "date-time",
      "description": "Timestamp when the record was created"
    },
    "updated_at": {
      "type": "string",
      "format": "date-time",
      "description": "Timestamp when the record was last updated"
    }
  },
  "required": ["provider_id", "name", "website", "created_at", "updated_at"],
  "additionalProperties": false
}


================================================
FILE: schemas/validator.js
================================================
const fs = require("fs");
const path = require("path");
const tv4 = require("tv4");
const glob = require("glob");

function validateSchema(schemaName, filePattern, isArray = false) {
  console.log(`\nValidating ${schemaName}...`);
  const schemaPath = path.join(__dirname, `${schemaName}.schema.json`);

  let schema;
  try {
    schema = JSON.parse(fs.readFileSync(schemaPath, "utf8"));
  } catch (error) {
    console.error(`Error reading schema file: ${schemaPath}`);
    console.error(error);
    return false;
  }

  const files = glob.sync(path.join(__dirname, "..", filePattern));

  if (files.length === 0) {
    console.warn(`⚠️ No files found matching pattern: ${filePattern}`);
    return true;
  }

  let isValid = true;

  for (const file of files) {
    try {
      const data = JSON.parse(fs.readFileSync(file, "utf8"));

      // If expecting an array, validate each item
      if (isArray) {
        if (!Array.isArray(data)) {
          console.error(
            `❌ Invalid: ${file} - Expected array but got ${typeof data}`
          );
          isValid = false;
          continue;
        }

        let allItemsValid = true;
        data.forEach((item, index) => {
          const result = tv4.validateMultiple(item, schema);
          if (!result.valid) {
            console.error(`❌ Invalid item [${index}] in: ${file}`);
            result.errors.forEach((error) =>
              console.error(`  - ${error.message} at ${error.dataPath}`)
            );
            allItemsValid = false;
          }
        });

        if (allItemsValid) {
          console.log(`✅ Valid: ${file} (${data.length} items)`);
        } else {
          isValid = false;
        }
      } else {
        // Single object validation
        const result = tv4.validateMultiple(data, schema);

        if (result.valid) {
          console.log(`✅ Valid: ${file}`);
        } else {
          console.error(`❌ Invalid: ${file}`);
          result.errors.forEach((error) =>
            console.error(`  - ${error.message} at ${error.dataPath}`)
          );
          isValid = false;
        }
      }
    } catch (error) {
      console.error(`Error processing file: ${file}`);
      console.error(error);
      isValid = false;
    }
  }

  return isValid;
}

console.log("🔍 Validating LLM Stats Data Structure...\n");
console.log("=".repeat(60));
console.log("Phase 1: Schema Validation");
console.log("=".repeat(60));

// Validate all data types
const validations = [
  // Core entities
  {
    schema: "organization",
    pattern: "data/organizations/*/organization.json",
  },
  {
    schema: "model",
    pattern: "data/organizations/*/models/*/model.json",
  },
  { schema: "license", pattern: "data/licenses/*.json" },
  { schema: "benchmark", pattern: "data/benchmarks/*.json" },
  { schema: "provider", pattern: "data/providers/*/provider.json" },

  // Arrays
  {
    schema: "benchmark-results",
    pattern: "data/organizations/*/models/*/benchmarks.json",
    isArray: true,
  },
  {
    schema: "provider-models",
    pattern: "data/providers/*/models.json",
    isArray: true,
  },
];

let allValid = true;

for (const { schema, pattern, isArray } of validations) {
  const isValid = validateSchema(schema, pattern, isArray);
  allValid = allValid && isValid;
}

if (allValid) {
  console.log("\n✅ All schemas are valid! 🎉");

  // Run integrity validation
  console.log("\n" + "=".repeat(60));
  console.log("Phase 2: Data Integrity Validation");
  console.log("=".repeat(60));

  const IntegrityValidator = require("./integrity-validator.js");
  const integrityValidator = new IntegrityValidator();

  integrityValidator.validate().then((integrityValid) => {
    if (integrityValid) {
      console.log("\n🎉 All validations passed successfully!");
      process.exit(0);
    } else {
      console.error("\n❌ Data integrity validation failed.");
      process.exit(1);
    }
  });
} else {
  console.error("\n❌ Schema validation failed.");
  process.exit(1);
}