gitextract_itkaqbxy/

├── .coderabbit.yaml
├── .github/
│   ├── ISSUE_TEMPLATE/
│   │   └── config.yml
│   └── workflows/
│       ├── copyright-check.yml
│       ├── docs.yml
│       ├── gpu_tests.yml
│       ├── lint.yml
│       └── tests.yml
├── .gitignore
├── .pre-commit-config.yaml
├── CONTRIBUTING.md
├── LICENSE
├── MANIFEST.in
├── README.md
├── __init__.py
├── cluster_configs/
│   ├── example-local.yaml
│   ├── example-ray.yaml
│   └── example-slurm.yaml
├── core/
│   ├── README.md
│   ├── pyproject.toml
│   └── requirements.txt
├── dataset_explorer_demo/
│   ├── README.md
│   └── visualize_similar.py
├── dockerfiles/
│   ├── Dockerfile.megatron
│   ├── Dockerfile.nemo-rl
│   ├── Dockerfile.nemo-skills
│   ├── Dockerfile.sandbox
│   ├── Dockerfile.verl
│   ├── Dockerfile.vllm
│   ├── README.md
│   ├── build.sh
│   ├── ifbench.patch
│   ├── sandbox/
│   │   ├── block_network.c
│   │   ├── nginx-worker-proxy.conf.template
│   │   ├── nginx.conf.template
│   │   └── start-with-nginx.sh
│   └── swe-bench/
│       ├── Dockerfile.nemo-skills.alpine
│       └── Dockerfile.swe-zero
├── docs/
│   ├── agentic_inference/
│   │   ├── parallel_thinking.md
│   │   └── tool_calling.md
│   ├── basics/
│   │   ├── chat_interface.md
│   │   ├── cluster-configs.md
│   │   ├── code-packaging.md
│   │   ├── index.md
│   │   ├── inference.md
│   │   ├── installation.md
│   │   ├── prompt-format.md
│   │   └── sandbox.md
│   ├── css/
│   │   └── extra.css
│   ├── evaluation/
│   │   ├── code.md
│   │   ├── external-benchmarks.md
│   │   ├── formal-math.md
│   │   ├── index.md
│   │   ├── instruction-following.md
│   │   ├── long-context.md
│   │   ├── multilingual.md
│   │   ├── natural-math.md
│   │   ├── other-benchmarks.md
│   │   ├── robustness.md
│   │   ├── scientific-knowledge.md
│   │   ├── speculative-decoding.md
│   │   ├── speech-audio.md
│   │   ├── tool-calling.md
│   │   └── vlm.md
│   ├── index.md
│   ├── pipelines/
│   │   ├── decontamination.md
│   │   ├── evaluation.md
│   │   ├── generation.md
│   │   ├── index.md
│   │   ├── llm-as-a-judge.md
│   │   ├── run-cmd.md
│   │   ├── start-server.md
│   │   ├── training-verl.md
│   │   └── training.md
│   ├── recipes/
│   │   └── libtrace.md
│   ├── releases/
│   │   ├── index.md
│   │   ├── nemotron-math-v2/
│   │   │   ├── dataset.md
│   │   │   ├── evaluation.md
│   │   │   ├── index.md
│   │   │   └── training.md
│   │   ├── nemotronmathproofs/
│   │   │   └── index.md
│   │   ├── opencodereasoning/
│   │   │   ├── dataset.md
│   │   │   ├── evaluation.md
│   │   │   └── index.md
│   │   ├── openmathinstruct2/
│   │   │   ├── dataset.md
│   │   │   ├── evaluation.md
│   │   │   ├── index.md
│   │   │   └── training.md
│   │   ├── openmathreasoning/
│   │   │   ├── dataset.md
│   │   │   ├── evaluation.md
│   │   │   ├── index.md
│   │   │   └── training.md
│   │   └── openreasoning/
│   │       ├── dataset.md
│   │       ├── evaluation.md
│   │       ├── index.md
│   │       └── training.md
│   └── tutorials/
│       ├── index.md
│       ├── notebooks/
│       │   ├── demo_aimo_inference.ipynb
│       │   └── prepare_calibration_data.py
│       └── posts/
│           ├── gpt-oss-python.md
│           ├── llama-nemotron-super-v1.5-evals.md
│           ├── nemotron-nano-v2-evals.md
│           ├── noc-reasoning-agent.md
│           └── omr-simple-recipe.md
├── greptile.json
├── mkdocs.yml
├── nemo_skills/
│   ├── __init__.py
│   ├── _cli_stub.py
│   ├── code_execution/
│   │   ├── __init__.py
│   │   ├── local_sandbox/
│   │   │   ├── __init__.py
│   │   │   ├── local_sandbox_server.py
│   │   │   └── start_local_sandbox.sh
│   │   ├── proof_utils.py
│   │   ├── sandbox.py
│   │   └── utils.py
│   ├── conversion/
│   │   ├── __init__.py
│   │   ├── hf_to_nemo_llama.py
│   │   ├── hf_to_nemo_qwen.py
│   │   ├── hf_to_trtllm_quantize.py
│   │   ├── nemo_config_llama.yaml
│   │   ├── nemo_config_qwen.yaml
│   │   ├── nemo_to_hf_llama.py
│   │   └── nemo_to_hf_qwen.py
│   ├── dataset/
│   │   ├── __init__.py
│   │   ├── aai/
│   │   │   ├── __init__.py
│   │   │   ├── aai_score.py
│   │   │   └── prepare.py
│   │   ├── aalcr/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── aime24/
│   │   │   ├── __init__.py
│   │   │   ├── prepare.py
│   │   │   └── test.txt
│   │   ├── aime24-x/
│   │   │   ├── __init__.py
│   │   │   ├── aime24_x_utils.py
│   │   │   └── prepare.py
│   │   ├── aime25/
│   │   │   ├── __init__.py
│   │   │   ├── prepare.py
│   │   │   └── test.txt
│   │   ├── aime25-x/
│   │   │   ├── __init__.py
│   │   │   ├── aime25_x_utils.py
│   │   │   └── prepare.py
│   │   ├── aime26/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── algebra222/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── amc23/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── answer-judge/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── apex-shortlist/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── arena-hard/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── arena-hard-v2/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── asdiv/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── asr-leaderboard/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── audiobench/
│   │   │   ├── __init__.py
│   │   │   ├── judge/
│   │   │   │   └── __init__.py
│   │   │   ├── nonjudge/
│   │   │   │   └── __init__.py
│   │   │   └── prepare.py
│   │   ├── beyond-aime/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── bfcl_v3/
│   │   │   ├── __init__.py
│   │   │   ├── bfcl_score.py
│   │   │   ├── constants.py
│   │   │   ├── irrelevance/
│   │   │   │   └── __init__.py
│   │   │   ├── java/
│   │   │   │   └── __init__.py
│   │   │   ├── javascript/
│   │   │   │   └── __init__.py
│   │   │   ├── live_irrelevance/
│   │   │   │   └── __init__.py
│   │   │   ├── live_multiple/
│   │   │   │   └── __init__.py
│   │   │   ├── live_parallel/
│   │   │   │   └── __init__.py
│   │   │   ├── live_parallel_multiple/
│   │   │   │   └── __init__.py
│   │   │   ├── live_relevance/
│   │   │   │   └── __init__.py
│   │   │   ├── live_simple/
│   │   │   │   └── __init__.py
│   │   │   ├── multi_turn_base/
│   │   │   │   └── __init__.py
│   │   │   ├── multi_turn_long_context/
│   │   │   │   └── __init__.py
│   │   │   ├── multi_turn_miss_func/
│   │   │   │   └── __init__.py
│   │   │   ├── multi_turn_miss_param/
│   │   │   │   └── __init__.py
│   │   │   ├── multiple/
│   │   │   │   └── __init__.py
│   │   │   ├── parallel/
│   │   │   │   └── __init__.py
│   │   │   ├── parallel_multiple/
│   │   │   │   └── __init__.py
│   │   │   ├── prepare.py
│   │   │   ├── simple/
│   │   │   │   └── __init__.py
│   │   │   ├── simple_java/
│   │   │   │   └── __init__.py
│   │   │   ├── simple_javascript/
│   │   │   │   └── __init__.py
│   │   │   ├── simple_python/
│   │   │   │   └── __init__.py
│   │   │   └── utils.py
│   │   ├── bfcl_v4/
│   │   │   ├── __init__.py
│   │   │   ├── bfcl_score.py
│   │   │   ├── irrelevance/
│   │   │   │   └── __init__.py
│   │   │   ├── live_irrelevance/
│   │   │   │   └── __init__.py
│   │   │   ├── live_multiple/
│   │   │   │   └── __init__.py
│   │   │   ├── live_parallel/
│   │   │   │   └── __init__.py
│   │   │   ├── live_parallel_multiple/
│   │   │   │   └── __init__.py
│   │   │   ├── live_relevance/
│   │   │   │   └── __init__.py
│   │   │   ├── live_simple/
│   │   │   │   └── __init__.py
│   │   │   ├── memory_kv/
│   │   │   │   └── __init__.py
│   │   │   ├── memory_rec_sum/
│   │   │   │   └── __init__.py
│   │   │   ├── memory_vector/
│   │   │   │   └── __init__.py
│   │   │   ├── multi_turn_base/
│   │   │   │   └── __init__.py
│   │   │   ├── multi_turn_long_context/
│   │   │   │   └── __init__.py
│   │   │   ├── multi_turn_miss_func/
│   │   │   │   └── __init__.py
│   │   │   ├── multi_turn_miss_param/
│   │   │   │   └── __init__.py
│   │   │   ├── multiple/
│   │   │   │   └── __init__.py
│   │   │   ├── parallel/
│   │   │   │   └── __init__.py
│   │   │   ├── parallel_multiple/
│   │   │   │   └── __init__.py
│   │   │   ├── prepare.py
│   │   │   ├── simple_java/
│   │   │   │   └── __init__.py
│   │   │   ├── simple_javascript/
│   │   │   │   └── __init__.py
│   │   │   ├── simple_python/
│   │   │   │   └── __init__.py
│   │   │   ├── web_search_base/
│   │   │   │   └── __init__.py
│   │   │   └── web_search_no_snippet/
│   │   │       └── __init__.py
│   │   ├── bigcodebench/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── birdbench/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── brumo25/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── ccc/
│   │   │   └── __init__.py
│   │   ├── challenge19/
│   │   │   ├── __init__.py
│   │   │   ├── prepare.py
│   │   │   └── test.txt
│   │   ├── college_math/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── comp-math-24-25/
│   │   │   ├── __init__.py
│   │   │   ├── prepare.py
│   │   │   └── test.txt
│   │   ├── compute-eval/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── contextasr-bench/
│   │   │   ├── __init__.py
│   │   │   ├── coarse/
│   │   │   │   └── __init__.py
│   │   │   ├── contextasr_score.py
│   │   │   ├── contextless/
│   │   │   │   └── __init__.py
│   │   │   ├── fine/
│   │   │   │   └── __init__.py
│   │   │   └── prepare.py
│   │   ├── covost2/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── critpt/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── dsbench_da/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── fleurs/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── flores200/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── frontierscience-olympiad/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── gaokao2023en/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── global_piqa/
│   │   │   ├── __init__.py
│   │   │   ├── global_piqa_utils.py
│   │   │   └── prepare.py
│   │   ├── gpqa/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── gpqa-x/
│   │   │   ├── __init__.py
│   │   │   ├── gpqa_x_utils.py
│   │   │   └── prepare.py
│   │   ├── gsm-plus/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── gsm8k/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── hendrycks_math/
│   │   │   ├── __init__.py
│   │   │   ├── fix_ref_solns.py
│   │   │   └── prepare.py
│   │   ├── hle/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── hle_verified/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── hmmt_feb25/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── hmmt_nov25/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── hotpotqa/
│   │   │   ├── __init__.py
│   │   │   ├── prepare.py
│   │   │   └── prepare_utils.py
│   │   ├── hotpotqa_closedbook/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── human-eval/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── human-eval-infilling/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── icpc/
│   │   │   └── __init__.py
│   │   ├── ifbench/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── ifeval/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── imo-answerbench/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── imo-gradingbench/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── imo-proofbench/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── ioi/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── librispeech-pc/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── livebench-coding/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── livecodebench/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── livecodebench-cpp/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── livecodebench-pro/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── livecodebench-x/
│   │   │   ├── __init__.py
│   │   │   ├── livecodebench_x_utils.py
│   │   │   └── prepare.py
│   │   ├── longbench-v2/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── longcodebench/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── m-arena-hard/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── m-arena-hard-v2/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── math-500/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── math-odyssey/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── mawps/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── mbpp/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── minerva_math/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── minif2f/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── mmau-pro/
│   │   │   ├── __init__.py
│   │   │   ├── closed_form/
│   │   │   │   └── __init__.py
│   │   │   ├── instruction_following/
│   │   │   │   └── __init__.py
│   │   │   ├── mmau_pro_score.py
│   │   │   ├── open_ended/
│   │   │   │   └── __init__.py
│   │   │   └── prepare.py
│   │   ├── mmlu/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── mmlu-pro/
│   │   │   ├── __init__.py
│   │   │   ├── prepare.py
│   │   │   └── subsets/
│   │   │       └── 10pct_opt_v1.txt
│   │   ├── mmlu-prox/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── mmlu-redux/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── mmmlu/
│   │   │   ├── __init__.py
│   │   │   ├── mmmlu_utils.py
│   │   │   └── prepare.py
│   │   ├── mmmu-pro/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── mobench/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── mrcr/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── musan/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── numb3rs/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── olympiadbench/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── omni-math/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── omniscience/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── open-proof-corpus-judge/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── physics/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── polymath/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── prepare.py
│   │   ├── proof-arena-judge/
│   │   │   ├── __init__.py
│   │   │   ├── gemini_imo_2025/
│   │   │   │   ├── 1.txt
│   │   │   │   ├── 2.txt
│   │   │   │   ├── 3.txt
│   │   │   │   ├── 4.txt
│   │   │   │   └── 5.txt
│   │   │   └── prepare.py
│   │   ├── proof-bench-judge/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── proofnet/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── putnam-bench/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── ruler/
│   │   │   ├── __init__.py
│   │   │   ├── prepare.py
│   │   │   └── ruler_score.py
│   │   ├── ruler2/
│   │   │   ├── __init__.py
│   │   │   ├── prepare.py
│   │   │   ├── prepare_mmlu.py
│   │   │   ├── prepare_niah.py
│   │   │   ├── prepare_qa.py
│   │   │   ├── ruler2_score.py
│   │   │   └── tokenizer.py
│   │   ├── scicode/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── simpleqa/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── speed-bench/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── supergpqa/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── svamp/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── swe-bench/
│   │   │   ├── __init__.py
│   │   │   ├── dump_images.py
│   │   │   ├── dump_repos.py
│   │   │   └── prepare.py
│   │   ├── swe-bench-multilingual/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── swe-bench-pro/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── swe-rebench/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── ugphysics/
│   │   │   ├── __init__.py
│   │   │   └── prepare.py
│   │   ├── utils.py
│   │   └── wmt24pp/
│   │       ├── __init__.py
│   │       └── prepare.py
│   ├── evaluation/
│   │   ├── __init__.py
│   │   ├── aggregate_answers.py
│   │   ├── compute_group_score.py
│   │   ├── evaluator/
│   │   │   ├── __init__.py
│   │   │   ├── arena.py
│   │   │   ├── audio.py
│   │   │   ├── base.py
│   │   │   ├── bfcl.py
│   │   │   ├── bird.py
│   │   │   ├── ccc.py
│   │   │   ├── code.py
│   │   │   ├── comet.py
│   │   │   ├── compute_eval.py
│   │   │   ├── contextasr.py
│   │   │   ├── critpt.py
│   │   │   ├── dsbench.py
│   │   │   ├── icpc.py
│   │   │   ├── ifbench.py
│   │   │   ├── ifeval.py
│   │   │   ├── ioi.py
│   │   │   ├── livecodebench.py
│   │   │   ├── math.py
│   │   │   ├── mcq.py
│   │   │   ├── mmau_pro.py
│   │   │   ├── mrcr.py
│   │   │   ├── nvembed_judge.py
│   │   │   ├── ruler.py
│   │   │   ├── scicode.py
│   │   │   └── specdec.py
│   │   ├── math_grader.py
│   │   ├── metrics/
│   │   │   ├── __init__.py
│   │   │   ├── aalcr_metrics.py
│   │   │   ├── answer_judgement_metrics.py
│   │   │   ├── arena_metrics.py
│   │   │   ├── audio_metrics.py
│   │   │   ├── base.py
│   │   │   ├── bfcl_metrics.py
│   │   │   ├── bird_metrics.py
│   │   │   ├── ccc_metrics.py
│   │   │   ├── code_metrics.py
│   │   │   ├── compute_metrics.py
│   │   │   ├── contextasr_metrics.py
│   │   │   ├── critpt_metrics.py
│   │   │   ├── gradingbench_metrics.py
│   │   │   ├── hleaa_metrics.py
│   │   │   ├── hotpotqa_filtering.py
│   │   │   ├── hotpotqa_metrics.py
│   │   │   ├── icpc_metrics.py
│   │   │   ├── if_metrics.py
│   │   │   ├── ioi_metrics.py
│   │   │   ├── lean4_metrics.py
│   │   │   ├── map_metrics.py
│   │   │   ├── math_metrics.py
│   │   │   ├── mcq_multilingual_metrics.py
│   │   │   ├── mmau_pro_metrics.py
│   │   │   ├── mrcr_metrics.py
│   │   │   ├── omni_metrics.py
│   │   │   ├── physics_metrics.py
│   │   │   ├── ruler2_metrics.py
│   │   │   ├── ruler_metrics.py
│   │   │   ├── simpleqa_metrics.py
│   │   │   ├── specdec_metrics.py
│   │   │   ├── translation_metrics.py
│   │   │   ├── ugphysics_metrics.py
│   │   │   ├── utils.py
│   │   │   └── weighted_math_metrics.py
│   │   └── utils.py
│   ├── file_utils.py
│   ├── inference/
│   │   ├── __init__.py
│   │   ├── autoformalize.py
│   │   ├── chat_interface/
│   │   │   ├── __init__.py
│   │   │   ├── chat_service.py
│   │   │   ├── core.py
│   │   │   ├── launch.py
│   │   │   └── ui.py
│   │   ├── check_contamination.py
│   │   ├── eval/
│   │   │   ├── __init__.py
│   │   │   ├── arena_judge.py
│   │   │   ├── bfcl.py
│   │   │   ├── bfcl_utils.py
│   │   │   ├── bfcl_web_search.py
│   │   │   ├── compute_eval.py
│   │   │   ├── critpt.py
│   │   │   ├── scicode.py
│   │   │   ├── scicode_utils.py
│   │   │   ├── specdec.py
│   │   │   └── swebench.py
│   │   ├── factory.py
│   │   ├── generate.py
│   │   ├── litellm_hybrid_cache.py
│   │   ├── llm_math_judge.py
│   │   ├── log_samples_wandb.py
│   │   ├── merge_chunks.py
│   │   ├── model/
│   │   │   ├── __init__.py
│   │   │   ├── asr_nim.py
│   │   │   ├── audio_utils.py
│   │   │   ├── azure.py
│   │   │   ├── base.py
│   │   │   ├── code_execution.py
│   │   │   ├── context_retry.py
│   │   │   ├── gemini.py
│   │   │   ├── megatron.py
│   │   │   ├── nim_utils.py
│   │   │   ├── openai.py
│   │   │   ├── parallel_thinking.py
│   │   │   ├── sglang.py
│   │   │   ├── tool_call.py
│   │   │   ├── tts_nim.py
│   │   │   ├── utils.py
│   │   │   ├── vllm.py
│   │   │   └── vllm_multimodal.py
│   │   ├── patch_litellm_logging.py
│   │   ├── prover.py
│   │   ├── retrieve_similar.py
│   │   ├── server/
│   │   │   ├── __init__.py
│   │   │   ├── serve_riva_nim.py
│   │   │   ├── serve_sglang.py
│   │   │   ├── serve_unified.py
│   │   │   ├── serve_vllm.py
│   │   │   └── serve_vllm_dp_ray.py
│   │   ├── structured_outputs.py
│   │   └── tournament_utils.py
│   ├── mcp/
│   │   ├── __init__.py
│   │   ├── adapters.py
│   │   ├── clients.py
│   │   ├── config.py
│   │   ├── servers/
│   │   │   ├── __init__.py
│   │   │   ├── chemistry/
│   │   │   │   ├── __init__.py
│   │   │   │   └── periodictable_tool.py
│   │   │   ├── exa_tool.py
│   │   │   ├── physics/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── coolprop_tool.py
│   │   │   │   ├── particle_tool.py
│   │   │   │   └── radioactivedecay_tool.py
│   │   │   ├── python_tool.py
│   │   │   ├── tavily_search_tool.py
│   │   │   └── web/
│   │   │       ├── __init__.py
│   │   │       ├── arxiv_tool.py
│   │   │       └── wikipedia_tool.py
│   │   ├── tool_manager.py
│   │   ├── tool_providers.py
│   │   └── utils.py
│   ├── pipeline/
│   │   ├── __init__.py
│   │   ├── app.py
│   │   ├── cli.py
│   │   ├── convert.py
│   │   ├── dataset.py
│   │   ├── eval.py
│   │   ├── generate.py
│   │   ├── judges/
│   │   │   ├── __init__.py
│   │   │   ├── comet_judge.py
│   │   │   └── nvembed_judge.py
│   │   ├── megatron_lm/
│   │   │   ├── __init__.py
│   │   │   └── train.py
│   │   ├── nemo_evaluator.py
│   │   ├── nemo_gym_rollouts.py
│   │   ├── nemo_rl/
│   │   │   ├── __init__.py
│   │   │   ├── average_checkpoints.py
│   │   │   ├── grpo.py
│   │   │   └── sft.py
│   │   ├── prepare_data.py
│   │   ├── robust_eval.py
│   │   ├── run_cmd.py
│   │   ├── setup.py
│   │   ├── start_server.py
│   │   ├── summarize_results.py
│   │   ├── summarize_robustness.py
│   │   ├── utils/
│   │   │   ├── __init__.py
│   │   │   ├── cluster.py
│   │   │   ├── commands.py
│   │   │   ├── declarative.py
│   │   │   ├── docker_images.py
│   │   │   ├── eval.py
│   │   │   ├── exp.py
│   │   │   ├── generation.py
│   │   │   ├── mounts.py
│   │   │   ├── packager.py
│   │   │   ├── ray_executor.py
│   │   │   ├── scripts/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── base.py
│   │   │   │   ├── eval.py
│   │   │   │   ├── generation.py
│   │   │   │   ├── nemo_gym.py
│   │   │   │   └── server.py
│   │   │   └── server.py
│   │   └── verl/
│   │       ├── __init__.py
│   │       └── ppo.py
│   ├── prompt/
│   │   ├── __init__.py
│   │   ├── code_tags/
│   │   │   ├── __init__.py
│   │   │   ├── gpt-oss.yaml
│   │   │   ├── llama3.yaml
│   │   │   ├── nemotron.yaml
│   │   │   ├── openmath.yaml
│   │   │   ├── qwen-lean.yaml
│   │   │   └── qwen.yaml
│   │   ├── config/
│   │   │   ├── __init__.py
│   │   │   ├── compute-eval/
│   │   │   │   └── baseline.yaml
│   │   │   ├── eval/
│   │   │   │   ├── aai/
│   │   │   │   │   ├── livecodebench.yaml
│   │   │   │   │   ├── math.yaml
│   │   │   │   │   ├── mcq-10choices-boxed.yaml
│   │   │   │   │   ├── mcq-10choices.yaml
│   │   │   │   │   ├── mcq-4choices-boxed.yaml
│   │   │   │   │   ├── mcq-4choices.yaml
│   │   │   │   │   ├── omni.yaml
│   │   │   │   │   ├── search-mcq-10choices.yaml
│   │   │   │   │   └── search-mcq-4choices.yaml
│   │   │   │   ├── bigcodebench/
│   │   │   │   │   └── codegen.yaml
│   │   │   │   ├── critpt/
│   │   │   │   │   ├── code_output.yaml
│   │   │   │   │   └── solve_problem.yaml
│   │   │   │   ├── hotpotqa.yaml
│   │   │   │   ├── hotpotqa_closedbook.yaml
│   │   │   │   ├── livecodebench/
│   │   │   │   │   ├── aa_index.yaml
│   │   │   │   │   ├── default.yaml
│   │   │   │   │   └── default_reasoning.yaml
│   │   │   │   ├── longbench/
│   │   │   │   │   └── default.yaml
│   │   │   │   ├── matharena/
│   │   │   │   │   └── aime.yaml
│   │   │   │   ├── scicode/
│   │   │   │   │   ├── background.yaml
│   │   │   │   │   └── default.yaml
│   │   │   │   └── swe-bench/
│   │   │   │       ├── mini-swe-agent/
│   │   │   │       │   ├── swebench.yaml
│   │   │   │       │   ├── swebench_backticks.yaml
│   │   │   │       │   └── swebench_xml.yaml
│   │   │   │       ├── openhands/
│   │   │   │       │   ├── default.toml
│   │   │   │       │   └── no-native-tool-calling.toml
│   │   │   │       └── swe-agent/
│   │   │   │           ├── default.yaml
│   │   │   │           ├── multilingual.yaml
│   │   │   │           └── swe-agent-lm-32b.yaml
│   │   │   ├── generic/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── codegen.yaml
│   │   │   │   ├── codegen_system.yaml
│   │   │   │   ├── default.yaml
│   │   │   │   ├── dsbench-da-incontext.yaml
│   │   │   │   ├── dsbench-da.yaml
│   │   │   │   ├── fim.yaml
│   │   │   │   ├── general-boxed.yaml
│   │   │   │   ├── genselect.yaml
│   │   │   │   ├── gensynthesis.yaml
│   │   │   │   ├── hle.yaml
│   │   │   │   ├── math-base.yaml
│   │   │   │   ├── math.yaml
│   │   │   │   ├── matharena.yaml
│   │   │   │   ├── physics.yaml
│   │   │   │   ├── problem-augmentation-similar.yaml
│   │   │   │   ├── problem-augmentation.yaml
│   │   │   │   ├── search-boxed.yaml
│   │   │   │   ├── text_to_sql.yaml
│   │   │   │   └── ugphysics.yaml
│   │   │   ├── gpt-oss/
│   │   │   │   ├── livecodebench.yaml
│   │   │   │   └── math.yaml
│   │   │   ├── judge/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── aa-omni-judge.yaml
│   │   │   │   ├── aalcr.yaml
│   │   │   │   ├── arena.yaml
│   │   │   │   ├── arena_creative.yaml
│   │   │   │   ├── audiobench.yaml
│   │   │   │   ├── audiobench_binary.yaml
│   │   │   │   ├── check-contamination.yaml
│   │   │   │   ├── code.yaml
│   │   │   │   ├── frontierscience-olympiad.yaml
│   │   │   │   ├── general-judge.yaml
│   │   │   │   ├── hle.yaml
│   │   │   │   ├── imo_answerbench.yaml
│   │   │   │   ├── imo_gradingbench.yaml
│   │   │   │   ├── imo_proofbench.yaml
│   │   │   │   ├── math-code.yaml
│   │   │   │   ├── math-proof-judge.yaml
│   │   │   │   ├── math.yaml
│   │   │   │   ├── mmau-pro.yaml
│   │   │   │   ├── mt-bench/
│   │   │   │   │   ├── turn1.yaml
│   │   │   │   │   ├── turn1_with_ref.yaml
│   │   │   │   │   ├── turn2.yaml
│   │   │   │   │   └── turn2_with_ref.yaml
│   │   │   │   ├── physics.yaml
│   │   │   │   ├── simpleqa.yaml
│   │   │   │   └── ugphysics.yaml
│   │   │   ├── lean4/
│   │   │   │   ├── autoformalization.yaml
│   │   │   │   ├── backtranslation.yaml
│   │   │   │   ├── formal-proof-deepseek-prover-v2-nemotron.yaml
│   │   │   │   ├── formal-proof-deepseek-prover-v2.yaml
│   │   │   │   ├── formal-proof-reasoning-execution.yaml
│   │   │   │   ├── formal-proof-reasoning.yaml
│   │   │   │   ├── formal-proof.yaml
│   │   │   │   ├── goedel-prover-v2-nemotron.yaml
│   │   │   │   ├── goedel-prover-v2-refinement-nemotron.yaml
│   │   │   │   ├── goedel-prover-v2-refinement.yaml
│   │   │   │   ├── goedel-prover-v2.yaml
│   │   │   │   ├── judge-backtranslation.yaml
│   │   │   │   ├── nat-to-lean4.yaml
│   │   │   │   ├── refinement_code_error.yaml
│   │   │   │   ├── refinement_consistent_error.yaml
│   │   │   │   └── refinement_parsing_error.yaml
│   │   │   ├── llama3-instruct/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── math.yaml
│   │   │   │   └── mmlu.yaml
│   │   │   ├── multilingual/
│   │   │   │   ├── __init__.py
│   │   │   │   └── segment-translation.yaml
│   │   │   ├── openmath/
│   │   │   │   ├── genselect.yaml
│   │   │   │   └── tir.yaml
│   │   │   ├── qwen/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── math-cot.yaml
│   │   │   │   ├── math-tir.yaml
│   │   │   │   └── qwq.yaml
│   │   │   ├── qwen3/
│   │   │   │   ├── math-cot-non-think.yaml
│   │   │   │   └── math-cot-think.yaml
│   │   │   ├── robustness/
│   │   │   │   ├── code_prompts/
│   │   │   │   │   ├── aai_prompt.yaml
│   │   │   │   │   ├── code_1.yaml
│   │   │   │   │   ├── code_2.yaml
│   │   │   │   │   ├── code_3.yaml
│   │   │   │   │   ├── code_4.yaml
│   │   │   │   │   ├── ns_gen_codegen.yaml
│   │   │   │   │   └── ns_python_codegen.yaml
│   │   │   │   ├── math_prompts/
│   │   │   │   │   ├── boxed_1.yaml
│   │   │   │   │   ├── boxed_2.yaml
│   │   │   │   │   ├── boxed_3.yaml
│   │   │   │   │   ├── boxed_4.yaml
│   │   │   │   │   ├── boxed_5.yaml
│   │   │   │   │   ├── boxed_6.yaml
│   │   │   │   │   ├── boxed_7.yaml
│   │   │   │   │   ├── boxed_8.yaml
│   │   │   │   │   ├── boxed_aai.yaml
│   │   │   │   │   └── boxed_general.yaml
│   │   │   │   ├── mcq_prompts/
│   │   │   │   │   ├── aai_1.yaml
│   │   │   │   │   ├── aai_2.yaml
│   │   │   │   │   ├── angle_brackets_1.yaml
│   │   │   │   │   ├── angle_brackets_2.yaml
│   │   │   │   │   ├── boxed_1.yaml
│   │   │   │   │   ├── boxed_2.yaml
│   │   │   │   │   ├── correct_1.yaml
│   │   │   │   │   ├── correct_2.yaml
│   │   │   │   │   ├── final_answer_1.yaml
│   │   │   │   │   └── final_answer_2.yaml
│   │   │   │   └── prompt_set_config.yaml
│   │   │   ├── unit_test/
│   │   │   │   └── code.yaml
│   │   │   └── vlm/
│   │   │       ├── __init__.py
│   │   │       └── mmmu-pro.yaml
│   │   ├── few_shot_examples/
│   │   │   ├── __init__.py
│   │   │   ├── gsm8k.py
│   │   │   ├── lean4.py
│   │   │   ├── math.py
│   │   │   ├── mmlu.py
│   │   │   ├── mmlu_pro.py
│   │   │   └── open_science.py
│   │   └── utils.py
│   ├── training/
│   │   ├── __init__.py
│   │   ├── data_preparation_utils/
│   │   │   ├── __init__.py
│   │   │   ├── arithmetic_utils.py
│   │   │   ├── config/
│   │   │   │   ├── code_sft.yaml
│   │   │   │   ├── math_rl.yaml
│   │   │   │   ├── math_sft.yaml
│   │   │   │   └── stem_sft.yaml
│   │   │   ├── filters.py
│   │   │   ├── merge_processor.py
│   │   │   └── preprocessing.py
│   │   ├── nemo_rl/
│   │   │   ├── __init__.py
│   │   │   ├── configs/
│   │   │   │   ├── grpo.yaml
│   │   │   │   └── sft.yaml
│   │   │   ├── convert_dcp_to_hf.py
│   │   │   ├── convert_megatron_to_hf.py
│   │   │   ├── environments/
│   │   │   │   ├── __init__.py
│   │   │   │   └── math_environment.py
│   │   │   ├── offline_hf_consolidation.py
│   │   │   ├── prompts/
│   │   │   │   ├── cot.txt
│   │   │   │   └── math.txt
│   │   │   ├── start_grpo.py
│   │   │   └── start_sft.py
│   │   ├── prepare_data.py
│   │   ├── train_redrafter.py
│   │   └── verl/
│   │       ├── __init__.py
│   │       └── prepare_data.py
│   ├── utils.py
│   └── version.py
├── pyproject.toml
├── recipes/
│   ├── README.md
│   ├── asr_tts/
│   │   ├── README.md
│   │   ├── nim_configurations.py
│   │   ├── riva_generate.py
│   │   └── scripts/
│   │       ├── run_asr_nim_cluster.sh
│   │       └── run_tts_nim_cluster.sh
│   ├── data-integrity/
│   │   ├── README.md
│   │   ├── model_comparison/
│   │   │   ├── __init__.py
│   │   │   ├── analyses/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── length_analysis.py
│   │   │   │   ├── similarity_analysis.py
│   │   │   │   ├── umap_analysis.py
│   │   │   │   └── vocabulary_analysis.py
│   │   │   ├── analyzer.py
│   │   │   ├── data_loader.py
│   │   │   ├── main.py
│   │   │   ├── report_generator.py
│   │   │   ├── requirements.txt
│   │   │   ├── setup.py
│   │   │   ├── utils/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── file_utils.py
│   │   │   │   ├── model_utils.py
│   │   │   │   └── text_utils.py
│   │   │   └── visualization/
│   │   │       ├── __init__.py
│   │   │       ├── interactive_plots.py
│   │   │       └── static_plots.py
│   │   ├── postprocess_data.py
│   │   ├── prepare_data.py
│   │   └── run_integrity_pipeline.py
│   ├── gencluster/
│   │   ├── pipeline/
│   │   │   ├── run_inter_tournament.py
│   │   │   ├── run_intra_tournament.py
│   │   │   ├── solution_generation.py
│   │   │   └── test_case_generation.py
│   │   ├── prompts/
│   │   │   ├── generator.yaml
│   │   │   ├── selector.yaml
│   │   │   └── validator.yaml
│   │   └── scripts/
│   │       ├── compute_tournament_score.py
│   │       ├── extract_cpp_code.py
│   │       ├── filter_clusters.py
│   │       ├── generate_datasets_json.py
│   │       ├── generate_test_cases.py
│   │       ├── merge_tournament_scores.py
│   │       ├── run_tournament_all.py
│   │       ├── submission_ICPC.py
│   │       ├── submission_IOI.py
│   │       └── tournament_schedule.py
│   ├── libtrace/
│   │   ├── README.md
│   │   ├── dockerfiles/
│   │   │   ├── Dockerfile.sandbox
│   │   │   ├── environment.yml
│   │   │   └── start-with-nginx.sh
│   │   ├── prompts/
│   │   │   ├── applicability-relevance.yaml
│   │   │   └── problem-generation.yaml
│   │   └── scripts/
│   │       ├── collect_generated_problems.py
│   │       ├── filter_applicability_relevance.py
│   │       ├── gather_solutions.py
│   │       ├── harvest_docs.py
│   │       └── prepare_inference_jsonl.py
│   ├── multimodal/
│   │   ├── __init__.py
│   │   └── server/
│   │       ├── README.md
│   │       ├── __init__.py
│   │       ├── backends/
│   │       │   ├── __init__.py
│   │       │   ├── base.py
│   │       │   ├── magpie_tts_backend.py
│   │       │   └── nemo_asr_backend.py
│   │       └── unified_server.py
│   ├── noc-reasoning-agent/
│   │   ├── configs/
│   │   │   ├── config.ini
│   │   │   ├── noc_reasoning_sft.yaml
│   │   │   └── noc_reasoning_sft_6.yaml
│   │   ├── prompts/
│   │   │   ├── formatting_prompt.yaml
│   │   │   ├── prompt_incident.yaml
│   │   │   ├── prompt_reasoning.yaml
│   │   │   └── shortened_prompt_reasoning.yaml
│   │   └── scripts/
│   │       ├── create_agent_with_tools.py
│   │       ├── create_agent_with_tools_batch.py
│   │       ├── evaluation/
│   │       │   ├── evaluation_with_judge.py
│   │       │   ├── problem_code_evaluation.py
│   │       │   └── score.py
│   │       ├── filtering/
│   │       │   ├── filter_rows.py
│   │       │   └── match_keywords.py
│   │       ├── ns_pipelines/
│   │       │   ├── generate_synthetic_data.py
│   │       │   └── prepare_react_agent.py
│   │       ├── tools.py
│   │       ├── utils/
│   │       │   ├── create_input_jsonl_from_incidents.py
│   │       │   ├── format_reasoning_json.py
│   │       │   ├── reasoning_processes.py
│   │       │   ├── schema_columns.py
│   │       │   ├── split_incident_data.py
│   │       │   ├── split_mocktools_answers.py
│   │       │   └── token_usage.py
│   │       └── visualization/
│   │           ├── extract_representation_columns.py
│   │           ├── extract_scores.py
│   │           └── generate_trace_visualization.py
│   ├── opencodereasoning/
│   │   ├── configs/
│   │   │   └── solution_sdg/
│   │   │       ├── demo.yaml
│   │   │       └── r1.yaml
│   │   ├── pipeline/
│   │   │   ├── prepare_questions.py
│   │   │   └── prepare_solutions.py
│   │   ├── prompts/
│   │   │   ├── generate_cpp_soln.yaml
│   │   │   └── generate_python_soln.yaml
│   │   └── scripts/
│   │       ├── filter_questions.py
│   │       ├── functional_helpers.py
│   │       ├── output_processing.py
│   │       └── prepare_questions.py
│   ├── openmathreasoning/
│   │   ├── configs/
│   │   │   ├── genselect_sdg/
│   │   │   │   └── qwq.yaml
│   │   │   ├── problem_sdg/
│   │   │   │   ├── demo.yaml
│   │   │   │   ├── example-data.txt
│   │   │   │   └── qwen-instruct.yaml
│   │   │   └── solution_sdg/
│   │   │       ├── demo.yaml
│   │   │       ├── qwq.yaml
│   │   │       ├── r1.yaml
│   │   │       ├── tir-limo.yaml
│   │   │       └── tir-openmath.yaml
│   │   ├── pipeline/
│   │   │   ├── genselect_generation.py
│   │   │   ├── problem_generation.py
│   │   │   └── solution_generation.py
│   │   ├── prompts/
│   │   │   ├── classify-if-binary.yaml
│   │   │   ├── classify-if-invalid.yaml
│   │   │   ├── classify-if-mcq.yaml
│   │   │   ├── classify-if-proof.yaml
│   │   │   ├── classify-tir-novelty.yaml
│   │   │   ├── classify-tir-significance.yaml
│   │   │   ├── convert-proofs.yaml
│   │   │   ├── extract-answers.yaml
│   │   │   ├── extract-problems.yaml
│   │   │   ├── math-tir-detailed.yaml
│   │   │   ├── summarize-genselect.yaml
│   │   │   └── summarize-solution.yaml
│   │   └── scripts/
│   │       ├── extract_python_fragments.py
│   │       ├── filter_novelty_significance.py
│   │       ├── genselect/
│   │       │   ├── extract_judgment.py
│   │       │   ├── merge_new_summary.py
│   │       │   ├── prepare_labeling_data.py
│   │       │   └── utils.py
│   │       ├── merge_new_summary.py
│   │       ├── postprocess_answer_extraction.py
│   │       ├── postprocess_classification.py
│   │       ├── postprocess_problem_extraction.py
│   │       ├── postprocess_proof_conversion.py
│   │       ├── postprocess_tir_generations.py
│   │       ├── prepare_raw_data.py
│   │       └── simplified_recipe.py
│   ├── openreasoning/
│   │   ├── eval.py
│   │   ├── prompts/
│   │   │   ├── science_question_augmentation_prompt.yaml
│   │   │   └── science_question_generation_prompt.yaml
│   │   └── scripts/
│   │       └── use_majority_if_no_answer.py
│   ├── opensciencereasoning/
│   │   ├── openscience_dataset_collection/
│   │   │   ├── README.md
│   │   │   ├── prompts/
│   │   │   │   ├── mcq_augment_inspired_by.yaml
│   │   │   │   ├── mcq_augment_similar.yaml
│   │   │   │   ├── mcq_four_options.yaml
│   │   │   │   ├── mcq_ten_options.yaml
│   │   │   │   └── subtopic_expansion.yaml
│   │   │   └── scripts/
│   │   │       └── filter_mcq_solutions.py
│   │   └── sdg_pipeline/
│   │       ├── README.md
│   │       ├── configs/
│   │       │   ├── pipelines/
│   │       │   │   └── base.yaml
│   │       │   └── settings/
│   │       │       ├── kimi_k2.yaml
│   │       │       ├── mcq_10_options.yaml
│   │       │       ├── mcq_4_options.yaml
│   │       │       ├── multiple_prompts.yaml
│   │       │       ├── python_enabled.yaml
│   │       │       ├── seed_data.yaml
│   │       │       ├── seed_data_postprocess.yaml
│   │       │       └── without_gt.yaml
│   │       ├── prompt/
│   │       │   ├── __init__.py
│   │       │   ├── configs/
│   │       │   │   ├── default_problem.yaml
│   │       │   │   └── topics_labeling.yaml
│   │       │   └── few_shots/
│   │       │       ├── __init__.py
│   │       │       └── topics.py
│   │       ├── run_pipeline.py
│   │       └── scripts/
│   │           ├── aggregate_difficulty.py
│   │           ├── aggregate_metadata.py
│   │           ├── aggregate_solutions.py
│   │           ├── aggregate_topics.py
│   │           ├── decontaminate.py
│   │           ├── extract_predictions.py
│   │           ├── filter_problems.py
│   │           ├── filter_solutions.py
│   │           ├── map_diversity_prompts.py
│   │           ├── prepare_topics.py
│   │           ├── process_messages_and_bucket.py
│   │           ├── remove_redundant_fields.py
│   │           ├── utils/
│   │           │   ├── constants.py
│   │           │   └── regex_constants.py
│   │           └── validate_pipeline.py
│   ├── proof-gen-verification/
│   │   ├── README.md
│   │   ├── configs/
│   │   │   └── judge-eval.yaml
│   │   ├── pipeline/
│   │   │   └── eval_judge.py
│   │   ├── prompts/
│   │   │   ├── genselect/
│   │   │   │   ├── default.yaml
│   │   │   │   ├── opc_instructions.yaml
│   │   │   │   └── proof_genselect_default.yaml
│   │   │   ├── math_judge/
│   │   │   │   ├── gemini_imo_judge_summary.yaml
│   │   │   │   ├── general.yaml
│   │   │   │   ├── general_summary.yaml
│   │   │   │   ├── general_summary_rubric.yaml
│   │   │   │   ├── judge_prompt_ablation/
│   │   │   │   │   ├── gemini1.yaml
│   │   │   │   │   ├── gemini2.yaml
│   │   │   │   │   ├── prompt1.yaml
│   │   │   │   │   ├── prompt2.yaml
│   │   │   │   │   ├── prompt3.yaml
│   │   │   │   │   ├── prompt4.yaml
│   │   │   │   │   ├── prompt5.yaml
│   │   │   │   │   ├── prompt5_rubric.yaml
│   │   │   │   │   └── prompt6_rubric.yaml
│   │   │   │   ├── lemma_break.yaml
│   │   │   │   ├── opc_judge.yaml
│   │   │   │   ├── opc_judge_summary.yaml
│   │   │   │   ├── opc_judge_summary_gt_proof.yaml
│   │   │   │   ├── opc_judge_summary_rubric.yaml
│   │   │   │   ├── proofbench_ms_ref.yaml
│   │   │   │   ├── proofbench_none.yaml
│   │   │   │   ├── proofbench_none_binary.yaml
│   │   │   │   ├── step_break.yaml
│   │   │   │   ├── step_judge_v2.yaml
│   │   │   │   ├── true_false_break.yaml
│   │   │   │   └── true_false_judge.yaml
│   │   │   ├── prover.yaml
│   │   │   └── prover_final_ans.yaml
│   │   └── scripts/
│   │       ├── build_final_ans_dataset.py
│   │       ├── combine_judgements.py
│   │       ├── final_answer_qs.py
│   │       ├── generate_generic_bon_dspy.py
│   │       ├── generate_generic_bon_generation.py
│   │       ├── generic_eval_bon.py
│   │       ├── genselect_judge_generation.py
│   │       ├── make_metrics_fa_qs.py
│   │       ├── make_rubric_generation.py
│   │       ├── script_generation.py
│   │       ├── sol_selection_generation.py
│   │       └── step_judgement_generation.py
│   └── translation/
│       ├── config/
│       │   └── qwen25.yaml
│       └── translate_jsonl.py
├── requirements/
│   ├── audio.txt
│   ├── code_execution.txt
│   ├── common-dev.txt
│   ├── common-tests.txt
│   ├── docs.txt
│   ├── pipeline.txt
│   └── stem.txt
├── tests/
│   ├── __init__.py
│   ├── conftest.py
│   ├── data/
│   │   ├── code-output.test
│   │   ├── contamination-example.test
│   │   ├── dummy_external_benchmark/
│   │   │   ├── benchmark_map.json
│   │   │   ├── my_benchmarks/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── dataset/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── my_simple_bench/
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   └── prepare.py
│   │   │   │   │   └── word_count/
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       └── prepare.py
│   │   │   │   ├── evaluation/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   └── word_count.py
│   │   │   │   ├── inference/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   └── word_count.py
│   │   │   │   ├── metrics/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   └── word_count.py
│   │   │   │   └── prompt/
│   │   │   │       └── eval/
│   │   │   │           └── word_count/
│   │   │   │               └── default.yaml
│   │   │   └── pyproject.toml
│   │   ├── eval_outputs/
│   │   │   ├── eval-results/
│   │   │   │   ├── answer-judge/
│   │   │   │   │   ├── output-rs0.jsonl-test
│   │   │   │   │   ├── output-rs1.jsonl-test
│   │   │   │   │   ├── output-rs2.jsonl-test
│   │   │   │   │   └── output-rs3.jsonl-test
│   │   │   │   ├── arena-hard/
│   │   │   │   │   └── output.jsonl-test
│   │   │   │   ├── gpqa/
│   │   │   │   │   ├── output-rs0.jsonl-test
│   │   │   │   │   ├── output-rs1.jsonl-test
│   │   │   │   │   ├── output-rs2.jsonl-test
│   │   │   │   │   └── output-rs3.jsonl-test
│   │   │   │   ├── hendrycks_math/
│   │   │   │   │   ├── output-rs0.jsonl-test
│   │   │   │   │   ├── output-rs1.jsonl-test
│   │   │   │   │   └── output-rs2.jsonl-test
│   │   │   │   ├── human-eval/
│   │   │   │   │   ├── output-rs0.jsonl-test
│   │   │   │   │   └── output-rs1.jsonl-test
│   │   │   │   ├── ifeval/
│   │   │   │   │   ├── output-rs0.jsonl-test
│   │   │   │   │   ├── output-rs1.jsonl-test
│   │   │   │   │   └── output-rs2.jsonl-test
│   │   │   │   ├── metrics-ms8192.json-test
│   │   │   │   ├── metrics.json-test
│   │   │   │   └── minif2f/
│   │   │   │       ├── output-rs0.jsonl-test
│   │   │   │       ├── output-rs1.jsonl-test
│   │   │   │       ├── output-rs2.jsonl-test
│   │   │   │       └── output-rs3.jsonl-test
│   │   │   ├── summarize_results_output-ms8192.txt
│   │   │   └── summarize_results_output.txt
│   │   ├── multi_model_eval_smoke.py
│   │   ├── nemo_evaluator/
│   │   │   ├── example-eval-config.yaml
│   │   │   └── example-gpu-test-config.yaml
│   │   ├── openai-input-dict.test
│   │   ├── openai-input-list.test
│   │   ├── openmathinstruct2.test
│   │   ├── output-rs0.test
│   │   ├── output-rs1.test
│   │   ├── output-rs2.test
│   │   ├── small-grpo-data.test
│   │   ├── small-sft-data-messages.test
│   │   └── small-sft-data.test
│   ├── gpu-tests/
│   │   ├── __init__.py
│   │   ├── make_tiny_llm.py
│   │   ├── run_qwen.sh
│   │   ├── test-local.yaml
│   │   ├── test_contamination.py
│   │   ├── test_context_retry.py
│   │   ├── test_eval.py
│   │   ├── test_external_benchmark_eval.py
│   │   ├── test_generate.py
│   │   ├── test_judge.py
│   │   ├── test_nemo_evaluator.py
│   │   ├── test_nemo_gym_rollouts.py
│   │   ├── test_run_cmd_llm_infer.py
│   │   ├── test_sandbox_mounts.py
│   │   ├── test_tool_calling.py
│   │   ├── test_train.py
│   │   ├── test_vllm_audio.py
│   │   └── utils.py
│   ├── scripts/
│   │   └── run_cmd_llm_infer_check.py
│   ├── slurm-tests/
│   │   ├── README.md
│   │   ├── asr_nim/
│   │   │   ├── README.md
│   │   │   ├── asr.test
│   │   │   ├── check_results.py
│   │   │   └── run_test.py
│   │   ├── clone_and_run.sh
│   │   ├── gpt_oss_python_aime25/
│   │   │   ├── check_results.py
│   │   │   └── run_test.py
│   │   ├── nano_30b_tool_calling/
│   │   │   ├── check_results.py
│   │   │   └── run_test.py
│   │   ├── omr_simple_recipe/
│   │   │   ├── check_results.py
│   │   │   └── run_test.py
│   │   ├── qwen3_4b_evals/
│   │   │   ├── check_results.py
│   │   │   └── run_test.py
│   │   ├── qwen3_4b_ray_executor/
│   │   │   ├── check_results.py
│   │   │   └── run_test.py
│   │   ├── qwen3coder_30b_swebench/
│   │   │   ├── check_results.py
│   │   │   └── run_test.py
│   │   ├── run_all.sh
│   │   ├── stem_sdg_pipeline/
│   │   │   └── run_test.py
│   │   ├── super_120b_aime25/
│   │   │   ├── check_results.py
│   │   │   ├── run_test.py
│   │   │   └── trtllm-extra-llm-api-config.yml
│   │   ├── super_49b_evals/
│   │   │   ├── check_results.py
│   │   │   └── run_test.py
│   │   ├── tts_nim/
│   │   │   ├── README.md
│   │   │   ├── check_results.py
│   │   │   ├── run_test.py
│   │   │   └── tts.test
│   │   ├── unified_asr/
│   │   │   ├── asr_openai.test
│   │   │   ├── check_results.py
│   │   │   └── run_test.py
│   │   ├── unified_tts/
│   │   │   ├── README.md
│   │   │   ├── check_results.py
│   │   │   ├── run_test.py
│   │   │   └── tts_openai.test
│   │   ├── utils.py
│   │   └── wmt24pp_gym_topology/
│   │       ├── README.md
│   │       ├── check_results.py
│   │       └── run_test.py
│   ├── test_arena_metrics.py
│   ├── test_base_metrics.py
│   ├── test_code_execution.py
│   ├── test_configs.py
│   ├── test_data_preparation.py
│   ├── test_datasets.py
│   ├── test_declarative_pipeline.py
│   ├── test_default_args.py
│   ├── test_dependency_isolation.py
│   ├── test_eval.py
│   ├── test_external_benchmarks.py
│   ├── test_generation.py
│   ├── test_magpie_tts_backend.py
│   ├── test_math_equal.py
│   ├── test_mcp_clients.py
│   ├── test_metrics.py
│   ├── test_nemo_asr_backend.py
│   ├── test_nemo_evaluator_pipeline.py
│   ├── test_nvidia_inference_api.py
│   ├── test_pipeline_utils.py
│   ├── test_prompts.py
│   ├── test_prover.py
│   ├── test_ray_executor.py
│   ├── test_sandbox_fork_exc_leak.py
│   ├── test_sandbox_network_blocking.py
│   ├── test_session_affinity.py
│   ├── test_streaming_tool_calling.py
│   ├── test_unified_server_audio_parser.py
│   ├── test_unified_server_batcher.py
│   ├── test_unified_server_error_handling.py
│   ├── test_vllm_audio.py
│   └── test_vlm.py
└── tools/
    ├── pyproject.toml
    └── requirements.txt